Project

General

Profile

parseObjectList.pl

the perl script that checks id uniqueness - Rob Nahf, 2012-07-13 08:55

Download (1.1 KB)

 
1
# a quick script to look at the uniqueness of identifiers returned from a node.
2
# It uses the start and count parameters to get the full object list in 
3
# successive calls to the node.
4

    
5

    
6
$NODE = "https://cn-orc-1.dataone.org/cn";
7
$pageSize = 10000;
8

    
9
($response) = listObjects($NODE,0,0);
10
($total) = $response=~/total="(\d+)"/;
11
print "total objects for $NODE : $total\n";
12

    
13
$runningTotal = 0;
14

    
15
while ($runningTotal < $total) {
16
    $response = listObjects($NODE,$runningTotal,$pageSize);
17
    @oinfos = split("<objectInfo>",$response);
18
    $header = shift @oinfos;
19

    
20
    print STDERR "**running total = $runningTotal\n" if ($runningTotal % 10000 == 0);
21
    foreach (@oinfos) {
22
	
23
	my ($id) = /<identifier>(.+)<\/identifier>/;
24
	if ($id) {
25
	    $runningTotal++;
26
	    $idSet{$id}++;
27
	} else {
28
	    print "$_\n";
29
	}
30
    }
31
}
32
print "number of unique identifiers: " . scalar(keys %idSet) . "\n";
33

    
34
foreach (sort keys %idSet) {
35
    if ($idSet{$_} > 1) {
36
	print "  $_ : $idSet{$_}\n";
37
    }
38
}
39

    
40
sub listObjects {
41
    my($baseUrl,$start,$count) = @_;
42

    
43
    my $cmd = "curl '$baseUrl/v1/object?start=$start&count=$count'";
44

    
45
    return `$cmd`; 
46
}
Add picture from clipboard (Maximum size: 14.8 MB)