parseObjectList.pl
1 |
# a quick script to look at the uniqueness of identifiers returned from a node. |
---|---|
2 |
# It uses the start and count parameters to get the full object list in |
3 |
# successive calls to the node. |
4 |
|
5 |
|
6 |
$NODE = "https://cn-orc-1.dataone.org/cn"; |
7 |
$pageSize = 10000; |
8 |
|
9 |
($response) = listObjects($NODE,0,0); |
10 |
($total) = $response=~/total="(\d+)"/; |
11 |
print "total objects for $NODE : $total\n"; |
12 |
|
13 |
$runningTotal = 0; |
14 |
|
15 |
while ($runningTotal < $total) { |
16 |
$response = listObjects($NODE,$runningTotal,$pageSize); |
17 |
@oinfos = split("<objectInfo>",$response); |
18 |
$header = shift @oinfos; |
19 |
|
20 |
print STDERR "**running total = $runningTotal\n" if ($runningTotal % 10000 == 0); |
21 |
foreach (@oinfos) { |
22 |
|
23 |
my ($id) = /<identifier>(.+)<\/identifier>/; |
24 |
if ($id) { |
25 |
$runningTotal++; |
26 |
$idSet{$id}++; |
27 |
} else { |
28 |
print "$_\n"; |
29 |
} |
30 |
} |
31 |
} |
32 |
print "number of unique identifiers: " . scalar(keys %idSet) . "\n"; |
33 |
|
34 |
foreach (sort keys %idSet) { |
35 |
if ($idSet{$_} > 1) { |
36 |
print " $_ : $idSet{$_}\n"; |
37 |
} |
38 |
} |
39 |
|
40 |
sub listObjects { |
41 |
my($baseUrl,$start,$count) = @_; |
42 |
|
43 |
my $cmd = "curl '$baseUrl/v1/object?start=$start&count=$count'"; |
44 |
|
45 |
return `$cmd`; |
46 |
} |