#!/usr/bin/perl # # $Id: verifier.pl,v 1.3 2000/03/28 21:11:50 alberto Exp alberto $ # # ADS Bibcode verifier script. # Reads a list of bibcodes given either on the command line or in standard # input, generates a query to an ADS server which verifies the existence # of each bibcode, and outputs the list of verified bibcodes to standard # output. # # Examples: # verifier.pl 1999A+A...352...19R 2002abcde.123....1X 1998aspc..145..378E # # NOTE: to each bibcode verified by this script corresponds a URL of # http://$server/cgi-bin/bib_query?BIBCODE # this is the most general ADS URL for the document identified by # BIBCODE and is guaranteed to work in the future. # Please make sure to use the canonical bibcode returned by the script. # # Requires the libwww-perl-5 available from http://www.linpro.no/lwp/ # Written by Alberto Accomazzi # # $Log: verifier.pl,v $ # Revision 1.3 2000/03/28 21:11:50 alberto # Modified to return table listing input bibcodes vs. # canonical bibcodes verified with ADS. # Now defaults to a POST rather than GET from verifier script. # # Revision 1.2 1999/03/23 22:51:21 alberto # Enforced checks on bibcode syntax. # # Revision 1.1 1999/03/23 22:46:19 alberto # Initial revision # # # customizeable variables: $server = "adsabs.harvard.edu"; $database = "ALL"; $debug = 0; # this is the list of valid ADS database keys @databases = qw( AST INST PHY ALL ); # ($script = $0) =~ s:^.*/::; $cgi = "/cgi-bin/verify"; $format = "PLAINTEXT"; $method = 'POST'; $version = sprintf("%s/%d.%02d", q$RCSfile: verifier.pl,v $ =~ /:\s*(\w+)/, q$Revision: 1.3 $ =~ /(\d+)\.(\d+)/); use LWP::UserAgent; $ua = new LWP::UserAgent; # customize user agent to carry this script's signature $ua->agent("$version " . $ua->agent); # read options from command line while ($ARGV[0] =~ /^-\w/) { $_ = shift(@ARGV); if (/^-database/) { $database = shift(@ARGV); } elsif (/^-debug/) { $debug++; } elsif (/^-get/) { $method = 'GET'; } elsif (/^-url/) { $return = 'url'; } else { die "$script: unknown option \"$_\""; } } # check to see if selected database appears in the list of legal ones die "$script: unknown database \"$database\"" unless (grep(/\Q$database\E/,@databases)); # get bibcodes either from command line or standard input if (@ARGV) { @bibcodes = @ARGV; } else { warn "$script: reading bibcodes from stdin...\n"; @bibcodes = (); while () { s/^\s+|\s+$//g; next unless /\S/; push(@bibcodes,$_); } } die "$script: no input bibcodes specified!\n" unless (@bibcodes); warn "$script: read ", scalar(@bibcodes), " input bibcodes\n"; warn "$script: input bibcodes are: ", join(", ",@bibcodes), "\n" if ($debug); # first check bibcodes validity @bibcodes = &check_bibcodes(@bibcodes); die "$script: no valid bibcodes found!\n" unless (@bibcodes); my $query = &make_query(@bibcodes); my $url = "http://$server$cgi"; if ($method eq 'GET') { $url .= '?' . $query; $query = ""; } warn "$script: target url is ", $url, "\n", "$script: method is ", $method, "\n" if ($debug); warn "$script: content is ", $query, "\n" if ($debug and $method eq 'POST'); my $req = new HTTP::Request($method,$url); if ($query) { $req->header('Content-Type','application/x-www-form-urlencoded'); $req->header('Content-Length',length($query)); $req->content($query); } my $res = $ua->request($req); die "$script: HTTP request failed!\n" unless ($res->is_success); warn "$script: content is: ", $res->content if ($debug); @valid = &parse_content($res->content); my $verified = 0; while ($bibcode = shift(@bibcodes)) { my $valid = shift(@valid); print $bibcode, "\t", $valid, "\n"; $verified++ if ($valid); } warn "$script: verified ", $verified, " bibcodes with ADS\n"; # this routine has become much more complicated now that we # have to account for recognizing canonical bibcodes which may # be different from input ones; we should really do this all # in XML... sub parse_content { my @db_contents = split(/^ADS.*Bibcode Verification/,$_[0]); my $content = shift(@db_contents); my @valid = (); die "$script: error parsing response header from ADS verifier: $content\n" if ($content); while (defined($content = shift(@db_contents))) { my @lines = split(/\n/,$content); my $index = 0; while (defined($line = shift(@lines))) { if ($line =~ /^YES,\s+(\S{19})/i) { $valid[$index++] = $1; } elsif ($line =~ /^NO,\s+(\S{19})/i) { $valid[$index++] = ""; } } } return @valid; } # returns url-encoded query sub make_query { my $query = "db_key=$database&data_type=$format"; my $bib; while (defined($bib = shift)) { # escape ampersands in bibcodes since they go in URL $bib =~ s/[\&\+]/\%26/g; $query .= '&' . $bib; } return $query; } # returns valid bibcodes sub check_bibcodes { my @valid = (); while (defined($bib = shift)) { unless ($bib =~ /(\d{4}\D\S{13}[A-Z.:])/) { warn "$script: invalid bibcode \"$bib\" skipped\n"; next; } push(@valid,$bib); } return @valid; }