Bib-Tools
view release on metacpan or search on metacpan
lib/Bib/Tools.pm view on Meta::CPAN
sub _split_duplicates {
# split list of references into three lists: one with refs that have unique doi's, one with no doi's
#and one with all the rest (with duplicate doi's)
my $self = shift @_;
my @refs=@{$self->{refs}};
my @newrefs;
foreach my $ref (@refs) {
my $doi = $ref->doi();
if (!defined($doi) || length($doi)==0) {push @{$self->{nodoi_refs}}, $ref; next; }# skip entries with no DOI
my $found = 0;
foreach my $ref2 (@newrefs) {
if ($ref2->doi() eq $doi) {
$found = 1;
}
}
if (!$found) {
push @newrefs, $ref;
} else {
push @{$self->{duprefs}}, $ref;
}
}
$self->{refs} = \@newrefs;
}
####################################################################################
sub append {
# add new reference to end of existing list
my $self = shift @_;
my $ref = shift @_;
push @{$self->{refs}}, $ref;
}
####################################################################################
sub add_details {
# given an array of raw strings, try to convert into paper references
my $self = shift @_;
foreach my $cites (@_) {
$self->{last} = Bib::CrossRef->new();
$self->{last}->parse_text($cites);
$self->append($self->{last});
sleep 1/(0.001+$self->{ratelimit}); # rate limit queries to crossref
}
$self->_split_duplicates();
}
####################################################################################
sub add_google {
# scrape paper details from google scholar personal page -- nb: no doi info on google, so use crossref.org to obtain this
# nb: doesn't work with google scholar search results
my $self = shift @_;
my $url = shift @_;
my $ua = LWP::UserAgent->new;
$ua->agent('Mozilla/5.0');
my $req = HTTP::Request->new(GET => $url);
my $res = $ua->request($req);
if ($res->is_success) {
my $tree= HTML::TreeBuilder::XPath->new;
$tree->parse($res->decoded_content);
my @atitles=$tree->findvalues('//tr[@class="gsc_a_tr"]/td/a[@class="gsc_a_at"]');
my @authors=$tree->findvalues('//tr[@class="gsc_a_tr"]/td/div[@class="gs_gray"][1]');
my @jtitles=$tree->findvalues('//tr[@class="gsc_a_tr"]/td/div[@class="gs_gray"][2]');
my $len1 = @atitles; my $len2 = @authors; my $len3 = @jtitles;
if (($len1 != $len2) || ($len1 != $len3) || ($len2 != $len3)) {$self->_err("Problem parsing google page: mismatched $len1 titles/$len2 authors/$len3 journals.");return []}
for (my $i = 0; $i<$len1; $i++) {
# these are already utf8
$authors[$i] = decode_entities($authors[$i]);
$atitles[$i] = decode_entities($atitles[$i]);
$jtitles[$i] = decode_entities($jtitles[$i]);
my $temp = $authors[$i].", ".$atitles[$i].", ".$jtitles[$i];
my $r = Bib::CrossRef->new;
$r->parse_text($temp);
$jtitles[$i] =~ m/\s([0-9][0-9][0-9][0-9])$/;
my $year=$1;
if ((length($year)==4) && ($r->date ne $1)) {
$r->_setscore(0.5); # mismatch in year, probably bad
}
$self->append($r);
}
} else {
$self->_err("Problem with $url: ".$res->status_line);
}
}
####################################################################################
sub add_google_search {
# scrape paper details from google scholar search results -- *not* from persons scholar home page
my $self = shift @_;
my $url = shift @_;
my $ua = LWP::UserAgent->new;
$ua->agent('Mozilla/5.0');
my $req = HTTP::Request->new(GET => $url);
my $res = $ua->request($req);
if ($res->is_success) {
my $tree= HTML::TreeBuilder::XPath->new;
$tree->parse($res->decoded_content);
my @atitles=$tree->findvalues('//div[@class="gs_ri"]/h3/a');
my @authors=$tree->findvalues('//div[@class="gs_a"]');
my $len1 = @atitles; my $len2 = @authors;
if ($len1 != $len2) {$self->_err("Problem parsing google page: mismatched $len1 titles/$len2 authors.");return [];}
my @cites=();
for (my $i = 0; $i<$len1; $i++) {
$authors[$i] = decode_entities($authors[$i]);
$atitles[$i] = decode_entities($atitles[$i]);
my $str = $authors[$i].", ".$atitles[$i];
if (length($str)>5) { # a potentially useful entry ?
push @cites, $authors[$i].", ".$atitles[$i];
}
}
$self->add_details(@cites);
} else {
$self->_err("Problem with $url: ".$res->status_line);
}
}
####################################################################################
sub _dblp_setauth {
my $self = shift @_; my $r = shift @_; my $cite = shift @_;
if (ref($cite->{'author'}) eq "HASH") {
$r->_setauthcount(1);
$r->_setauth(1,$cite->{'author'});
} else {
my $count = 0;
foreach my $au (@{$cite->{'author'}}) {
$count++;
$r->_setauth($count, $au);
}
$r->_setauthcount($count);
}
}
####################################################################################
sub add_dblp {
# get details using DBLP XML API
my $self = shift @_;
my $url = shift @_;
my $maxnum = shift @_; if (!defined($maxnum)) {$maxnum=-1;}
my $ua = LWP::UserAgent->new;
$ua->agent('Mozilla/5.0');
my $req = HTTP::Request->new(GET => $url);
my $res = $ua->request($req);
if ($res->is_success) {
my $xs = XML::Simple->new();
my $data = $xs->XMLin($res->decoded_content);
my @cites; my @ctemp;
if (defined $data->{'r'}) {
# a person page
@cites = $data->{'r'};
} elsif (defined $data->{'article'}) {
# its xml for a single article
$ctemp[0] = $data;
push @cites, \@ctemp;
}
my $num=0;
foreach my $c (@{$cites[0]}) {
$num++; if ($maxnum>0 && $num>$maxnum) {last;} # mainly for testing
my @k = keys %{$c};
my $cite = $c->{$k[0]};
my $ee = $cite->{'ee'};
if ($ee =~ m/dx.doi.org/) {
# we have a DOI, lets call crossref
$ee =~ s/http:\/\/dx.doi.org\///;
my $r = Bib::CrossRef->new;
$r->parse_text($ee);
if ($r->score >=1) {
if (!defined $r->authcount || $r->authcount==0) {
# shouldn't happen, but sometimes doi data lacks authors so use dblp data
$self->_dblp_setauth($r,$cite);
}
$self->append($r);
next; # move on to next record
}
}
my $jtitle='';
if (defined $cite->{'journal'}) {
$jtitle = $cite->{'journal'};
} elsif (defined $cite->{'booktitle'}) {
$jtitle = $cite->{'booktitle'};
}
my $temp = $cite->{'year'}.' '.$cite->{'title'}.' '.$jtitle. ' ';
if (ref($cite->{'author'}) eq "HASH") {
$temp .= $cite->{'author'};
} else {
foreach my $au (@{$cite->{'author'}}) { $temp .= $au.", ";}
}
my $r = Bib::CrossRef->new;
$r->parse_text($temp);
if ($r->score >= 1) {
# found an ok match, lets use it
$self->append($r);
next; # move on
}
# we got a poor match, lets use the rest of the dblp data
$r = Bib::CrossRef->new;
if (exists $cite->{'publtype'}) {
$r->_setgenre($cite->{'publtype'});
} elsif ($k[0] =~ m/article/) {
$r->_setgenre('article');
} elsif ($k[0] =~ m/inproceedings/) {
$r->_setgenre('proceeding');
} elsif ($k[0] =~ m/informal/) {
$r->_setgenre('preprint');
} else {
lib/Bib/Tools.pm view on Meta::CPAN
my $auth='';
my $c = $cite->{'work-contributors'}->{'contributor'};
if (ref($c) eq "HASH") {
# single author
if ($c->{'contributor-attributes'}->{'contributor-role'} =~ m/author/) {$auth = $c->{'credit-name'}->{'content'};}
} else {
# multiple authors
foreach my $au (@{$c}) {
if ($au->{'contributor-attributes'}->{'contributor-role'} =~ m/author/) {$auth .= $au->{'credit-name'}->{'content'}.", ";}
}
}
return $auth;
}
####################################################################################
sub _orcid_setauth {
# use an orcid entry to set citation author list (using orcid bibtex data if appropriate)
my $self = shift @_; my $r = shift @_;
my $cite = shift @_; my $entry = shift @_;
my $c = $cite->{'work-contributors'}->{'contributor'};
if (defined $c) {
# we have an orcid author entry
my $authcount=0;
if (ref($c) eq "HASH") {
# single author
if ($c->{'contributor-attributes'}->{'contributor-role'} =~ m/author/) {
$authcount++;
$r->_setauth($authcount,$c->{'credit-name'}->{'content'});
}
} else {
# multiple authors
foreach my $au (@{$c}) {
if ($au->{'contributor-attributes'}->{'contributor-role'} =~ m/author/) {
$authcount++;
$r->_setauth($authcount,$au->{'credit-name'}->{'content'});
}
}
}
$r->_setauthcount($authcount);
if ($authcount>0) {return;} # found some authors, finish up
}
# no author info, lets see if bibtex has any author info
if (defined $entry) {
$self->_bibtex_setauth($r,$entry);
}
}
####################################################################################
sub add_orcid {
# get paper details from orcid using API
my $self = shift @_; my $orcid_id = shift @_;
my $ua = LWP::UserAgent->new;
my $req = HTTP::Request->new(GET => "http://pub.orcid.org/$orcid_id/orcid-works/");
my $res = $ua->request($req);
if ($res->is_success) {
my $xs = XML::Simple->new();
# the orcid response is utf8 xml
my $data = $xs->XMLin($res->decoded_content);
my @cites = $data->{'orcid-profile'}->{'orcid-activities'}->{'orcid-works'}->{'orcid-work'};
foreach my $cite (@{$cites[0]}) {
my $entry = undef;
if ($cite->{'work-citation'}->{'work-citation-type'} =~ m/bibtex/) {
# we have a bibtex reference, extract some extra info
my $bibtex = $cite->{'work-citation'}->{'citation'};
open my $fh, '<', \$bibtex;
my $parser = BibTeX::Parser->new($fh);
$entry = $parser->next;
if (!$entry->parse_ok) {$entry = undef;}
}
my $doi = _orcid_getdoi($cite);
if ((defined $doi) && (length($doi)>5)) { # we seem to have a DOI
# use DOI to search.crossref.org
my $r = Bib::CrossRef->new;
$r->parse_text($doi);
if ($r->score>=1) {
if (!defined $r->authcount || $r->authcount==0) {
# shouldn't happen, but sometimes doi data lacks authors so use orcid data
$self->_orcid_setauth($r,$cite,$entry);
}
$self->append($r);
next; # move on
}
}
# use title etc to search.crossref.org
my $date; my $atitle; my $jtitle;
if (exists $cite->{'publication-date'}->{'year'}) {$date = $cite->{'publication-date'}->{'year'};}
if (exists $cite->{'work-title'}->{'title'}) {$atitle = $cite->{'work-title'}->{'title'};}
if (exists $cite->{'journal-title'}) {$jtitle = $cite->{'journal-title'};}
my $auth = _orcid_getauth($cite);
my $temp=$auth.' '.$date.' '.$atitle.' '.$jtitle;
if (length($temp)>10 && length($date)>0 && length($atitle)+length($auth)>0) { # we have a potentially useful search string
my $r = Bib::CrossRef->new;
$r->parse_text($temp);
if ($r->score >= 1) {
# found an ok match, lets use it
$self->append($r);
next; # move on
}
}
# for a poor match, try to extract rest of info from orcid
my $r = Bib::CrossRef->new;
$r->_setdate($date); $r->_setatitle($atitle); $r->_setjtitle($jtitle);
if (exists $cite->{'work-type'}) {$r->_setgenre($cite->{'work-type'});}
$self->_orcid_setauth($r,$cite);
$self->_bibtex_parse($r,$entry);
$r->_setscore(1);
$r->_setquery($temp);
# add manually constructed record
$self->append($r);
}
$self->_split_duplicates();
} else {
$self->_err("Problem with orcid.org: ".$res->status_line);
}
}
####################################################################################
sub _find_pubmed {
my $c = shift @_;
my $name = shift @_;
my $term = shift @_;
foreach my $item (@{$c}) {
if ($item->{'Name'} eq $name) {
return $item->{$term};
}
}
return undef;
}
####################################################################################
sub add_pubmed {
# add results from a pubmed query
my ($self,$q) = @_;
my $ua = LWP::UserAgent->new;
$q =~ s/\s+/+/g;
my $req = HTTP::Request->new(GET => "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?usehistory=y&db=pubmed&term=".$q);
my $res = $ua->request($req);
if ($res->is_success) {
my $web = $1 if ($res->decoded_content =~ /<WebEnv>(\S+)<\/WebEnv>/);
my $key = $1 if ($res->decoded_content =~ /<QueryKey>(\d+)<\/QueryKey>/);
$req = HTTP::Request->new(GET => "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&query_key=$key&WebEnv=$web");
$res = $ua->request($req);
if ($res->is_success) {
my $xs = XML::Simple->new();
my $data = $xs->XMLin($res->decoded_content);
my @cites = $data->{'DocSum'};
foreach my $cite (@{$cites[0]}) {
my $c = $cite->{'Item'};
if (ref($c) ne "ARRAY") {next;}
my $r = Bib::CrossRef->new;
my $doi = _find_pubmed($c,'DOI','content');
if (defined $doi) {
# PubMed is reliable, no need to call crossref
# my $r = Bib::CrossRef->new;
# $r->parse_text($doi);
# $self->append($r);
# next; # move on
$r->_setdoi($doi);
$r->_seturl('http://dx.doi.org/'.$doi);
}
$r->_setjtitle(_find_pubmed($c,'FullJournalName','content'));
$r->_setatitle(_find_pubmed($c,'Title','content'));
my $date = _find_pubmed($c,'PubDate','content');
$date =~ m/^([0-9][0-9][0-9][0-9])/;
$r->_setdate($1); # extract the year
$r->_setvolume(_find_pubmed($c,'Volume','content'));
$r->_setissue(_find_pubmed($c,'Issue','content'));
my $p = _find_pubmed($c,'Pages','content');
my @bits = split('-',$p);
$r->_setspage($bits[0]); $r->_setepage($bits[1]);
my $aulist = _find_pubmed($c,'AuthorList','Item');
my $authcount=0;
if (ref($aulist) ne "ARRAY") {
$authcount = 1;
$r->_setauth($authcount,$aulist->{'content'});
} else {
foreach my $au (@{$aulist}) {
$authcount++;
$r->_setauth($authcount,$au->{'content'});
}
}
$r->_setauthcount($authcount);
my $g = _find_pubmed($c,'FullJournalName','Item');
$r->_setgenre($g->{'content'});
$r->_setscore(1);
#$r->_setquery($auth." ".$temp);
# add manually constructed record
$self->append($r);
}
}
$self->_split_duplicates();
return;
}
$self->_err("Problem with http://eutils.ncbi.nlm.nih.gov: ".$res->status_line);
}
####################################################################################
sub _rem_brackets {
# remove {} brackets from bibtex entry
my $self = shift @_; my $str = shift @_;
$str =~ s/[\{\}]//g;
$str =~ s/\\textquotesingle/\\'/g;
return $str
( run in 0.991 second using v1.01-cache-2.11-cpan-39bf76dae61 )