Bib-Tools

 view release on metacpan or  search on metacpan

lib/Bib/Tools.pm  view on Meta::CPAN

sub _split_duplicates {
  # split list of references into three lists: one with refs that have unique doi's, one with no doi's
  #and one with all the rest (with duplicate doi's)
  my $self = shift @_;
  my @refs=@{$self->{refs}};
  
  my @newrefs;
  foreach my $ref (@refs) {
    my $doi = $ref->doi();
    if (!defined($doi) || length($doi)==0) {push @{$self->{nodoi_refs}}, $ref; next; }# skip entries with no DOI
    my $found = 0;
    foreach my $ref2 (@newrefs) {
      if ($ref2->doi() eq $doi) {
        $found = 1;
      }
    }
    if (!$found) {
      push @newrefs, $ref;
    } else {
      push @{$self->{duprefs}}, $ref;
    }
  }
  $self->{refs} = \@newrefs;
}

####################################################################################
sub append {
  # add new reference to end of existing list
  my $self = shift @_;
  my $ref = shift @_;
  push @{$self->{refs}}, $ref;
}

####################################################################################
sub add_details {
  # given an array of raw strings, try to convert into paper references
  
  my $self = shift @_;
  foreach my $cites (@_) {
    $self->{last} = Bib::CrossRef->new();
    $self->{last}->parse_text($cites);
    $self->append($self->{last});
    sleep 1/(0.001+$self->{ratelimit}); # rate limit queries to crossref
  }
  $self->_split_duplicates();
}

####################################################################################
sub add_google {
  # scrape paper details from google scholar personal page -- nb: no doi info on google, so use crossref.org to obtain this
  # nb: doesn't work with google scholar search results
  
  my $self = shift @_;
  my $url = shift @_;
  my $ua = LWP::UserAgent->new;
  $ua->agent('Mozilla/5.0');
  my $req = HTTP::Request->new(GET => $url);
  my $res = $ua->request($req);
  if ($res->is_success) {
    my $tree= HTML::TreeBuilder::XPath->new;
    $tree->parse($res->decoded_content);
    my @atitles=$tree->findvalues('//tr[@class="gsc_a_tr"]/td/a[@class="gsc_a_at"]');
    my @authors=$tree->findvalues('//tr[@class="gsc_a_tr"]/td/div[@class="gs_gray"][1]');
    my @jtitles=$tree->findvalues('//tr[@class="gsc_a_tr"]/td/div[@class="gs_gray"][2]');
    my $len1 = @atitles; my $len2 = @authors; my $len3 = @jtitles;
    if (($len1 != $len2) || ($len1 != $len3) || ($len2 != $len3)) {$self->_err("Problem parsing google page: mismatched $len1 titles/$len2 authors/$len3 journals.");return []}
    for (my $i = 0; $i<$len1; $i++) {
      # these are already utf8
      $authors[$i] = decode_entities($authors[$i]);
      $atitles[$i] = decode_entities($atitles[$i]);
      $jtitles[$i] = decode_entities($jtitles[$i]);
      my $temp = $authors[$i].", ".$atitles[$i].", ".$jtitles[$i];
      my $r = Bib::CrossRef->new;
      $r->parse_text($temp);
      $jtitles[$i] =~ m/\s([0-9][0-9][0-9][0-9])$/;
      my $year=$1;
      if ((length($year)==4) && ($r->date ne $1)) {
        $r->_setscore(0.5); # mismatch in year, probably bad
      }
      $self->append($r);
    }
  } else {
    $self->_err("Problem with $url: ".$res->status_line);
  }
}

####################################################################################
sub add_google_search {
  # scrape paper details from google scholar search results -- *not* from persons scholar home page

  my $self = shift @_;
  my $url = shift @_;
  my $ua = LWP::UserAgent->new;
  $ua->agent('Mozilla/5.0');
  my $req = HTTP::Request->new(GET => $url);
  my $res = $ua->request($req);
  if ($res->is_success) {
    my $tree= HTML::TreeBuilder::XPath->new;
    $tree->parse($res->decoded_content);
    my @atitles=$tree->findvalues('//div[@class="gs_ri"]/h3/a');
    my @authors=$tree->findvalues('//div[@class="gs_a"]');
    my $len1 = @atitles; my $len2 = @authors;
    if ($len1 != $len2) {$self->_err("Problem parsing google page: mismatched $len1 titles/$len2 authors.");return [];}
    my @cites=();
    for (my $i = 0; $i<$len1; $i++) {
      $authors[$i] = decode_entities($authors[$i]);
      $atitles[$i] = decode_entities($atitles[$i]);
      my $str = $authors[$i].", ".$atitles[$i];
      if (length($str)>5) { # a potentially useful entry ?
        push @cites, $authors[$i].", ".$atitles[$i];
      }
    }
    $self->add_details(@cites);
  } else {
    $self->_err("Problem with $url: ".$res->status_line);
  }
}

####################################################################################
sub _dblp_setauth {
  my $self = shift @_; my $r = shift @_; my $cite = shift @_;
  
  if (ref($cite->{'author'}) eq "HASH") {
    $r->_setauthcount(1);
    $r->_setauth(1,$cite->{'author'});
  } else {
    my $count = 0;
    foreach my $au (@{$cite->{'author'}}) {
      $count++;
      $r->_setauth($count, $au);
    }
    $r->_setauthcount($count);
  }
}

####################################################################################
sub add_dblp {
  # get details using DBLP XML API
  
  my $self = shift @_;
  my $url = shift @_;
  my $maxnum = shift @_; if (!defined($maxnum)) {$maxnum=-1;}
  
  my $ua = LWP::UserAgent->new;
  $ua->agent('Mozilla/5.0');
  my $req = HTTP::Request->new(GET => $url);
  my $res = $ua->request($req);
  if ($res->is_success) {
    my $xs = XML::Simple->new();
    my $data = $xs->XMLin($res->decoded_content);
    my @cites; my @ctemp;
    if (defined $data->{'r'}) {
       # a person page
       @cites = $data->{'r'};
    } elsif (defined $data->{'article'}) { 
       # its xml for a single article
       $ctemp[0] = $data;
       push @cites, \@ctemp;
    }
    my $num=0;
    foreach my $c (@{$cites[0]}) {
      $num++; if ($maxnum>0 && $num>$maxnum) {last;}  # mainly for testing
      my @k = keys %{$c};
      my $cite = $c->{$k[0]};
      my $ee = $cite->{'ee'};
      if ($ee =~ m/dx.doi.org/) {
        # we have a DOI, lets call crossref
        $ee =~ s/http:\/\/dx.doi.org\///;
        my $r = Bib::CrossRef->new;
        $r->parse_text($ee);
        if ($r->score >=1) {
          if (!defined $r->authcount || $r->authcount==0) {
            # shouldn't happen, but sometimes doi data lacks authors so use dblp data
            $self->_dblp_setauth($r,$cite);
          }
          $self->append($r);
          next; # move on to next record
        }
      }
      my $jtitle='';
      if (defined $cite->{'journal'}) {
        $jtitle = $cite->{'journal'};
      } elsif (defined $cite->{'booktitle'}) {
        $jtitle = $cite->{'booktitle'};
      }
      my $temp = $cite->{'year'}.' '.$cite->{'title'}.' '.$jtitle. ' ';
      if (ref($cite->{'author'}) eq "HASH") {
        $temp .= $cite->{'author'};
      } else {
        foreach my $au (@{$cite->{'author'}}) { $temp .= $au.", ";}
      }
      my $r = Bib::CrossRef->new;
      $r->parse_text($temp);
      if ($r->score >= 1) {
        # found an ok match, lets use it
        $self->append($r);
        next; # move on
      }
      
      # we got a poor match, lets use the rest of the dblp data
      $r = Bib::CrossRef->new;
      if (exists $cite->{'publtype'}) {
        $r->_setgenre($cite->{'publtype'});
      } elsif ($k[0] =~ m/article/) {
        $r->_setgenre('article');
      } elsif ($k[0] =~ m/inproceedings/) {
        $r->_setgenre('proceeding');
      } elsif ($k[0] =~ m/informal/) {
        $r->_setgenre('preprint');
      } else {

lib/Bib/Tools.pm  view on Meta::CPAN

  my $auth='';
  my $c = $cite->{'work-contributors'}->{'contributor'};
  if (ref($c) eq "HASH") {
    # single author
    if ($c->{'contributor-attributes'}->{'contributor-role'} =~ m/author/) {$auth = $c->{'credit-name'}->{'content'};}
  } else {
    # multiple authors
    foreach my $au (@{$c}) {
      if ($au->{'contributor-attributes'}->{'contributor-role'} =~ m/author/) {$auth .= $au->{'credit-name'}->{'content'}.", ";}
    }
  }
  return $auth;
}

####################################################################################
sub _orcid_setauth {
  # use an orcid entry to set citation author list (using orcid bibtex data if appropriate)
  my $self = shift @_; my $r = shift @_;
  my $cite = shift @_; my $entry = shift @_;
  
  my $c = $cite->{'work-contributors'}->{'contributor'};
  if (defined $c) {
    # we have an orcid author entry
    my $authcount=0;
    if (ref($c) eq "HASH") {
      # single author
      if ($c->{'contributor-attributes'}->{'contributor-role'} =~ m/author/) {
        $authcount++;
        $r->_setauth($authcount,$c->{'credit-name'}->{'content'});
      }
    } else {
      # multiple authors
      foreach my $au (@{$c}) {
        if ($au->{'contributor-attributes'}->{'contributor-role'} =~ m/author/) {
          $authcount++;
          $r->_setauth($authcount,$au->{'credit-name'}->{'content'});
        }
      }
    }
    $r->_setauthcount($authcount);
    if ($authcount>0) {return;} # found some authors, finish up
  }
  # no author info, lets see if bibtex has any author info
  if (defined $entry) {
    $self->_bibtex_setauth($r,$entry);
  }
}

####################################################################################
sub add_orcid {
  # get paper details from orcid using API
  
  my $self = shift @_; my $orcid_id = shift @_;
  
  my $ua = LWP::UserAgent->new;
  my $req = HTTP::Request->new(GET => "http://pub.orcid.org/$orcid_id/orcid-works/");
  my $res = $ua->request($req);
  if ($res->is_success) {
    my $xs = XML::Simple->new();
    # the orcid response is utf8 xml
    my $data = $xs->XMLin($res->decoded_content);
    my @cites = $data->{'orcid-profile'}->{'orcid-activities'}->{'orcid-works'}->{'orcid-work'};
    foreach my $cite (@{$cites[0]}) {
      my $entry = undef;
      if ($cite->{'work-citation'}->{'work-citation-type'} =~ m/bibtex/) {
        # we have a bibtex reference, extract some extra info
        my $bibtex = $cite->{'work-citation'}->{'citation'};
        open my $fh, '<', \$bibtex;
        my $parser = BibTeX::Parser->new($fh);
        $entry = $parser->next;
        if (!$entry->parse_ok) {$entry = undef;}
      }
      my $doi = _orcid_getdoi($cite);
      if ((defined $doi) && (length($doi)>5)) { # we seem to have a DOI
        # use DOI to search.crossref.org
        my $r = Bib::CrossRef->new;
        $r->parse_text($doi);
        if ($r->score>=1) {
          if (!defined $r->authcount || $r->authcount==0) {
            # shouldn't happen, but sometimes doi data lacks authors so use orcid data
            $self->_orcid_setauth($r,$cite,$entry);
          }
          $self->append($r);
          next;  # move on
        }
      }
      # use title etc to search.crossref.org
      my $date; my $atitle; my $jtitle;
      if (exists $cite->{'publication-date'}->{'year'}) {$date = $cite->{'publication-date'}->{'year'};}
      if (exists $cite->{'work-title'}->{'title'}) {$atitle = $cite->{'work-title'}->{'title'};}
      if (exists $cite->{'journal-title'}) {$jtitle = $cite->{'journal-title'};}
      my $auth = _orcid_getauth($cite);
      my $temp=$auth.' '.$date.' '.$atitle.' '.$jtitle;
      if (length($temp)>10 && length($date)>0 && length($atitle)+length($auth)>0) { # we have a potentially useful search string
        my $r = Bib::CrossRef->new;
        $r->parse_text($temp);
        if ($r->score >= 1) {
          # found an ok match, lets use it
          $self->append($r);
          next; # move on
        }
      }
      # for a poor match, try to extract rest of info from orcid
      my $r = Bib::CrossRef->new;
      $r->_setdate($date); $r->_setatitle($atitle); $r->_setjtitle($jtitle);
      if (exists $cite->{'work-type'}) {$r->_setgenre($cite->{'work-type'});}
      $self->_orcid_setauth($r,$cite);
      $self->_bibtex_parse($r,$entry);
      $r->_setscore(1);
      $r->_setquery($temp);
      # add manually constructed record
      $self->append($r);
    }
    $self->_split_duplicates();
  } else {
    $self->_err("Problem with orcid.org: ".$res->status_line);
  }
}

####################################################################################
sub _find_pubmed {
  my $c = shift @_;
  my $name = shift @_;
  my $term = shift @_;
  foreach my $item (@{$c}) {
    if ($item->{'Name'} eq $name) {
      return $item->{$term};
    }
  }
  return undef;
}

####################################################################################
sub add_pubmed {
  # add results from a pubmed query
  my ($self,$q) = @_;

  my $ua = LWP::UserAgent->new;
  $q =~ s/\s+/+/g;
  my $req = HTTP::Request->new(GET => "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?usehistory=y&db=pubmed&term=".$q);
  my $res = $ua->request($req);
  if ($res->is_success) {
    my $web = $1 if ($res->decoded_content =~ /<WebEnv>(\S+)<\/WebEnv>/);
    my $key = $1 if ($res->decoded_content =~ /<QueryKey>(\d+)<\/QueryKey>/);
    $req = HTTP::Request->new(GET => "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&query_key=$key&WebEnv=$web");
    $res = $ua->request($req);
    if ($res->is_success) {
      my $xs = XML::Simple->new();
      my $data = $xs->XMLin($res->decoded_content);
      my @cites = $data->{'DocSum'};
      foreach my $cite (@{$cites[0]}) {
        my $c = $cite->{'Item'};
        if (ref($c) ne "ARRAY") {next;}
        my $r = Bib::CrossRef->new;
        my $doi = _find_pubmed($c,'DOI','content');
        if (defined $doi) {
          # PubMed is reliable, no need to call crossref
        #  my $r = Bib::CrossRef->new;
        #  $r->parse_text($doi);
        #  $self->append($r);
        #  next; # move on
          $r->_setdoi($doi);
          $r->_seturl('http://dx.doi.org/'.$doi);
        }
        $r->_setjtitle(_find_pubmed($c,'FullJournalName','content'));
        $r->_setatitle(_find_pubmed($c,'Title','content'));
        my $date = _find_pubmed($c,'PubDate','content');
        $date =~ m/^([0-9][0-9][0-9][0-9])/;
        $r->_setdate($1); # extract the year
        $r->_setvolume(_find_pubmed($c,'Volume','content'));
        $r->_setissue(_find_pubmed($c,'Issue','content'));
        my $p = _find_pubmed($c,'Pages','content');
        my @bits = split('-',$p);
        $r->_setspage($bits[0]); $r->_setepage($bits[1]);
        
        my $aulist = _find_pubmed($c,'AuthorList','Item');
        my $authcount=0;
        if (ref($aulist) ne "ARRAY") {
          $authcount = 1;
          $r->_setauth($authcount,$aulist->{'content'});
        } else {
          foreach my $au (@{$aulist}) {
            $authcount++;
            $r->_setauth($authcount,$au->{'content'});
          }
        }
        $r->_setauthcount($authcount);
        my $g = _find_pubmed($c,'FullJournalName','Item');
        $r->_setgenre($g->{'content'});
        $r->_setscore(1);
        #$r->_setquery($auth." ".$temp);
        # add manually constructed record
        $self->append($r);
      }
    }
    $self->_split_duplicates();
    return;
  }
  $self->_err("Problem with http://eutils.ncbi.nlm.nih.gov: ".$res->status_line);
}

####################################################################################
sub _rem_brackets {
  # remove {} brackets from bibtex entry
  my $self = shift @_; my $str = shift @_;
  
  $str =~ s/[\{\}]//g;
  $str =~ s/\\textquotesingle/\\'/g;
  return $str



( run in 0.991 second using v1.01-cache-2.11-cpan-39bf76dae61 )