Bib-Tools

 view release on metacpan or  search on metacpan

lib/Bib/Tools.pm  view on Meta::CPAN

############################################################
#
#   Bib::Tools - For managing collections of Bib::CrossRef references.
#
############################################################

package Bib::Tools;

use 5.8.8;
use strict;
use warnings;
no warnings 'uninitialized';

require Exporter;
use Bib::CrossRef;
use LWP::UserAgent;
use JSON qw/decode_json/;
use URI::Escape qw(uri_escape_utf8 uri_unescape);
use HTML::Entities qw(decode_entities encode_entities);
use HTML::TreeBuilder::XPath;
use XML::Simple qw(XMLin);
use BibTeX::Parser qw(new next);
use IO::File;
use vars qw($VERSION @EXPORT @EXPORT_OK %EXPORT_TAGS @ISA);

#use LWP::Protocol::https;
#use Data::Dumper;

$VERSION = '0.17';
@ISA = qw(Exporter);
@EXPORT = qw();
@EXPORT_OK = qw(
sethtml clearhtml add_details add_google add_google_search add_orcid add_fromfile add_dblp add_pubmed
send_resp print print_nodoi num num_nodoi getref getref_nodoi append add_bibtex
);
%EXPORT_TAGS = (all => \@EXPORT_OK);

####################################################################################
sub new {
    my $self;
    # defaults
    $self->{refs} = []; # the references
    $self->{nodoi_refs} = []; 
    $self->{duprefs} = [];
    $self->{html}=0;
    $self->{ratelimit}=5; # limit of 5 crossref queries per sec
    $self->{last} = {};
    bless $self;
  
    my $ratelimit = $_[1];
    if (defined($ratelimit) && ($ratelimit>=0)) {$self->{ratelimit}=$ratelimit};
    return $self;
}

####################################################################################
sub sethtml {
  my $self = shift @_;
  $self->{html}=1;
}

####################################################################################
sub clearhtml {
  my $self = shift @_;
  $self->{html}=0;
}

####################################################################################
sub _err {
  my ($self, $str) = @_;
  if ($self->{html}) {
    print "<p style='color:red'>",$str,"</p>";
  } else {
    print $str,"\n";
  }
}

####################################################################################
sub _split_duplicates {
  # split list of references into three lists: one with refs that have unique doi's, one with no doi's
  #and one with all the rest (with duplicate doi's)
  my $self = shift @_;
  my @refs=@{$self->{refs}};
  
  my @newrefs;
  foreach my $ref (@refs) {
    my $doi = $ref->doi();
    if (!defined($doi) || length($doi)==0) {push @{$self->{nodoi_refs}}, $ref; next; }# skip entries with no DOI
    my $found = 0;
    foreach my $ref2 (@newrefs) {
      if ($ref2->doi() eq $doi) {
        $found = 1;
      }
    }
    if (!$found) {
      push @newrefs, $ref;
    } else {
      push @{$self->{duprefs}}, $ref;
    }
  }
  $self->{refs} = \@newrefs;
}

####################################################################################
sub append {
  # add new reference to end of existing list
  my $self = shift @_;
  my $ref = shift @_;
  push @{$self->{refs}}, $ref;
}

####################################################################################
sub add_details {
  # given an array of raw strings, try to convert into paper references
  
  my $self = shift @_;
  foreach my $cites (@_) {
    $self->{last} = Bib::CrossRef->new();
    $self->{last}->parse_text($cites);
    $self->append($self->{last});
    sleep 1/(0.001+$self->{ratelimit}); # rate limit queries to crossref
  }
  $self->_split_duplicates();
}

####################################################################################
sub add_google {
  # scrape paper details from google scholar personal page -- nb: no doi info on google, so use crossref.org to obtain this
  # nb: doesn't work with google scholar search results
  
  my $self = shift @_;
  my $url = shift @_;
  my $ua = LWP::UserAgent->new;
  $ua->agent('Mozilla/5.0');
  my $req = HTTP::Request->new(GET => $url);
  my $res = $ua->request($req);
  if ($res->is_success) {
    my $tree= HTML::TreeBuilder::XPath->new;
    $tree->parse($res->decoded_content);
    my @atitles=$tree->findvalues('//tr[@class="gsc_a_tr"]/td/a[@class="gsc_a_at"]');
    my @authors=$tree->findvalues('//tr[@class="gsc_a_tr"]/td/div[@class="gs_gray"][1]');
    my @jtitles=$tree->findvalues('//tr[@class="gsc_a_tr"]/td/div[@class="gs_gray"][2]');
    my $len1 = @atitles; my $len2 = @authors; my $len3 = @jtitles;
    if (($len1 != $len2) || ($len1 != $len3) || ($len2 != $len3)) {$self->_err("Problem parsing google page: mismatched $len1 titles/$len2 authors/$len3 journals.");return []}
    for (my $i = 0; $i<$len1; $i++) {
      # these are already utf8
      $authors[$i] = decode_entities($authors[$i]);
      $atitles[$i] = decode_entities($atitles[$i]);
      $jtitles[$i] = decode_entities($jtitles[$i]);
      my $temp = $authors[$i].", ".$atitles[$i].", ".$jtitles[$i];
      my $r = Bib::CrossRef->new;
      $r->parse_text($temp);
      $jtitles[$i] =~ m/\s([0-9][0-9][0-9][0-9])$/;
      my $year=$1;
      if ((length($year)==4) && ($r->date ne $1)) {
        $r->_setscore(0.5); # mismatch in year, probably bad
      }
      $self->append($r);



( run in 1.135 second using v1.01-cache-2.11-cpan-0bb4e1dffa6 )