Bib-Tools
view release on metacpan or search on metacpan
lib/Bib/Tools.pm view on Meta::CPAN
############################################################
#
# Bib::Tools - For managing collections of Bib::CrossRef references.
#
############################################################
package Bib::Tools;
use 5.8.8;
use strict;
use warnings;
no warnings 'uninitialized';
require Exporter;
use Bib::CrossRef;
use LWP::UserAgent;
use JSON qw/decode_json/;
use URI::Escape qw(uri_escape_utf8 uri_unescape);
use HTML::Entities qw(decode_entities encode_entities);
use HTML::TreeBuilder::XPath;
use XML::Simple qw(XMLin);
use BibTeX::Parser qw(new next);
use IO::File;
use vars qw($VERSION @EXPORT @EXPORT_OK %EXPORT_TAGS @ISA);
#use LWP::Protocol::https;
#use Data::Dumper;
$VERSION = '0.17';
@ISA = qw(Exporter);
@EXPORT = qw();
@EXPORT_OK = qw(
sethtml clearhtml add_details add_google add_google_search add_orcid add_fromfile add_dblp add_pubmed
send_resp print print_nodoi num num_nodoi getref getref_nodoi append add_bibtex
);
%EXPORT_TAGS = (all => \@EXPORT_OK);
####################################################################################
sub new {
my $self;
# defaults
$self->{refs} = []; # the references
$self->{nodoi_refs} = [];
$self->{duprefs} = [];
$self->{html}=0;
$self->{ratelimit}=5; # limit of 5 crossref queries per sec
$self->{last} = {};
bless $self;
my $ratelimit = $_[1];
if (defined($ratelimit) && ($ratelimit>=0)) {$self->{ratelimit}=$ratelimit};
return $self;
}
####################################################################################
sub sethtml {
my $self = shift @_;
$self->{html}=1;
}
####################################################################################
sub clearhtml {
my $self = shift @_;
$self->{html}=0;
}
####################################################################################
sub _err {
my ($self, $str) = @_;
if ($self->{html}) {
print "<p style='color:red'>",$str,"</p>";
} else {
print $str,"\n";
}
}
####################################################################################
sub _split_duplicates {
# split list of references into three lists: one with refs that have unique doi's, one with no doi's
#and one with all the rest (with duplicate doi's)
my $self = shift @_;
my @refs=@{$self->{refs}};
my @newrefs;
foreach my $ref (@refs) {
my $doi = $ref->doi();
if (!defined($doi) || length($doi)==0) {push @{$self->{nodoi_refs}}, $ref; next; }# skip entries with no DOI
my $found = 0;
foreach my $ref2 (@newrefs) {
if ($ref2->doi() eq $doi) {
$found = 1;
}
}
if (!$found) {
push @newrefs, $ref;
} else {
push @{$self->{duprefs}}, $ref;
}
}
$self->{refs} = \@newrefs;
}
####################################################################################
sub append {
# add new reference to end of existing list
my $self = shift @_;
my $ref = shift @_;
push @{$self->{refs}}, $ref;
}
####################################################################################
sub add_details {
# given an array of raw strings, try to convert into paper references
my $self = shift @_;
foreach my $cites (@_) {
$self->{last} = Bib::CrossRef->new();
$self->{last}->parse_text($cites);
$self->append($self->{last});
sleep 1/(0.001+$self->{ratelimit}); # rate limit queries to crossref
}
$self->_split_duplicates();
}
####################################################################################
sub add_google {
# scrape paper details from google scholar personal page -- nb: no doi info on google, so use crossref.org to obtain this
# nb: doesn't work with google scholar search results
my $self = shift @_;
my $url = shift @_;
my $ua = LWP::UserAgent->new;
$ua->agent('Mozilla/5.0');
my $req = HTTP::Request->new(GET => $url);
my $res = $ua->request($req);
if ($res->is_success) {
my $tree= HTML::TreeBuilder::XPath->new;
$tree->parse($res->decoded_content);
my @atitles=$tree->findvalues('//tr[@class="gsc_a_tr"]/td/a[@class="gsc_a_at"]');
my @authors=$tree->findvalues('//tr[@class="gsc_a_tr"]/td/div[@class="gs_gray"][1]');
my @jtitles=$tree->findvalues('//tr[@class="gsc_a_tr"]/td/div[@class="gs_gray"][2]');
my $len1 = @atitles; my $len2 = @authors; my $len3 = @jtitles;
if (($len1 != $len2) || ($len1 != $len3) || ($len2 != $len3)) {$self->_err("Problem parsing google page: mismatched $len1 titles/$len2 authors/$len3 journals.");return []}
for (my $i = 0; $i<$len1; $i++) {
# these are already utf8
$authors[$i] = decode_entities($authors[$i]);
$atitles[$i] = decode_entities($atitles[$i]);
$jtitles[$i] = decode_entities($jtitles[$i]);
my $temp = $authors[$i].", ".$atitles[$i].", ".$jtitles[$i];
my $r = Bib::CrossRef->new;
$r->parse_text($temp);
$jtitles[$i] =~ m/\s([0-9][0-9][0-9][0-9])$/;
my $year=$1;
if ((length($year)==4) && ($r->date ne $1)) {
$r->_setscore(0.5); # mismatch in year, probably bad
}
$self->append($r);
( run in 1.135 second using v1.01-cache-2.11-cpan-0bb4e1dffa6 )