Biblio-Document-Parser
view release on metacpan or search on metacpan
lib/Biblio/Document/Parser/Brody.pm view on Meta::CPAN
warn "No header/footers were found\n" if $DEBUG;
}
# Kill any bad chars
# local *lat2uni = convertor( 'latin1', 'utf8' );
# lat2uni(\$BIBL);
# if( $BIBL =~ /$RE_BOR/mi ) {
# $BIBL = $';
# } else {
# croak "FATAL: Unable to find reference section\n";
# }
my @REFS;
# Attempt to find the reference section
while( !@REFS && ($BIBL =~ /$RE_BOR/mi) && ($BIBL = $') ) {
my $c = 0;
# Count the number of occurences of [\d] over the next 2k of data or so
my $buffer = substr($BIBL, 0, 2048);
$c = 0;
while($buffer =~ m/^\s*\[\d+\]/mog) { last if ++$c == 5 }
if( $c >= 5 ) {
warn "Style = numbered square ([1])\n" if $DEBUG;
last if (@REFS = &style_numbered_square($BIBL));
}
# How about 1. notation
# $buffer = substr($BIBL, 0, 2046);
$c = 0;
while($buffer =~ m/^\s*(\d+)\./mog) { last if ++$c == 5 }
if( $c >= 5 ) {
warn "Style = numbered (1.)\n" if $DEBUG;
# $BIBL =~ s/^\s*(\d+)\./\[$1\]/mg;
last if (@REFS = &style_numbered($BIBL));
}
# Now we're getting desperate - hopefully its a name list followed by year
# $buffer = substr($BIBL, 0, 2048);
$c = 0;
while($buffer =~ m/^$RE_NAME_LIST_CHARS{10,40}[^\d\-]19|20\d{2}[^\d\-]/mog) { last if ++$c == 5 }
if( $c >= 5 ) {
warn "Style = years\n" if $DEBUG;
last if (@REFS = &style_years($BIBL));
}
# if( @REFS ) {
# last;
# } elsif( $BIBL =~ /$RE_BOR/mi ) {
# warn "Skipping section ...\n" if $DEBUG;
# $BIBL = $';
# } else {
# last;
# }
}
for( my $i = 0; $i < @REFS; $i++ ) {
my $ref = $REFS[$i] or next;
# $REFS[$i] = "[" . ($i+1) . "] " . unicode_string($ref);
$REFS[$i] = "[" . ($i+1) . "] " . $ref;
}
return grep { defined($_) && length($_) } @REFS;
}
#my ($BIBL, $buffer);
#$BIBL = '';
#my $lc = 0;
#die "FATAL: Input has gone beyond $MAX_SIZE byte limit" if read(STDIN,$BIBL,$MAX_SIZE) == $MAX_SIZE;
#die "Empty input" unless length($BIBL);
#while( read(STDIN,$buffer,4096) ) {
# $BIBL .= $buffer;
# die "FATAL: Input has gone beyond $MAX_SIZE bytes limit" if length($BIBL) > $MAX_SIZE;
#}
#while( <> ) {
# s/\f/\n\n/sg;
# $BIBL = $_ . $BIBL;
# die "FATAL: Input has gone beyond $MAX_SIZE bytes limit" if length($BIBL) > $MAX_SIZE;
# if( $_ =~ /^(?:\n\s*){3}/ ) {
# # Regexp matches for the end of the string are *really* bad performance
# # Lines are in reverse order!
# if( $BIBL =~ /^(?:\n\s*){3}([^\n]{0,40}\w+[^\n]{0,40})(?:\n\s*){2}/os ) {
# $HEADERS{header_to_regexp($1)}++;
# }
# }
#}
# Put the lines back in-order
#my @lines = split(/\n/,$BIBL);
#$BIBL = '';
#for(@lines) {
# $BIBL = $_ . "\n" . $BIBL;
#}
# Read in the document
#while( read(STDIN,$buffer,4096) ) {
# if( length($BIBL) > $MAX_SIZE ) {
# die "FATAL: Input has gone beyond $MAX_SIZE Bytes limit\n";
# }
# $BIBL .= $buffer;
#}
#print "Ref section:\n", $BIBL;
# Change to utf8
#use utf8;
#### REMAINING FUNCTIONS ARE INTERNAL OR DEPRECATED ####
sub end_of_references {
my $ref = shift;
if( $$ref =~ /${RE_EOR}/im ||
$$ref =~ /^\s*acknowledgements:/im ) {
lib/Biblio/Document/Parser/Brody.pm view on Meta::CPAN
# Join refs with the previous reference if they are very short or are quite short and don't start with ...(year)
for( my $i = 1; $i < @REFS; $i++ ) {
my $l = $REFS[$i];
$l =~ s/\s+//sg;
if( (length($l) < 30) ||
(length($l) < 50 && $REFS[$i] !~ /^$RE_NAME_LIST_CHARS{10,40}[^\d\-](\d{4})[^\d\-]/s) ) {
$REFS[$i-1] .= $REFS[$i];
splice(@REFS,$i,1);
$i--;
}
}
# If we find 3 sequential references without years near the beginning we probably have trailing garbage
my $lc = 0;
for( my $i = 10; $i < @REFS; $i++ ) {
if( $REFS[$i] =~ /^\D{10,50}19|20\d{2}/s ) {
$lc = 0;
} else {
$lc++;
}
if( $lc == 3 ) {
splice(@REFS,$i-2);
}
}
# Remove lines without any numbers that are quite long (excluding spaces)
for( my $i = 0; $i < @REFS; $i++ ) {
my $l = $REFS[$i];
$l =~ s/\s+//sg;
if( length($l) > 100 && $REFS[$i] !~ /\d/ ) {
splice(@REFS,$i,1);
}
}
# Prettify
map { $_ =~ s/\s+/ /sg; $_ =~ s/^\s+//; $_ =~ s/\s+$//s; } @REFS;
# This doesn't work - names are too icky
# Now go back in and split anything that looks like name, x (year)
# for( my $i = 0; $i < @REFS; $i++ ) {
# my @srefs = grep { $_ =~ /\S/ } split(/((?:[a-zA-Z\-\'\.]+\s*,\s*[a-zA-Z\.]+.{0,7})+\d{4}\b)/, $REFS[$i]);
# next unless @srefs > 2;
#print "Split reference:\n",
# (map { "PART: \"$_\"\n" } @srefs), "\n";
# }
#die;
return @REFS;
}
sub header_to_regexp {
my $header = shift;
$header =~ s/([\\\|\(\)\[\]\.\*\+\?\{\}])/\\$1/g;
$header =~ s/\s+/\\s+/g;
$header =~ s/\d+/\\d+/g;
return $header;
return q/(?:\n\s*){3}(/.$header.q/)(?:\n\s*){2}/;
}
#sub unicode_string {
# $_ = shift();
# s/[\x00-\x08\x0b-\x0c\x0e-\x1f]//sg;
# s/([\x80-\xff])/sprintf("&#x%04x;",ord($1))/seg;
# return $_;
#}
1;
__END__
=back
=head1 AUTHOR
Written by Tim Brody <tdb01r@ecs.soton.ac.uk>
( run in 0.493 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )