Biblio-Document-Parser

 view release on metacpan or  search on metacpan

lib/Biblio/Document/Parser/Brody.pm  view on Meta::CPAN

		warn "No header/footers were found\n" if $DEBUG;
	}

	# Kill any bad chars
#	local *lat2uni = convertor( 'latin1', 'utf8' );
#	lat2uni(\$BIBL);

#	if( $BIBL =~ /$RE_BOR/mi ) {
#		$BIBL = $';
#	} else {
#		croak "FATAL: Unable to find reference section\n";
#	}


	my @REFS;

	# Attempt to find the reference section
	while( !@REFS && ($BIBL =~ /$RE_BOR/mi) && ($BIBL = $') ) {
		my $c = 0;

		# Count the number of occurences of [\d] over the next 2k of data or so
		my $buffer = substr($BIBL, 0, 2048);
		$c = 0;
		while($buffer =~ m/^\s*\[\d+\]/mog) { last if ++$c == 5 }
		if( $c >= 5 ) {
	warn "Style = numbered square ([1])\n" if $DEBUG;
			last if (@REFS = &style_numbered_square($BIBL));
		}

		# How about 1. notation
	#	$buffer = substr($BIBL, 0, 2046);
		$c = 0;
		while($buffer =~ m/^\s*(\d+)\./mog) { last if ++$c == 5 }
		if( $c >= 5 ) {
	warn "Style = numbered (1.)\n" if $DEBUG;
	#		$BIBL =~ s/^\s*(\d+)\./\[$1\]/mg;
			last if (@REFS = &style_numbered($BIBL));
		}

		# Now we're getting desperate - hopefully its a name list followed by year
		# $buffer = substr($BIBL, 0, 2048);
		$c = 0;
		while($buffer =~ m/^$RE_NAME_LIST_CHARS{10,40}[^\d\-]19|20\d{2}[^\d\-]/mog) { last if ++$c == 5 }
		if( $c >= 5 ) {
	warn "Style = years\n" if $DEBUG;
			last if (@REFS = &style_years($BIBL));
		}

#		if( @REFS ) {
#			last;
#		} elsif( $BIBL =~ /$RE_BOR/mi ) {
#	warn "Skipping section ...\n" if $DEBUG;
#			$BIBL = $';
#		} else {
#			last;
#		}
	}

	for( my $i = 0; $i < @REFS; $i++ ) {
		my $ref = $REFS[$i] or next;
#		$REFS[$i] = "[" . ($i+1) . "] " . unicode_string($ref);
		$REFS[$i] = "[" . ($i+1) . "] " . $ref;
	}

	return grep { defined($_) && length($_) } @REFS;
}

#my ($BIBL, $buffer);
#$BIBL = '';

#my $lc = 0;

#die "FATAL: Input has gone beyond $MAX_SIZE byte limit" if read(STDIN,$BIBL,$MAX_SIZE) == $MAX_SIZE;

#die "Empty input" unless length($BIBL);

#while( read(STDIN,$buffer,4096) ) {
#	$BIBL .= $buffer;
#	die "FATAL: Input has gone beyond $MAX_SIZE bytes limit" if length($BIBL) > $MAX_SIZE;
#}


#while( <> ) {
#	s/\f/\n\n/sg;
#	$BIBL = $_ . $BIBL;
#	die "FATAL: Input has gone beyond $MAX_SIZE bytes limit" if length($BIBL) > $MAX_SIZE;
#	if( $_ =~ /^(?:\n\s*){3}/ ) {
#		# Regexp matches for the end of the string are *really* bad performance
#		# Lines are in reverse order!
#		if( $BIBL =~ /^(?:\n\s*){3}([^\n]{0,40}\w+[^\n]{0,40})(?:\n\s*){2}/os ) {
#			$HEADERS{header_to_regexp($1)}++;
#		}
#	}
#}

# Put the lines back in-order
#my @lines = split(/\n/,$BIBL);
#$BIBL = '';
#for(@lines) {
#	$BIBL = $_ . "\n" . $BIBL;
#}

# Read in the document
#while( read(STDIN,$buffer,4096) ) {
#	if( length($BIBL) > $MAX_SIZE ) {
#		die "FATAL: Input has gone beyond $MAX_SIZE Bytes limit\n";
#	}
#	$BIBL .= $buffer;
#}

#print "Ref section:\n", $BIBL;

# Change to utf8
#use utf8;

#### REMAINING FUNCTIONS ARE INTERNAL OR DEPRECATED ####

sub end_of_references {
	my $ref = shift;
	if( $$ref =~ /${RE_EOR}/im ||
	    $$ref =~ /^\s*acknowledgements:/im ) {

lib/Biblio/Document/Parser/Brody.pm  view on Meta::CPAN


	# Join refs with the previous reference if they are very short or are quite short and don't start with ...(year)
	for( my $i = 1; $i < @REFS; $i++ ) {
		my $l = $REFS[$i];
		$l =~ s/\s+//sg;
		if( (length($l) < 30) ||
		    (length($l) < 50 && $REFS[$i] !~ /^$RE_NAME_LIST_CHARS{10,40}[^\d\-](\d{4})[^\d\-]/s) ) {
			$REFS[$i-1] .= $REFS[$i];
			splice(@REFS,$i,1);
			$i--;
		}
	}

	# If we find 3 sequential references without years near the beginning we probably have trailing garbage
	my $lc = 0;
	for( my $i = 10; $i < @REFS; $i++ ) {
		if( $REFS[$i] =~ /^\D{10,50}19|20\d{2}/s ) {
			$lc = 0;
		} else {
			$lc++;
		}
		if( $lc == 3 ) {
			splice(@REFS,$i-2);
		}
	}

	# Remove lines without any numbers that are quite long (excluding spaces)
	for( my $i = 0; $i < @REFS; $i++ ) {
		my $l = $REFS[$i];
		$l =~ s/\s+//sg;
		if( length($l) > 100 && $REFS[$i] !~ /\d/ ) {
			splice(@REFS,$i,1);
		}
	}

	# Prettify
	map { $_ =~ s/\s+/ /sg; $_ =~ s/^\s+//; $_ =~ s/\s+$//s; } @REFS;

# This doesn't work - names are too icky
	# Now go back in and split anything that looks like name, x (year)
#	for( my $i = 0; $i < @REFS; $i++ ) {
#		my @srefs = grep { $_ =~ /\S/ } split(/((?:[a-zA-Z\-\'\.]+\s*,\s*[a-zA-Z\.]+.{0,7})+\d{4}\b)/, $REFS[$i]);
#		next unless @srefs > 2;
#print "Split reference:\n",
#	(map { "PART: \"$_\"\n" } @srefs), "\n";
#	}
#die;

	return @REFS;
}

sub header_to_regexp {
	my $header = shift;
	$header =~ s/([\\\|\(\)\[\]\.\*\+\?\{\}])/\\$1/g;
       	$header =~ s/\s+/\\s+/g;
       	$header =~ s/\d+/\\d+/g;
	return $header;
       	return q/(?:\n\s*){3}(/.$header.q/)(?:\n\s*){2}/;
}

#sub unicode_string {
#	$_ = shift();
#	s/[\x00-\x08\x0b-\x0c\x0e-\x1f]//sg;
#	s/([\x80-\xff])/sprintf("&#x%04x;",ord($1))/seg;
#	return $_;
#}

1;

__END__

=back

=head1 AUTHOR

Written by Tim Brody <tdb01r@ecs.soton.ac.uk>



( run in 0.493 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )