CAM-PDF
view release on metacpan or search on metacpan
lib/CAM/PDF/PageText.pm view on Meta::CPAN
package CAM::PDF::PageText;
use 5.006;
use warnings;
use strict;
our $VERSION = '1.60';
=head1 NAME
CAM::PDF::PageText - Extract text from PDF page tree
=head1 SYNOPSIS
my $pdf = CAM::PDF->new($filename);
my $pageone_tree = $pdf->getPageContentTree(1);
print CAM::PDF::PageText->render($pageone_tree);
=head1 DESCRIPTION
This module attempts to extract sequential text from a PDF page. This
is not a robust process, as PDF text is graphically laid out in
arbitrary order. This module uses a few heuristics to try to guess
what text goes next to what other text, but may be fooled easily by,
say, subscripts, non-horizontal text, changes in font, form fields
etc.
All those disclaimers aside, it is useful for a quick dump of text
from a simple PDF file.
=head1 LICENSE
Same as L<CAM::PDF>
=head1 FUNCTIONS
=over
=item $pkg->render($pagetree)
=item $pkg->render($pagetree, $verbose)
Turn a page content tree into a string. This is a class method that
should be called like:
CAM::PDF::PageText->render($pagetree);
=cut
sub render
{
my $pkg = shift;
my $pagetree = shift;
my $verbose = shift;
my $str = q{};
my @stack = ([@{$pagetree->{blocks}}]);
my $in_textblock = 0;
## The stack is a list of blocks. We do depth-first on blocks, but
## we must be sure to traverse the children of the blocks in their
## original order.
while (@stack > 0)
{
# keep grabbing the same node until it's empty
my $node = $stack[-1];
if (ref $node)
{
if (@{$node} > 0) # Still has children?
{
my $block = shift @{$node}; # grab the next child
if ($block->{type} eq 'block')
{
if ($block->{name} eq 'BT')
{
# Insert a flag on the stack to say when we leave the BT block
push @stack, 'BT';
$in_textblock = 1;
}
push @stack, [@{$block->{value}}]; # descend
}
elsif ($in_textblock)
( run in 1.685 second using v1.01-cache-2.11-cpan-39bf76dae61 )