CAM-PDF

 view release on metacpan or  search on metacpan

lib/CAM/PDF/PageText.pm  view on Meta::CPAN

package CAM::PDF::PageText;

use 5.006;
use warnings;
use strict;

our $VERSION = '1.60';

=head1 NAME

CAM::PDF::PageText - Extract text from PDF page tree

=head1 SYNOPSIS

   my $pdf = CAM::PDF->new($filename);
   my $pageone_tree = $pdf->getPageContentTree(1);
   print CAM::PDF::PageText->render($pageone_tree);

=head1 DESCRIPTION

This module attempts to extract sequential text from a PDF page.  This
is not a robust process, as PDF text is graphically laid out in
arbitrary order.  This module uses a few heuristics to try to guess
what text goes next to what other text, but may be fooled easily by,
say, subscripts, non-horizontal text, changes in font, form fields
etc.

All those disclaimers aside, it is useful for a quick dump of text
from a simple PDF file.

=head1 LICENSE

Same as L<CAM::PDF>

=head1 FUNCTIONS

=over

=item $pkg->render($pagetree)

=item $pkg->render($pagetree, $verbose)

Turn a page content tree into a string.  This is a class method that
should be called like:

   CAM::PDF::PageText->render($pagetree);

=cut

sub render
{
   my $pkg      = shift;
   my $pagetree = shift;
   my $verbose  = shift;

   my $str          = q{};
   my @stack        = ([@{$pagetree->{blocks}}]);
   my $in_textblock = 0;

   ## The stack is a list of blocks.  We do depth-first on blocks, but
   ## we must be sure to traverse the children of the blocks in their
   ## original order.

   while (@stack > 0)
   {
      # keep grabbing the same node until it's empty
      my $node = $stack[-1];
      if (ref $node)
      {
         if (@{$node} > 0)   # Still has children?
         {
            my $block = shift @{$node};   # grab the next child
            if ($block->{type} eq 'block')
            {
               if ($block->{name} eq 'BT')
               {
                  # Insert a flag on the stack to say when we leave the BT block
                  push @stack, 'BT';
                  $in_textblock = 1;
               }
               push @stack, [@{$block->{value}}];  # descend
            }
            elsif ($in_textblock)



( run in 1.685 second using v1.01-cache-2.11-cpan-39bf76dae61 )