XHTML results from the CPAN

EBook-Tools

=head2 C<parse_text()>

Parses the C<'    '> (DATA.FRK) resource loaded into
C<< $self->{resources} >>, if present, extracting the text into
C<< $self->{text} >>, uncompressing it if necessary.  LZSS uncompression
will use the C<< $self->{lzsslengthbits} >> and
C<< $self->{lzssoffsetbits} >> attributes if present, and default to 3
length bits and 14 offset bits otherwise.

HTML headers and footers are then applied, and control codes replaced
with appropriate tags.

Returns the length of the raw uncompressed text before any HTML
modification was done, or undef if no text resource was found or the
text was encrypted.

=cut

sub parse_text :method
{
    my $self = shift;
    my $subname = (caller(0))[3];
    croak($subname . "() called as a procedure!\n") unless(ref $self);
    debug(2,"DEBUG[",$subname,"]");

    return unless($self->{resources}->{'    '});

    $self->parse_resource_cm();
    my $lengthbits = $self->{lzsslengthbits} || 3;
    my $offsetbits = $self->{lzssoffsetbits} || 14;
    my $lzss = EBook::Tools::LZSS->new(lengthbits => $lengthbits,
                                       offsetbits => $offsetbits,
                                       windowstart => 1);
    my $textref;
    my $textlength;

    if($self->{encryption})
    {
        warn($subname,"(): encrypted text not supported!\n");
        return;
    }

    if($self->{compression})
    {
        $textref = $lzss->uncompress(\$self->{resources}->{'    '}->{data});
    }
    else
    {
        $textref = \$self->{resources}->{'    '}->{data};
    }
    $textlength = length($$textref);

    if(!$textlength)
    {
        carp($subname,"(): no text extracted from DATA.FRK resource!\n");
        return;
    }

    $self->{text} = <<'END';
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head>
  <meta http-equiv="CONTENT-TYPE" content="text/html; charset=windows-1252" />
END

    $self->{text} .= "  <title>$self->{title}</title>\n";
    $self->{text} .= "</head>\n<body>\n";

    my $pos = 0;
    my %ccharmap = (
        0x0A => "\n" . '<br style="page-break-before: always" />', # supported!
        0x0B => "\n<p>",
        0x0D => "<br />\n",
        0x0E => '',             # Start of <table>, not yet supported
        0x13 => '',             # End of table cell </td>, not yet supported
        0x14 => "\n<hr />\n",
        0x8E => "&eacute;",
        0xA0 => "&nbsp;",
        0xA5 => "&bull;",
        0xA8 => "&reg;",
        0xA9 => "&copy;",
        0xAA => "&trade;",
        0xAE => "&AElig;",
        0xC7 => "&laquo;",
        0xC8 => "&raquo;",
        0xC9 => "&hellip;",
        0xD0 => "&ndash;",
        0xD1 => "&mdash;",
        0xD2 => "&ldquo;",
        0xD3 => "&rdquo;",
        0xD4 => "&lsquo;",
        0xD5 => "&rsquo;",
        0xE1 => "&middot;",
        );

    while($pos < $textlength)
    {
        my $char = substr($$textref,$pos,1);
        my $ord = ord($char);

        if($ord == 0x0F)        # Image
        {
            $self->{text} .= $self->{offsetelements}->{$pos};
        }
        elsif(defined $ccharmap{$ord})
        {
            $self->{text} .= $ccharmap{$ord};
        }
        else
        {
            $self->{text} .= $char;
        }
        $pos++;
    }
    $self->{text} .= "\n</body>\n</html>";
    $self->{text} =~ s/\x15 .*? \x15//gx;        # Kill header - comment out?
    $self->{text} =~ s/\x16 .*? \x16//gx;        # Kill footer
    return $textlength;
( run in 1.554 second using v1.01-cache-2.11-cpan-df04353d9ac )