EBook-Tools
view release on metacpan or search on metacpan
lib/EBook/Tools/IMP.pm view on Meta::CPAN
=head2 C<parse_text()>
Parses the C<' '> (DATA.FRK) resource loaded into
C<< $self->{resources} >>, if present, extracting the text into
C<< $self->{text} >>, uncompressing it if necessary. LZSS uncompression
will use the C<< $self->{lzsslengthbits} >> and
C<< $self->{lzssoffsetbits} >> attributes if present, and default to 3
length bits and 14 offset bits otherwise.
HTML headers and footers are then applied, and control codes replaced
with appropriate tags.
Returns the length of the raw uncompressed text before any HTML
modification was done, or undef if no text resource was found or the
text was encrypted.
=cut
sub parse_text :method
{
my $self = shift;
my $subname = (caller(0))[3];
croak($subname . "() called as a procedure!\n") unless(ref $self);
debug(2,"DEBUG[",$subname,"]");
return unless($self->{resources}->{' '});
$self->parse_resource_cm();
my $lengthbits = $self->{lzsslengthbits} || 3;
my $offsetbits = $self->{lzssoffsetbits} || 14;
my $lzss = EBook::Tools::LZSS->new(lengthbits => $lengthbits,
offsetbits => $offsetbits,
windowstart => 1);
my $textref;
my $textlength;
if($self->{encryption})
{
warn($subname,"(): encrypted text not supported!\n");
return;
}
if($self->{compression})
{
$textref = $lzss->uncompress(\$self->{resources}->{' '}->{data});
}
else
{
$textref = \$self->{resources}->{' '}->{data};
}
$textlength = length($$textref);
if(!$textlength)
{
carp($subname,"(): no text extracted from DATA.FRK resource!\n");
return;
}
$self->{text} = <<'END';
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="CONTENT-TYPE" content="text/html; charset=windows-1252" />
END
$self->{text} .= " <title>$self->{title}</title>\n";
$self->{text} .= "</head>\n<body>\n";
my $pos = 0;
my %ccharmap = (
0x0A => "\n" . '<br style="page-break-before: always" />', # supported!
0x0B => "\n<p>",
0x0D => "<br />\n",
0x0E => '', # Start of <table>, not yet supported
0x13 => '', # End of table cell </td>, not yet supported
0x14 => "\n<hr />\n",
0x8E => "é",
0xA0 => " ",
0xA5 => "•",
0xA8 => "®",
0xA9 => "©",
0xAA => "™",
0xAE => "Æ",
0xC7 => "«",
0xC8 => "»",
0xC9 => "…",
0xD0 => "–",
0xD1 => "—",
0xD2 => "“",
0xD3 => "”",
0xD4 => "‘",
0xD5 => "’",
0xE1 => "·",
);
while($pos < $textlength)
{
my $char = substr($$textref,$pos,1);
my $ord = ord($char);
if($ord == 0x0F) # Image
{
$self->{text} .= $self->{offsetelements}->{$pos};
}
elsif(defined $ccharmap{$ord})
{
$self->{text} .= $ccharmap{$ord};
}
else
{
$self->{text} .= $char;
}
$pos++;
}
$self->{text} .= "\n</body>\n</html>";
$self->{text} =~ s/\x15 .*? \x15//gx; # Kill header - comment out?
$self->{text} =~ s/\x16 .*? \x16//gx; # Kill footer
return $textlength;
( run in 1.554 second using v1.01-cache-2.11-cpan-df04353d9ac )