html2latex
view release on metacpan or search on metacpan
HTML/Latex.pm view on Meta::CPAN
"my_type" => \&my_handler,
To %types.
=head2 Add support in the configuration file.
The format of the configuration file is in XML and can be found above
under CONFIGURATION FILE. The default XML file is at the bottom of
Latex.pm under __DATA__. Basically, for every tag you want to use your
new handler, use <tag> as follows:
<tag name="TAG_NAME" type="my_type">
<tex>TEX_PARAMATER</tex>
</tap>
TAG_NAME is, of course, the tag name. "my_type" is the name of the
type you assigned your handler to. TEX_PARAMATER is the value that
gets placed under $tex in the example handler.
That's it. Now HTML::Latex should obey the new handler and behave
correctly.
=head1 NOTES
In you call html2latex() on several URLs any filename given after a
URL will continue to use the latest HOST given. Also, files default
to index.html, regardless of what the server thinks. So, if you use:
html2latex(http://slashdot.org)
html2latex(foo.html)
html2latex(http://linuxtoday.net)
html2latex(bar.html)
html2latex() will try to grab http://slashdot.org/index.html,
http://slashdot.org/foo.html, http://linuxtoday.net/index.html, and
http://linuxtoday.net/bar.html
=head1 BUGS
* Anything between <TABLE> and <TR> and <TD> is ignored. I do not
* Anything between <OL> or <UL> and <LI> will not be ignored, but will
really mess Latex up.
=cut
################### END DOCUMENTATION #######################
################### BEGIN DEFENITIONS #######################
# Test what modules we can use
eval {require URI};
$present{'URI'} = 1 unless $@;
eval {require LWP::Simple};
$present{'LWP::Simple'} = 1 unless $@;
eval {require Image::Magick};
$present{'Image::Magick'} = 1 unless $@;
# The configuration file gives a "type" to each tag. This hash tells
# what functions to use on each type
my %types = (
"command" => \&command_handler,
"environment" => \&environment_handler,
"single" => \&single_handler,
"ignore" => \&texify,
"other" => \&other_handler,
"kill" => sub {return ""},
"image" => \&image_handler,
"table" => \&table_handler,
"pre" => \&pre_handler, # Experimental; don't use
);
# Some characters typed in HTML need to be altered to be correct in
# Latex. These must be done this specific order All the foreign
# characters or special ascii characters that need to be altered. *
# next the comment means it doesn't really work or is faked. If it's
# commented out, that means it doesn't work at all.
my @specials = (
['<!--.*-->' , '' ], #comments
['\$' , '\$' ],
['\\\\(?!\$)', "\$\\backslash\$"], #\
['<' , '$<$' ],
['>' , '$>$' ],
['&' , '\&' ],
['%' , '\%' ],
['#' , '\#' ],
['{' , '\{' ],
['}' , '\}' ],
['_' , '\_' ],
['\^' , '\^{}' ],
[chr(161), '!`' ], #¡
#[chr(162), '' ], #¢*
[chr(163), '{\\pounds}' ], #£
#[chr(164), '' ], #¤*
[chr(165), '{Y\hspace*{-1.4ex}--}'], #¥*
[chr(166), '$|$' ], #¦*
[chr(167), '{\\S}' ], #§
[chr(168), '\\"{}' ], #¨
[chr(169), '{\\copyright}' ], #©
[chr(170), '$^{\underline{a}}$'], #ª*
[chr(171), '<<' ], #«
[chr(172), '$\\neg$' ], #¬
[chr(173), '$-$' ], #
#[chr(174), '' ], #®*
[chr(175), '$^-$' ], #¯
[chr(176), '$^{\\circ}$' ], #°
[chr(177), '$\\pm$' ], #±
[chr(178), '$^2$' ], #²
[chr(179), '$^3$' ], #³
[chr(180), '$^\\prime$' ], #´
[chr(181), '$\\mu$' ], #µ
[chr(182), '{\P}' ], #¶
[chr(183), '$\cdot$' ], #·
[chr(184), ',' ], #¸*
[chr(185), '$^1$' ], #¹
[chr(186), '$^{\\underline{\\circ}}$'], #º*
HTML/Latex.pm view on Meta::CPAN
sub find_table_lengths {
my $table = shift;
#only care about TR children
my @rows = grep 'tr', $table->content_list;
my $max_row_length = 0;
foreach my $row (@rows){
#only care about the TD children
my @columns = grep 'td', $row->content_list;
if(@columns > $max_row_length){
$max_row_length = @columns;
}
}
# row_number column_number
return (scalar(@rows),$max_row_length);
}
# returns an array of column alignments
# <1> the refrence to the HTML::Element table.
sub create_column_alignments {
my $table = shift;
my @column_alignments;
#only care about TR children
my @rows = grep 'tr', $table->content_list;
if($rows[0]){
#only care about the TD children
my @columns = grep 'td', $rows[0]->content_list;
foreach my $column (@columns){
my $align = $column->attr('align');
if($align and $align eq 'left'){
$align = 'l';
} elsif($align and $align eq 'right'){
$align = 'r';
} else {
$align = 'c';
}
push @column_alignments, $align;
}
}
return @column_alignments;
}
# converts an image from jpeg or gif into png
# returns the name of the new filename is successfull
# <1> filename
sub convert_image {
my $source = shift;
my($absolute,$relative) = get_uri($source);
if ($absolute and $relative){ #If we can find the file
#if it successfully stores the file
my ($aname,$apath,$asuffix) = fileparse($absolute,'\.(gif|png|jpe?g)');
my ($rname,$rpath,$rsuffix) = fileparse($relative,'\.(gif|png|jpe?g)');
if($asuffix eq '.gif' || $asuffix eq '.jpg' || $asuffix eq '.jpeg'){ #
# Picture is of a convertable type
if($present{'Image::Magick'}){
# convert it with Image::Magick
require Image::Magick;
my $aoutput = "$apath$aname.png"; #write to and return with png
my $routput = "$rpath$rname.png";
my $image = Image::Magick->new();
$image->Read("$absolute");
$image->Write("$aoutput");
undef $image;
print $LOG "IMG: Converted $source to $routput\n";
return $routput;
} else {
# No Image::Magick. Warn user and return nothing.
print $LOG "IMG: Can't convert $source without Image::Magick; using alt\n";
return;
}
} elsif ($asuffix eq '.png'){
# It's a PNG for sure.
my $routput = "$rpath$rname.png";
return $routput;
} else {
# so, it's not a png,gif, or jpg. That means it's an invalid.
print $LOG "IMG: Invalid picture type: $source; using alt\n";
return;
}
} else {
# We can't even get at the file.
return;
}
}
# If the filename is really a URL, then go grab it, translate
# the name to the local file directory, and return that file name.
# Otherwise, just return the thing you got in.
# <1> is the URI
# [2] can specify to change the default host for subsiquent calls
# return ($absolute_path_to_file,$relative_path_to_file);
# The relative can be absolute itself (same as $absolute).
{
#variables to stay the same across calls of get_uri. It's used in
#case we get image URLs with no host or scheme or path.
my $HOST = undef; #global value of current HOST
my $PATH = undef; #path inside host where we start
my $SCHEME = undef; #scheme originally used
sub get_uri {
my ($uri,$absolute_local,$relative_local);
$uri = $absolute_local = $relative_local = shift;
print $LOG "looking for $uri\n" if $options->{debug};
my $override = shift || 0; #absolute means that you replace $HOST and $PATH
if(-f $uri){
# it's an absolute local file.
$PATH = dirname($uri) if $override;
print $LOG "returning $uri\n" if $options->{debug};
return ($uri,$uri);
} elsif(defined($PATH) && -f "$PATH/$uri") {
#it must be a local relative image
print $LOG "returning $PATH/$uri\n" if $options->{debug};
return ("$PATH/$uri",$uri);
} elsif($uri =~ m|://|){
#It's a full URL
# Load necessary modules if you can.
unless($present{'URI'}) {
print $LOG "NEED: Can't handle request of $uri without module URI\n";
return;
}
require URI;
URI->import();
unless($present{'LWP::Simple'}) {
print $LOG "NEED: Can't handle request of $uri without module LWP::Simple\n";
( run in 0.963 second using v1.01-cache-2.11-cpan-99c4e6809bf )