Alvis-Convert
view release on metacpan or search on metacpan
bin/html2plain view on Meta::CPAN
{
my ($html_txt,$plain_txt,$header);
if (exists($html_entries{$base_name}{htmlF}))
{
my $html_txt;
if (!defined(open(F,"<$html_entries{$base_name}{htmlF}")))
{
warn "Unable to open \"$html_entries{$base_name}{htmlF}\".\n";
next;
}
while (my $l=<F>)
{
$html_txt.=$l;
}
close(F);
($plain_txt,$header)=$C->clean($html_txt);
if (!defined($plain_txt))
{
warn "Getting the plain text for basename \"$base_name\" failed. " .
$C->errmsg() if
$Warnings;
$C->clearerr();
next;
}
}
else
{
warn "No HTML file for basename \"$base_name\".\n" if
$Warnings;
next;
}
if (!&_output_plain($plain_txt))
{
warn "Outputting the Alvis records for base name \"$base_name\" failed. " . $C->errmsg() if
$Warnings;
$C->clearerr();
next;
}
}
return 1;
}
sub _output_plain
{
my $plain_txt=shift;
my $out_f;
my $dir=$ODir . '/' .
int($outputN / $NPerOurDir);
if ($outputN % $NPerOurDir==0)
{
mkdir($dir);
}
$out_f=$dir . '/' . $outputN . '.' .
$OutSuffix;
if (!defined(open(OUT,">:utf8",$out_f)))
{
warn "Cannot open output file \"$out_f\".\n";
return 0;
}
print OUT $plain_txt;
close(OUT);
$outputN++;
print "$outputN\r";
}
__END__
=head1 NAME
html2plain.pl - HTML to plain text converter
=head1 SYNOPSIS
html2plain.pl [options] [source directory ...]
Options:
--html-ext HTML file identifying filename extension
--out-ext output filename extension
--out-dir output directory
--N-per-out-dir # of records per output directory
--source-encoding the encoding of the HTML files
--[no]assert-html assert that the document is HTML
--[no]symbolic-char-entities-to-chars
convert symbolic character entities to UTF-8
characters
--[no]numerical-char-entities-to-chars
convert numerical character entities to UTF-8
characters
--[no]clean-whitespace remove redundant whitespace
--[no]assert-assumptions assert that the document is in UTF-8 and contains
before actually converting to plain text
--help brief help message
--man full documentation
--[no]warnings warnings output flag
=head1 OPTIONS
=over 8
=item B<--html-ext>
Sets the HTML file identifying filename extension.
Default value: 'html'.
=item B<--out-ext>
Sets the output filename extension.
Default value: 'plain'.
=item B<--out-dir>
Sets the output directory. Default value: '.'.
( run in 0.676 second using v1.01-cache-2.11-cpan-140bd7fdf52 )