Alvis-Convert
view release on metacpan or search on metacpan
bin/html2plain view on Meta::CPAN
GetOptions('help|?'=>\$PrintHelp,
'man'=>\$PrintManual,
'warnings!'=>\$Warnings,
'html-ext=s'=>\$HTMLSuffix,
'assert-html!'=>\$AssertHTML,
'symbolic-char-entities-to-chars!'=>\$ConvCharEnts,
'numerical-char-entities-to-chars!'=>\$ConvNumEnts,
'source-encoding=s'=>\$SourceEncoding,
'clean-whitespace!'=>\$CleanWS,
'assert-assumptions!'=>\$AssertAss,
'out-dir=s'=>\$ODir,
'out-ext=s'=>\$OutSuffix,
'N-per-out-dir=s'=>\$NPerOurDir) or
pod2usage(2);
pod2usage(1) if $PrintHelp;
pod2usage(-exitstatus => 0, -verbose => 2) if $PrintManual;
pod2usage(1) if (@ARGV!=1);
my $SDir=shift @ARGV;
$|=1;
my $C=Alvis::HTML->new(alvisKeep=>1,
alvisRemove=>1,
obsolete=>1,
proprietary=>1,
xhtml=>1,
wml=>1,
keepAll=>0,
assertHTML=>$AssertHTML,
convertCharEnts=>$ConvCharEnts,
convertNumEnts=>$ConvNumEnts,
sourceEncoding=>$SourceEncoding,
cleanWhitespace=>$CleanWS,
assertSourceAssumptions=>$AssertAss
);
my %Seen;
my $outputN=0;
if (!&_convert_collection($SDir,{htmlSuffix=>$HTMLSuffix}))
{
die("Conversion failed. " . $C->errmsg());
}
sub _parse_entries
{
my $entries=shift;
my $options=shift;
my $html_entries=shift;
for my $e (@$entries)
{
if ($Seen{$e})
{
next;
}
$Seen{$e}=1;
if (-d $e)
{
my @entries=glob("$e/*");;
&_parse_entries(\@entries,$options,$html_entries);
next;
}
my ($basename,$suffix);
if ($e=~/^(.*)\.([^\.]+)$/)
{
$basename=$1;
$suffix=$2;
}
else
{
warn "Skipping non-suffixed non-directory entry \"$e\".\n" if
$Warnings;
next;
}
if ($suffix eq $options->{htmlSuffix})
{
$html_entries->{$basename}{htmlF}=$e;
}
}
}
sub _convert_collection
{
my $root_dir=shift;
my $options=shift;
my @entries=glob("$root_dir/*");
my %html_entries=();
%Seen=();
print "Parsing the source directory entries...\r";
&_parse_entries(\@entries,$options,\%html_entries);
print " \r";
for my $base_name (keys %html_entries)
{
my ($html_txt,$plain_txt,$header);
if (exists($html_entries{$base_name}{htmlF}))
{
my $html_txt;
if (!defined(open(F,"<$html_entries{$base_name}{htmlF}")))
{
warn "Unable to open \"$html_entries{$base_name}{htmlF}\".\n";
next;
}
while (my $l=<F>)
{
$html_txt.=$l;
}
close(F);
($plain_txt,$header)=$C->clean($html_txt);
if (!defined($plain_txt))
{
warn "Getting the plain text for basename \"$base_name\" failed. " .
$C->errmsg() if
$Warnings;
$C->clearerr();
next;
}
}
else
{
warn "No HTML file for basename \"$base_name\".\n" if
$Warnings;
next;
}
if (!&_output_plain($plain_txt))
{
warn "Outputting the Alvis records for base name \"$base_name\" failed. " . $C->errmsg() if
$Warnings;
$C->clearerr();
next;
}
}
return 1;
}
sub _output_plain
{
my $plain_txt=shift;
my $out_f;
my $dir=$ODir . '/' .
int($outputN / $NPerOurDir);
( run in 1.175 second using v1.01-cache-2.11-cpan-39bf76dae61 )