App-Followme
view release on metacpan or search on metacpan
share/clean.pl view on Meta::CPAN
#!/usr/bin/perl
use strict;
use warnings;
use IO::File;
use Getopt::Std;
use App::Followme::Web;
# False outside of body tags, true inside
our $in_body;
use constant PATTERNS => <<EOQ;
# Rework quote paragraphs
(<p><font><b>*</b></font></p>) -> <p>(<b>*</b><br/>)</p>
# Remove cruft from comment paragraphs
<p><font>*</font></p> -> <p>*</p>
# Remove paragraphs containing breaks
<p><br/>*</p> ->
# Title
<p><font><font size="6"><b>*</b></font></font></p> -> <h1>*</h1>
# Subtitle
<h1>*</h1> -> <h2>*</h2>
# Remove font tags
<font>*</font> -> *
# Remove other span tags
<span>*</span> -> *
# Remove document styling
<style>*</style> ->
# Remove metadata tags
<meta> ->
EOQ
use constant TEMPLATE => <<EOQ;
<html>
<head>
<!-- section meta -->
<title></title>
<!-- endsection meta -->
</head>
<body>
<!-- section primary -->
<!-- endsection primary-->
<!-- section secondary -->
<!-- endsection secondary-->
</body>
</html>
EOQ
use constant METADATA => <<EOQ;
<title></title>
<meta name="description" content="">
<meta name="keywords" content="">
<meta name="author" content="">
EOQ
#----------------------------------------------------------------------
# Main
my %opt;
getopts('r', \%opt);
if ($opt{r}) {
revert_files(@ARGV);
} else {
my $patterns = read_patterns(PATTERNS);
clean_files($patterns, @ARGV);
}
#----------------------------------------------------------------------
# Add sections to cleaned file
sub add_sections {
my ($text, $template) = @_;
my @hold;
my @output;
my @input = web_split_at_tags($text);
foreach my $tag ($template =~ /<[^>]*>/g) {
if ($tag =~ /^<!--\s*end/) {
push(@hold, $tag);
} elsif ($tag =~ /^<!--/) {
if (@hold) {
share/clean.pl view on Meta::CPAN
die "Couldn't write file ($file): $!\n" unless $fd;
print $fd $text;
close($fd);
}
__END__
=encoding utf-8
=head1 NAME
clean.pl - Remove unwanted html from file
=head1 SYNOPSIS
perl clean.pl html_file [html_file]*
=head1 DESCRIPTION
Parse an html file into html tags and text. Use the patterns in a pattern
string to match and replace combinations of tags and text. Save the existing
file by appending a ~ to the file name and save the modified file under
the old name. Use the -r option to restore the original file under its old name.
In addition to the replacements in the patterns string, this script strips
options from html tags not mentioned in the patterns string, fills pararagraph
lines to 65 characters, and deletes consecutive blank lines.
=head1 CONFIGURATION
The PATTERNS string has two patterns on each line separated by
the string "->". The first pattern is matched in the html file and replaced
using the second pattern. The second pattern can be blank. In this case the
html matched by the first pattern is replace by an empty line. The file can
contain blank and comment lines in addition to pattern lines. Comment lines
start with a sharp character (#).
Patterns contain a combination to tags and text. Partial matches are done on
tage. For example, <div> matches any div tag while <div class="article">
only matches tags of the article class and <div class=*> matches any div tag
with a class option. A single star (*) matches any combination of text or tags.
Any other text is matched as a regular expression. Patterns can be grouped
inside of parentheses, which mean match one or more consecutive instances of
the pattern.
The replacement pattern contain the tags that replace the match and text. A
star in the pattern is replaced by the text matched by the corresponding star
in the match pattern. If the star is in a tag otion, set the option to
the value in the corresponding option in the match pattern. Any other text is
put in the output verbatim. Patterns can be grouped inside parentheses. The
pattern will be copied as many times as were matched in the correpsonding set
of parentheses in the match pattern.
The format is probably best explained by example:
# Replace strong tags by bold
<strong>*</strong> -> <b>*</b>
# Replace center tags
<center>*</center> -> <div class="centered">*</div>
# Remove font tags
<font>*</font> -> *
# Remove style tag and its content (empty replacement)
<style>*</style> ->
# Replace div tags with p tags, keep class
<div class=*>*</div> -> <p class=*>*</p>
# Replace list with breaks
<ul>(<li>*</li>)</ul> -> (*<br>)
# Remove page numbering (empty replacement)
<p>^\d+$</p> ->
=head1 LICENSE
Copyright (C) Bernie Simon.
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=head1 AUTHOR
Bernie Simon E<lt>bernie.simon@gmail.comE<gt>
=cut
( run in 0.782 second using v1.01-cache-2.11-cpan-5735350b133 )