App-Followme

 view release on metacpan or  search on metacpan

share/clean.pl  view on Meta::CPAN

#!/usr/bin/perl

use strict;
use warnings;

use IO::File;
use Getopt::Std;

use App::Followme::Web;

# False outside of body tags, true inside
our $in_body;

use constant PATTERNS => <<EOQ;
# Rework quote paragraphs
(<p><font><b>*</b></font></p>)                      ->   <p>(<b>*</b><br/>)</p>
# Remove cruft from comment paragraphs
<p><font>*</font></p>                               ->   <p>*</p>
# Remove paragraphs containing breaks
<p><br/>*</p>                                       ->
# Title
<p><font><font size="6"><b>*</b></font></font></p>  ->   <h1>*</h1>
# Subtitle
<h1>*</h1>                                          ->   <h2>*</h2>
# Remove font tags
<font>*</font>                                      ->   *
# Remove other span tags
<span>*</span>                                      ->   *
# Remove document styling
<style>*</style>                                    ->
# Remove metadata tags
<meta>                                              ->
EOQ

use constant TEMPLATE => <<EOQ;
<html>
<head>
<!-- section meta -->
<title></title>
<!-- endsection meta -->
</head>
<body>
<!-- section primary -->
<!-- endsection primary-->
<!-- section secondary -->
<!-- endsection secondary-->
</body>
</html>
EOQ

use constant METADATA => <<EOQ;
<title></title>
<meta name="description" content="">
<meta name="keywords" content="">
<meta name="author" content="">
EOQ

#----------------------------------------------------------------------
# Main

my %opt;
getopts('r', \%opt);

if ($opt{r}) {
    revert_files(@ARGV);
} else {
    my $patterns = read_patterns(PATTERNS);
    clean_files($patterns, @ARGV);
}

#----------------------------------------------------------------------
# Add sections to cleaned file

sub add_sections {
    my ($text, $template) = @_;

    my @hold;
    my @output;
    my @input = web_split_at_tags($text);

    foreach my $tag ($template =~ /<[^>]*>/g) {
        if ($tag =~ /^<!--\s*end/) {
            push(@hold, $tag);

        } elsif ($tag =~ /^<!--/) {
            if (@hold) {

share/clean.pl  view on Meta::CPAN

    die "Couldn't write file ($file): $!\n" unless $fd;

    print $fd $text;
    close($fd);
}

__END__

=encoding utf-8

=head1 NAME

clean.pl - Remove unwanted html from file

=head1 SYNOPSIS

    perl clean.pl html_file [html_file]*

=head1 DESCRIPTION

Parse an html file into html tags and text. Use the patterns in a pattern
string to match and replace combinations of tags and text. Save the existing 
file by appending a ~ to the file name and save the modified file under 
the old name. Use the -r option to restore the original file under its old name.

In addition to the replacements in the patterns string, this script strips 
options from html tags not mentioned in the patterns string, fills pararagraph
lines to 65 characters, and deletes consecutive blank lines.

=head1 CONFIGURATION

The PATTERNS string has two patterns on each line separated by
the string "->". The first pattern is matched in the html file and replaced
using the second pattern. The second pattern can be blank. In this case the
html matched by the first pattern is replace by an empty line. The file can
contain blank and comment lines in addition to pattern lines. Comment lines
start with a sharp character (#). 

Patterns contain a combination to tags and text. Partial matches are done on 
tage. For example, <div> matches any div tag while <div class="article"> 
only matches tags of the article class and <div class=*> matches any div tag
with a class option. A single star (*) matches any combination of text or tags. 
Any other text is matched as a regular expression. Patterns can be grouped 
inside of parentheses, which mean match one or more consecutive instances of 
the pattern.

The replacement pattern contain the tags that replace the match and text. A
star in the pattern is replaced by the text matched by the corresponding star
in the match pattern. If the star is in a tag otion, set the option to 
the value in the corresponding option in the match pattern. Any other text is 
put in the output verbatim. Patterns can be grouped inside parentheses. The 
pattern will be copied as many times as were matched in the correpsonding set 
of parentheses in the match pattern.

The format is probably best explained by example:

    # Replace strong tags by bold
    <strong>*</strong> -> <b>*</b>
    # Replace center tags
    <center>*</center> -> <div class="centered">*</div>
    # Remove font tags
    <font>*</font> -> *
    # Remove style tag and its content (empty replacement)
    <style>*</style> ->
    # Replace div tags with p tags, keep class
    <div class=*>*</div> -> <p class=*>*</p>
    # Replace list with breaks
    <ul>(<li>*</li>)</ul> -> (*<br>)
    # Remove page numbering (empty replacement)
    <p>^\d+$</p> ->

=head1 LICENSE

Copyright (C) Bernie Simon.

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=head1 AUTHOR

Bernie Simon E<lt>bernie.simon@gmail.comE<gt>

=cut



( run in 0.782 second using v1.01-cache-2.11-cpan-5735350b133 )