App-Zapzi
view release on metacpan or search on metacpan
* (#1) Moved file slurp and other file operations to Path::Tiny
0.014 2014-05-10 18:35:44+07:00 Asia/Bangkok
* Removed requirement for GD module and system library
* Remove iframe tags when using HTML based transformers
* Improved error reporting for transformers
0.013 2014-03-05 20:29:23+07:00 Asia/Bangkok
* Remove font tag attributes when using HTMLExtractMain transformer
* Ignore errors from Text::Markdown about matching braces as can
continue anyway
* Validate article IDs passed in to command line
0.012 2013-10-29 15:09:24 Asia/Hong_Kong
* Added distributors to deliver eBooks after publication
* Copy distributor - copies file to another directory eg an eReader
connected via USB cable
* Script distributor - runs a script with the eBook as parameter
t/lib/ZapziTestSchema.pm
t/release-distmeta.t
t/release-pod-coverage.t
t/release-pod-syntax.t
t/release-test-version.t
t/testfiles/bad-markdown.txt
t/testfiles/distribute-script-echo.pl
t/testfiles/distribute-script-error.pl
t/testfiles/empty.html
t/testfiles/empty.txt
t/testfiles/html-font.html
t/testfiles/html-fragment.html
t/testfiles/html-links.html
t/testfiles/html-no-title.html
t/testfiles/html-two-titles.html
t/testfiles/html-utf8.html
t/testfiles/sample.html
t/testfiles/sample.pm
t/testfiles/sample.txt
t/testfiles/sample.unknown
t/testfiles/ws-and-long-lines.txt
lib/App/Zapzi/Transformers/HTMLExtractMain.pm view on Meta::CPAN
sub _extract_html
{
my $self = shift;
my ($raw_html) = @_;
my $tree = HTML::ExtractMain::extract_main_html($raw_html,
output_type => 'tree' );
if ($tree)
{
$self->_remove_fonts($tree);
$self->_optionally_deactivate_links($tree);
}
return $tree;
}
sub _remove_fonts
{
my ($self, $tree) = @_;
# Remove any font attributes as they rarely work as expected on
# eReaders - eg colours do not make sense on monochrome displays,
# font families will probably not exist.
for my $font ($tree->look_down(_tag => "font"))
{
$font->attr($_, undef) for $font->all_external_attr_names;
}
}
sub _optionally_deactivate_links
{
my ($self, $tree) = @_;
# Turn links into text if option was requested.
my $option = App::Zapzi::UserConfig::get('deactivate_links');
t/05-transform.t view on Meta::CPAN
# Try an HTML file with two titles and leading/trailing whitespace
$f = App::Zapzi::FetchArticle->new(
source => 't/testfiles/html-two-titles.html');
ok( $f->fetch, 'Fetch HTML with two title tags' );
$tx = App::Zapzi::Transform->new(raw_article => $f);
isa_ok( $tx, 'App::Zapzi::Transform' );
ok( $tx->to_readable, 'Transform sample HTML file' );
is( $tx->title, 'Title 1',
'Title selected from HTML extract with two title tags');
# Try an HTML file with embedded font tags
$f = App::Zapzi::FetchArticle->new(
source => 't/testfiles/html-font.html');
ok( $f->fetch, 'Fetch HTML with font tags' );
$tx = App::Zapzi::Transform->new(raw_article => $f);
isa_ok( $tx, 'App::Zapzi::Transform' );
ok( $tx->to_readable, 'Transform sample HTML file with font tags' );
unlike( $tx->readable_text, qr/yellow/,
'Font attributes removed from HTML');
# Try an HTML file with links
$f = App::Zapzi::FetchArticle->new(
source => 't/testfiles/html-links.html');
ok( $f->fetch, 'Fetch HTML with links' );
$tx = App::Zapzi::Transform->new(raw_article => $f);
isa_ok( $tx, 'App::Zapzi::Transform' );
ok( $tx->to_readable, 'Transform sample HTML file with links' );
t/testfiles/html-font.html view on Meta::CPAN
<title>Font tests </title>
<meta name="description" content="A testing document for App::Zapzi">
<meta name="author" content="Rupert Lane">
</head>
<body>
<h1>Lorem ipsum</h1>
<p>
<font color="red">Lorem ipsum dolor sit amet, consectetur
adipisicing elit, sed do eiusmod tempor incididunt ut labore et
dolore magna aliqua.</font>
<font color="yellow">Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea
commodo consequat. Duis aute irure dolor in reprehenderit in
voluptate velit esse cillum dolore eu fugiat nulla pariatur.</font>
Excepteur sint occaecat cupidatat non proident, sunt in culpa
qui officia deserunt mollit anim id est laborum.</p>
</body>
</html>
( run in 0.736 second using v1.01-cache-2.11-cpan-5735350b133 )