MSWord-ToHTML
view release on metacpan or search on metacpan
lib/MSWord/ToHTML/Roles/HasHTML.pm view on Meta::CPAN
package MSWord::ToHTML::Roles::HasHTML;
{
$MSWord::ToHTML::Roles::HasHTML::VERSION = '0.010';
}
use Moose::Role;
use namespace::autoclean;
use strictures 1;
use MSWord::ToHTML::HTML;
use MooseX::Method::Signatures;
use Digest::SHA1 qw/sha1_hex/;
use XML::LibXML;
use XML::LibXSLT;
use IO::All;
use Try::Tiny;
use autodie;
use HTML::HTML5::Writer;
use HTML::TreeBuilder;
use HTML::Entities;
use Encode;
use Encode::Guess;
use CSS;
use List::MoreUtils qw/any/;
use Path::Class::File;
use Cwd;
use feature 'say';
has 'style' => (
is => 'ro',
lazy => 1,
default => sub {
my $self = shift;
$self->parser->load_xml(
location =>
'http://docbook.sourceforge.net/release/xsl/current/xhtml-1_1/docbook.xsl',
no_cdata => 1
);
},
);
has 'parser' => (
is => 'ro',
isa => 'XML::LibXML',
lazy => 1,
default => sub {
XML::LibXML->new;
},
);
has 'transformer' => (
is => 'ro',
isa => 'XML::LibXSLT',
lazy => 1,
default => sub {
XML::LibXSLT->new;
},
);
has 'writer' => (
is => 'ro',
isa => 'HTML::HTML5::Writer',
lazy => 1,
default => sub {
HTML::HTML5::Writer->new( markup => 'html' );
},
);
has 'css' => (
is => 'ro',
isa => 'CSS',
lazy => 1,
default => sub {
CSS->new(
{ 'parser' => 'CSS::Parse::Heavy',
adaptor => 'CSS::Adaptor::Debug'
}
);
},
);
has 'html5_parser' => (
is => 'ro',
isa => 'HTML::HTML5::Parser',
lazy => 1,
default => sub {
return HTML::HTML5::Parser->new;
},
);
method get_html {
my $base_html = $self->extract_base_html(@_);
return $self->html_to_html5($base_html);
}
method get_dom($file) {
return $self->parser->parse_file($file);
}
lib/MSWord/ToHTML/Roles/HasHTML.pm view on Meta::CPAN
}
if ($parsed_style) {
my @italic_selectors = grep { $_ }
map {
$_->selectors =~ /^(?<tag>\w+\.)(?<class>\w+)$/;
$+{class};
}
grep { $_->properties =~ /italic/ } @$parsed_style;
my @bold_selectors = grep { $_ }
map {
$_->selectors =~ /^(?<tag>\w+\.)(?<class>\w+)$/;
$+{class};
}
grep { $_->properties =~ /bold/ } @$parsed_style;
my %bolds = map { $_ => 1 } @bold_selectors;
my @both_selectors = grep { defined $bolds{$_} } @italic_selectors;
my %array_for = (
both => \@both_selectors,
bold => \@bold_selectors,
italic => \@italic_selectors
);
for my $type (qw/both bold italic/) {
for my $selector ( @{ $array_for{$type} } ) {
if ($selector) {
my @to_filter = $tree->look_down( 'class', $selector );
if ( @to_filter > 0 ) {
for my $el (@to_filter) {
if ( $type eq 'both' ) {
my $new_bold = HTML::Element->new('strong');
my $new_italic = HTML::Element->new('em');
$new_bold->push_content( $el->detach_content );
$new_italic->push_content($new_bold);
$el->replace_with($new_italic);
}
else {
my $new =
$type eq 'bold'
? HTML::Element->new('strong')
: HTML::Element->new('em');
$new->push_content( $el->detach_content );
$el->replace_with($new);
}
}
}
}
}
}
}
return $tree;
}
method html_to_html5( IO::All $base_html) {
try {
system(
"/usr/bin/tidy", "-f",
"$base_html.err", "-m",
"-clean", "-quiet",
"--preserve-entities", "yes",
"--indent-cdata", "yes",
"--escape-cdata", "yes",
"--repeated-attributes", "keep-last",
"--char-encoding", "utf8",
"--output-encoding", "utf8",
"--merge-spans", "yes",
"--bare", "yes",
"--logical-emphasis", "yes",
"--word-2000", "yes",
"--drop-empty-paras", "yes",
"--drop-font-tags", "yes",
"--drop-proprietary-attributes", "yes",
"--hide-endtags", "no",
"-language", "en",
"--add-xml-decl", "yes",
"--output-xhtml", "yes",
"--tidy-mark", "no",
"--doctype", "strict",
"$base_html"
);
}
catch {
"I could not tidy the base_html: $_";
};
( my $title = $self->file->filename ) =~ s/\s+/ /g;
$title =~ s/\(|\)|\-//g;
$title =~ /\A(?<filename>.+?)(?<extension>\.\w+)\z/g;
my $new_title = $+{filename} || 'Untitled';
$new_title =~ s/[[:punct:]]/ /g;
( my $filename = lc $self->file->filename ) =~ s/\s+/_/g;
$filename =~ s/\(|\)|\-//g;
$filename =~ /\A(?<filename>.+?)(?<extension>\.\w+)\z/g;
$filename = $+{filename};
$filename =~ s/[[:punct:]]/_/g;
$base_html = io("$base_html")->utf8;
my $cleaned_html = $self->post_clean_html( $base_html, $new_title );
my $new_dom = $self->parser->parse_html_fh( io("$cleaned_html") );
my $html5 = $self->writer->document($new_dom);
my $html5_file = io->catfile( io->tmpdir, $filename . '.html' )->utf8->print($html5);
my $html5_images = "$base_html" . "_files";
my $new_filename = $html5_file->file;
return MSWord::ToHTML::HTML->new(
file => "$new_filename",
( -e $html5_images ? ( images => $html5_images ) : () )
);
}
1;
( run in 1.101 second using v1.01-cache-2.11-cpan-5a3173703d6 )