MSWord-ToHTML

 view release on metacpan or  search on metacpan

lib/MSWord/ToHTML/Roles/HasHTML.pm  view on Meta::CPAN

package MSWord::ToHTML::Roles::HasHTML;
{
  $MSWord::ToHTML::Roles::HasHTML::VERSION = '0.010';
}

use Moose::Role;
use namespace::autoclean;
use strictures 1;
use MSWord::ToHTML::HTML;
use MooseX::Method::Signatures;
use Digest::SHA1 qw/sha1_hex/;
use XML::LibXML;
use XML::LibXSLT;
use IO::All;
use Try::Tiny;
use autodie;
use HTML::HTML5::Writer;
use HTML::TreeBuilder;
use HTML::Entities;
use Encode;
use Encode::Guess;
use CSS;
use List::MoreUtils qw/any/;
use Path::Class::File;
use Cwd;
use feature 'say';

has 'style' => (
    is      => 'ro',
    lazy    => 1,
    default => sub {
        my $self = shift;
        $self->parser->load_xml(
            location =>
                'http://docbook.sourceforge.net/release/xsl/current/xhtml-1_1/docbook.xsl',
            no_cdata => 1
        );
    },
);

has 'parser' => (
    is      => 'ro',
    isa     => 'XML::LibXML',
    lazy    => 1,
    default => sub {
        XML::LibXML->new;
    },
);
has 'transformer' => (
    is      => 'ro',
    isa     => 'XML::LibXSLT',
    lazy    => 1,
    default => sub {
        XML::LibXSLT->new;
    },
);

has 'writer' => (
    is      => 'ro',
    isa     => 'HTML::HTML5::Writer',
    lazy    => 1,
    default => sub {
        HTML::HTML5::Writer->new( markup => 'html' );
    },
);

has 'css' => (
    is      => 'ro',
    isa     => 'CSS',
    lazy    => 1,
    default => sub {
        CSS->new(
            {   'parser' => 'CSS::Parse::Heavy',
                adaptor  => 'CSS::Adaptor::Debug'
            }
        );
    },
);

has 'html5_parser' => (
    is      => 'ro',
    isa     => 'HTML::HTML5::Parser',
    lazy    => 1,
    default => sub {
        return HTML::HTML5::Parser->new;
    },
);

method get_html {
    my $base_html = $self->extract_base_html(@_);
    return $self->html_to_html5($base_html);
}

method get_dom($file) {
    return $self->parser->parse_file($file);
}

lib/MSWord/ToHTML/Roles/HasHTML.pm  view on Meta::CPAN

    }

    if ($parsed_style) {
      my @italic_selectors = grep { $_ }
        map {
        $_->selectors =~ /^(?<tag>\w+\.)(?<class>\w+)$/;
        $+{class};
        }
        grep { $_->properties =~ /italic/ } @$parsed_style;
      my @bold_selectors = grep { $_ }
        map {
        $_->selectors =~ /^(?<tag>\w+\.)(?<class>\w+)$/;
        $+{class};
        }
        grep { $_->properties =~ /bold/ } @$parsed_style;
      my %bolds          = map  { $_ => 1 } @bold_selectors;
      my @both_selectors = grep { defined $bolds{$_} } @italic_selectors;
      my %array_for      = (
        both   => \@both_selectors,
        bold   => \@bold_selectors,
        italic => \@italic_selectors
      );

      for my $type (qw/both bold italic/) {
        for my $selector ( @{ $array_for{$type} } ) {
          if ($selector) {
            my @to_filter = $tree->look_down( 'class', $selector );
            if ( @to_filter > 0 ) {
              for my $el (@to_filter) {
                if ( $type eq 'both' ) {
                  my $new_bold   = HTML::Element->new('strong');
                  my $new_italic = HTML::Element->new('em');
                  $new_bold->push_content( $el->detach_content );
                  $new_italic->push_content($new_bold);
                  $el->replace_with($new_italic);
                }
                else {
                  my $new =
                    $type eq 'bold'
                    ? HTML::Element->new('strong')
                    : HTML::Element->new('em');
                  $new->push_content( $el->detach_content );
                  $el->replace_with($new);
                }
              }
            }
          }
        }
      }
    }
    return $tree;
}

method html_to_html5( IO::All $base_html) {
    try {
        system(
            "/usr/bin/tidy",                 "-f",
            "$base_html.err",                "-m",
            "-clean",                        "-quiet",
            "--preserve-entities",           "yes",
            "--indent-cdata",                "yes",
            "--escape-cdata",                "yes",
            "--repeated-attributes",         "keep-last",
            "--char-encoding",               "utf8",
            "--output-encoding",             "utf8",
            "--merge-spans",                 "yes",
            "--bare",                        "yes",
            "--logical-emphasis",            "yes",
            "--word-2000",                   "yes",
            "--drop-empty-paras",            "yes",
            "--drop-font-tags",              "yes",
            "--drop-proprietary-attributes", "yes",
            "--hide-endtags",                "no",
            "-language",                     "en",
            "--add-xml-decl",                "yes",
            "--output-xhtml",                "yes",
            "--tidy-mark",                   "no",
            "--doctype",                     "strict",
            "$base_html"
        );
    }
    catch {
      "I could not tidy the base_html: $_";
    };

    ( my $title = $self->file->filename ) =~ s/\s+/ /g;
    $title =~ s/\(|\)|\-//g;
    $title =~ /\A(?<filename>.+?)(?<extension>\.\w+)\z/g;
    my $new_title = $+{filename} || 'Untitled';
    $new_title =~ s/[[:punct:]]/ /g;

    ( my $filename = lc $self->file->filename ) =~ s/\s+/_/g;
    $filename =~ s/\(|\)|\-//g;
    $filename =~ /\A(?<filename>.+?)(?<extension>\.\w+)\z/g;
    $filename = $+{filename};
    $filename =~ s/[[:punct:]]/_/g;

    $base_html = io("$base_html")->utf8;
    my $cleaned_html = $self->post_clean_html( $base_html, $new_title );
    my $new_dom      = $self->parser->parse_html_fh( io("$cleaned_html") );
    my $html5        = $self->writer->document($new_dom);

    my $html5_file = io->catfile( io->tmpdir, $filename . '.html' )->utf8->print($html5);
    my $html5_images = "$base_html" . "_files";
    my $new_filename = $html5_file->file;
    return MSWord::ToHTML::HTML->new(
      file => "$new_filename",
      ( -e $html5_images ? ( images => $html5_images ) : () )
    );
}

1;



( run in 1.101 second using v1.01-cache-2.11-cpan-5a3173703d6 )