App-optex-textconv
view release on metacpan or search on metacpan
lib/App/optex/textconv/ooxml/regex.pm view on Meta::CPAN
package App::optex::textconv::ooxml::regex;
our $VERSION = '1.07';
use v5.14;
use warnings;
use Carp;
use utf8;
use Encode;
use Data::Dumper;
use App::optex v0.3;
use App::optex::textconv::Converter 'import';
our @EXPORT_OK = qw(to_text get_list);
our @CONVERTER = (
[ qr/\.doc[xm]$/ => \&to_text ],
[ qr/\.ppt[xm]$/ => \&to_text ],
[ qr/\.xls[xm]$/ => \&to_text ],
);
sub xml2text {
local $_ = shift;
my $type = shift;
my $xml_re = qr/<\?xml\b[^>]*\?>\s*/;
return $_ unless /$xml_re/;
my @xml = grep { length } split /$xml_re/;
my @text = map { _xml2text($_, $type) } @xml;
join "\n", @text;
}
my %param = (
docx => { space => 2, separator => "" },
docm => { space => 2, separator => "" },
xlsx => { space => 1, separator => "\t" },
xlsm => { space => 1, separator => "\t" },
pptx => { space => 1, separator => "" },
pptm => { space => 1, separator => "" },
);
my $replace_reference = do {
my %hash = qw( amp & lt < gt > );
my @keys = keys %hash;
my $re = do { local $" = '|'; qr/&(@keys);/ };
sub { s/$re/$hash{$1}/g }
};
sub _xml2text {
local $_ = shift;
my $type = shift;
my $param = $param{$type} or die;
my @p;
my $fn_id = "";
while (m{
(?<footnote> <w:footnote \s+ w:id="(?<fn_id>\d+)" )
| <(?<tag>[apw]:p|si)\b[^>]*>(?<para>.*?)</\g{tag}>
}xsg)
{
if ($+{footnote}) {
$fn_id = $+{fn_id};
next;
}
my $para = $+{para};
my @s;
while ($para =~ m{
(?<fn_ref> <w:footnoteReference \s+ w:id="(?<fn_id>\d+)" )
| (?<footnote> <w:footnote \s+ w:id="(?<fn_id>\d+)" )
| (?<footnoteRef> <w:footnoteRef/> )
| (?<br> <[aw]:br/> )
| (?<tab> <w:tab/> | <w:tabs> )
| <(?<tag>(?:[apw]:)?t)\b[^>]*> (?<text>[^<]*?) </\g{tag}>
}xsg)
{
if ($+{fn_ref}) { push @s, "[^$+{fn_id}]" }
elsif ($+{footnote}) { $fn_id = $+{fn_id} }
elsif ($+{footnoteRef}) { push @s, "[^$fn_id]:" }
elsif ($+{br}) { push @s, "\n" }
elsif ($+{tab}) { push @s, " " }
elsif ($+{text} ne '') { push @s, $+{text} }
}
@s or next;
push @p, join($param->{separator}, @s) . ("\n" x $param->{space});
}
my $text = join '', @p;
$replace_reference->() for $text;
$text;
}
use Archive::Zip 1.37 qw( :ERROR_CODES :CONSTANTS );
sub to_text {
my $file = shift;
my $type = ($file =~ /\.((?:doc|xls|ppt)[xm])$/)[0] or return;
return '' if -z $file;
my $zip = Archive::Zip->new($file) or die;
my @contents;
for my $entry (get_list($zip, $type)) {
my $member = $zip->memberNamed($entry) or next;
my $xml = $member->contents or next;
my $text = xml2text $xml, $type or next;
$file = encode 'utf8', $file if utf8::is_utf8($file);
push @contents, "[ \"$file\" $entry ]\n\n$text";
}
join "\n", @contents;
}
sub get_list {
my($zip, $type) = @_;
if ($type =~ /^doc[xm]$/) {
map { "word/$_.xml" } qw(document endnotes footnotes);
}
elsif ($type =~ /^xls[xm]$/) {
map { "xl/$_.xml" } qw(sharedStrings);
}
elsif ($type =~ /^ppt[xm]$/) {
map { $_->[0] }
sort { $a->[1] <=> $b->[1] }
map { m{(ppt/slides/slide(\d+)\.xml)$} ? [ $1, $2 ] : () }
$zip->memberNames;
}
}
1;
( run in 0.815 second using v1.01-cache-2.11-cpan-8f98c5d2c55 )