App-Greple-msdoc
view release on metacpan or search on metacpan
lib/App/Greple/msdoc.pm view on Meta::CPAN
sub separate_xml {
s{ (?<=>) ([^<]*) }{ $1 ? "\n$1\n" : "\n" }gex;
}
sub indent_xml {
my %arg = @_;
my $file = delete $arg{&FILELABEL} or die;
my %nonewline = do {
map { $_ => 1 }
map { @{$_->[1]} }
grep { $file =~ $_->[0] } (
[ qr/\.doc[xm]$/, [ qw(w:t w:delText w:instrText wp:posOffset) ] ],
[ qr/\.ppt[xm]$/, [ qw(a:t) ] ],
[ qr/\.xls[xm]$/, [ qw(t v f formula1) ] ],
);
};
my $level = 0;
s{
(?<mark>
(?<single>
< (?<tag>[\w:]+) [^>]* />
)
|
(?<open>
< (?<tag>[\w:]+) [^>]* (?<!/) >
)
|
(?<close>
< / (?<tag>[\w:]+) >
)
)
}{
if (not $+{single} and $nonewline{$+{tag}}) {
join("", $+{open} ? $indent_mark x $level : "",
$+{mark},
$+{close} ? "\n" : "");
}
else {
$+{close} and $level--;
($indent_mark x ($+{open} ? $level++ : $level)) . $+{mark} . "\n";
}
}gex;
}
use Archive::Zip;
use App::optex::textconv::msdoc qw(to_text get_list);
my %formatter = (
'indent-xml' => \&indent_xml,
'separate-xml' => \&separate_xml,
);
sub extract_content {
my %arg = @_;
my $file = $arg{&FILELABEL} or die;
my $type = ($file =~ /\.((?:doc|xls|ppt)[xm])$/)[0] or die;
my $pid = open(STDIN, '-|') // croak "process fork failed: $!";
binmode STDIN, ':encoding(utf8)';
if ($pid) {
return $pid;
}
my $format = $arg{format} // $default_format;
if ($format eq 'text') {
print decode 'utf8', to_text($file);
exit;
} elsif ($format =~ /xml$/) {
my $zip = Archive::Zip->new($file);
my @xml;
for my $entry (get_list($zip, $type)) {
my $member = $zip->memberNamed($entry) or next;
my $xml = $member->contents or next;
push @xml, $xml;
}
my $xml = decode 'utf8', join "\n", @xml;
if (my $sub = $formatter{$format}) {
$sub->(&FILELABEL => $file) for $xml;
}
print $xml;
exit;
}
die;
}
1;
__DATA__
help default ignore
help --space Number of newlines after paragraph
help --separator Separator between each strings
help --indent Indent XML data
help --indent-mark Specify text for indentation
help --type Specify document type (docx, pptx, xlsx)
help --dump Print entire data
help --msdoc-format ignore
option default \
--if '/\.(doc|ppt|xls)[xm]$/:&__PACKAGE__::extract_content'
builtin space=i $opt_space
builtin separator=s $opt_separator
builtin type=s $opt_type
builtin msdoc-format=s $default_format
define (#delText) <w:delText>.*?</w:delText>
##
## --indent, --indent-mark
##
option --indent --msdoc-format=indent-xml
builtin indent-mark=s $indent_mark
##
## --dump
##
option --dump --le &sub{} --need 0 --all --exit=0
option --indent-fold \
( run in 2.132 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )