App-Greple-msdoc

 view release on metacpan or  search on metacpan

lib/App/Greple/msdoc.pm  view on Meta::CPAN

sub separate_xml {
    s{ (?<=>) ([^<]*) }{ $1 ? "\n$1\n" : "\n" }gex;
}

sub indent_xml {
    my %arg = @_;
    my $file = delete $arg{&FILELABEL} or die;

    my %nonewline = do {
	map  { $_ => 1 }
	map  { @{$_->[1]} }
	grep { $file =~ $_->[0] } (
	    [ qr/\.doc[xm]$/, [ qw(w:t w:delText w:instrText wp:posOffset) ] ],
	    [ qr/\.ppt[xm]$/, [ qw(a:t) ] ],
	    [ qr/\.xls[xm]$/, [ qw(t v f formula1) ] ],
	);
    };

    my $level = 0;

    s{
	(?<mark>
	  (?<single>
	    < (?<tag>[\w:]+) [^>]* />
	  )
	  |
	  (?<open>
	    < (?<tag>[\w:]+) [^>]* (?<!/) >
	  )
	  |
	  (?<close>
	    < / (?<tag>[\w:]+) >
	  )
	)
    }{
	if (not $+{single} and $nonewline{$+{tag}}) {
	    join("", $+{open} ? $indent_mark x $level : "",
		 $+{mark},
		 $+{close} ? "\n" : "");
	}
	else {
	    $+{close} and $level--;
	    ($indent_mark x ($+{open} ? $level++ : $level)) . $+{mark} . "\n";
	}
    }gex;
}

use Archive::Zip;
use App::optex::textconv::msdoc qw(to_text get_list);

my %formatter = (
    'indent-xml'   => \&indent_xml,
    'separate-xml' => \&separate_xml,
    );

sub extract_content {
    my %arg = @_;
    my $file = $arg{&FILELABEL} or die;
    my $type = ($file =~ /\.((?:doc|xls|ppt)[xm])$/)[0] or die;
    my $pid = open(STDIN, '-|') // croak "process fork failed: $!";
    binmode STDIN, ':encoding(utf8)';
    if ($pid) {
	return $pid;
    }
    my $format = $arg{format} // $default_format;
    if ($format eq 'text') {
	print decode 'utf8', to_text($file);
	exit;
    } elsif ($format =~ /xml$/) {
	my $zip = Archive::Zip->new($file);
	my @xml;
	for my $entry (get_list($zip, $type)) {
	    my $member = $zip->memberNamed($entry) or next;
	    my $xml = $member->contents or next;
	    push @xml, $xml;
	}
	my $xml = decode 'utf8', join "\n", @xml;
	if (my $sub = $formatter{$format}) {
	    $sub->(&FILELABEL => $file) for $xml;
	}
	print $xml;
	exit;
    }
    die;
}

1;

__DATA__

help	default		ignore
help	--space		Number of newlines after paragraph
help	--separator	Separator between each strings
help	--indent	Indent XML data
help	--indent-mark	Specify text for indentation
help	--type		Specify document type (docx, pptx, xlsx)
help	--dump		Print entire data
help	--msdoc-format	ignore

option default \
	--if '/\.(doc|ppt|xls)[xm]$/:&__PACKAGE__::extract_content'

builtin space=i $opt_space
builtin separator=s $opt_separator
builtin type=s $opt_type
builtin msdoc-format=s $default_format

define (#delText) <w:delText>.*?</w:delText>

##
## --indent, --indent-mark
##
option --indent --msdoc-format=indent-xml
builtin indent-mark=s $indent_mark

##
## --dump
##
option --dump --le &sub{} --need 0 --all --exit=0

option --indent-fold \



( run in 2.132 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )