MsOffice-Word-Surgeon
view release on metacpan or search on metacpan
lib/MsOffice/Word/Surgeon/PackagePart.pm view on Meta::CPAN
package MsOffice::Word::Surgeon::PackagePart;
use 5.24.0;
use Moose;
use MooseX::StrictConstructor;
use MsOffice::Word::Surgeon::Carp;
use MsOffice::Word::Surgeon::Utils qw(maybe_preserve_spaces is_at_run_level parse_attrs decode_entities encode_entities);
use MsOffice::Word::Surgeon::Run;
use MsOffice::Word::Surgeon::Text;
use MsOffice::Word::Surgeon::Field;
use MsOffice::Word::Surgeon::BookmarkBoundary;
use XML::LibXML ();;
use List::Util qw(max);
use match::simple qw(match);
# syntactic sugar for attributes
sub has_inner ($@) {my $attr = shift; has($attr => @_, lazy => 1, builder => "_$attr", init_arg => undef)}
# constant integers to specify indentation modes -- see L<XML::LibXML>
use constant XML_NO_INDENT => 0;
use constant XML_SIMPLE_INDENT => 1;
use namespace::clean -except => 'meta';
#======================================================================
# ATTRIBUTES
#======================================================================
# attributes passed to the constructor
has 'surgeon' => (is => 'ro', isa => 'MsOffice::Word::Surgeon', required => 1, weak_ref => 1);
has 'part_name' => (is => 'ro', isa => 'Str', required => 1);
# attributes constructed by the module -- not received through the constructor
has_inner 'contents' => (is => 'rw', isa => 'Str', trigger => \&_on_new_contents);
has_inner 'runs' => (is => 'ro', isa => 'ArrayRef', clearer => 'clear_runs');
has_inner 'relationships' => (is => 'ro', isa => 'ArrayRef');
has_inner 'images' => (is => 'ro', isa => 'HashRef');
has 'contents_has_changed' => (is => 'bare', isa => 'Bool', default => 0);
has 'was_cleaned_up' => (is => 'bare', isa => 'Bool', default => 0);
#======================================================================
# GLOBAL VARIABLES
#======================================================================
# Various regexes for removing uninteresting XML information
my %noise_reduction_regexes = (
proof_checking => qr(<w:(?:proofErr[^>]+|noProof/)>),
revision_ids => qr(\sw:rsid\w+="[^"]+"),
complex_script_bold => qr(<w:bCs/>),
page_breaks => qr(<w:lastRenderedPageBreak/>),
language => qr(<w:lang w:val="[^/>]+/>),
empty_run_props => qr(<w:rPr></w:rPr>),
soft_hyphens => qr(<w:softHyphen/>),
);
my @noise_reduction_list = qw/proof_checking revision_ids
complex_script_bold page_breaks language
empty_run_props soft_hyphens/;
#======================================================================
# LAZY ATTRIBUTE CONSTRUCTORS AND TRIGGERS
#======================================================================
sub _runs {
my $self = shift;
state $run_regex = qr[
<w:r> # opening tag for the run
(?:<w:rPr>(.*?)</w:rPr>)? # run properties -- capture in $1
(.*?) # run contents -- capture in $2
</w:r> # closing tag for the run
]x;
state $txt_regex = qr[
<w:t(?:\ xml:space="preserve")?> # opening tag for the text contents
(.*?) # text contents -- capture in $1
</w:t> # closing tag for text
]x;
# split XML content into run fragments
my $contents = $self->contents;
my @run_fragments = split m[$run_regex], $contents, -1; # -1 : don't strip trailing items
my @runs;
# build internal RUN objects
RUN:
while (my ($xml_before_run, $props, $run_contents) = splice @run_fragments, 0, 3) {
$run_contents //= '';
# split XML of this run into text fragmentsn
my @txt_fragments = split m[$txt_regex], $run_contents, -1; # -1 : don't strip trailing items
my @texts;
# build internal TEXT objects
TXT:
while (my ($xml_before_text, $txt_contents) = splice @txt_fragments, 0, 2) {
( run in 2.281 seconds using v1.01-cache-2.11-cpan-8f98c5d2c55 )