MsOffice-Word-Surgeon
view release on metacpan or search on metacpan
lib/MsOffice/Word/Surgeon/PackagePart.pm view on Meta::CPAN
use namespace::clean -except => 'meta';
#======================================================================
# ATTRIBUTES
#======================================================================
# attributes passed to the constructor
has 'surgeon' => (is => 'ro', isa => 'MsOffice::Word::Surgeon', required => 1, weak_ref => 1);
has 'part_name' => (is => 'ro', isa => 'Str', required => 1);
# attributes constructed by the module -- not received through the constructor
has_inner 'contents' => (is => 'rw', isa => 'Str', trigger => \&_on_new_contents);
has_inner 'runs' => (is => 'ro', isa => 'ArrayRef', clearer => 'clear_runs');
has_inner 'relationships' => (is => 'ro', isa => 'ArrayRef');
has_inner 'images' => (is => 'ro', isa => 'HashRef');
has 'contents_has_changed' => (is => 'bare', isa => 'Bool', default => 0);
has 'was_cleaned_up' => (is => 'bare', isa => 'Bool', default => 0);
#======================================================================
# GLOBAL VARIABLES
#======================================================================
# Various regexes for removing uninteresting XML information
my %noise_reduction_regexes = (
proof_checking => qr(<w:(?:proofErr[^>]+|noProof/)>),
revision_ids => qr(\sw:rsid\w+="[^"]+"),
complex_script_bold => qr(<w:bCs/>),
page_breaks => qr(<w:lastRenderedPageBreak/>),
language => qr(<w:lang w:val="[^/>]+/>),
empty_run_props => qr(<w:rPr></w:rPr>),
soft_hyphens => qr(<w:softHyphen/>),
);
my @noise_reduction_list = qw/proof_checking revision_ids
complex_script_bold page_breaks language
empty_run_props soft_hyphens/;
#======================================================================
# LAZY ATTRIBUTE CONSTRUCTORS AND TRIGGERS
#======================================================================
sub _runs {
my $self = shift;
state $run_regex = qr[
<w:r> # opening tag for the run
(?:<w:rPr>(.*?)</w:rPr>)? # run properties -- capture in $1
(.*?) # run contents -- capture in $2
</w:r> # closing tag for the run
]x;
state $txt_regex = qr[
<w:t(?:\ xml:space="preserve")?> # opening tag for the text contents
(.*?) # text contents -- capture in $1
</w:t> # closing tag for text
]x;
# split XML content into run fragments
my $contents = $self->contents;
my @run_fragments = split m[$run_regex], $contents, -1; # -1 : don't strip trailing items
my @runs;
# build internal RUN objects
RUN:
while (my ($xml_before_run, $props, $run_contents) = splice @run_fragments, 0, 3) {
$run_contents //= '';
# split XML of this run into text fragmentsn
my @txt_fragments = split m[$txt_regex], $run_contents, -1; # -1 : don't strip trailing items
my @texts;
# build internal TEXT objects
TXT:
while (my ($xml_before_text, $txt_contents) = splice @txt_fragments, 0, 2) {
next TXT if !$xml_before_text && !length($txt_contents);
$_ //= '' for $xml_before_text, $txt_contents;
decode_entities($txt_contents);
push @texts, MsOffice::Word::Surgeon::Text->new(xml_before => $xml_before_text,
literal_text => $txt_contents);
}
# assemble TEXT objects into a RUN object
next RUN if !$xml_before_run && !@texts;
$_ //= '' for $xml_before_run, $props;
push @runs, MsOffice::Word::Surgeon::Run->new(xml_before => $xml_before_run,
props => $props,
inner_texts => \@texts);
}
return \@runs;
}
sub _relationships {
my $self = shift;
# xml that describes the relationships for this package part
my $rel_xml = $self->_rels_xml;
# parse the relationships and assemble into a sparse array indexed by relationship ids
my @relationships;
while ($rel_xml =~ m[<Relationship\s+(.*?)/>]g) {
my %attrs = parse_attrs($1);
$attrs{$_} or croak "missing attribute '$_' in <Relationship> node" for qw/Id Type Target/;
($attrs{num} = $attrs{Id}) =~ s[^\D+][];
($attrs{short_type} = $attrs{Type}) =~ s[^.*/][];
$relationships[$attrs{num}] = \%attrs;
}
return \@relationships;
}
sub _images {
my $self = shift;
# get relationship ids associated with images
my %rel_image = map {$_->{Id} => $_->{Target}}
grep {$_ && $_->{short_type} eq 'image'}
$self->relationships->@*;
# get titles and relationship ids of images found within the part contents
my %image;
my @drawings = $self->contents =~ m[<w:drawing>(.*?)</w:drawing>]g;
DRAWING:
foreach my $drawing (@drawings) {
if ($drawing =~ m[<wp:docPr \s+ (.*?) />
.*?
<a:blip \s+ r:embed="(\w+)"]x) {
( run in 0.658 second using v1.01-cache-2.11-cpan-71847e10f99 )