MsOffice-Word-Surgeon

 view release on metacpan or  search on metacpan

lib/MsOffice/Word/Surgeon/PackagePart.pm  view on Meta::CPAN

use namespace::clean -except => 'meta';

#======================================================================
# ATTRIBUTES
#======================================================================

# attributes passed to the constructor
has       'surgeon'        => (is => 'ro', isa => 'MsOffice::Word::Surgeon', required => 1, weak_ref => 1);
has       'part_name'      => (is => 'ro', isa => 'Str',                     required => 1);

# attributes constructed by the module -- not received through the constructor
has_inner 'contents'       => (is => 'rw', isa => 'Str',      trigger => \&_on_new_contents);
has_inner 'runs'           => (is => 'ro', isa => 'ArrayRef', clearer => 'clear_runs');
has_inner 'relationships'  => (is => 'ro', isa => 'ArrayRef');
has_inner 'images'         => (is => 'ro', isa => 'HashRef');

has 'contents_has_changed' => (is => 'bare', isa => 'Bool', default => 0);
has 'was_cleaned_up'       => (is => 'bare', isa => 'Bool', default => 0);

#======================================================================
# GLOBAL VARIABLES
#======================================================================

# Various regexes for removing uninteresting XML information
my %noise_reduction_regexes = (
  proof_checking         => qr(<w:(?:proofErr[^>]+|noProof/)>),
  revision_ids           => qr(\sw:rsid\w+="[^"]+"),
  complex_script_bold    => qr(<w:bCs/>),
  page_breaks            => qr(<w:lastRenderedPageBreak/>),
  language               => qr(<w:lang w:val="[^/>]+/>),
  empty_run_props        => qr(<w:rPr></w:rPr>),
  soft_hyphens           => qr(<w:softHyphen/>),
 );
my @noise_reduction_list = qw/proof_checking revision_ids
                              complex_script_bold page_breaks language
                              empty_run_props soft_hyphens/;

#======================================================================
# LAZY ATTRIBUTE CONSTRUCTORS AND TRIGGERS
#======================================================================


sub _runs {
  my $self = shift;

  state $run_regex = qr[
    <w:r>                             # opening tag for the run
    (?:<w:rPr>(.*?)</w:rPr>)?         # run properties -- capture in $1
    (.*?)                             # run contents -- capture in $2
    </w:r>                            # closing tag for the run
  ]x;

  state $txt_regex = qr[
    <w:t(?:\ xml:space="preserve")?>  # opening tag for the text contents
    (.*?)                             # text contents -- capture in $1
    </w:t>                            # closing tag for text
  ]x;

  # split XML content into run fragments
  my $contents      = $self->contents;
  my @run_fragments = split m[$run_regex], $contents, -1; # -1 : don't strip trailing items
  my @runs;

  # build internal RUN objects
 RUN:
  while (my ($xml_before_run, $props, $run_contents) = splice @run_fragments, 0, 3) {
    $run_contents //= '';

    # split XML of this run into text fragmentsn
    my @txt_fragments = split m[$txt_regex], $run_contents, -1; # -1 : don't strip trailing items
    my @texts;

    # build internal TEXT objects
  TXT:
    while (my ($xml_before_text, $txt_contents) = splice @txt_fragments, 0, 2) {
      next TXT if !$xml_before_text && !length($txt_contents);
      $_ //= '' for $xml_before_text, $txt_contents;
      decode_entities($txt_contents);
      push @texts, MsOffice::Word::Surgeon::Text->new(xml_before   => $xml_before_text,
                                                      literal_text => $txt_contents);
    }

    # assemble TEXT objects into a RUN object
    next RUN if !$xml_before_run && !@texts;
    $_ //= '' for $xml_before_run, $props;
    push @runs, MsOffice::Word::Surgeon::Run->new(xml_before  => $xml_before_run,
                                                  props       => $props,
                                                  inner_texts => \@texts);
  }

  return \@runs;
}


sub _relationships {
  my $self = shift;

  # xml that describes the relationships for this package part
  my $rel_xml = $self->_rels_xml;

  # parse the relationships and assemble into a sparse array indexed by relationship ids
  my @relationships;
  while ($rel_xml =~ m[<Relationship\s+(.*?)/>]g) {
    my %attrs = parse_attrs($1);
    $attrs{$_} or croak "missing attribute '$_' in <Relationship> node" for qw/Id Type Target/;
    ($attrs{num}        = $attrs{Id})  =~ s[^\D+][];
    ($attrs{short_type} = $attrs{Type}) =~ s[^.*/][];
    $relationships[$attrs{num}] = \%attrs;
  }

  return \@relationships;
}


sub _images {
  my $self = shift;

  # get relationship ids associated with images
  my %rel_image  = map  {$_->{Id} => $_->{Target}}
                   grep {$_ && $_->{short_type} eq 'image'}
                   $self->relationships->@*;

  # get titles and relationship ids of images found within the part contents
  my %image;
  my @drawings = $self->contents =~ m[<w:drawing>(.*?)</w:drawing>]g;
 DRAWING:
  foreach my $drawing (@drawings) {
    if ($drawing =~ m[<wp:docPr \s+ (.*?) />
                      .*?
                      <a:blip \s+ r:embed="(\w+)"]x) {



( run in 0.658 second using v1.01-cache-2.11-cpan-71847e10f99 )