XML-AutoWriter

 view release on metacpan or  search on metacpan

lib/XML/ValidWriter.pm  view on Meta::CPAN

      croak "Function '$sym' not exported by '$pkg' or " . __PACKAGE__ ;
   }
}


my %escapees ;
$escapees{'&'}   = '&'  ;
$escapees{'<'}   = '&lt;'   ;
$escapees{'>'}   = '&gt;'   ;
$escapees{']>'}  = ']&gt;'  ;
$escapees{']]>'} = ']]&gt;' ;
$escapees{'"'}   = '&quot;' ;
$escapees{"'"}   = '&apos;' ;

# Takes a list, returns a list: don't use in scalar context.
sub _esc {
   croak "_esc used in scalar context" unless wantarray ;
   my $text ;
   return map {
      $text = $_ ;
      if ( $text =~ /([\x00-\x08\x0B\x0C\x0E-\x1F])/ ) {
	 croak sprintf(
	    "Illegal character 0x%02d (^%s) sent",
	    ord $1,
	    chr( ord( "A" ) + ord( $1 ) - 1 )
	 )
      }
      $text =~ s{([&<]|^>|^\]>|\]\]>)}{$escapees{$1}}eg ;
      $text ;
   } @_ ;
}


sub _esc1 {
   my $text = shift ;
   if ( $text =~ /([\x00-\x08\x0B\x0C\x0E-\x1F])/ ) {
      croak sprintf(
         "Invalid character 0x%02d (^%s) sent",
         ord $1,
	 chr( ord( "A" ) + ord( $1 ) - 1 )
      )
   }
   $text =~ s{([&<]|^>|^\]>|\]\]>)}{$escapees{$1}}eg ;
   return $text ;
}

sub _attr_esc1 {
   my $text = shift ;
   if ( $text =~ /([\x00-\x08\x0B\x0C\x0E-\x1F])/ ) {
      croak sprintf(
         "Invalid character 0x%02d (^%s) sent",
         ord $1,
	 chr( ord( "A" ) + ord( $1 ) - 1 )
      )
   }
   $text =~ s{([&<"'])}{$escapees{$1}}eg ;
   return $text ;
}


sub _esc_cdata_ends {
   ## This could be very memory hungry, but alas...
   my $text = join( '', @_ ) ;
   if ( $text =~ /([\x00-\x08\x0B\x0C\x0E-\x1F])/ ) {
      croak sprintf(
         "Invalid character 0x%02d (^%s) sent",
         ord $1,
	 chr( ord( "A" ) + ord( $1 ) - 1 )
      )
   }
   $text =~ s{\]\]>}{]]]]><![CDATA[>}g ;
   return $text ;
}


=item characters

   characters( "escaped text", "& more" ) ;
   $writer->characters( "escaped text", "& more" ) ;

Emits character data.  Character data will be escaped before output, by either
transforming 'E<lt>' and '&' to &lt; and &amp;, or by enclosing in a
'C<E<lt>![CDATA[...]]E<gt>>' bracket, depending on which will be more
human-readable, according to the module.

=cut

sub characters {
   my XML::ValidWriter $self = &_self ;
   my $to = $self->{OUTPUT} || select ;

   croak "Can't emit characters before the root element"
      if ! defined $self->{EMITTED_ROOT} ;

   my $stack = $self->{STACK} ;
   croak "Can't emit characters outside of the root element"
      unless @$stack ;

   my XML::VWElement $end_elt = $stack->[-1];
   my $open_elt = $self->getDoctype->element_decl( $end_elt->{NAME} ) ;

   croak "Element '$open_elt->{NAME}' can't contain #PCDATA"
      unless ! $open_elt || $open_elt->can_contain_pcdata ;

   croak "Undefined value passed to characters() in <$open_elt->{NAME}>"
      if grep ! defined $_, @_ ;

   my $length ;
   my $decide_cdata = $self->{STRAGGLERS} eq '>' ;
   my $in_cdata_mode ;

   if ( $decide_cdata ) {
      my $escs = 0 ;
      my $cdata_ends = 0 ;
      my $cdata_escs = 0 ;
      my $pos ;

      ## I assume that splitting CDATA ends between chunks is very
      ## rare.  If an app does that a lot, then this could guess 'wrong'
      ## and use CDATA escapes in a situation where they result in more
      ## bytes out than <& escaping would.
      for ( @_ ) {
	 $escs += tr/<&// ;
	 $pos = 0 ;
	 ++$cdata_ends while ( $pos = index $_, ']]>', $pos + 3 ) >= 0 ;
	 $cdata_escs += tr/\x00-\x08\x0b\x0c\x0e-\x1f// ;
	 $length += length $_ ;
      }
      ## Each &lt; or &amp; is 4 or 5 chars.
      ## Each ]]]]><![CDATA[< is 15.
      ## Each ]]>&#xN;<![CDATA[ is 17 or 18.
      ## We ## add 12 since <![CDATA[]]> is 12 chars.
      $in_cdata_mode = 4.5*$escs > 15*$cdata_ends + 17.75*$cdata_escs + 12 ;
   }
   else {
      $in_cdata_mode = $self->{STRAGGLERS} eq ']]>' ;
      $length += length $_ for @_ ;
   }

   return unless $length ;

   ## I chose to stay in or out of CDATA mode for an element
   ## in order to keep document structure relatively simple...to keep human
   ## readers from getting confused between escaping modes.
   ## This may lead to degeneracy if it's an (SG|X)ML document being emitted in
   ## an element, so this may change.
   if ( $in_cdata_mode ) {
      if ( $self->{STRAGGLERS} eq ']]>' ) {
	 ## Don't emit ']]><![CDATA[' between consecutive CDATA character
	 ## chunks.
         $self->{STRAGGLERS} = '' ;
      }
      else {
	 $self->{STRAGGLERS} .= '<![CDATA['
      }
      if ( ref $to eq 'SCALAR' ) {
	 $$to = join( '',
	    $$to,
	    $self->{STRAGGLERS},
	    _esc_cdata_ends( $self->{CDATA_END_PART}, @_ )
	 ) ;

	 $self->{CDATA_END_PART} = 
	    $$to =~ s/(\]\]?)(?!\n)\Z//
	       ? $1
	       : '' ;

      }
      else {
	 no strict 'refs' ;

	 my $chunk = _esc_cdata_ends( $self->{CDATA_END_PART}, @_ ) ;
	 $self->{CDATA_END_PART} = 
	    $chunk =~ s/(\]\]?)(?!\n)\Z//
	       ? $1
	       : '' ;

	 print $to $self->{STRAGGLERS}, $chunk
	    or croak "$! writing chars in <$open_elt->{NAME}>" ;

      }

      $self->{STRAGGLERS} = ']]>' ;
   }
   else {
      if ( ref $to eq 'SCALAR' ) {
	 $$to .= $self->{STRAGGLERS} ;
	 $$to .= _esc1( join( '', @_ ) ) ;
      }
      else {
	 no strict 'refs' ;
	 print $to $self->{STRAGGLERS}, _esc( @_ )
	    or croak "$! writing chars in <$open_elt->{NAME}>" ;
      }
      $self->{STRAGGLERS} = '' ;
#      $self->{CDATA_END_PART} = '' ;
   }

   $stack->[-1]->add_content( '#PCDATA' )
      if @{$stack} ;

   $self->{WAS_END_TAG} = 0 ;

   return ;
}


=item dataElement

   $writer->dataElement( $tag ) ;
   $writer->dataElement( $tag, $content ) ;
   $writer->dataElement( $tag, $content, attr1 => $val1, ... ) ;
   dataElement( $tag ) ;
   dataElement( $tag, $content ) ;
   dataElement( $tag, $content, attr1 => $val1, ... ) ;

Does the equivalent to

   ## Split the optional args in to attributes and elements arrays.
   $writer->startTag( $tag, @attributes ) ;
   $writer->characters( $content ) ;
   $writer->endTag( $tag ) ;

This function is exportable as dataElement(), and is also exported
for each element 'foo' found in the DTD as foo().

=cut

sub dataElement {
   my XML::ValidWriter $self = shift ;

   my ( $tag ) = shift ;

lib/XML/ValidWriter.pm  view on Meta::CPAN

sub getDataMode {
   my XML::ValidWriter $self = shift ;

   return $self->{DATA_MODE} ;
}


=item getDoctype

   $dtd = getDoctype ;
   $dtd = $writer->getDoctype ;

This is used to get the writer's XML::Doctype object.

=cut

sub getDoctype {
   my XML::ValidWriter $self = &_self ;
   return $self->{DOCTYPE} ;
}

=item getOutput

   $fh = getOutput ;
   $fh = $writer->getOutput ;

Gets the filehandle an XML::ValidWriter sends output to.

=cut

sub getOutput {
   my XML::ValidWriter $self = &_self ;
   return $self->{OUTPUT} ;
}


=item rawCharacters

   rawCharacters( "<unescaped text>", "& more text" ) ;
   $writer->rawCharacters( "<unescaped text>", "& more text" ) ;

This allows you to emit raw text without any escape processing.  The text
is not examined for tags, so you can invalidate your document and even
corrupt it's well-formedness.

=cut

## This is called everywhere to emit raw characters *except* characters(),
## which must go direct because it uses STRAGGLERS and CDATA_END_PART
## differently.
sub rawCharacters {
   my XML::ValidWriter $self = &_self ;

   my $to= $self->{OUTPUT} || select ;

   return unless grep length $_, @_ ;

   if ( ref $to eq 'SCALAR' ) {
      $$to .= join(
         '',
         _esc_cdata_ends( $self->{CDATA_END_PART} ),
	 $self->{STRAGGLERS},
	 @_
      ) ;
      $self->{AT_BOL} = substr( $$to, -1, 1 ) eq "\n" ;
   }
   else {
      no strict 'refs' ;

      for ( my $i = $#_ ; $i >= 0 ; --$i ) {
         next unless length $_[$i] ;
	 $self->{AT_BOL} = substr( $_[$i], -1, 1 ) eq "\n" ;
	 last ;
      }

      print $to
         _esc_cdata_ends( $self->{CDATA_END_PART} ),
         $self->{STRAGGLERS},
	 @_ or croak $!;
   }
   $self->{CDATA_END_PART} = '' ;
   $self->{STRAGGLERS} = '' ;
}


=item reset

   $writer->reset ;        # Not a function!

Resets a writer to be initialized, but not have emitted anything.

This is useful if you need to abort output, but want to reuse the
XML::ValidWriter.

=cut

sub reset {
   my XML::ValidWriter $self = shift ;
   $self->{STACK} = [] ;

   # If we should warn, clear the flag that says we checked it & vice versa
   $self->{CHECKED_XML_DECL} = ! $self->{SHOULD_WARN} ;

   ## I'd use assignement to a slice here, but older perls...
   $self->{IS_STANDALONE}   = 0 ;
   $self->{EMITTED_DOCTYPE} = undef ;
   $self->{EMITTED_ROOT}    = undef ;
   $self->{EMITTED_XML}     = undef ;

   $self->{AT_BOL}          = 1 ;
   $self->{WAS_END_TAG}     = 1 ;
   $self->{STRAGGLERS}      = '' ;
   $self->{CDATA_END_PART} = '' ;

   if ( defined $self->{FILE_NAME} ) {
      if ( defined $self->{OUTPUT} ) {
	 close $self->{OUTPUT} or croak "$! closing '$self->{FILE_NAME}'." ;
      }
      else {
	 require Symbol ;
	 $self->{OUTPUT} = Symbol::gensym() ;
      }
      eval "use Fcntl ; 1" or croak $@ ;
      open(
	 $self->{OUTPUT},
	 ">$self->{FILE_NAME}",
      ) 
	  or croak "$!: $self->{FILE_NAME}" ;
   }

   return ;
}



=item setDataMode



( run in 0.684 second using v1.01-cache-2.11-cpan-39bf76dae61 )