XML-LibXML

 view release on metacpan or  search on metacpan

LibXML.pm  view on Meta::CPAN

{
  my ($runtime_version) = LIBXML_RUNTIME_VERSION() =~ /^(\d+)/;
  if ( $runtime_version < LIBXML_VERSION ) {
    warn "Warning: XML::LibXML compiled against libxml2 ".LIBXML_VERSION.
      ", but runtime libxml2 is older $runtime_version\n";
  }
}


#-------------------------------------------------------------------------#
# parser flags                                                            #
#-------------------------------------------------------------------------#

# Copied directly from http://xmlsoft.org/html/libxml-parser.html#xmlParserOption
use constant {
  XML_PARSE_RECOVER	  => 1,	       # recover on errors
  XML_PARSE_NOENT	  => 2,	       # substitute entities
  XML_PARSE_DTDLOAD	  => 4,	       # load the external subset
  XML_PARSE_DTDATTR	  => 8,	       # default DTD attributes
  XML_PARSE_DTDVALID	  => 16,       # validate with the DTD
  XML_PARSE_NOERROR	  => 32,       # suppress error reports
  XML_PARSE_NOWARNING	  => 64,       # suppress warning reports
  XML_PARSE_PEDANTIC	  => 128,      # pedantic error reporting
  XML_PARSE_NOBLANKS	  => 256,      # remove blank nodes
  XML_PARSE_SAX1	  => 512,      # use the SAX1 interface internally
  XML_PARSE_XINCLUDE	  => 1024,     # Implement XInclude substitution
  XML_PARSE_NONET	  => 2048,     # Forbid network access
  XML_PARSE_NODICT	  => 4096,     # Do not reuse the context dictionary
  XML_PARSE_NSCLEAN	  => 8192,     # remove redundant namespaces declarations
  XML_PARSE_NOCDATA	  => 16384,    # merge CDATA as text nodes
  XML_PARSE_NOXINCNODE	  => 32768,    # do not generate XINCLUDE START/END nodes
  XML_PARSE_COMPACT	  => 65536,    # compact small text nodes; no modification of the tree allowed afterwards
                                       # (will possibly crash if you try to modify the tree)
  XML_PARSE_OLD10	  => 131072,   # parse using XML-1.0 before update 5
  XML_PARSE_NOBASEFIX	  => 262144,   # do not fixup XINCLUDE xml#base uris
  XML_PARSE_HUGE	  => 524288,   # relax any hardcoded limit from the parser
  XML_PARSE_OLDSAX	  => 1048576,  # parse using SAX2 interface from before 2.7.0
  HTML_PARSE_RECOVER => (1<<0),       # suppress error reports
  HTML_PARSE_NOERROR  => (1<<5),       # suppress error reports
};

$XML_LIBXML_PARSE_DEFAULTS = ( XML_PARSE_NODICT );

# this hash is made global so that applications can add names for new
# libxml2 parser flags as temporary workaround

%PARSER_FLAGS = (
  recover		 => XML_PARSE_RECOVER,
  expand_entities	 => XML_PARSE_NOENT,
  load_ext_dtd	         => XML_PARSE_DTDLOAD,
  complete_attributes	 => XML_PARSE_DTDATTR,
  validation		 => XML_PARSE_DTDVALID,
  suppress_errors	 => XML_PARSE_NOERROR,
  suppress_warnings	 => XML_PARSE_NOWARNING,
  pedantic_parser	 => XML_PARSE_PEDANTIC,
  no_blanks		 => XML_PARSE_NOBLANKS,
  expand_xinclude	 => XML_PARSE_XINCLUDE,
  xinclude		 => XML_PARSE_XINCLUDE,
  no_network		 => XML_PARSE_NONET,
  clean_namespaces	 => XML_PARSE_NSCLEAN,
  no_cdata		 => XML_PARSE_NOCDATA,
  no_xinclude_nodes	 => XML_PARSE_NOXINCNODE,
  old10		         => XML_PARSE_OLD10,
  no_base_fix		 => XML_PARSE_NOBASEFIX,
  huge		         => XML_PARSE_HUGE,
  oldsax		 => XML_PARSE_OLDSAX,
);

my %OUR_FLAGS = (
  recover => 'XML_LIBXML_RECOVER',
  line_numbers => 'XML_LIBXML_LINENUMBERS',
  URI => 'XML_LIBXML_BASE_URI',
  base_uri => 'XML_LIBXML_BASE_URI',
  gdome => 'XML_LIBXML_GDOME',
  ext_ent_handler => 'ext_ent_handler',
);

sub _parser_options {
  my ($self, $opts) = @_;

  # currently dictionaries break XML::LibXML memory management

  my $flags;

  if (ref($self)) {
    $flags = ($self->{XML_LIBXML_PARSER_OPTIONS}||0);
  } else {
    $flags = $XML_LIBXML_PARSE_DEFAULTS;		# safety precaution
  }

  my ($key, $value);
  while (($key,$value) = each %$opts) {
    my $f = $PARSER_FLAGS{ $key };
    if (defined $f) {
      if ($value) {
	$flags |= $f
      } else {
	$flags &= ~$f;
      }
    } elsif ($key eq 'set_parser_flags') { # this can be used to pass flags XML::LibXML does not yet know about
      $flags |= $value;
    } elsif ($key eq 'unset_parser_flags') {
      $flags &= ~$value;
    }

  }
  return $flags;
}

my %compatibility_flags = (
  XML_LIBXML_VALIDATION => 'validation',
  XML_LIBXML_EXPAND_ENTITIES => 'expand_entities',
  XML_LIBXML_PEDANTIC => 'pedantic_parser',
  XML_LIBXML_NONET => 'no_network',
  XML_LIBXML_EXT_DTD => 'load_ext_dtd',
  XML_LIBXML_COMPLETE_ATTR => 'complete_attributes',
  XML_LIBXML_EXPAND_XINCLUDE => 'expand_xinclude',
  XML_LIBXML_NSCLEAN => 'clean_namespaces',
  XML_LIBXML_KEEP_BLANKS => 'keep_blanks',
  XML_LIBXML_LINENUMBERS => 'line_numbers',
);

LibXML.pm  view on Meta::CPAN

}

# perl style
sub process_xincludes {
    my $self = shift;
    my $doc = shift;
    my $opts = shift;
    my $options = $self->_parser_options($opts);

    my $rv;
    $self->_init_callbacks();
    eval {
        $rv = $self->_processXIncludes($doc || " ", $options);
    };
    my $err = $@;
    $self->_cleanup_callbacks();
    if ( $err ) {
        chomp $err unless ref $err;
        croak $@;
    }
    return $rv;
}

#-------------------------------------------------------------------------#
# HTML parsing functions                                                  #
#-------------------------------------------------------------------------#

sub _html_options {
  my ($self,$opts)=@_;
  $opts = {} unless ref $opts;
  #  return (undef,undef) unless ref $opts;
  my $flags = 0;
  {
    my $recover = exists $opts->{recover} ? $opts->{recover} : $self->recover;

    if ($recover)
    {
      $flags |= HTML_PARSE_RECOVER;
      if ($recover == 2)
      {
        $flags |= HTML_PARSE_NOERROR;
      }
    }
  }

  $flags |=     4 if $opts->{no_defdtd}; # default is ON: injects DTD as needed
  $flags |=    32 if exists $opts->{suppress_errors} ? $opts->{suppress_errors} : $self->get_option('suppress_errors');
  # This is to fix https://rt.cpan.org/Ticket/Display.html?id=58024 :
  # <quote>
  # In XML::LibXML, warnings are not suppressed when specifying the recover
  # or recover_silently flags as per the following excerpt from the manpage:
  # </quote>
  if ($self->recover_silently)
  {
      $flags |= 32;
  }
  $flags |=    64 if $opts->{suppress_warnings};
  $flags |=   128 if exists $opts->{pedantic_parser} ? $opts->{pedantic_parser} : $self->pedantic_parser;
  $flags |=   256 if exists $opts->{no_blanks} ? $opts->{no_blanks} : !$self->keep_blanks;
  $flags |=  2048 if exists $opts->{no_network} ? $opts->{no_network} : !$self->no_network;
  $flags |= 16384 if $opts->{no_cdata};
  $flags |= 65536 if $opts->{compact}; # compact small text nodes; no modification
                                         # of the tree allowed afterwards
                                         # (WILL possibly CRASH IF YOU try to MODIFY THE TREE)
  $flags |= 524288 if $opts->{huge}; # relax any hardcoded limit from the parser
  $flags |= 1048576 if $opts->{oldsax}; # parse using SAX2 interface from before 2.7.0

  return ($opts->{URI},$opts->{encoding},$flags);
}

sub parse_html_string {
    my ($self,$str,$opts) = @_;
    croak("parse_html_string is not a class method! Create a parser object with XML::LibXML->new first!") unless ref $self;
    croak("parse already in progress") if $self->{_State_};

    unless ( defined $str and length $str ) {
        croak("Empty String");
    }
    $self->{_State_} = 1;
    my $result;

    $self->_init_callbacks();
    eval {
      $result = $self->_parse_html_string( $str,
					   $self->_html_options($opts)
					  );
    };
    my $err = $@;
    $self->{_State_} = 0;
    if ($err) {
      chomp $err unless ref $err;
      $self->_cleanup_callbacks();
      croak $err;
    }

    $self->_cleanup_callbacks();

    return $result;
}

sub parse_html_file {
    my ($self,$file,$opts) = @_;
    croak("parse_html_file is not a class method! Create a parser object with XML::LibXML->new first!") unless ref $self;
    croak("parse already in progress") if $self->{_State_};
    $self->{_State_} = 1;
    my $result;

    $self->_init_callbacks();
    eval { $result = $self->_parse_html_file($file,
					     $self->_html_options($opts)
					    ); };
    my $err = $@;
    $self->{_State_} = 0;
    if ($err) {
      chomp $err unless ref $err;
      $self->_cleanup_callbacks();
      croak $err;
    }

    $self->_cleanup_callbacks();



( run in 1.582 second using v1.01-cache-2.11-cpan-39bf76dae61 )