XML-LibXML
view release on metacpan or search on metacpan
{
my ($runtime_version) = LIBXML_RUNTIME_VERSION() =~ /^(\d+)/;
if ( $runtime_version < LIBXML_VERSION ) {
warn "Warning: XML::LibXML compiled against libxml2 ".LIBXML_VERSION.
", but runtime libxml2 is older $runtime_version\n";
}
}
#-------------------------------------------------------------------------#
# parser flags #
#-------------------------------------------------------------------------#
# Copied directly from http://xmlsoft.org/html/libxml-parser.html#xmlParserOption
use constant {
XML_PARSE_RECOVER => 1, # recover on errors
XML_PARSE_NOENT => 2, # substitute entities
XML_PARSE_DTDLOAD => 4, # load the external subset
XML_PARSE_DTDATTR => 8, # default DTD attributes
XML_PARSE_DTDVALID => 16, # validate with the DTD
XML_PARSE_NOERROR => 32, # suppress error reports
XML_PARSE_NOWARNING => 64, # suppress warning reports
XML_PARSE_PEDANTIC => 128, # pedantic error reporting
XML_PARSE_NOBLANKS => 256, # remove blank nodes
XML_PARSE_SAX1 => 512, # use the SAX1 interface internally
XML_PARSE_XINCLUDE => 1024, # Implement XInclude substitution
XML_PARSE_NONET => 2048, # Forbid network access
XML_PARSE_NODICT => 4096, # Do not reuse the context dictionary
XML_PARSE_NSCLEAN => 8192, # remove redundant namespaces declarations
XML_PARSE_NOCDATA => 16384, # merge CDATA as text nodes
XML_PARSE_NOXINCNODE => 32768, # do not generate XINCLUDE START/END nodes
XML_PARSE_COMPACT => 65536, # compact small text nodes; no modification of the tree allowed afterwards
# (will possibly crash if you try to modify the tree)
XML_PARSE_OLD10 => 131072, # parse using XML-1.0 before update 5
XML_PARSE_NOBASEFIX => 262144, # do not fixup XINCLUDE xml#base uris
XML_PARSE_HUGE => 524288, # relax any hardcoded limit from the parser
XML_PARSE_OLDSAX => 1048576, # parse using SAX2 interface from before 2.7.0
HTML_PARSE_RECOVER => (1<<0), # suppress error reports
HTML_PARSE_NOERROR => (1<<5), # suppress error reports
};
$XML_LIBXML_PARSE_DEFAULTS = ( XML_PARSE_NODICT );
# this hash is made global so that applications can add names for new
# libxml2 parser flags as temporary workaround
%PARSER_FLAGS = (
recover => XML_PARSE_RECOVER,
expand_entities => XML_PARSE_NOENT,
load_ext_dtd => XML_PARSE_DTDLOAD,
complete_attributes => XML_PARSE_DTDATTR,
validation => XML_PARSE_DTDVALID,
suppress_errors => XML_PARSE_NOERROR,
suppress_warnings => XML_PARSE_NOWARNING,
pedantic_parser => XML_PARSE_PEDANTIC,
no_blanks => XML_PARSE_NOBLANKS,
expand_xinclude => XML_PARSE_XINCLUDE,
xinclude => XML_PARSE_XINCLUDE,
no_network => XML_PARSE_NONET,
clean_namespaces => XML_PARSE_NSCLEAN,
no_cdata => XML_PARSE_NOCDATA,
no_xinclude_nodes => XML_PARSE_NOXINCNODE,
old10 => XML_PARSE_OLD10,
no_base_fix => XML_PARSE_NOBASEFIX,
huge => XML_PARSE_HUGE,
oldsax => XML_PARSE_OLDSAX,
);
my %OUR_FLAGS = (
recover => 'XML_LIBXML_RECOVER',
line_numbers => 'XML_LIBXML_LINENUMBERS',
URI => 'XML_LIBXML_BASE_URI',
base_uri => 'XML_LIBXML_BASE_URI',
gdome => 'XML_LIBXML_GDOME',
ext_ent_handler => 'ext_ent_handler',
);
sub _parser_options {
my ($self, $opts) = @_;
# currently dictionaries break XML::LibXML memory management
my $flags;
if (ref($self)) {
$flags = ($self->{XML_LIBXML_PARSER_OPTIONS}||0);
} else {
$flags = $XML_LIBXML_PARSE_DEFAULTS; # safety precaution
}
my ($key, $value);
while (($key,$value) = each %$opts) {
my $f = $PARSER_FLAGS{ $key };
if (defined $f) {
if ($value) {
$flags |= $f
} else {
$flags &= ~$f;
}
} elsif ($key eq 'set_parser_flags') { # this can be used to pass flags XML::LibXML does not yet know about
$flags |= $value;
} elsif ($key eq 'unset_parser_flags') {
$flags &= ~$value;
}
}
return $flags;
}
my %compatibility_flags = (
XML_LIBXML_VALIDATION => 'validation',
XML_LIBXML_EXPAND_ENTITIES => 'expand_entities',
XML_LIBXML_PEDANTIC => 'pedantic_parser',
XML_LIBXML_NONET => 'no_network',
XML_LIBXML_EXT_DTD => 'load_ext_dtd',
XML_LIBXML_COMPLETE_ATTR => 'complete_attributes',
XML_LIBXML_EXPAND_XINCLUDE => 'expand_xinclude',
XML_LIBXML_NSCLEAN => 'clean_namespaces',
XML_LIBXML_KEEP_BLANKS => 'keep_blanks',
XML_LIBXML_LINENUMBERS => 'line_numbers',
);
}
# perl style
sub process_xincludes {
my $self = shift;
my $doc = shift;
my $opts = shift;
my $options = $self->_parser_options($opts);
my $rv;
$self->_init_callbacks();
eval {
$rv = $self->_processXIncludes($doc || " ", $options);
};
my $err = $@;
$self->_cleanup_callbacks();
if ( $err ) {
chomp $err unless ref $err;
croak $@;
}
return $rv;
}
#-------------------------------------------------------------------------#
# HTML parsing functions #
#-------------------------------------------------------------------------#
sub _html_options {
my ($self,$opts)=@_;
$opts = {} unless ref $opts;
# return (undef,undef) unless ref $opts;
my $flags = 0;
{
my $recover = exists $opts->{recover} ? $opts->{recover} : $self->recover;
if ($recover)
{
$flags |= HTML_PARSE_RECOVER;
if ($recover == 2)
{
$flags |= HTML_PARSE_NOERROR;
}
}
}
$flags |= 4 if $opts->{no_defdtd}; # default is ON: injects DTD as needed
$flags |= 32 if exists $opts->{suppress_errors} ? $opts->{suppress_errors} : $self->get_option('suppress_errors');
# This is to fix https://rt.cpan.org/Ticket/Display.html?id=58024 :
# <quote>
# In XML::LibXML, warnings are not suppressed when specifying the recover
# or recover_silently flags as per the following excerpt from the manpage:
# </quote>
if ($self->recover_silently)
{
$flags |= 32;
}
$flags |= 64 if $opts->{suppress_warnings};
$flags |= 128 if exists $opts->{pedantic_parser} ? $opts->{pedantic_parser} : $self->pedantic_parser;
$flags |= 256 if exists $opts->{no_blanks} ? $opts->{no_blanks} : !$self->keep_blanks;
$flags |= 2048 if exists $opts->{no_network} ? $opts->{no_network} : !$self->no_network;
$flags |= 16384 if $opts->{no_cdata};
$flags |= 65536 if $opts->{compact}; # compact small text nodes; no modification
# of the tree allowed afterwards
# (WILL possibly CRASH IF YOU try to MODIFY THE TREE)
$flags |= 524288 if $opts->{huge}; # relax any hardcoded limit from the parser
$flags |= 1048576 if $opts->{oldsax}; # parse using SAX2 interface from before 2.7.0
return ($opts->{URI},$opts->{encoding},$flags);
}
sub parse_html_string {
my ($self,$str,$opts) = @_;
croak("parse_html_string is not a class method! Create a parser object with XML::LibXML->new first!") unless ref $self;
croak("parse already in progress") if $self->{_State_};
unless ( defined $str and length $str ) {
croak("Empty String");
}
$self->{_State_} = 1;
my $result;
$self->_init_callbacks();
eval {
$result = $self->_parse_html_string( $str,
$self->_html_options($opts)
);
};
my $err = $@;
$self->{_State_} = 0;
if ($err) {
chomp $err unless ref $err;
$self->_cleanup_callbacks();
croak $err;
}
$self->_cleanup_callbacks();
return $result;
}
sub parse_html_file {
my ($self,$file,$opts) = @_;
croak("parse_html_file is not a class method! Create a parser object with XML::LibXML->new first!") unless ref $self;
croak("parse already in progress") if $self->{_State_};
$self->{_State_} = 1;
my $result;
$self->_init_callbacks();
eval { $result = $self->_parse_html_file($file,
$self->_html_options($opts)
); };
my $err = $@;
$self->{_State_} = 0;
if ($err) {
chomp $err unless ref $err;
$self->_cleanup_callbacks();
croak $err;
}
$self->_cleanup_callbacks();
( run in 1.582 second using v1.01-cache-2.11-cpan-39bf76dae61 )