App-Music-ChordPro
view release on metacpan or search on metacpan
lib/ChordPro/lib/SVGPDF/Parser.pm view on Meta::CPAN
class SVGPDF::Parser;
use File::LoadLines;
use Carp;
field $debug;
method parse_file ( $fname, %args ) {
$debug = $args{debug} if defined $args{debug};
my $data = loadlines( $fname, { split => 0, chomp => 0 } );
$self->parse( $data, %args );
}
method parse ( $data, %args ) {
if ( $debug ) {
# Make it easier to read/write long lines and disable parts.
$data =~ s/^#.*//mg;
$data =~ s/\\[\n\r]+\s*//g;
}
$self->_parse( $data, %args );
}
# The _parse method is a modified version of XML::Tiny. All comments
# and restrictions of L<XML::Tiny> are applicable.
# Main modification is to allow whitespace elements in <text> elements.
# These are significant in SVG.
# Since we're aiming at SVG parsing, and SVG is strict XML but often
# wrapped in an (X)HTML document, the parser functionality is set
# to no fatal_declarations and strict_entity_parsing.
field $re_name;
field %emap;
method _parse ( $data, %params) {
my $elem = { content => [] };
# TODO: Accept whitespace tokens by default within <text> elements.
my $whitespace_tokens = $params{whitespace_tokens};
$re_name //= '[:_a-z][\\w:\\.-]*';
%emap = qw( lt < gt > amp & quot " apos ' );
my $fixent = sub ( $e ) {
$e =~ s/&#(\d+);/chr($1)/ge && return $e;
$e =~ s/&#(x[0-9a-f]+);/chr(hex($1))/gie && return $e;
$e =~ s/&(lt|gt|quot|apos|amp);/$emap{$1}/ge && return $e;
croak( "SVG Parser: Illegal ampersand or entity \"$1\"" )
if $e =~ /(&[^;]{0,10})/;
$e;
};
croak( "SVG Parser: No elements" ) if !defined($data) || $data !~ /\S/;
# Illegal low-ASCII chars.
croak( "SVG Parser: Not well-formed (illegal low-ASCII chars)" )
if $data =~ /[\x00-\x08\x0b\x0c\x0e-\x1f]/;
# Turn CDATA into PCDATA.
$data =~ s{<!\[CDATA\[(.*?)]]>}{
$_ = $1.chr(0); # this makes sure that empty CDATAs become
s/([&<>'"])/ # the empty string and aren't just thrown away.
$1 eq '&' ? '&' :
$1 eq '<' ? '<' :
$1 eq '"' ? '"' :
$1 eq "'" ? ''' :
'>'
/eg;
$_;
}egs;
croak( "SVG Parser: Not well-formed (CDATA not delimited or bad comment)" )
if $data =~ /]]>/ # ]]> not delimiting CDATA
|| $data =~ /<!--(.*?)--->/s # ---> can't end a comment
|| grep { $_ && /--/ }
( $data =~ /^\s+|<!--(.*?)-->|\s+$/gs); # -- in comm
# Strip leading/trailing whitespace and comments (which don't nest - phew!).
$data =~ s/^\s+|<!--(.*?)-->|\s+$//gs;
# Turn quoted > in attribs into >.
# Double- and single-quoted attrib values get done seperately.
while ( $data =~ s/($re_name\s*=\s*"[^"]*)>([^"]*")/$1>$2/gsi ) {}
while ( $data =~ s/($re_name\s*=\s*'[^']*)>([^']*')/$1>$2/gsi ) {}
if ( $params{fatal_declarations} && $data =~ /<!(ENTITY|DOCTYPE)/ ) {
croak( "SVG Parser: Unexpected \"$1\"" );
}
# The abc2svg generator forgets the close the body. Fix it.
if ( $data =~ /\<meta\s+name="generator"\s+content="abc2svg/ ) {
$data =~ s;</div>\s*</html>;</div></body></html>;;
$whitespace_tokens++;
}
# Ignore empty tokens/whitespace tokens.
foreach my $token ( grep { length }
split( /(<[^>]+>)/, $data ) ) {
next if $token =~ /^\s+$/s && !$whitespace_tokens;
next if $token =~ /<\?$re_name.*?\?>/is
|| $token =~ /^<!(ENTITY|DOCTYPE)/i;
if ( $token =~ m!^</($re_name)\s*>!i ) { # close tag
croak( "SVG Parser: Not well-formed (at \"$token\")" )
if $elem->{name} ne $1;
$elem = delete $elem->{parent};
}
elsif ( $token =~ /^<$re_name(\s[^>]*)*(\s*\/)?>/is ) { # open tag
my ( $tagname, $attribs_raw ) =
( $token =~ m!<(\S*)(.*?)(\s*/)?>!s );
# First make attribs into a list so we can spot duplicate keys.
my $attrib = [
# Do double- and single- quoted attribs seperately.
$attribs_raw =~ /\s($re_name)\s*=\s*"([^"]*?)"/gi,
$attribs_raw =~ /\s($re_name)\s*=\s*'([^']*?)'/gi
];
if ( @{$attrib} == 2 * keys %{{@{$attrib}}} ) {
$attrib = { @{$attrib} }
}
else {
croak( "SVG Parser: Not well-formed (duplicate attribute)" );
}
( run in 0.704 second using v1.01-cache-2.11-cpan-0d23b851a93 )