Serge

 view release on metacpan or  search on metacpan

lib/Serge/Engine/Plugin/parse_xml.pm  view on Meta::CPAN

<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
</head>
<body style="font-family: sans-serif; font-size: 120%">

<p>
# This is an automatically generated message.

The following parsing errors were found when attempting to localize resource files.
</p>

$text

</body>
</html>
|;

        Serge::Mail::send_html_message(
            $email_from, # from
            $email_to, # to (list)
            $email_subject, # subject
            $text # message body
        );
    }

}

sub parse {
    my ($self, $textref, $callbackref, $lang) = @_;

    die 'callbackref not specified' unless $callbackref;

    die 'node_match not specified' unless $self->{data}->{node_match};

    my $node_match = $self->{data}->{node_match} || [];
    my $node_exclude = $self->{data}->{node_exclude} || [];
    my $node_html = $self->{data}->{node_html} || [];

    # Make a copy of the string as we will change it

    my $text = $$textref;

    # Replace the symbolic entities as we are not going to expand them

    $text =~ s/&(\w+);/'__HTML__ENTITY__'.$1.'__'/ge;

    # Wrap CDATA blocks inside special '__CDATA' tag
    # to be able to reconstruct it later

    $text =~ s/(<\!\[CDATA\[.*?\]\]>)/'<__CDATA>'._escape_pi_and_comments($1).'<\/__CDATA>'/sge;

    # Wrap processing instruction inside special '__PI' tag
    # to be able to reconstruct it later

    $text =~ s/<\?(.*?)\?>/<__PI><\!\[CDATA\[$1\]\]><\/__PI>/sg;

    # Wrap HTML comment inside special '__COMMENT' tag
    # to be able to reconstruct it later

    $text =~ s/<\!--(.*?)-->/<__COMMENT><\!\[CDATA\[$1\]\]><\/__COMMENT>/sg;

    # Restore escaped processing instructions and comments inside cdata

    $text = _unescape_pi_and_comments($text);

    # Add the dummy root tag for XML to be valid

    $text = '<__ROOT>'.$text.'</__ROOT>';

    # Create XML parser object

    use XML::Parser;
    my $parser = new XML::Parser(Style => 'IxTree');

    # Parse XML

    my $tree;
    eval {
        $tree = $parser->parse($text);
    };
    if ($@) {
        my $error_text = $@;
        $error_text =~ s/\t/ /g;
        $error_text =~ s/^\s+//s;

        $self->{errors}->{$self->{parent}->{engine}->{current_file_rel}} = $error_text;

        die $error_text;
    }

    # Add the empty attributes hash to the root tag (for uniform processing)

    unshift @$tree, {};

    # Process tree recursively and generate the localized output

    my $out = $self->render_tag_recursively('', $tree, $callbackref, $lang, '');

    return $lang ? $out : undef;
}

sub _escape_pi_and_comments {
    my $text = shift;

    $text =~ s/<\?/__PI_START__/sg;
    $text =~ s/\?>/__PI_END__/sg;
    $text =~ s/<\!--/__COMMENT_START__/sg;
    $text =~ s/-->/__COMMENT_END__/sg;

    return $text;
}

sub _unescape_pi_and_comments {
    my $text = shift;

    $text =~ s/__PI_START__/<\?/sg;
    $text =~ s/__PI_END__/\?>/sg;
    $text =~ s/__COMMENT_START__/<\!--/sg;
    $text =~ s/__COMMENT_END__/-->/sg;

    return $text;
}

sub process_text_node {
    my ($self, $path, $attrs, $strref, $callbackref, $lang, $cdata, $noquotes) = @_;

    # Check if node path matches our expectations

    my $ok = undef;

    # Test if node path matches the mask

    foreach my $rule (@{$self->{data}->{node_match}}) {
        if (ref($rule) eq "HASH") {
            my $prule = $rule->{path};
            if ($path =~ m/$prule/) {
                my $attrs_ok = 1;
                foreach my $name (keys %{$rule->{attributes}}) {
                    my $arule = $rule->{attributes}->{$name};
                    if ($attrs->{$name} !~ m/$arule/) {
                        print "\t\t\tattribute '$name' [".$attrs->{$name}."] doesn't match rule '$arule'\n" if $self->{parent}->{debug};
                        $attrs_ok = undef;
                        last;
                    }
                }
                if ($attrs_ok) {
                    $ok = 1;
                    last;
                }
            } else {
                print "\t\t\tpath doesn't match\n" if $self->{parent}->{debug};
            }
        } else { # treat rule as a string
            if ($path =~ m/$rule/) {
                $ok = 1;
                last;
            }
        }
    }

    # Test if node path does not match the exclusion mask

    if ($ok) {
        foreach my $rule (@{$self->{data}->{node_exclude}}) {
            if (ref($rule) eq "HASH") {
                my $prule = $rule->{path};
                if ($path =~ m/$prule/) {
                    my $attrs_ok = 1;
                    foreach my $name (keys %{$rule->{attributes}}) {
                        my $arule = $rule->{attributes}->{$name};
                        if ($attrs->{$name} !~ m/$arule/) {
                            print "\t\t\t[exclude] attribute '$name' [".$attrs->{$name}."] doesn't match rule '$arule'\n" if $self->{parent}->{debug};
                            $attrs_ok = undef;
                            last;
                        }
                    }
                    if ($attrs_ok) {
                        $ok = undef;
                        last;
                    }
                } else {
                    print "\t\t\t[exclude] path doesn't match\n" if $self->{parent}->{debug};
                }
            } else { # treat rule as a string
                if ($path =~ m/$rule/) {

lib/Serge/Engine/Plugin/parse_xml.pm  view on Meta::CPAN

                        last;
                    }
                } else {
                    print "\t\t\tpath doesn't match\n" if $self->{parent}->{debug};
                }
            } else { # treat rule as a string
                if ($path =~ m/$rule/) {
                    $is_html = 1;
                    last;
                }
            }
        }
    }

    if ($self->{parent}->{debug}) {
        if ($ok) {
            if ($is_html) {
                print "\t\t[ok, HTML mode] $path\n";
            } else {
                print "\t\t[ok] $path\n";
            }
        } else {
            print "\t\t[--] $path\n";
        }
    }

    # reconstruct original XML with symbolic entities
    # (do this before we exit to make sure all text nodes, even those
    # not matching the mask, will be restored)
    $$strref =~ s/__HTML__ENTITY__(\w+?)__/&$1;/g;

    # now exit if the node doesn't match the mask
    return unless $ok;

    # in InDesign mode, strip the line break Unicode symbols since these are generally English-specific
    # (? need to verify ?)
    if ($self->{data}->{xml_kind_indesign}) {
        $$strref =~ s/\x{2028}//g; # Unicode Character 'LINE SEPARATOR' (U+2028)
    }

    # trim the string
    my $trimmed = $$strref;

    $trimmed =~ s/^\s+//sg;
    $trimmed =~ s/\s+$//sg;

    # 1) skip empty strings
    # 2) skip strings consisting of non-alphabet characters (bullets, arrows, etc.)
    # 3) skip strings representing plain numbers
    if ($trimmed ne '' && $trimmed !~ m/^(\W+|\d+)$/) {
        # in InDesign mode, preserve the leading and trailing whitespace
        my ($leading_whitespace, $trailing_whitespace);
        if ($self->{data}->{xml_kind_indesign}) {
            ($$strref =~ m/^(\s+)/) && ($leading_whitespace = $1);
            ($$strref =~ m/(\s+)$/) && ($trailing_whitespace = $1);
        }

        $$strref = $trimmed;

        # unescape basic XML entities unless we're inside CDATA block
        xml_unescape_strref($strref) unless $cdata;

        if ($is_html) {
            # if node is html, pass its text to html parser for string extraction
            # if html_parser fails to parse the XML due to errors,
            # it will die(), and this will be catched in main application

            # lazy-load html parser plugin
            # (parse_php_xhtml or the one specified in html_parser config node)
            if (!$self->{html_parser}) {
                if (exists $self->{data}->{html_parser}) {
                    $self->{html_parser} = $self->load_plugin_from_node(
                        'Serge::Engine::Plugin', $self->{data}->{html_parser}
                    );
                } else {
                    # fallback to loading parse_php_xhtml with default parameters
                    eval('use Serge::Engine::Plugin::parse_php_xhtml; $self->{html_parser} = Serge::Engine::Plugin::parse_php_xhtml->new($self->{parent});');
                    ($@) && die "Can't load parser plugin 'parse_php_xhtml': $@";
                    print "Loaded HTML parser plugin for HTML nodes\n" if $self->{parent}->{debug};
                }
            }

            $self->{html_parser}->{current_file_rel} = $self->{parent}->{engine}->{current_file_rel}.":$path";
            if ($lang) {
                $$strref = $self->{html_parser}->parse($strref, $callbackref, $lang);
                if (defined $$strref) {
                    # escape unsafe xml chars unless we're in CDATA block
                    xml_escape_strref($strref, $noquotes) unless $cdata;
                } else {
                    $$strref = $trimmed;
                }
            } else {
                $self->{html_parser}->parse($strref, $callbackref);
            }
        } else {
            # additionally unescape Android-specific stuff, if requested
            _android_unescape($strref) if ($self->{data}->{xml_kind_android});

            if ($lang) {
                $$strref = &$callbackref($$strref, undef, $path, undef, $lang);
            } else {
                &$callbackref($$strref, undef, $path, undef, undef);
            }

            # escape Android-specific stuff if requested
            _android_escape($strref) if ($self->{data}->{xml_kind_android});

            # preserve symbolic entities from escaping
            $$strref =~ s/&(\w+);/'__HTML__ENTITY__'.$1.'__'/ge;

            # escape unsafe xml chars (in Android mode, do not xml-escape quotes)
            $noquotes = $noquotes || $self->{data}->{xml_kind_android};
            xml_escape_strref($strref, $noquotes) unless $cdata;

            # restore symbolic entities
            $$strref =~ s/__HTML__ENTITY__(\w+?)__/&$1;/g;

            # in InDesign mode, make sure the leading and trailing whitespace
            # is restored to the original values
            if ($self->{data}->{xml_kind_indesign}) {
                $$strref =~ s/^(\s+)/$leading_whitespace/e;
                $$strref =~ s/(\s+)$/$trailing_whitespace/e;
            }
        }
    }
}

sub _android_unescape {
    my ($strref) = @_;

    $$strref =~ s/\\'/'/g; # Android-specific apostrophe unescaping
    $$strref =~ s/\\"/"/g; # Android-specific quote unescaping
}

sub _android_escape {
    my ($strref) = @_;

    $$strref =~ s/'/\\'/g; # Android-specific apostrophe escaping
    $$strref =~ s/"/\\"/g; # Android-specific quote escaping

}

sub _dummy_callback {
    my ($s) = @_;
    return $s;
}

sub render_tag_recursively {
    my ($self, $name, $subtree, $callbackref, $lang, $path, $cdata, $parent_attrs) = @_;
    my $attrs = $subtree->[0];

    $cdata = 1 if (($name eq '__CDATA') || ($name eq '__COMMENT') || ($name eq '__PI'));

    my $inner_xml = '';

    for (my $i = 0; $i < (scalar(@$subtree) - 1) / 2; $i++) {
        my $tagname = $subtree->[1 + $i*2];
        my $tagtree = $subtree->[1 + $i*2 + 1];

        # do not process text inside processing instructions
        # TODO: this can potentially be a conditional option, disabled by default
        if ($tagname eq '__PI') {
            $inner_xml .= $self->render_tag_recursively($tagname, $tagtree, \&_dummy_callback, $lang, $path, $cdata, $attrs);
            next;
        }

        if ($tagname ne '0') {
            # node does not contain plain text, render the subtree

            my $tagpath;
            if (($tagname eq '__ROOT') || ($tagname eq '__CDATA') || ($tagname eq '__COMMENT') || ($tagname eq '__PI')) {
                $tagpath = $path;
            } else {
                $tagpath = $path.'/'.$tagname;
            }

            if ($lang) {
                $inner_xml .= $self->render_tag_recursively($tagname, $tagtree, $callbackref, $lang, $tagpath, $cdata, $attrs);
            } else {
                $self->render_tag_recursively($tagname, $tagtree, $callbackref, $lang, $tagpath, $cdata, $attrs);
            }
        } else {
            # tagtree holds a string for text nodes

            my $str = $tagtree;

            $self->process_text_node($path, $parent_attrs, \$str, $callbackref, $lang, $cdata, 1);

            if ($lang) {
                $inner_xml .= $str;
            }
        }
    }

    # Generating the string consisting of [ attr="value"] pairs

    my $attrs_text;

    foreach my $key (sort keys %$attrs) {
        my $str = $attrs->{$key};

        my $tagpath = $path.'@'.$str;

        $self->process_text_node($tagpath, $attrs, \$str, $callbackref, $lang, undef, undef);

        if ($lang) {
            $attrs_text .= " $key=\"$str\"";
        }
    }

    # Construct and return the tag string with its inner xml

    if ($lang) {
        if ($name eq '__CDATA') {
            return '<![CDATA['.$inner_xml.']]>';
        }

        if ($name eq '__COMMENT') {
            return '<!--'.$inner_xml.'-->';
        }

        if ($name eq '__PI') {
            return '<?'.$inner_xml.'?>';
        }

        if (($name ne '') && ($name ne '__ROOT')) {
            return "<$name$attrs_text>$inner_xml</$name>";
        }

        return $inner_xml;
    }
}

1;



( run in 1.097 second using v1.01-cache-2.11-cpan-39bf76dae61 )