Marpa-R2

 view release on metacpan or  search on metacpan

html/lib/Marpa/R2/HTML.pm  view on Meta::CPAN

    my $ref_type        = ref $document_ref;
    Marpa::R2::exception('Arg to parse() must be ref to string')
        if not $ref_type
            or $ref_type ne 'SCALAR'
            or not defined ${$document_ref};

    my $document = $self->{document} = $document_ref;

    my ($core_rules,   $runtime_tag,
        $rank_by_name, $is_empty_element,
        $primary_group_by_tag
    ) = $self->{config}->contents();
    $self->{is_empty_element} = $is_empty_element;
    if ($self->{dump_config}) {
         return $self->{config}->as_string();
    }
    my @action_by_rule_id = ();
    $self->{action_by_rule_id} = \@action_by_rule_id;
    my $thin_grammar = Marpa::R2::Thin::G->new( { if => 1 } );
    my $tracer = Marpa::R2::Thin::Trace->new($thin_grammar);
    $self->{tracer}                  = $tracer;

    RULE: for my $rule ( @{$core_rules} ) {
        my $lhs    = $rule->{lhs};
        my $rhs    = $rule->{rhs};
        my $min    = $rule->{min};
        my $action = $rule->{action};
        my @symbol_ids = ();
        for my $symbol_name ( $lhs, @{$rhs} ) {
            push @symbol_ids,
                $tracer->symbol_by_name($symbol_name)
                // $tracer->symbol_new($symbol_name);
        }
        my ($lhs_id, @rhs_ids) = @symbol_ids;
        my $rule_id;
        if ( defined $min ) {
            $rule_id =
                $thin_grammar->sequence_new( $lhs_id, $rhs_ids[0],
                { min => $min } );
        }
        else {
            $rule_id = $thin_grammar->rule_new( $lhs_id, \@rhs_ids );
        }
        $action_by_rule_id[$rule_id] = $action;
    } ## end RULE: for my $rule ( @{$core_rules} )

    # Some constants that we will use a lot
    my $SYMID_CRUFT = $tracer->symbol_by_name('CRUFT');
    my $SYMID_CDATA = $tracer->symbol_by_name('CDATA');
    my $SYMID_PCDATA = $tracer->symbol_by_name('PCDATA');
    my $SYMID_WHITESPACE = $tracer->symbol_by_name('WHITESPACE');
    my $SYMID_PI = $tracer->symbol_by_name('PI');
    my $SYMID_C = $tracer->symbol_by_name('C');
    my $SYMID_D = $tracer->symbol_by_name('D');
    my $SYMID_EOF = $tracer->symbol_by_name('EOF');

    my @raw_tokens = ();
    my $p          = HTML::Parser->new(
        api_version => 3,
        start_h     => [
            \@raw_tokens, q{tagname,'S',line,column,offset,offset_end,is_cdata,attr}
        ],
        end_h =>
            [ \@raw_tokens, q{tagname,'E',line,column,offset,offset_end,is_cdata} ],
        text_h => [
            \@raw_tokens,
            qq{'$SYMID_WHITESPACE','T',line,column,offset,offset_end,is_cdata}
        ],
        comment_h =>
            [ \@raw_tokens, qq{'$SYMID_C','C',line,column,offset,offset_end,is_cdata} ],
        declaration_h =>
            [ \@raw_tokens, qq{'$SYMID_D','D',line,column,offset,offset_end,is_cdata} ],
        process_h =>
            [ \@raw_tokens, qq{'$SYMID_PI','PI',line,column,offset,offset_end,is_cdata} ],
        unbroken_text => 1
    );

    $p->parse( ${$document} );
    $p->eof;

    my @html_parser_tokens = ();
    HTML_PARSER_TOKEN:
    for my $raw_token (@raw_tokens) {
        my ( undef, $token_type, $line, $column, $offset, $offset_end, $is_cdata, $attr ) =
            @{$raw_token};

        PROCESS_TOKEN_TYPE: {
            if ($is_cdata) {
                $raw_token->[Marpa::R2::HTML::Internal::Token::TOKEN_ID] =
                    $SYMID_CDATA;
                last PROCESS_TOKEN_TYPE;
            }
            if ( $token_type eq 'T' ) {

                # White space as defined in HTML 4.01
                # space (x20); ASCII tab (x09); ASCII form feed (x0C;); Zero-width space (x200B)
                # and the two characters which appear in line breaks:
                # carriage return (x0D) and line feed (x0A)
                # I avoid the Perl character codes because I do NOT want
                # localization
                $raw_token->[Marpa::R2::HTML::Internal::Token::TOKEN_ID] =
                 $SYMID_PCDATA if
                    substr(
                        ${$document}, $offset, ( $offset_end - $offset )
                    ) =~ / [^\x09\x0A\x0C\x0D\x20\x{200B}] /oxms;

                last PROCESS_TOKEN_TYPE;
            } ## end if ( $token_type eq 'T' )
            if ( $token_type eq 'E' or $token_type eq 'S' ) {

                # If it's a virtual token from HTML::Parser,
                # pretend it never existed.
                # HTML::Parser supplies missing
                # end tags for title elements, but for no
                # others.
                # This is not helpful and we need to special-case
                # these zero-length tags and throw them away.
                next HTML_PARSER_TOKEN if $offset_end <= $offset;

                my $tag_name = $raw_token
                    ->[Marpa::R2::HTML::Internal::Token::TAG_NAME];
                my $terminal    = $token_type . q{_} . $tag_name;
                my $terminal_id = $tracer->symbol_by_name($terminal);
                if ( not defined $terminal_id ) {
                    my $group_symbol = $primary_group_by_tag->{$tag_name}
                        // 'GRP_anywhere';
                    my $contents = $runtime_tag->{$tag_name} // 'FLO_mixed';
                    my @symbol_names = (
                        $group_symbol,
                        'ELE_' . $tag_name,
                        'S_' . $tag_name,
                        $contents, 'E_' . $tag_name
                    );
                    my @symbol_ids = ();
                    SYMBOL: for my $symbol_name (@symbol_names) {
                        my $symbol_id = $tracer->symbol_by_name($symbol_name);
                        if ( not defined $symbol_id ) {
                            $symbol_id = $tracer->symbol_new($symbol_name);
                        }
                        push @symbol_ids, $symbol_id;
                    } ## end SYMBOL: for my $symbol_name (@symbol_names)
                    my ( $top_id, $lhs_id, @rhs_ids ) = @symbol_ids;
                    $thin_grammar->rule_new( $top_id, [$lhs_id] );
                    my $element_rule_id =
                        $thin_grammar->rule_new( $lhs_id, \@rhs_ids );
                    $action_by_rule_id[$element_rule_id] = 'ELE_' . $tag_name;
                    $terminal_id = $tracer->symbol_by_name($terminal);



( run in 1.749 second using v1.01-cache-2.11-cpan-ceb78f64989 )