Marpa-R2
view release on metacpan or search on metacpan
html/lib/Marpa/R2/HTML.pm view on Meta::CPAN
my $ref_type = ref $document_ref;
Marpa::R2::exception('Arg to parse() must be ref to string')
if not $ref_type
or $ref_type ne 'SCALAR'
or not defined ${$document_ref};
my $document = $self->{document} = $document_ref;
my ($core_rules, $runtime_tag,
$rank_by_name, $is_empty_element,
$primary_group_by_tag
) = $self->{config}->contents();
$self->{is_empty_element} = $is_empty_element;
if ($self->{dump_config}) {
return $self->{config}->as_string();
}
my @action_by_rule_id = ();
$self->{action_by_rule_id} = \@action_by_rule_id;
my $thin_grammar = Marpa::R2::Thin::G->new( { if => 1 } );
my $tracer = Marpa::R2::Thin::Trace->new($thin_grammar);
$self->{tracer} = $tracer;
RULE: for my $rule ( @{$core_rules} ) {
my $lhs = $rule->{lhs};
my $rhs = $rule->{rhs};
my $min = $rule->{min};
my $action = $rule->{action};
my @symbol_ids = ();
for my $symbol_name ( $lhs, @{$rhs} ) {
push @symbol_ids,
$tracer->symbol_by_name($symbol_name)
// $tracer->symbol_new($symbol_name);
}
my ($lhs_id, @rhs_ids) = @symbol_ids;
my $rule_id;
if ( defined $min ) {
$rule_id =
$thin_grammar->sequence_new( $lhs_id, $rhs_ids[0],
{ min => $min } );
}
else {
$rule_id = $thin_grammar->rule_new( $lhs_id, \@rhs_ids );
}
$action_by_rule_id[$rule_id] = $action;
} ## end RULE: for my $rule ( @{$core_rules} )
# Some constants that we will use a lot
my $SYMID_CRUFT = $tracer->symbol_by_name('CRUFT');
my $SYMID_CDATA = $tracer->symbol_by_name('CDATA');
my $SYMID_PCDATA = $tracer->symbol_by_name('PCDATA');
my $SYMID_WHITESPACE = $tracer->symbol_by_name('WHITESPACE');
my $SYMID_PI = $tracer->symbol_by_name('PI');
my $SYMID_C = $tracer->symbol_by_name('C');
my $SYMID_D = $tracer->symbol_by_name('D');
my $SYMID_EOF = $tracer->symbol_by_name('EOF');
my @raw_tokens = ();
my $p = HTML::Parser->new(
api_version => 3,
start_h => [
\@raw_tokens, q{tagname,'S',line,column,offset,offset_end,is_cdata,attr}
],
end_h =>
[ \@raw_tokens, q{tagname,'E',line,column,offset,offset_end,is_cdata} ],
text_h => [
\@raw_tokens,
qq{'$SYMID_WHITESPACE','T',line,column,offset,offset_end,is_cdata}
],
comment_h =>
[ \@raw_tokens, qq{'$SYMID_C','C',line,column,offset,offset_end,is_cdata} ],
declaration_h =>
[ \@raw_tokens, qq{'$SYMID_D','D',line,column,offset,offset_end,is_cdata} ],
process_h =>
[ \@raw_tokens, qq{'$SYMID_PI','PI',line,column,offset,offset_end,is_cdata} ],
unbroken_text => 1
);
$p->parse( ${$document} );
$p->eof;
my @html_parser_tokens = ();
HTML_PARSER_TOKEN:
for my $raw_token (@raw_tokens) {
my ( undef, $token_type, $line, $column, $offset, $offset_end, $is_cdata, $attr ) =
@{$raw_token};
PROCESS_TOKEN_TYPE: {
if ($is_cdata) {
$raw_token->[Marpa::R2::HTML::Internal::Token::TOKEN_ID] =
$SYMID_CDATA;
last PROCESS_TOKEN_TYPE;
}
if ( $token_type eq 'T' ) {
# White space as defined in HTML 4.01
# space (x20); ASCII tab (x09); ASCII form feed (x0C;); Zero-width space (x200B)
# and the two characters which appear in line breaks:
# carriage return (x0D) and line feed (x0A)
# I avoid the Perl character codes because I do NOT want
# localization
$raw_token->[Marpa::R2::HTML::Internal::Token::TOKEN_ID] =
$SYMID_PCDATA if
substr(
${$document}, $offset, ( $offset_end - $offset )
) =~ / [^\x09\x0A\x0C\x0D\x20\x{200B}] /oxms;
last PROCESS_TOKEN_TYPE;
} ## end if ( $token_type eq 'T' )
if ( $token_type eq 'E' or $token_type eq 'S' ) {
# If it's a virtual token from HTML::Parser,
# pretend it never existed.
# HTML::Parser supplies missing
# end tags for title elements, but for no
# others.
# This is not helpful and we need to special-case
# these zero-length tags and throw them away.
next HTML_PARSER_TOKEN if $offset_end <= $offset;
my $tag_name = $raw_token
->[Marpa::R2::HTML::Internal::Token::TAG_NAME];
my $terminal = $token_type . q{_} . $tag_name;
my $terminal_id = $tracer->symbol_by_name($terminal);
if ( not defined $terminal_id ) {
my $group_symbol = $primary_group_by_tag->{$tag_name}
// 'GRP_anywhere';
my $contents = $runtime_tag->{$tag_name} // 'FLO_mixed';
my @symbol_names = (
$group_symbol,
'ELE_' . $tag_name,
'S_' . $tag_name,
$contents, 'E_' . $tag_name
);
my @symbol_ids = ();
SYMBOL: for my $symbol_name (@symbol_names) {
my $symbol_id = $tracer->symbol_by_name($symbol_name);
if ( not defined $symbol_id ) {
$symbol_id = $tracer->symbol_new($symbol_name);
}
push @symbol_ids, $symbol_id;
} ## end SYMBOL: for my $symbol_name (@symbol_names)
my ( $top_id, $lhs_id, @rhs_ids ) = @symbol_ids;
$thin_grammar->rule_new( $top_id, [$lhs_id] );
my $element_rule_id =
$thin_grammar->rule_new( $lhs_id, \@rhs_ids );
$action_by_rule_id[$element_rule_id] = 'ELE_' . $tag_name;
$terminal_id = $tracer->symbol_by_name($terminal);
( run in 1.749 second using v1.01-cache-2.11-cpan-ceb78f64989 )