HTML-HTML5-Parser
view release on metacpan or search on metacpan
lib/HTML/HTML5/Parser/Tokenizer.pm view on Meta::CPAN
## "list of scripts that will execute as soon as possible" or the first
## script in the "list of scripts that will execute asynchronously",
## has completed loading. If one has, then it MUST be executed
## and removed from the list.
## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
## (This requirement was dropped from HTML5 spec, unfortunately.)
my $is_space = {
0x0009 => 1, # CHARACTER TABULATION (HT)
0x000A => 1, # LINE FEED (LF)
#0x000B => 0, # LINE TABULATION (VT)
0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
0x000D => 1, # CARRIAGE RETURN (CR)
0x0020 => 1, # SPACE (SP)
};
sub KEY_ELSE_CHAR () { 255 }
sub KEY_ULATIN_CHAR () { 254 }
sub KEY_LLATIN_CHAR () { 253 }
sub KEY_EOF_CHAR () { 252 }
sub KEY_SPACE_CHAR () { 251 }
my $Action;
my $XMLAction;
$Action->[DATA_STATE]->[0x0026] = {
name => 'data &',
state => ENTITY_STATE, # "entity data state" + "consume a character reference"
state_set => {entity_add => -1, prev_state => DATA_STATE},
};
$Action->[DATA_STATE]->[0x003C] = {
name => 'data <',
state => TAG_OPEN_STATE,
};
$Action->[DATA_STATE]->[KEY_EOF_CHAR] = {
name => 'data eof',
emit => END_OF_FILE_TOKEN,
reconsume => 1,
};
$Action->[DATA_STATE]->[0x0000] = {
name => 'data null',
emit => CHARACTER_TOKEN,
error => 'NULL',
};
$Action->[DATA_STATE]->[KEY_ELSE_CHAR] = {
name => 'data else',
emit => CHARACTER_TOKEN,
emit_data_read_until => qq{\x00<&},
};
$XMLAction->[DATA_STATE]->[0x005D] = { # ]
name => 'data ]',
state => DATA_MSE1_STATE,
emit => CHARACTER_TOKEN,
};
$XMLAction->[DATA_STATE]->[KEY_ELSE_CHAR] = {
name => 'data else xml',
emit => CHARACTER_TOKEN,
emit_data_read_until => qq{\x00<&\]},
};
$Action->[RCDATA_STATE]->[0x0026] = {
name => 'rcdata &',
state => ENTITY_STATE, # "entity data state" + "consume a character reference"
state_set => {entity_add => -1, prev_state => RCDATA_STATE},
};
$Action->[RCDATA_STATE]->[0x003C] = {
name => 'rcdata <',
state => RCDATA_LT_STATE,
};
$Action->[RCDATA_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
$Action->[RCDATA_STATE]->[0x0000] = {
name => 'rcdata null',
emit => CHARACTER_TOKEN,
emit_data => "\x{FFFD}",
error => 'NULL',
};
$Action->[RCDATA_STATE]->[KEY_ELSE_CHAR] = {
name => 'rcdata else',
emit => CHARACTER_TOKEN,
emit_data_read_until => qq{\x00<&},
};
$Action->[RAWTEXT_STATE]->[0x003C] = {
name => 'rawtext <',
state => RAWTEXT_LT_STATE,
};
$Action->[RAWTEXT_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
$Action->[RAWTEXT_STATE]->[0x0000] = $Action->[RCDATA_STATE]->[0x0000];
$Action->[RAWTEXT_STATE]->[KEY_ELSE_CHAR] = {
name => 'rawtext else',
emit => CHARACTER_TOKEN,
emit_data_read_until => qq{\x00<},
};
$Action->[SCRIPT_DATA_STATE]->[0x003C] = {
name => 'script data <',
state => SCRIPT_DATA_LT_STATE,
};
$Action->[SCRIPT_DATA_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
$Action->[SCRIPT_DATA_STATE]->[0x0000] = $Action->[RAWTEXT_STATE]->[0x0000];
$Action->[SCRIPT_DATA_STATE]->[KEY_ELSE_CHAR] = $Action->[RAWTEXT_STATE]->[KEY_ELSE_CHAR];
$Action->[PLAINTEXT_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
$Action->[PLAINTEXT_STATE]->[0x0000] = $Action->[RAWTEXT_STATE]->[0x0000];
$Action->[PLAINTEXT_STATE]->[KEY_ELSE_CHAR] = {
name => 'plaintext else',
emit => CHARACTER_TOKEN,
emit_data_read_until => qq{\x00},
};
# "Tag open state" is known as "tag state" in XML5.
$Action->[TAG_OPEN_STATE]->[0x0021] = {
name => 'tag open !',
state => MARKUP_DECLARATION_OPEN_STATE,
};
$Action->[TAG_OPEN_STATE]->[0x002F] = {
name => 'tag open /',
state => CLOSE_TAG_OPEN_STATE,
};
$Action->[TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
name => 'tag open uc',
ct => {
type => START_TAG_TOKEN,
delta => 1,
append_tag_name => 0x0020, # UC -> lc
},
state => TAG_NAME_STATE,
};
$XMLAction->[TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
name => 'tag open uc xml',
ct => {
type => START_TAG_TOKEN,
delta => 1,
append_tag_name => 0x0000,
},
state => TAG_NAME_STATE,
};
$Action->[TAG_OPEN_STATE]->[KEY_LLATIN_CHAR] = {
name => 'tag open lc',
ct => {
type => START_TAG_TOKEN,
delta => 1,
append_tag_name => 0x0000,
},
state => TAG_NAME_STATE,
};
$Action->[TAG_OPEN_STATE]->[0x003F] = {
name => 'tag open ?',
state => BOGUS_COMMENT_STATE,
error => 'pio',
error_delta => 1,
ct => {
type => COMMENT_TOKEN,
},
reconsume => 1, ## $self->{nc} is intentionally left as is
};
$XMLAction->[TAG_OPEN_STATE]->[0x003F] = { # ?
name => 'tag open ? xml',
state => PI_STATE,
};
$Action->[TAG_OPEN_STATE]->[KEY_SPACE_CHAR] =
$Action->[TAG_OPEN_STATE]->[0x003E] = { # >
name => 'tag open else',
error => 'bare stago',
error_delta => 1,
state => DATA_STATE,
reconsume => 1,
emit => CHARACTER_TOKEN,
emit_data => '<',
emit_delta => 1,
};
$Action->[TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = $Action->[TAG_OPEN_STATE]->[0x003E];
$XMLAction->[TAG_OPEN_STATE]->[0x0000] = {
name => 'tag open null xml',
ct => {
type => START_TAG_TOKEN,
delta => 1,
append_tag_name => 0xFFFD,
},
error => 'NULL',
state => TAG_NAME_STATE,
};
## XML5: "<:" has a parse error.
$XMLAction->[TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = {
name => 'tag open else xml',
ct => {
type => START_TAG_TOKEN,
delta => 1,
append_tag_name => 0x0000,
},
state => TAG_NAME_STATE,
};
$Action->[RCDATA_LT_STATE]->[0x002F] = {
name => 'rcdata lt /',
state => RCDATA_END_TAG_OPEN_STATE,
buffer => {clear => 1},
};
$Action->[RAWTEXT_LT_STATE]->[0x002F] = {
name => 'rawtext lt /',
state => RAWTEXT_END_TAG_OPEN_STATE,
buffer => {clear => 1},
};
$Action->[SCRIPT_DATA_LT_STATE]->[0x002F] = {
name => 'script data lt /',
state => SCRIPT_DATA_END_TAG_OPEN_STATE,
buffer => {clear => 1},
};
$Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[0x002F] = {
name => 'script data escaped lt /',
state => SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE,
buffer => {clear => 1},
};
$Action->[SCRIPT_DATA_LT_STATE]->[0x0021] = {
name => 'script data lt !',
state => SCRIPT_DATA_ESCAPE_START_STATE,
emit => CHARACTER_TOKEN,
emit_data => '<!',
};
$Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_ULATIN_CHAR] = {
name => 'script data escaped lt uc',
emit => CHARACTER_TOKEN,
emit_data => '<',
emit_data_append => 1,
buffer => {clear => 1, append => 0x0020}, # UC -> lc
state => SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE,
};
$Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_LLATIN_CHAR] = {
name => 'script data escaped lt lc',
emit => CHARACTER_TOKEN,
emit_data => '<',
emit_data_append => 1,
buffer => {clear => 1, append => 0x0000},
state => SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE,
};
$Action->[RCDATA_LT_STATE]->[KEY_ELSE_CHAR] = {
name => 'rcdata lt else',
state => RCDATA_STATE,
reconsume => 1,
emit => CHARACTER_TOKEN,
emit_data => '<',
};
$Action->[RAWTEXT_LT_STATE]->[KEY_ELSE_CHAR] = {
name => 'rawtext lt else',
state => RAWTEXT_STATE,
reconsume => 1,
emit => CHARACTER_TOKEN,
emit_data => '<',
};
$Action->[SCRIPT_DATA_LT_STATE]->[KEY_ELSE_CHAR] = {
name => 'script data lt else',
state => SCRIPT_DATA_STATE,
reconsume => 1,
emit => CHARACTER_TOKEN,
emit_data => '<',
};
$Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_ELSE_CHAR] = {
name => 'script data escaped lt else',
state => SCRIPT_DATA_ESCAPED_STATE,
reconsume => 1,
emit => CHARACTER_TOKEN,
emit_data => '<',
};
## XXX "End tag token" in latest HTML5 and in XML5.
$Action->[CLOSE_TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
name => 'end tag open uc',
ct => {
type => END_TAG_TOKEN,
delta => 2,
append_tag_name => 0x0020, # UC -> lc
},
state => TAG_NAME_STATE,
};
$XMLAction->[CLOSE_TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
name => 'end tag open uc xml',
ct => {
type => END_TAG_TOKEN,
delta => 2,
append_tag_name => 0x0000,
},
state => TAG_NAME_STATE,
};
$Action->[CLOSE_TAG_OPEN_STATE]->[KEY_LLATIN_CHAR] = {
name => 'end tag open lc',
ct => {
type => END_TAG_TOKEN,
delta => 2,
append_tag_name => 0x0000,
},
state => TAG_NAME_STATE,
};
$Action->[CLOSE_TAG_OPEN_STATE]->[0x003E] = {
name => 'end tag open >',
error => 'empty end tag',
error_delta => 2, # "<" in "</>"
state => DATA_STATE,
};
lib/HTML/HTML5/Parser/Tokenizer.pm view on Meta::CPAN
quirks => 1,
line => $self->{line_prev},
column => $self->{column_prev} - 7,
};
if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
$self->{line_prev} = $self->{line};
$self->{column_prev} = $self->{column};
$self->{column}++;
$self->{nc}
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
} else {
$self->{set_nc}->($self);
}
redo A;
} else {
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
line => $self->{line_prev},
column => $self->{column_prev} - 1 - length $self->{kwd});
$self->{state} = BOGUS_COMMENT_STATE;
## Reconsume.
$self->{ct} = {type => COMMENT_TOKEN,
data => $self->{kwd},
line => $self->{line_prev},
column => $self->{column_prev} - 1 - length $self->{kwd},
};
redo A;
}
} elsif ($state == MD_CDATA_STATE) {
if ($nc == {
'[' => 0x0043, # C
'[C' => 0x0044, # D
'[CD' => 0x0041, # A
'[CDA' => 0x0054, # T
'[CDAT' => 0x0041, # A
'[CDATA' => NEVER_CHAR, # ([)
}->{$self->{kwd}}) {
## Stay in the state.
$self->{kwd} .= chr $nc;
if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
$self->{line_prev} = $self->{line};
$self->{column_prev} = $self->{column};
$self->{column}++;
$self->{nc}
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
} else {
$self->{set_nc}->($self);
}
redo A;
} elsif ($self->{kwd} eq '[CDATA' and
$nc == 0x005B) { # [
if ($self->{is_xml} and
not $self->{tainted} and
@{$self->{open_elements} or []} == 0) {
$self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
line => $self->{line_prev},
column => $self->{column_prev} - 7);
$self->{tainted} = 1;
} else {
}
$self->{ct} = {type => CHARACTER_TOKEN,
data => '',
line => $self->{line_prev},
column => $self->{column_prev} - 7};
$self->{state} = CDATA_SECTION_STATE;
if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
$self->{line_prev} = $self->{line};
$self->{column_prev} = $self->{column};
$self->{column}++;
$self->{nc}
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
} else {
$self->{set_nc}->($self);
}
redo A;
} else {
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
line => $self->{line_prev},
column => $self->{column_prev} - 1 - length $self->{kwd});
$self->{state} = BOGUS_COMMENT_STATE;
## Reconsume.
$self->{ct} = {type => COMMENT_TOKEN,
data => $self->{kwd},
line => $self->{line_prev},
column => $self->{column_prev} - 1 - length $self->{kwd},
};
redo A;
}
} elsif ($state == COMMENT_START_STATE) {
if ($nc == 0x002D) { # -
$self->{state} = COMMENT_START_DASH_STATE;
if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
$self->{line_prev} = $self->{line};
$self->{column_prev} = $self->{column};
$self->{column}++;
$self->{nc}
= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
} else {
$self->{set_nc}->($self);
}
redo A;
} elsif ($nc == 0x003E) { # >
$self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
if ($self->{in_subset}) {
$self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
} else {
( run in 0.825 second using v1.01-cache-2.11-cpan-172d661cebc )