HTML-HTML5-Parser

 view release on metacpan or  search on metacpan

lib/HTML/HTML5/Parser/Tokenizer.pm  view on Meta::CPAN

## "list of scripts that will execute as soon as possible" or the first
## script in the "list of scripts that will execute asynchronously",
## has completed loading.  If one has, then it MUST be executed
## and removed from the list.

## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
## (This requirement was dropped from HTML5 spec, unfortunately.)

my $is_space = {
  0x0009 => 1, # CHARACTER TABULATION (HT)
  0x000A => 1, # LINE FEED (LF)
  #0x000B => 0, # LINE TABULATION (VT)
  0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
  0x000D => 1, # CARRIAGE RETURN (CR)
  0x0020 => 1, # SPACE (SP)
};

sub KEY_ELSE_CHAR () { 255 }
sub KEY_ULATIN_CHAR () { 254 }
sub KEY_LLATIN_CHAR () { 253 }
sub KEY_EOF_CHAR () { 252 }
sub KEY_SPACE_CHAR () { 251 }

my $Action;
my $XMLAction;
$Action->[DATA_STATE]->[0x0026] = {
  name => 'data &',
  state => ENTITY_STATE, # "entity data state" + "consume a character reference"
  state_set => {entity_add => -1, prev_state => DATA_STATE},
};
$Action->[DATA_STATE]->[0x003C] = {
  name => 'data <',
  state => TAG_OPEN_STATE,
};
$Action->[DATA_STATE]->[KEY_EOF_CHAR] = {
  name => 'data eof',
  emit => END_OF_FILE_TOKEN,
  reconsume => 1,
};
$Action->[DATA_STATE]->[0x0000] = {
  name => 'data null',
  emit => CHARACTER_TOKEN,
  error => 'NULL',
};
$Action->[DATA_STATE]->[KEY_ELSE_CHAR] = {
  name => 'data else',
  emit => CHARACTER_TOKEN,
  emit_data_read_until => qq{\x00<&},
};
  $XMLAction->[DATA_STATE]->[0x005D] = { # ]
    name => 'data ]',
    state => DATA_MSE1_STATE,
    emit => CHARACTER_TOKEN,
  };
  $XMLAction->[DATA_STATE]->[KEY_ELSE_CHAR] = {
    name => 'data else xml',
    emit => CHARACTER_TOKEN,
    emit_data_read_until => qq{\x00<&\]},
  };
$Action->[RCDATA_STATE]->[0x0026] = {
  name => 'rcdata &',
  state => ENTITY_STATE, # "entity data state" + "consume a character reference"
  state_set => {entity_add => -1, prev_state => RCDATA_STATE},
};
$Action->[RCDATA_STATE]->[0x003C] = {
  name => 'rcdata <',
  state => RCDATA_LT_STATE,
};
$Action->[RCDATA_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
$Action->[RCDATA_STATE]->[0x0000] = {
  name => 'rcdata null',
  emit => CHARACTER_TOKEN,
  emit_data => "\x{FFFD}",
  error => 'NULL',
};
$Action->[RCDATA_STATE]->[KEY_ELSE_CHAR] = {
  name => 'rcdata else',
  emit => CHARACTER_TOKEN,
  emit_data_read_until => qq{\x00<&},
};
$Action->[RAWTEXT_STATE]->[0x003C] = {
  name => 'rawtext <',
  state => RAWTEXT_LT_STATE,
};
$Action->[RAWTEXT_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
$Action->[RAWTEXT_STATE]->[0x0000] = $Action->[RCDATA_STATE]->[0x0000];
$Action->[RAWTEXT_STATE]->[KEY_ELSE_CHAR] = {
  name => 'rawtext else',
  emit => CHARACTER_TOKEN,
  emit_data_read_until => qq{\x00<},
};
$Action->[SCRIPT_DATA_STATE]->[0x003C] = {
  name => 'script data <',
  state => SCRIPT_DATA_LT_STATE,
};
$Action->[SCRIPT_DATA_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
$Action->[SCRIPT_DATA_STATE]->[0x0000] = $Action->[RAWTEXT_STATE]->[0x0000];
$Action->[SCRIPT_DATA_STATE]->[KEY_ELSE_CHAR] = $Action->[RAWTEXT_STATE]->[KEY_ELSE_CHAR];
$Action->[PLAINTEXT_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
$Action->[PLAINTEXT_STATE]->[0x0000] = $Action->[RAWTEXT_STATE]->[0x0000];
$Action->[PLAINTEXT_STATE]->[KEY_ELSE_CHAR] = {
  name => 'plaintext else',
  emit => CHARACTER_TOKEN,
  emit_data_read_until => qq{\x00},
};
# "Tag open state" is known as "tag state" in XML5.
$Action->[TAG_OPEN_STATE]->[0x0021] = {
  name => 'tag open !',
  state => MARKUP_DECLARATION_OPEN_STATE,
};
$Action->[TAG_OPEN_STATE]->[0x002F] = {
  name => 'tag open /',
  state => CLOSE_TAG_OPEN_STATE,
};
$Action->[TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
  name => 'tag open uc',
  ct => {
    type => START_TAG_TOKEN,
    delta => 1,
    append_tag_name => 0x0020, # UC -> lc
  },
  state => TAG_NAME_STATE,
};
  $XMLAction->[TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
    name => 'tag open uc xml',
    ct => {
      type => START_TAG_TOKEN,
      delta => 1,
      append_tag_name => 0x0000,
    },
    state => TAG_NAME_STATE,
  };
$Action->[TAG_OPEN_STATE]->[KEY_LLATIN_CHAR] = {
  name => 'tag open lc',
  ct => {
    type => START_TAG_TOKEN,
    delta => 1,
    append_tag_name => 0x0000,
  },
  state => TAG_NAME_STATE,
};
$Action->[TAG_OPEN_STATE]->[0x003F] = {
  name => 'tag open ?',
  state => BOGUS_COMMENT_STATE,
  error => 'pio',
  error_delta => 1,
  ct => {
    type => COMMENT_TOKEN,
  },
  reconsume => 1, ## $self->{nc} is intentionally left as is
};
  $XMLAction->[TAG_OPEN_STATE]->[0x003F] = { # ?
    name => 'tag open ? xml',
    state => PI_STATE,
  };
$Action->[TAG_OPEN_STATE]->[KEY_SPACE_CHAR] =
$Action->[TAG_OPEN_STATE]->[0x003E] = { # >
  name => 'tag open else',
  error => 'bare stago',
  error_delta => 1,
  state => DATA_STATE,
  reconsume => 1,
  emit => CHARACTER_TOKEN,
  emit_data => '<',
  emit_delta => 1,
};
$Action->[TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = $Action->[TAG_OPEN_STATE]->[0x003E];
  $XMLAction->[TAG_OPEN_STATE]->[0x0000] = {
    name => 'tag open null xml',
    ct => {
      type => START_TAG_TOKEN,
      delta => 1,
      append_tag_name => 0xFFFD,
    },
    error => 'NULL',
    state => TAG_NAME_STATE,
  };
  ## XML5: "<:" has a parse error.
  $XMLAction->[TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = {
    name => 'tag open else xml',
    ct => {
      type => START_TAG_TOKEN,
      delta => 1,
      append_tag_name => 0x0000,
    },
    state => TAG_NAME_STATE,
  };
$Action->[RCDATA_LT_STATE]->[0x002F] = {
  name => 'rcdata lt /',
  state => RCDATA_END_TAG_OPEN_STATE,
  buffer => {clear => 1},
};
$Action->[RAWTEXT_LT_STATE]->[0x002F] = {
  name => 'rawtext lt /',
  state => RAWTEXT_END_TAG_OPEN_STATE,
  buffer => {clear => 1},
};
$Action->[SCRIPT_DATA_LT_STATE]->[0x002F] = {
  name => 'script data lt /',
  state => SCRIPT_DATA_END_TAG_OPEN_STATE,
  buffer => {clear => 1},
};
$Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[0x002F] = {
  name => 'script data escaped lt /',
  state => SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE,
  buffer => {clear => 1},
};
$Action->[SCRIPT_DATA_LT_STATE]->[0x0021] = {
  name => 'script data lt !',
  state => SCRIPT_DATA_ESCAPE_START_STATE,
  emit => CHARACTER_TOKEN,
  emit_data => '<!',
};
$Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_ULATIN_CHAR] = {
  name => 'script data escaped lt uc',
  emit => CHARACTER_TOKEN,
  emit_data => '<',
  emit_data_append => 1,
  buffer => {clear => 1, append => 0x0020}, # UC -> lc
  state => SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE,
};
$Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_LLATIN_CHAR] = {
  name => 'script data escaped lt lc',
  emit => CHARACTER_TOKEN,
  emit_data => '<',
  emit_data_append => 1,
  buffer => {clear => 1, append => 0x0000},
  state => SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE,
};
$Action->[RCDATA_LT_STATE]->[KEY_ELSE_CHAR] = {
  name => 'rcdata lt else',
  state => RCDATA_STATE,
  reconsume => 1,
  emit => CHARACTER_TOKEN,
  emit_data => '<',
};
$Action->[RAWTEXT_LT_STATE]->[KEY_ELSE_CHAR] = {
  name => 'rawtext lt else',
  state => RAWTEXT_STATE,
  reconsume => 1,
  emit => CHARACTER_TOKEN,
  emit_data => '<',
};
$Action->[SCRIPT_DATA_LT_STATE]->[KEY_ELSE_CHAR] = {
  name => 'script data lt else',
  state => SCRIPT_DATA_STATE,
  reconsume => 1,
  emit => CHARACTER_TOKEN,
  emit_data => '<',
};
$Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_ELSE_CHAR] = {
  name => 'script data escaped lt else',
  state => SCRIPT_DATA_ESCAPED_STATE,
  reconsume => 1,
  emit => CHARACTER_TOKEN,
  emit_data => '<',
};
## XXX "End tag token" in latest HTML5 and in XML5.
$Action->[CLOSE_TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
  name => 'end tag open uc',
  ct => {
    type => END_TAG_TOKEN,
    delta => 2,
    append_tag_name => 0x0020, # UC -> lc
  },
  state => TAG_NAME_STATE,
};
  $XMLAction->[CLOSE_TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
    name => 'end tag open uc xml',
    ct => {
      type => END_TAG_TOKEN,
      delta => 2,
      append_tag_name => 0x0000,
    },
    state => TAG_NAME_STATE,
  };
$Action->[CLOSE_TAG_OPEN_STATE]->[KEY_LLATIN_CHAR] = {
  name => 'end tag open lc',
  ct => {
    type => END_TAG_TOKEN,
    delta => 2,
    append_tag_name => 0x0000,
  },
  state => TAG_NAME_STATE,
};
$Action->[CLOSE_TAG_OPEN_STATE]->[0x003E] = {
  name => 'end tag open >',
  error => 'empty end tag',
  error_delta => 2, # "<" in "</>"
  state => DATA_STATE,
};

lib/HTML/HTML5/Parser/Tokenizer.pm  view on Meta::CPAN

                                  quirks => 1,
                                  line => $self->{line_prev},
                                  column => $self->{column_prev} - 7,
                                 };
        
    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
      $self->{line_prev} = $self->{line};
      $self->{column_prev} = $self->{column};
      $self->{column}++;
      $self->{nc}
          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
    } else {
      $self->{set_nc}->($self);
    }
  
        redo A;
      } else {
                
        $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
                        line => $self->{line_prev},
                        column => $self->{column_prev} - 1 - length $self->{kwd});
        $self->{state} = BOGUS_COMMENT_STATE;
        ## Reconsume.
        $self->{ct} = {type => COMMENT_TOKEN,
                                  data => $self->{kwd},
                                  line => $self->{line_prev},
                                  column => $self->{column_prev} - 1 - length $self->{kwd},
                                 };
        redo A;
      }
    } elsif ($state == MD_CDATA_STATE) {
      if ($nc == {
            '[' => 0x0043, # C
            '[C' => 0x0044, # D
            '[CD' => 0x0041, # A
            '[CDA' => 0x0054, # T
            '[CDAT' => 0x0041, # A
            '[CDATA' => NEVER_CHAR, # ([)
          }->{$self->{kwd}}) {
        
        ## Stay in the state.
        $self->{kwd} .= chr $nc;
        
    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
      $self->{line_prev} = $self->{line};
      $self->{column_prev} = $self->{column};
      $self->{column}++;
      $self->{nc}
          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
    } else {
      $self->{set_nc}->($self);
    }
  
        redo A;
      } elsif ($self->{kwd} eq '[CDATA' and
               $nc == 0x005B) { # [
        if ($self->{is_xml} and 
            not $self->{tainted} and
            @{$self->{open_elements} or []} == 0) {
          
          $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
                          line => $self->{line_prev},
                          column => $self->{column_prev} - 7);
          $self->{tainted} = 1;
        } else {
          
        }

        $self->{ct} = {type => CHARACTER_TOKEN,
                                  data => '',
                                  line => $self->{line_prev},
                                  column => $self->{column_prev} - 7};
        $self->{state} = CDATA_SECTION_STATE;
        
    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
      $self->{line_prev} = $self->{line};
      $self->{column_prev} = $self->{column};
      $self->{column}++;
      $self->{nc}
          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
    } else {
      $self->{set_nc}->($self);
    }
  
        redo A;
      } else {
        
        $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
                        line => $self->{line_prev},
                        column => $self->{column_prev} - 1 - length $self->{kwd});
        $self->{state} = BOGUS_COMMENT_STATE;
        ## Reconsume.
        $self->{ct} = {type => COMMENT_TOKEN,
                                  data => $self->{kwd},
                                  line => $self->{line_prev},
                                  column => $self->{column_prev} - 1 - length $self->{kwd},
                                 };
        redo A;
      }
    } elsif ($state == COMMENT_START_STATE) {
      if ($nc == 0x002D) { # -
        
        $self->{state} = COMMENT_START_DASH_STATE;
        
    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
      $self->{line_prev} = $self->{line};
      $self->{column_prev} = $self->{column};
      $self->{column}++;
      $self->{nc}
          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
    } else {
      $self->{set_nc}->($self);
    }
  
        redo A;
      } elsif ($nc == 0x003E) { # >
        $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
        if ($self->{in_subset}) {
          
          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
        } else {



( run in 0.825 second using v1.01-cache-2.11-cpan-172d661cebc )