HTML-Parser
view release on metacpan or search on metacpan
s = parse_buf(aTHX_ p_state, s, end, utf8, self);
continue;
}
if (!p_state->strict_comment && *s == '<') {
char *s1 = s + 1;
if (s1 == end || isHNAME_FIRST(*s1) || *s1 == '/' || *s1 == '!' || *s1 == '?') {
/* some kind of unterminated markup. Report rest as as comment */
token_pos_t token;
token.beg = s + 1;
token.end = end;
report_event(p_state, E_COMMENT, s, end, utf8, &token, 1, self);
s = end;
}
}
break;
}
if (s < end) {
/* report rest as text */
report_event(p_state, E_TEXT, s, end, utf8, 0, 0, self);
}
SvREFCNT_dec(p_state->buf);
p_state->buf = 0;
}
if (p_state->pend_text && SvOK(p_state->pend_text))
flush_pending_text(p_state, self);
if (p_state->ignoring_element) {
/* document not balanced */
SvREFCNT_dec(p_state->ignoring_element);
p_state->ignoring_element = 0;
}
report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, 0, self);
/* reset state */
p_state->offset = 0;
if (p_state->line)
p_state->line = 1;
p_state->column = 0;
p_state->start_document = 0;
p_state->literal_mode = 0;
p_state->is_cdata = 0;
return;
}
if (p_state->utf8_mode)
sv_utf8_downgrade(chunk, 0);
if (p_state->buf && SvOK(p_state->buf)) {
sv_catsv(p_state->buf, chunk);
beg = SvPV(p_state->buf, len);
utf8 = SvUTF8(p_state->buf);
}
else {
beg = SvPV(chunk, len);
utf8 = SvUTF8(chunk);
if (p_state->offset == 0 && DOWARN) {
/* Print warnings if we find unexpected Unicode BOM forms */
if (p_state->argspec_entity_decode &&
!(p_state->attr_encoded && p_state->argspec_entity_decode == ARG_ATTR) &&
!p_state->utf8_mode && (
(!utf8 && len >= 3 && strnEQ(beg, "\xEF\xBB\xBF", 3)) ||
(utf8 && len >= 6 && strnEQ(beg, "\xC3\xAF\xC2\xBB\xC2\xBF", 6)) ||
(!utf8 && probably_utf8_chunk(aTHX_ beg, len))
)
)
{
warn("Parsing of undecoded UTF-8 will give garbage when decoding entities");
}
if (utf8 && len >= 2 && strnEQ(beg, "\xFF\xFE", 2)) {
warn("Parsing string decoded with wrong endianness");
}
if (!utf8 && len >= 4 &&
(strnEQ(beg, "\x00\x00\xFE\xFF", 4) ||
strnEQ(beg, "\xFE\xFF\x00\x00", 4))
)
{
warn("Parsing of undecoded UTF-32");
}
else if (!utf8 && len >= 2 &&
(strnEQ(beg, "\xFE\xFF", 2) || strnEQ(beg, "\xFF\xFE", 2))
)
{
warn("Parsing of undecoded UTF-16");
}
}
}
if (!len)
return; /* nothing to do */
end = beg + len;
s = parse_buf(aTHX_ p_state, beg, end, utf8, self);
if (s == end || p_state->eof) {
if (p_state->buf) {
SvOK_off(p_state->buf);
}
}
else {
/* need to keep rest in buffer */
if (p_state->buf) {
/* chop off some chars at the beginning */
if (SvOK(p_state->buf)) {
sv_chop(p_state->buf, s);
}
else {
sv_setpvn(p_state->buf, s, end - s);
if (utf8)
SvUTF8_on(p_state->buf);
else
SvUTF8_off(p_state->buf);
}
}
else {
p_state->buf = newSVpv(s, end - s);
if (utf8)
SvUTF8_on(p_state->buf);
( run in 0.506 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )