HTML-Parser
view release on metacpan or search on metacpan
/*
* Copyright 1999-2016, Gisle Aas
* Copyright 1999-2000, Michael A. Chase
*
* This library is free software; you can redistribute it and/or
* modify it under the same terms as Perl itself.
*/
#ifndef EXTERN
#define EXTERN extern
#endif
#include "hctype.h" /* isH...() macros */
#include "tokenpos.h" /* dTOKEN; PUSH_TOKEN() */
const static
struct literal_tag {
int len;
char* str;
int is_cdata;
}
literal_mode_elem[] =
{
{6, "script", 1},
{5, "style", 1},
{3, "xmp", 1},
{6, "iframe", 1},
{9, "plaintext", 1},
{5, "title", 0},
{8, "textarea", 0},
{0, 0, 0}
};
enum argcode {
ARG_SELF = 1, /* need to avoid '\0' in argspec string */
ARG_TOKENS,
ARG_TOKENPOS,
ARG_TOKEN0,
ARG_TAGNAME,
ARG_TAG,
ARG_ATTR,
ARG_ATTRARR,
ARG_ATTRSEQ,
ARG_TEXT,
ARG_DTEXT,
ARG_IS_CDATA,
ARG_SKIPPED_TEXT,
ARG_OFFSET,
ARG_OFFSET_END,
ARG_LENGTH,
ARG_LINE,
ARG_COLUMN,
ARG_EVENT,
ARG_UNDEF,
ARG_LITERAL, /* Always keep last */
/* extra flags always encoded first */
ARG_FLAG_FLAT_ARRAY
};
static const char * const argname[] = {
/* Must be in the same order as enum argcode */
"self", /* ARG_SELF */
"tokens", /* ARG_TOKENS */
"tokenpos", /* ARG_TOKENPOS */
"token0", /* ARG_TOKEN0 */
"tagname", /* ARG_TAGNAME */
"tag", /* ARG_TAG */
"attr", /* ARG_ATTR */
"@attr", /* ARG_ATTRARR */
"attrseq", /* ARG_ATTRSEQ */
"text", /* ARG_TEXT */
"dtext", /* ARG_DTEXT */
"is_cdata", /* ARG_IS_CDATA */
"skipped_text", /* ARG_SKIPPED_TEXT */
"offset", /* ARG_OFFSET */
"offset_end", /* ARG_OFFSET_END */
"length", /* ARG_LENGTH */
"line", /* ARG_LINE */
"column", /* ARG_COLUMN */
"event", /* ARG_EVENT */
"undef", /* ARG_UNDEF */
/* ARG_LITERAL (not compared) */
/* ARG_FLAG_FLAT_ARRAY */
};
#define CASE_SENSITIVE(p_state) \
((p_state)->xml_mode || (p_state)->case_sensitive)
#define STRICT_NAMES(p_state) \
((p_state)->xml_mode || (p_state)->strict_names)
#define ALLOW_EMPTY_TAG(p_state) \
((p_state)->xml_mode || (p_state)->empty_element_tags)
static void flush_pending_text(PSTATE* p_state, SV* self);
/*
* Parser functions.
*
* parse() - top level entry point.
* deals with text and calls one of its
* subordinate parse_*() routines after
* looking at the first char after "<"
* parse_decl() - deals with declarations <!...>
* parse_comment() - deals with <!-- ... -->
* parse_marked_section - deals with <![ ... [ ... ]]>
* parse_end() - deals with end tags </...>
* parse_start() - deals with start tags <A...>
* parse_process() - deals with process instructions <?...>
* parse_null() - deals with anything else <....>
*
* report_event() - called whenever any of the parse*() routines
* has recongnized something.
*/
static void
report_event(PSTATE* p_state,
event_id_t event,
char *beg, char *end, U32 utf8,
token_pos_t *tokens, int num_tokens,
SV* self
)
{
struct p_handler *h;
dTHX;
dSP;
AV *array;
STRLEN my_na;
char *argspec;
char *s;
STRLEN offset;
STRLEN line;
STRLEN column;
#define CHR_DIST(a,b) (utf8 ? utf8_distance((U8*)(a),(U8*)(b)) : (a) - (b))
SvUTF8_off(tagname);
if (!CASE_SENSITIVE(p_state))
sv_lower(aTHX_ tagname);
if (p_state->ignoring_element) {
if (sv_eq(p_state->ignoring_element, tagname)) {
if (event == E_START)
p_state->ignore_depth++;
else if (--p_state->ignore_depth == 0) {
SvREFCNT_dec(p_state->ignoring_element);
p_state->ignoring_element = 0;
}
}
goto IGNORE_EVENT;
}
if (p_state->ignore_elements &&
hv_fetch_ent(p_state->ignore_elements, tagname, 0, 0))
{
if (event == E_START) {
p_state->ignoring_element = newSVsv(tagname);
p_state->ignore_depth = 1;
}
goto IGNORE_EVENT;
}
if (p_state->ignore_tags &&
hv_fetch_ent(p_state->ignore_tags, tagname, 0, 0))
{
goto IGNORE_EVENT;
}
if (p_state->report_tags &&
!hv_fetch_ent(p_state->report_tags, tagname, 0, 0))
{
goto IGNORE_EVENT;
}
}
else if (p_state->ignoring_element) {
goto IGNORE_EVENT;
}
}
h = &p_state->handlers[event];
if (!h->cb) {
/* event = E_DEFAULT; */
h = &p_state->handlers[E_DEFAULT];
if (!h->cb)
goto IGNORE_EVENT;
}
if (SvTYPE(h->cb) != SVt_PVAV && !SvTRUE(h->cb)) {
/* FALSE scalar ('' or 0) means IGNORE this event */
return;
}
if (p_state->unbroken_text && event == E_TEXT) {
/* should buffer text */
if (!p_state->pend_text)
p_state->pend_text = newSV(256);
if (SvOK(p_state->pend_text)) {
if (p_state->is_cdata != p_state->pend_text_is_cdata) {
flush_pending_text(p_state, self);
SPAGAIN;
goto INIT_PEND_TEXT;
}
}
else {
INIT_PEND_TEXT:
p_state->pend_text_offset = offset;
p_state->pend_text_line = line;
p_state->pend_text_column = column;
p_state->pend_text_is_cdata = p_state->is_cdata;
sv_setpvs(p_state->pend_text, "");
if (!utf8)
SvUTF8_off(p_state->pend_text);
}
if (utf8 && !SvUTF8(p_state->pend_text))
sv_utf8_upgrade(p_state->pend_text);
if (utf8 || !SvUTF8(p_state->pend_text)) {
sv_catpvn(p_state->pend_text, beg, end - beg);
}
else {
SV *tmp = newSVpvn(beg, end - beg);
sv_utf8_upgrade(tmp);
sv_catsv(p_state->pend_text, tmp);
SvREFCNT_dec(tmp);
}
return;
}
else if (p_state->pend_text && SvOK(p_state->pend_text)) {
flush_pending_text(p_state, self);
SPAGAIN;
}
/* At this point we have decided to generate an event callback */
argspec = h->argspec ? SvPV(h->argspec, my_na) : "";
if (SvTYPE(h->cb) == SVt_PVAV) {
if (*argspec == ARG_FLAG_FLAT_ARRAY) {
argspec++;
array = (AV*)h->cb;
}
else {
/* start sub-array for accumulator array */
array = newAV();
}
}
else {
array = 0;
if (*argspec == ARG_FLAG_FLAT_ARRAY)
argspec++;
/* start argument stack for callback */
ENTER;
SAVETMPS;
PUSHMARK(SP);
}
for (s = argspec; *s; s++) {
SV* arg = 0;
int push_arg = 1;
enum argcode argcode = (enum argcode)*s;
switch( argcode ) {
case ARG_SELF:
arg = sv_mortalcopy(self);
break;
case ARG_TOKENS:
attrval = newSVsv(p_state->bool_attr_val);
else
attrval = newSVsv(attrname);
}
if (!CASE_SENSITIVE(p_state))
sv_lower(aTHX_ attrname);
if (argcode == ARG_ATTR) {
if (hv_exists_ent(hv, attrname, 0) ||
!hv_store_ent(hv, attrname, attrval, 0)) {
SvREFCNT_dec(attrval);
}
SvREFCNT_dec(attrname);
}
else { /* ARG_ATTRARR */
if (array) {
av_push(array, attrname);
av_push(array, attrval);
}
else {
mXPUSHs(attrname);
mXPUSHs(attrval);
}
}
}
}
else if (argcode == ARG_ATTRARR) {
push_arg = 0;
}
break;
case ARG_ATTRSEQ: /* (v2 compatibility stuff) */
if (event == E_START) {
AV* av = newAV();
int i;
for (i = 1; i < num_tokens; i += 2) {
SV* attrname = newSVpvn(tokens[i].beg,
tokens[i].end-tokens[i].beg);
if (utf8)
SvUTF8_on(attrname);
if (!CASE_SENSITIVE(p_state))
sv_lower(aTHX_ attrname);
av_push(av, attrname);
}
arg = sv_2mortal(newRV_noinc((SV*)av));
}
break;
case ARG_TEXT:
arg = sv_2mortal(newSVpvn(beg, end - beg));
if (utf8)
SvUTF8_on(arg);
break;
case ARG_DTEXT:
if (event == E_TEXT) {
arg = sv_2mortal(newSVpvn(beg, end - beg));
if (utf8)
SvUTF8_on(arg);
if (!p_state->is_cdata) {
if (p_state->utf8_mode) {
sv_utf8_decode(arg);
sv_utf8_upgrade(arg);
}
decode_entities(aTHX_ arg, p_state->entity2char, 1);
if (p_state->utf8_mode)
SvUTF8_off(arg);
}
}
break;
case ARG_IS_CDATA:
if (event == E_TEXT) {
arg = boolSV(p_state->is_cdata);
}
break;
case ARG_SKIPPED_TEXT:
arg = sv_2mortal(p_state->skipped_text);
p_state->skipped_text = newSVpvs("");
break;
case ARG_OFFSET:
arg = sv_2mortal(newSViv(offset));
break;
case ARG_OFFSET_END:
arg = sv_2mortal(newSViv(offset + CHR_DIST(end, beg)));
break;
case ARG_LENGTH:
arg = sv_2mortal(newSViv(CHR_DIST(end, beg)));
break;
case ARG_LINE:
arg = sv_2mortal(newSViv(line));
break;
case ARG_COLUMN:
arg = sv_2mortal(newSViv(column));
break;
case ARG_EVENT:
assert(event >= 0 && event < EVENT_COUNT);
arg = sv_2mortal(newSVpv(event_id_str[event], 0));
break;
case ARG_LITERAL:
{
int len = (unsigned char)s[1];
arg = sv_2mortal(newSVpvn(s+2, len));
if (SvUTF8(h->argspec))
SvUTF8_on(arg);
s += len + 1;
}
break;
case ARG_UNDEF:
arg = sv_mortalcopy(&PL_sv_undef);
break;
default:
arg = sv_2mortal(newSVpvf("Bad argspec %d", *s));
break;
}
if (push_arg) {
if (!arg)
arg = sv_mortalcopy(&PL_sv_undef);
if (array) {
/* have to fix mortality here or add mortality to
* XPUSHs after removing it from the switch cases.
*/
}
else if (*s == '"' || *s == '\'') {
char *string_beg = s;
s++;
while (s < end && *s != *string_beg && *s != '\\')
s++;
if (*s == *string_beg) {
/* literal */
int len = s - string_beg - 1;
unsigned char buf[2];
if (len > 255)
croak("Literal string is longer than 255 chars in argspec");
buf[0] = ARG_LITERAL;
buf[1] = len;
sv_catpvn(argspec, (char*)buf, 2);
sv_catpvn(argspec, string_beg+1, len);
s++;
}
else if (*s == '\\') {
croak("Backslash reserved for literal string in argspec");
}
else {
croak("Unterminated literal string in argspec");
}
}
else {
croak("Bad argspec (%s)", s);
}
while (isHSPACE(*s))
s++;
if (*s == '}' && SvPVX(argspec)[0] == ARG_FLAG_FLAT_ARRAY) {
/* end of '@{ ... }' */
s++;
while (isHSPACE(*s))
s++;
if (s < end)
croak("Bad argspec: stuff after @{...} (%s)", s);
}
if (s == end)
break;
if (*s != ',') {
croak("Missing comma separator in argspec");
}
s++;
while (isHSPACE(*s))
s++;
}
return argspec;
}
static void
flush_pending_text(PSTATE* p_state, SV* self)
{
dTHX;
bool old_unbroken_text = p_state->unbroken_text;
SV* old_pend_text = p_state->pend_text;
bool old_is_cdata = p_state->is_cdata;
STRLEN old_offset = p_state->offset;
STRLEN old_line = p_state->line;
STRLEN old_column = p_state->column;
assert(p_state->pend_text && SvOK(p_state->pend_text));
p_state->unbroken_text = 0;
p_state->pend_text = 0;
p_state->is_cdata = p_state->pend_text_is_cdata;
p_state->offset = p_state->pend_text_offset;
p_state->line = p_state->pend_text_line;
p_state->column = p_state->pend_text_column;
report_event(p_state, E_TEXT,
SvPVX(old_pend_text), SvEND(old_pend_text),
SvUTF8(old_pend_text), 0, 0, self);
SvOK_off(old_pend_text);
p_state->unbroken_text = old_unbroken_text;
p_state->pend_text = old_pend_text;
p_state->is_cdata = old_is_cdata;
p_state->offset = old_offset;
p_state->line = old_line;
p_state->column = old_column;
}
static char*
skip_until_gt(char *beg, char *end)
{
/* tries to emulate quote skipping behaviour observed in MSIE */
char *s = beg;
char quote = '\0';
char prev = ' ';
while (s < end) {
if (!quote && *s == '>')
return s;
if (*s == '"' || *s == '\'') {
if (*s == quote) {
quote = '\0'; /* end of quoted string */
}
else if (!quote && (prev == ' ' || prev == '=')) {
quote = *s;
}
}
prev = *s++;
}
return end;
}
static char*
parse_comment(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
char *s = beg;
if (p_state->strict_comment) {
dTOKENS(4);
char *start_com = s; /* also used to signal inside/outside */
while (1) {
/* try to locate "--" */
FIND_DASH_DASH:
/* printf("find_dash_dash: [%s]\n", s); */
while (s < end && *s != '-' && *s != '>')
s++;
if (s == end) {
FREE_TOKENS;
return beg;
}
if (*s == '>') {
s++;
if (start_com)
goto FIND_DASH_DASH;
/* we are done recognizing all comments, make callbacks */
report_event(p_state, E_COMMENT,
beg - 4, s, utf8,
tokens, num_tokens,
self);
FREE_TOKENS;
/* try to locate /--\s*>/ which signals end-of-comment */
LOCATE_END:
while (s < end && *s != '-')
s++;
token.end = s;
if (s < end) {
s++;
if (*s == '-') {
s++;
while (isHSPACE(*s))
s++;
if (*s == '>') {
s++;
/* yup */
report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self);
return s;
}
}
if (s < end) {
s = token.end + 1;
goto LOCATE_END;
}
}
if (s == end)
return beg;
}
return 0;
}
#ifdef MARKED_SECTION
static void
marked_section_update(PSTATE* p_state)
{
dTHX;
/* we look at p_state->ms_stack to determine p_state->ms */
AV* ms_stack = p_state->ms_stack;
p_state->ms = MS_NONE;
if (ms_stack) {
int stack_len = av_len(ms_stack);
int stack_idx;
for (stack_idx = 0; stack_idx <= stack_len; stack_idx++) {
SV** svp = av_fetch(ms_stack, stack_idx, 0);
if (svp) {
AV* tokens = (AV*)SvRV(*svp);
int tokens_len = av_len(tokens);
int i;
assert(SvTYPE(tokens) == SVt_PVAV);
for (i = 0; i <= tokens_len; i++) {
SV** svp = av_fetch(tokens, i, 0);
if (svp) {
STRLEN len;
char *token_str = SvPV(*svp, len);
enum marked_section_t token;
if (strEQ(token_str, "include"))
token = MS_INCLUDE;
else if (strEQ(token_str, "rcdata"))
token = MS_RCDATA;
else if (strEQ(token_str, "cdata"))
token = MS_CDATA;
else if (strEQ(token_str, "ignore"))
token = MS_IGNORE;
else
token = MS_NONE;
if (p_state->ms < token)
p_state->ms = token;
}
}
}
}
}
/* printf("MS %d\n", p_state->ms); */
p_state->is_cdata = (p_state->ms == MS_CDATA);
return;
}
static char*
parse_marked_section(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
dTHX;
char *s;
AV* tokens = 0;
if (!p_state->marked_sections)
return 0;
assert(beg[0] == '<');
assert(beg[1] == '!');
assert(beg[2] == '[');
s = beg + 3;
FIND_NAMES:
while (isHSPACE(*s))
s++;
while (isHNAME_FIRST(*s)) {
char *name_start = s;
char *name_end;
SV *name;
s++;
while (isHNAME_CHAR(*s))
s++;
name_end = s;
while (isHSPACE(*s))
s++;
if (s == end)
goto PREMATURE;
if (!tokens)
tokens = newAV();
name = newSVpvn(name_start, name_end - name_start);
if (utf8)
SvUTF8_on(name);
av_push(tokens, sv_lower(aTHX_ name));
}
if (*s == '-') {
s++;
if (*s == '-') {
/* comment */
s++;
while (1) {
while (s < end && *s != '-')
s++;
if (s == end)
goto PREMATURE;
s++; /* skip first '-' */
if (*s == '-') {
s++;
/* comment finished */
goto FIND_NAMES;
}
}
}
else {
char *word_start = s;
while (s < end && isHNOT_SPACE_GT(*s)) {
if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) {
if ((s + 1) == end)
goto PREMATURE;
if (*(s + 1) == '>')
break;
}
s++;
}
if (s == end)
goto PREMATURE;
PUSH_TOKEN(word_start, s);
}
while (isHSPACE(*s))
s++;
if (s == end)
goto PREMATURE;
}
else {
PUSH_TOKEN(0, 0); /* boolean attr value */
}
}
if (ALLOW_EMPTY_TAG(p_state) && *s == '/') {
s++;
if (s == end)
goto PREMATURE;
empty_tag = 1;
}
if (*s == '>') {
s++;
/* done */
report_event(p_state, E_START, beg, s, utf8, tokens, num_tokens, self);
if (empty_tag) {
report_event(p_state, E_END, s, s, utf8, tokens, 1, self);
}
else if (!p_state->xml_mode) {
/* find out if this start tag should put us into literal_mode
*/
int i;
int tag_len = tokens[0].end - tokens[0].beg;
for (i = 0; literal_mode_elem[i].len; i++) {
if (tag_len == literal_mode_elem[i].len) {
/* try to match it */
char *s = beg + 1;
char *t = literal_mode_elem[i].str;
int len = tag_len;
while (len) {
if (toLOWER(*s) != *t)
break;
s++;
t++;
if (!--len) {
/* found it */
p_state->literal_mode = literal_mode_elem[i].str;
p_state->is_cdata = literal_mode_elem[i].is_cdata;
/* printf("Found %s\n", p_state->literal_mode); */
goto END_OF_LITERAL_SEARCH;
}
}
}
}
END_OF_LITERAL_SEARCH:
;
}
FREE_TOKENS;
return s;
}
FREE_TOKENS;
return 0;
PREMATURE:
FREE_TOKENS;
return beg;
}
static char*
parse_end(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
char *s = beg+2;
hctype_t name_first, name_char;
if (STRICT_NAMES(p_state)) {
name_first = HCTYPE_NAME_FIRST;
name_char = HCTYPE_NAME_CHAR;
}
else {
name_first = name_char = HCTYPE_NOT_SPACE_GT;
}
if (isHCTYPE(*s, name_first)) {
token_pos_t tagname;
tagname.beg = s;
s++;
while (s < end && isHCTYPE(*s, name_char))
s++;
tagname.end = s;
if (p_state->strict_end) {
while (isHSPACE(*s))
s++;
}
else {
s = skip_until_gt(s, end);
}
if (s < end) {
if (*s == '>') {
s++;
/* a complete end tag has been recognized */
report_event(p_state, E_END, beg, s, utf8, &tagname, 1, self);
return s;
}
}
#include "pfunc.h" /* declares the parsefunc[] */
#endif /* USE_PFUNC */
static char*
parse_buf(pTHX_ PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
char *s = beg;
char *t = beg;
char *new_pos;
while (!p_state->eof) {
/*
* At the start of this loop we will always be ready for eating text
* or a new tag. We will never be inside some tag. The 't' points
* to where we started and the 's' is advanced as we go.
*/
while (p_state->literal_mode) {
char *l = p_state->literal_mode;
char *end_text;
while (s < end && *s != '<') {
s++;
}
if (s == end) {
s = t;
goto DONE;
}
end_text = s;
s++;
/* here we rely on '\0' termination of perl svpv buffers */
if (*s == '/') {
s++;
while (*l && toLOWER(*s) == *l) {
s++;
l++;
}
if (!*l && (strNE(p_state->literal_mode, "plaintext") || p_state->closing_plaintext)) {
/* matched it all */
token_pos_t end_token;
end_token.beg = end_text + 2;
end_token.end = s;
while (isHSPACE(*s))
s++;
if (*s == '>') {
s++;
if (t != end_text)
report_event(p_state, E_TEXT, t, end_text, utf8,
0, 0, self);
report_event(p_state, E_END, end_text, s, utf8,
&end_token, 1, self);
p_state->literal_mode = 0;
p_state->is_cdata = 0;
t = s;
}
}
}
}
#ifdef MARKED_SECTION
while (p_state->ms == MS_CDATA || p_state->ms == MS_RCDATA) {
while (s < end && *s != ']')
s++;
if (*s == ']') {
char *end_text = s;
s++;
if (*s == ']' && *(s + 1) == '>') {
s += 2;
/* marked section end */
if (t != end_text)
report_event(p_state, E_TEXT, t, end_text, utf8,
0, 0, self);
report_event(p_state, E_NONE, end_text, s, utf8, 0, 0, self);
t = s;
SvREFCNT_dec(av_pop(p_state->ms_stack));
marked_section_update(p_state);
continue;
}
}
if (s == end) {
s = t;
goto DONE;
}
}
#endif
/* first we try to match as much text as possible */
while (s < end && *s != '<') {
#ifdef MARKED_SECTION
if (p_state->ms && *s == ']') {
char *end_text = s;
s++;
if (*s == ']') {
s++;
if (*s == '>') {
s++;
report_event(p_state, E_TEXT, t, end_text, utf8,
0, 0, self);
report_event(p_state, E_NONE, end_text, s, utf8,
0, 0, self);
t = s;
SvREFCNT_dec(av_pop(p_state->ms_stack));
marked_section_update(p_state);
continue;
}
}
}
#endif
s++;
}
if (s != t) {
if (*s == '<') {
report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self);
token_pos_t t;
char dummy;
t.beg = p_state->literal_mode;
t.end = p_state->literal_mode + strlen(p_state->literal_mode);
report_event(p_state, E_END, &dummy, &dummy, 0, &t, 1, self);
}
else {
p_state->pending_end_tag = p_state->literal_mode;
}
p_state->literal_mode = 0;
s = parse_buf(aTHX_ p_state, s, end, utf8, self);
continue;
}
if (!p_state->strict_comment && !p_state->no_dash_dash_comment_end && *s == '<') {
p_state->no_dash_dash_comment_end = 1;
s = parse_buf(aTHX_ p_state, s, end, utf8, self);
continue;
}
if (!p_state->strict_comment && *s == '<') {
char *s1 = s + 1;
if (s1 == end || isHNAME_FIRST(*s1) || *s1 == '/' || *s1 == '!' || *s1 == '?') {
/* some kind of unterminated markup. Report rest as as comment */
token_pos_t token;
token.beg = s + 1;
token.end = end;
report_event(p_state, E_COMMENT, s, end, utf8, &token, 1, self);
s = end;
}
}
break;
}
if (s < end) {
/* report rest as text */
report_event(p_state, E_TEXT, s, end, utf8, 0, 0, self);
}
SvREFCNT_dec(p_state->buf);
p_state->buf = 0;
}
if (p_state->pend_text && SvOK(p_state->pend_text))
flush_pending_text(p_state, self);
if (p_state->ignoring_element) {
/* document not balanced */
SvREFCNT_dec(p_state->ignoring_element);
p_state->ignoring_element = 0;
}
report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, 0, self);
/* reset state */
p_state->offset = 0;
if (p_state->line)
p_state->line = 1;
p_state->column = 0;
p_state->start_document = 0;
p_state->literal_mode = 0;
p_state->is_cdata = 0;
return;
}
if (p_state->utf8_mode)
sv_utf8_downgrade(chunk, 0);
if (p_state->buf && SvOK(p_state->buf)) {
sv_catsv(p_state->buf, chunk);
beg = SvPV(p_state->buf, len);
utf8 = SvUTF8(p_state->buf);
}
else {
beg = SvPV(chunk, len);
utf8 = SvUTF8(chunk);
if (p_state->offset == 0 && DOWARN) {
/* Print warnings if we find unexpected Unicode BOM forms */
if (p_state->argspec_entity_decode &&
!(p_state->attr_encoded && p_state->argspec_entity_decode == ARG_ATTR) &&
!p_state->utf8_mode && (
(!utf8 && len >= 3 && strnEQ(beg, "\xEF\xBB\xBF", 3)) ||
(utf8 && len >= 6 && strnEQ(beg, "\xC3\xAF\xC2\xBB\xC2\xBF", 6)) ||
(!utf8 && probably_utf8_chunk(aTHX_ beg, len))
)
)
{
warn("Parsing of undecoded UTF-8 will give garbage when decoding entities");
}
if (utf8 && len >= 2 && strnEQ(beg, "\xFF\xFE", 2)) {
warn("Parsing string decoded with wrong endianness");
}
if (!utf8 && len >= 4 &&
(strnEQ(beg, "\x00\x00\xFE\xFF", 4) ||
strnEQ(beg, "\xFE\xFF\x00\x00", 4))
)
{
warn("Parsing of undecoded UTF-32");
}
else if (!utf8 && len >= 2 &&
(strnEQ(beg, "\xFE\xFF", 2) || strnEQ(beg, "\xFF\xFE", 2))
)
{
warn("Parsing of undecoded UTF-16");
}
}
}
if (!len)
return; /* nothing to do */
end = beg + len;
s = parse_buf(aTHX_ p_state, beg, end, utf8, self);
if (s == end || p_state->eof) {
if (p_state->buf) {
SvOK_off(p_state->buf);
}
}
else {
/* need to keep rest in buffer */
if (p_state->buf) {
( run in 1.159 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )