HTML-Parser

 view release on metacpan or  search on metacpan

hparser.c  view on Meta::CPAN

/*
 * Copyright 1999-2016, Gisle Aas
 * Copyright 1999-2000, Michael A. Chase
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the same terms as Perl itself.
 */

#ifndef EXTERN
#define EXTERN extern
#endif

#include "hctype.h"    /* isH...() macros */
#include "tokenpos.h"  /* dTOKEN; PUSH_TOKEN() */


const static
struct literal_tag {
    int len;
    char* str;
    int is_cdata;
}
literal_mode_elem[] =
{
    {6, "script", 1},
    {5, "style", 1},
    {3, "xmp", 1},
    {6, "iframe", 1},
    {9, "plaintext", 1},
    {5, "title", 0},
    {8, "textarea", 0},
    {0, 0, 0}
};

enum argcode {
    ARG_SELF = 1,  /* need to avoid '\0' in argspec string */
    ARG_TOKENS,
    ARG_TOKENPOS,
    ARG_TOKEN0,
    ARG_TAGNAME,
    ARG_TAG,
    ARG_ATTR,
    ARG_ATTRARR,
    ARG_ATTRSEQ,
    ARG_TEXT,
    ARG_DTEXT,
    ARG_IS_CDATA,
    ARG_SKIPPED_TEXT,
    ARG_OFFSET,
    ARG_OFFSET_END,
    ARG_LENGTH,
    ARG_LINE,
    ARG_COLUMN,
    ARG_EVENT,
    ARG_UNDEF,
    ARG_LITERAL, /* Always keep last */

    /* extra flags always encoded first */
    ARG_FLAG_FLAT_ARRAY
};

static const char * const argname[] = {
    /* Must be in the same order as enum argcode */
    "self",     /* ARG_SELF */
    "tokens",   /* ARG_TOKENS */
    "tokenpos", /* ARG_TOKENPOS */
    "token0",   /* ARG_TOKEN0 */
    "tagname",  /* ARG_TAGNAME */
    "tag",      /* ARG_TAG */
    "attr",     /* ARG_ATTR */
    "@attr",    /* ARG_ATTRARR */
    "attrseq",  /* ARG_ATTRSEQ */
    "text",     /* ARG_TEXT */
    "dtext",    /* ARG_DTEXT */
    "is_cdata", /* ARG_IS_CDATA */
    "skipped_text", /* ARG_SKIPPED_TEXT */
    "offset",   /* ARG_OFFSET */
    "offset_end", /* ARG_OFFSET_END */
    "length",   /* ARG_LENGTH */
    "line",     /* ARG_LINE */
    "column",   /* ARG_COLUMN */
    "event",    /* ARG_EVENT */
    "undef",    /* ARG_UNDEF */
    /* ARG_LITERAL (not compared) */
    /* ARG_FLAG_FLAT_ARRAY */
};

#define CASE_SENSITIVE(p_state) \
         ((p_state)->xml_mode || (p_state)->case_sensitive)
#define STRICT_NAMES(p_state) \
         ((p_state)->xml_mode || (p_state)->strict_names)
#define ALLOW_EMPTY_TAG(p_state) \
         ((p_state)->xml_mode || (p_state)->empty_element_tags)

static void flush_pending_text(PSTATE* p_state, SV* self);

/*
 * Parser functions.
 *
 *   parse()                       - top level entry point.
 *                                   deals with text and calls one of its
 *                                   subordinate parse_*() routines after
 *                                   looking at the first char after "<"
 *     parse_decl()                - deals with declarations         <!...>
 *       parse_comment()           - deals with <!-- ... -->
 *       parse_marked_section      - deals with <![ ... [ ... ]]>
 *     parse_end()                 - deals with end tags             </...>
 *     parse_start()               - deals with start tags           <A...>
 *     parse_process()             - deals with process instructions <?...>
 *     parse_null()                - deals with anything else        <....>
 *
 *     report_event() - called whenever any of the parse*() routines
 *                      has recongnized something.
 */

static void
report_event(PSTATE* p_state,
	     event_id_t event,
	     char *beg, char *end, U32 utf8,
	     token_pos_t *tokens, int num_tokens,
	     SV* self
	    )
{
    struct p_handler *h;
    dTHX;
    dSP;
    AV *array;
    STRLEN my_na;
    char *argspec;
    char *s;
    STRLEN offset;
    STRLEN line;
    STRLEN column;

    #define CHR_DIST(a,b) (utf8 ? utf8_distance((U8*)(a),(U8*)(b)) : (a) - (b))

hparser.c  view on Meta::CPAN

		SvUTF8_off(tagname);
	    if (!CASE_SENSITIVE(p_state))
		sv_lower(aTHX_ tagname);

	    if (p_state->ignoring_element) {
		if (sv_eq(p_state->ignoring_element, tagname)) {
		    if (event == E_START)
			p_state->ignore_depth++;
		    else if (--p_state->ignore_depth == 0) {
			SvREFCNT_dec(p_state->ignoring_element);
			p_state->ignoring_element = 0;
		    }
		}
		goto IGNORE_EVENT;
	    }

	    if (p_state->ignore_elements &&
		hv_fetch_ent(p_state->ignore_elements, tagname, 0, 0))
	    {
		if (event == E_START) {
		    p_state->ignoring_element = newSVsv(tagname);
		    p_state->ignore_depth = 1;
		}
		goto IGNORE_EVENT;
	    }

	    if (p_state->ignore_tags &&
		hv_fetch_ent(p_state->ignore_tags, tagname, 0, 0))
	    {
		goto IGNORE_EVENT;
	    }
	    if (p_state->report_tags &&
		!hv_fetch_ent(p_state->report_tags, tagname, 0, 0))
	    {
		goto IGNORE_EVENT;
	    }
	}
	else if (p_state->ignoring_element) {
	    goto IGNORE_EVENT;
	}
    }

    h = &p_state->handlers[event];
    if (!h->cb) {
	/* event = E_DEFAULT; */
	h = &p_state->handlers[E_DEFAULT];
	if (!h->cb)
	    goto IGNORE_EVENT;
    }

    if (SvTYPE(h->cb) != SVt_PVAV && !SvTRUE(h->cb)) {
	/* FALSE scalar ('' or 0) means IGNORE this event */
	return;
    }

    if (p_state->unbroken_text && event == E_TEXT) {
	/* should buffer text */
	if (!p_state->pend_text)
	    p_state->pend_text = newSV(256);
	if (SvOK(p_state->pend_text)) {
	    if (p_state->is_cdata != p_state->pend_text_is_cdata) {
		flush_pending_text(p_state, self);
		SPAGAIN;
		goto INIT_PEND_TEXT;
	    }
	}
	else {
	INIT_PEND_TEXT:
	    p_state->pend_text_offset = offset;
	    p_state->pend_text_line = line;
	    p_state->pend_text_column = column;
	    p_state->pend_text_is_cdata = p_state->is_cdata;
	    sv_setpvs(p_state->pend_text, "");
	    if (!utf8)
		SvUTF8_off(p_state->pend_text);
	}
	if (utf8 && !SvUTF8(p_state->pend_text))
	    sv_utf8_upgrade(p_state->pend_text);
	if (utf8 || !SvUTF8(p_state->pend_text)) {
	    sv_catpvn(p_state->pend_text, beg, end - beg);
	}
	else {
	    SV *tmp = newSVpvn(beg, end - beg);
	    sv_utf8_upgrade(tmp);
	    sv_catsv(p_state->pend_text, tmp);
	    SvREFCNT_dec(tmp);
	}
	return;
    }
    else if (p_state->pend_text && SvOK(p_state->pend_text)) {
	flush_pending_text(p_state, self);
	SPAGAIN;
    }

    /* At this point we have decided to generate an event callback */

    argspec = h->argspec ? SvPV(h->argspec, my_na) : "";

    if (SvTYPE(h->cb) == SVt_PVAV) {

	if (*argspec == ARG_FLAG_FLAT_ARRAY) {
	    argspec++;
	    array = (AV*)h->cb;
	}
	else {
	    /* start sub-array for accumulator array */
	    array = newAV();
	}
    }
    else {
	array = 0;
	if (*argspec == ARG_FLAG_FLAT_ARRAY)
	    argspec++;

	/* start argument stack for callback */
	ENTER;
	SAVETMPS;
	PUSHMARK(SP);
    }

    for (s = argspec; *s; s++) {
	SV* arg = 0;
	int push_arg = 1;
	enum argcode argcode = (enum argcode)*s;

	switch( argcode ) {

	case ARG_SELF:
	    arg = sv_mortalcopy(self);
	    break;

	case ARG_TOKENS:

hparser.c  view on Meta::CPAN

			    attrval = newSVsv(p_state->bool_attr_val);
			else
			    attrval = newSVsv(attrname);
		    }

		    if (!CASE_SENSITIVE(p_state))
			sv_lower(aTHX_ attrname);

		    if (argcode == ARG_ATTR) {
			if (hv_exists_ent(hv, attrname, 0) ||
			    !hv_store_ent(hv, attrname, attrval, 0)) {
			    SvREFCNT_dec(attrval);
			}
			SvREFCNT_dec(attrname);
		    }
		    else { /* ARG_ATTRARR */
			if (array) {
			    av_push(array, attrname);
			    av_push(array, attrval);
			}
			else {
			    mXPUSHs(attrname);
			    mXPUSHs(attrval);
			}
		    }
		}
	    }
	    else if (argcode == ARG_ATTRARR) {
		push_arg = 0;
	    }
	    break;

	case ARG_ATTRSEQ:       /* (v2 compatibility stuff) */
	    if (event == E_START) {
		AV* av = newAV();
		int i;
		for (i = 1; i < num_tokens; i += 2) {
		    SV* attrname = newSVpvn(tokens[i].beg,
					    tokens[i].end-tokens[i].beg);
		    if (utf8)
			SvUTF8_on(attrname);
		    if (!CASE_SENSITIVE(p_state))
			sv_lower(aTHX_ attrname);
		    av_push(av, attrname);
		}
		arg = sv_2mortal(newRV_noinc((SV*)av));
	    }
	    break;

	case ARG_TEXT:
	    arg = sv_2mortal(newSVpvn(beg, end - beg));
	    if (utf8)
		SvUTF8_on(arg);
	    break;

	case ARG_DTEXT:
	    if (event == E_TEXT) {
		arg = sv_2mortal(newSVpvn(beg, end - beg));
		if (utf8)
		    SvUTF8_on(arg);
		if (!p_state->is_cdata) {
		    if (p_state->utf8_mode) {
			sv_utf8_decode(arg);
                        sv_utf8_upgrade(arg);
                    }
		    decode_entities(aTHX_ arg, p_state->entity2char, 1);
		    if (p_state->utf8_mode)
			SvUTF8_off(arg);
		}
	    }
	    break;

	case ARG_IS_CDATA:
	    if (event == E_TEXT) {
		arg = boolSV(p_state->is_cdata);
	    }
	    break;

        case ARG_SKIPPED_TEXT:
	    arg = sv_2mortal(p_state->skipped_text);
	    p_state->skipped_text = newSVpvs("");
            break;

	case ARG_OFFSET:
	    arg = sv_2mortal(newSViv(offset));
	    break;

	case ARG_OFFSET_END:
	    arg = sv_2mortal(newSViv(offset + CHR_DIST(end, beg)));
	    break;

	case ARG_LENGTH:
	    arg = sv_2mortal(newSViv(CHR_DIST(end, beg)));
	    break;

	case ARG_LINE:
	    arg = sv_2mortal(newSViv(line));
	    break;

	case ARG_COLUMN:
	    arg = sv_2mortal(newSViv(column));
	    break;

	case ARG_EVENT:
	    assert(event >= 0 && event < EVENT_COUNT);
	    arg = sv_2mortal(newSVpv(event_id_str[event], 0));
	    break;

	case ARG_LITERAL:
	{
	    int len = (unsigned char)s[1];
	    arg = sv_2mortal(newSVpvn(s+2, len));
	    if (SvUTF8(h->argspec))
		SvUTF8_on(arg);
	    s += len + 1;
	}
	break;

	case ARG_UNDEF:
	    arg = sv_mortalcopy(&PL_sv_undef);
	    break;

	default:
	    arg = sv_2mortal(newSVpvf("Bad argspec %d", *s));
	    break;
	}

	if (push_arg) {
	    if (!arg)
		arg = sv_mortalcopy(&PL_sv_undef);

	    if (array) {
		/* have to fix mortality here or add mortality to
		 * XPUSHs after removing it from the switch cases.
		 */

hparser.c  view on Meta::CPAN

	}
	else if (*s == '"' || *s == '\'') {
	    char *string_beg = s;
	    s++;
	    while (s < end && *s != *string_beg && *s != '\\')
		s++;
	    if (*s == *string_beg) {
		/* literal */
		int len = s - string_beg - 1;
		unsigned char buf[2];
		if (len > 255)
		    croak("Literal string is longer than 255 chars in argspec");
		buf[0] = ARG_LITERAL;
		buf[1] = len;
		sv_catpvn(argspec, (char*)buf, 2);
		sv_catpvn(argspec, string_beg+1, len);
		s++;
	    }
	    else if (*s == '\\') {
		croak("Backslash reserved for literal string in argspec");
	    }
	    else {
		croak("Unterminated literal string in argspec");
	    }
	}
	else {
	    croak("Bad argspec (%s)", s);
	}

	while (isHSPACE(*s))
	    s++;

	if (*s == '}' && SvPVX(argspec)[0] == ARG_FLAG_FLAT_ARRAY) {
	    /* end of '@{ ... }' */
	    s++;
	    while (isHSPACE(*s))
		s++;
	    if (s < end)
		croak("Bad argspec: stuff after @{...} (%s)", s);
	}

	if (s == end)
	    break;
	if (*s != ',') {
	    croak("Missing comma separator in argspec");
	}
	s++;
	while (isHSPACE(*s))
	    s++;
    }
    return argspec;
}


static void
flush_pending_text(PSTATE* p_state, SV* self)
{
    dTHX;
    bool   old_unbroken_text = p_state->unbroken_text;
    SV*    old_pend_text     = p_state->pend_text;
    bool   old_is_cdata      = p_state->is_cdata;
    STRLEN old_offset        = p_state->offset;
    STRLEN old_line          = p_state->line;
    STRLEN old_column        = p_state->column;

    assert(p_state->pend_text && SvOK(p_state->pend_text));

    p_state->unbroken_text = 0;
    p_state->pend_text     = 0;
    p_state->is_cdata      = p_state->pend_text_is_cdata;
    p_state->offset        = p_state->pend_text_offset;
    p_state->line          = p_state->pend_text_line;
    p_state->column        = p_state->pend_text_column;

    report_event(p_state, E_TEXT,
		 SvPVX(old_pend_text), SvEND(old_pend_text),
		 SvUTF8(old_pend_text), 0, 0, self);
    SvOK_off(old_pend_text);

    p_state->unbroken_text = old_unbroken_text;
    p_state->pend_text     = old_pend_text;
    p_state->is_cdata      = old_is_cdata;
    p_state->offset        = old_offset;
    p_state->line          = old_line;
    p_state->column        = old_column;
}

static char*
skip_until_gt(char *beg, char *end)
{
    /* tries to emulate quote skipping behaviour observed in MSIE */
    char *s = beg;
    char quote = '\0';
    char prev = ' ';
    while (s < end) {
	if (!quote && *s == '>')
	    return s;
	if (*s == '"' || *s == '\'') {
	    if (*s == quote) {
		quote = '\0';  /* end of quoted string */
	    }
	    else if (!quote && (prev == ' ' || prev == '=')) {
		quote = *s;
	    }
	}
	prev = *s++;
    }
    return end;
}

static char*
parse_comment(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
    char *s = beg;

    if (p_state->strict_comment) {
	dTOKENS(4);
	char *start_com = s;  /* also used to signal inside/outside */

	while (1) {
	    /* try to locate "--" */
	FIND_DASH_DASH:
	    /* printf("find_dash_dash: [%s]\n", s); */
	    while (s < end && *s != '-' && *s != '>')
		s++;

	    if (s == end) {
		FREE_TOKENS;
		return beg;
	    }

	    if (*s == '>') {
		s++;
		if (start_com)
		    goto FIND_DASH_DASH;

		/* we are done recognizing all comments, make callbacks */
		report_event(p_state, E_COMMENT,
			     beg - 4, s, utf8,
			     tokens, num_tokens,
			     self);
		FREE_TOKENS;

hparser.c  view on Meta::CPAN

	/* try to locate /--\s*>/ which signals end-of-comment */
    LOCATE_END:
	while (s < end && *s != '-')
	    s++;
	token.end = s;
	if (s < end) {
	    s++;
	    if (*s == '-') {
		s++;
		while (isHSPACE(*s))
		    s++;
		if (*s == '>') {
		    s++;
		    /* yup */
		    report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self);
		    return s;
		}
	    }
	    if (s < end) {
		s = token.end + 1;
		goto LOCATE_END;
	    }
	}

	if (s == end)
	    return beg;
    }

    return 0;
}


#ifdef MARKED_SECTION

static void
marked_section_update(PSTATE* p_state)
{
    dTHX;
    /* we look at p_state->ms_stack to determine p_state->ms */
    AV* ms_stack = p_state->ms_stack;
    p_state->ms = MS_NONE;

    if (ms_stack) {
	int stack_len = av_len(ms_stack);
	int stack_idx;
	for (stack_idx = 0; stack_idx <= stack_len; stack_idx++) {
	    SV** svp = av_fetch(ms_stack, stack_idx, 0);
	    if (svp) {
		AV* tokens = (AV*)SvRV(*svp);
		int tokens_len = av_len(tokens);
		int i;
		assert(SvTYPE(tokens) == SVt_PVAV);
		for (i = 0; i <= tokens_len; i++) {
		    SV** svp = av_fetch(tokens, i, 0);
		    if (svp) {
			STRLEN len;
			char *token_str = SvPV(*svp, len);
			enum marked_section_t token;
			if (strEQ(token_str, "include"))
			    token = MS_INCLUDE;
			else if (strEQ(token_str, "rcdata"))
			    token = MS_RCDATA;
			else if (strEQ(token_str, "cdata"))
			    token = MS_CDATA;
			else if (strEQ(token_str, "ignore"))
			    token = MS_IGNORE;
			else
			    token = MS_NONE;
			if (p_state->ms < token)
			    p_state->ms = token;
		    }
		}
	    }
	}
    }
    /* printf("MS %d\n", p_state->ms); */
    p_state->is_cdata = (p_state->ms == MS_CDATA);
    return;
}


static char*
parse_marked_section(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
    dTHX;
    char *s;
    AV* tokens = 0;

    if (!p_state->marked_sections)
	return 0;

    assert(beg[0] == '<');
    assert(beg[1] == '!');
    assert(beg[2] == '[');
    s = beg + 3;

FIND_NAMES:
    while (isHSPACE(*s))
	s++;
    while (isHNAME_FIRST(*s)) {
	char *name_start = s;
	char *name_end;
	SV *name;
	s++;
	while (isHNAME_CHAR(*s))
	    s++;
	name_end = s;
	while (isHSPACE(*s))
	    s++;
	if (s == end)
	    goto PREMATURE;

	if (!tokens)
	    tokens = newAV();
	name = newSVpvn(name_start, name_end - name_start);
	if (utf8)
	    SvUTF8_on(name);
	av_push(tokens, sv_lower(aTHX_ name));
    }
    if (*s == '-') {
	s++;
	if (*s == '-') {
	    /* comment */
	    s++;
	    while (1) {
		while (s < end && *s != '-')
		    s++;
		if (s == end)
		    goto PREMATURE;

		s++;  /* skip first '-' */
		if (*s == '-') {
		    s++;
		    /* comment finished */
		    goto FIND_NAMES;
		}
	    }

hparser.c  view on Meta::CPAN

	    }
	    else {
		char *word_start = s;
		while (s < end && isHNOT_SPACE_GT(*s)) {
		    if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) {
			if ((s + 1) == end)
			    goto PREMATURE;
			if (*(s + 1) == '>')
			    break;
		    }
		    s++;
		}
		if (s == end)
		    goto PREMATURE;
		PUSH_TOKEN(word_start, s);
	    }
	    while (isHSPACE(*s))
		s++;
	    if (s == end)
		goto PREMATURE;
	}
	else {
	    PUSH_TOKEN(0, 0); /* boolean attr value */
	}
    }

    if (ALLOW_EMPTY_TAG(p_state) && *s == '/') {
	s++;
	if (s == end)
	    goto PREMATURE;
	empty_tag = 1;
    }

    if (*s == '>') {
	s++;
	/* done */
	report_event(p_state, E_START, beg, s, utf8, tokens, num_tokens, self);
	if (empty_tag) {
	    report_event(p_state, E_END, s, s, utf8, tokens, 1, self);
	}
	else if (!p_state->xml_mode) {
	    /* find out if this start tag should put us into literal_mode
	     */
	    int i;
	    int tag_len = tokens[0].end - tokens[0].beg;

	    for (i = 0; literal_mode_elem[i].len; i++) {
		if (tag_len == literal_mode_elem[i].len) {
		    /* try to match it */
		    char *s = beg + 1;
		    char *t = literal_mode_elem[i].str;
		    int len = tag_len;
		    while (len) {
			if (toLOWER(*s) != *t)
			    break;
			s++;
			t++;
			if (!--len) {
			    /* found it */
			    p_state->literal_mode = literal_mode_elem[i].str;
			    p_state->is_cdata = literal_mode_elem[i].is_cdata;
			    /* printf("Found %s\n", p_state->literal_mode); */
			    goto END_OF_LITERAL_SEARCH;
			}
		    }
		}
	    }
	END_OF_LITERAL_SEARCH:
	    ;
	}

	FREE_TOKENS;
	return s;
    }

    FREE_TOKENS;
    return 0;

PREMATURE:
    FREE_TOKENS;
    return beg;
}


static char*
parse_end(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
    char *s = beg+2;
    hctype_t name_first, name_char;

    if (STRICT_NAMES(p_state)) {
	name_first = HCTYPE_NAME_FIRST;
	name_char  = HCTYPE_NAME_CHAR;
    }
    else {
	name_first = name_char = HCTYPE_NOT_SPACE_GT;
    }

    if (isHCTYPE(*s, name_first)) {
	token_pos_t tagname;
	tagname.beg = s;
	s++;
	while (s < end && isHCTYPE(*s, name_char))
	    s++;
	tagname.end = s;

	if (p_state->strict_end) {
	    while (isHSPACE(*s))
		s++;
	}
	else {
	    s = skip_until_gt(s, end);
	}
	if (s < end) {
	    if (*s == '>') {
		s++;
		/* a complete end tag has been recognized */
		report_event(p_state, E_END, beg, s, utf8, &tagname, 1, self);
		return s;
	    }
	}

hparser.c  view on Meta::CPAN




#include "pfunc.h"                   /* declares the parsefunc[] */
#endif /* USE_PFUNC */

static char*
parse_buf(pTHX_ PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
    char *s = beg;
    char *t = beg;
    char *new_pos;

    while (!p_state->eof) {
	/*
	 * At the start of this loop we will always be ready for eating text
	 * or a new tag.  We will never be inside some tag.  The 't' points
	 * to where we started and the 's' is advanced as we go.
	 */

	while (p_state->literal_mode) {
	    char *l = p_state->literal_mode;
	    char *end_text;

	    while (s < end && *s != '<') {
		s++;
	    }

	    if (s == end) {
		s = t;
		goto DONE;
	    }

	    end_text = s;
	    s++;

	    /* here we rely on '\0' termination of perl svpv buffers */
	    if (*s == '/') {
		s++;
		while (*l && toLOWER(*s) == *l) {
		    s++;
		    l++;
		}

		if (!*l && (strNE(p_state->literal_mode, "plaintext") || p_state->closing_plaintext)) {
		    /* matched it all */
		    token_pos_t end_token;
		    end_token.beg = end_text + 2;
		    end_token.end = s;

		    while (isHSPACE(*s))
			s++;
		    if (*s == '>') {
			s++;
			if (t != end_text)
			    report_event(p_state, E_TEXT, t, end_text, utf8,
					 0, 0, self);
			report_event(p_state, E_END,  end_text, s, utf8,
				     &end_token, 1, self);
			p_state->literal_mode = 0;
			p_state->is_cdata = 0;
			t = s;
		    }
		}
	    }
	}

#ifdef MARKED_SECTION
	while (p_state->ms == MS_CDATA || p_state->ms == MS_RCDATA) {
	    while (s < end && *s != ']')
		s++;
	    if (*s == ']') {
		char *end_text = s;
		s++;
		if (*s == ']' && *(s + 1) == '>') {
		    s += 2;
		    /* marked section end */
		    if (t != end_text)
			report_event(p_state, E_TEXT, t, end_text, utf8,
				     0, 0, self);
		    report_event(p_state, E_NONE, end_text, s, utf8, 0, 0, self);
		    t = s;
		    SvREFCNT_dec(av_pop(p_state->ms_stack));
		    marked_section_update(p_state);
		    continue;
		}
	    }
	    if (s == end) {
		s = t;
		goto DONE;
	    }
	}
#endif

	/* first we try to match as much text as possible */
	while (s < end && *s != '<') {
#ifdef MARKED_SECTION
	    if (p_state->ms && *s == ']') {
		char *end_text = s;
		s++;
		if (*s == ']') {
		    s++;
		    if (*s == '>') {
			s++;
			report_event(p_state, E_TEXT, t, end_text, utf8,
				     0, 0, self);
			report_event(p_state, E_NONE, end_text, s, utf8,
				     0, 0, self);
			t = s;
			SvREFCNT_dec(av_pop(p_state->ms_stack));
			marked_section_update(p_state);
			continue;
		    }
		}
	    }
#endif
	    s++;
	}
	if (s != t) {
	    if (*s == '<') {
		report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self);

hparser.c  view on Meta::CPAN

			token_pos_t t;
			char dummy;
			t.beg = p_state->literal_mode;
			t.end = p_state->literal_mode + strlen(p_state->literal_mode);
			report_event(p_state, E_END, &dummy, &dummy, 0, &t, 1, self);
		    }
		    else {
			p_state->pending_end_tag = p_state->literal_mode;
		    }
		    p_state->literal_mode = 0;
		    s = parse_buf(aTHX_ p_state, s, end, utf8, self);
		    continue;
		}

		if (!p_state->strict_comment && !p_state->no_dash_dash_comment_end && *s == '<') {
		    p_state->no_dash_dash_comment_end = 1;
		    s = parse_buf(aTHX_ p_state, s, end, utf8, self);
		    continue;
		}

		if (!p_state->strict_comment && *s == '<') {
		    char *s1 = s + 1;
		    if (s1 == end || isHNAME_FIRST(*s1) || *s1 == '/' || *s1 == '!' || *s1 == '?') {
			/* some kind of unterminated markup.  Report rest as as comment */
			token_pos_t token;
			token.beg = s + 1;
			token.end = end;
			report_event(p_state, E_COMMENT, s, end, utf8, &token, 1, self);
			s = end;
		    }
		}

		break;
	    }

	    if (s < end) {
		/* report rest as text */
		report_event(p_state, E_TEXT, s, end, utf8, 0, 0, self);
	    }

	    SvREFCNT_dec(p_state->buf);
	    p_state->buf = 0;
	}
	if (p_state->pend_text && SvOK(p_state->pend_text))
	    flush_pending_text(p_state, self);

	if (p_state->ignoring_element) {
	    /* document not balanced */
	    SvREFCNT_dec(p_state->ignoring_element);
	    p_state->ignoring_element = 0;
	}
	report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, 0, self);

	/* reset state */
	p_state->offset = 0;
	if (p_state->line)
	    p_state->line = 1;
	p_state->column = 0;
	p_state->start_document = 0;
	p_state->literal_mode = 0;
	p_state->is_cdata = 0;
	return;
    }

    if (p_state->utf8_mode)
	sv_utf8_downgrade(chunk, 0);

    if (p_state->buf && SvOK(p_state->buf)) {
	sv_catsv(p_state->buf, chunk);
	beg = SvPV(p_state->buf, len);
	utf8 = SvUTF8(p_state->buf);
    }
    else {
	beg = SvPV(chunk, len);
	utf8 = SvUTF8(chunk);
	if (p_state->offset == 0 && DOWARN) {
	    /* Print warnings if we find unexpected Unicode BOM forms */
	    if (p_state->argspec_entity_decode &&
		!(p_state->attr_encoded && p_state->argspec_entity_decode == ARG_ATTR) &&
		!p_state->utf8_mode && (
                 (!utf8 && len >= 3 && strnEQ(beg, "\xEF\xBB\xBF", 3)) ||
		 (utf8 && len >= 6 && strnEQ(beg, "\xC3\xAF\xC2\xBB\xC2\xBF", 6)) ||
		 (!utf8 && probably_utf8_chunk(aTHX_ beg, len))
		)
	       )
	    {
		warn("Parsing of undecoded UTF-8 will give garbage when decoding entities");
	    }
	    if (utf8 && len >= 2 && strnEQ(beg, "\xFF\xFE", 2)) {
		warn("Parsing string decoded with wrong endianness");
	    }
	    if (!utf8 && len >= 4 &&
		(strnEQ(beg, "\x00\x00\xFE\xFF", 4) ||
		 strnEQ(beg, "\xFE\xFF\x00\x00", 4))
		)
	    {
		warn("Parsing of undecoded UTF-32");
	    }
	    else if (!utf8 && len >= 2 &&
		     (strnEQ(beg, "\xFE\xFF", 2) || strnEQ(beg, "\xFF\xFE", 2))
		)
	    {
		warn("Parsing of undecoded UTF-16");
	    }
	}
    }

    if (!len)
	return; /* nothing to do */

    end = beg + len;
    s = parse_buf(aTHX_ p_state, beg, end, utf8, self);

    if (s == end || p_state->eof) {
	if (p_state->buf) {
	    SvOK_off(p_state->buf);
	}
    }
    else {
	/* need to keep rest in buffer */
	if (p_state->buf) {



( run in 1.159 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )