HTML-Parser

 view release on metacpan or  search on metacpan

hparser.c  view on Meta::CPAN

		    s = parse_buf(aTHX_ p_state, s, end, utf8, self);
		    continue;
		}

		if (!p_state->strict_comment && *s == '<') {
		    char *s1 = s + 1;
		    if (s1 == end || isHNAME_FIRST(*s1) || *s1 == '/' || *s1 == '!' || *s1 == '?') {
			/* some kind of unterminated markup.  Report rest as as comment */
			token_pos_t token;
			token.beg = s + 1;
			token.end = end;
			report_event(p_state, E_COMMENT, s, end, utf8, &token, 1, self);
			s = end;
		    }
		}

		break;
	    }

	    if (s < end) {
		/* report rest as text */
		report_event(p_state, E_TEXT, s, end, utf8, 0, 0, self);
	    }

	    SvREFCNT_dec(p_state->buf);
	    p_state->buf = 0;
	}
	if (p_state->pend_text && SvOK(p_state->pend_text))
	    flush_pending_text(p_state, self);

	if (p_state->ignoring_element) {
	    /* document not balanced */
	    SvREFCNT_dec(p_state->ignoring_element);
	    p_state->ignoring_element = 0;
	}
	report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, 0, self);

	/* reset state */
	p_state->offset = 0;
	if (p_state->line)
	    p_state->line = 1;
	p_state->column = 0;
	p_state->start_document = 0;
	p_state->literal_mode = 0;
	p_state->is_cdata = 0;
	return;
    }

    if (p_state->utf8_mode)
	sv_utf8_downgrade(chunk, 0);

    if (p_state->buf && SvOK(p_state->buf)) {
	sv_catsv(p_state->buf, chunk);
	beg = SvPV(p_state->buf, len);
	utf8 = SvUTF8(p_state->buf);
    }
    else {
	beg = SvPV(chunk, len);
	utf8 = SvUTF8(chunk);
	if (p_state->offset == 0 && DOWARN) {
	    /* Print warnings if we find unexpected Unicode BOM forms */
	    if (p_state->argspec_entity_decode &&
		!(p_state->attr_encoded && p_state->argspec_entity_decode == ARG_ATTR) &&
		!p_state->utf8_mode && (
                 (!utf8 && len >= 3 && strnEQ(beg, "\xEF\xBB\xBF", 3)) ||
		 (utf8 && len >= 6 && strnEQ(beg, "\xC3\xAF\xC2\xBB\xC2\xBF", 6)) ||
		 (!utf8 && probably_utf8_chunk(aTHX_ beg, len))
		)
	       )
	    {
		warn("Parsing of undecoded UTF-8 will give garbage when decoding entities");
	    }
	    if (utf8 && len >= 2 && strnEQ(beg, "\xFF\xFE", 2)) {
		warn("Parsing string decoded with wrong endianness");
	    }
	    if (!utf8 && len >= 4 &&
		(strnEQ(beg, "\x00\x00\xFE\xFF", 4) ||
		 strnEQ(beg, "\xFE\xFF\x00\x00", 4))
		)
	    {
		warn("Parsing of undecoded UTF-32");
	    }
	    else if (!utf8 && len >= 2 &&
		     (strnEQ(beg, "\xFE\xFF", 2) || strnEQ(beg, "\xFF\xFE", 2))
		)
	    {
		warn("Parsing of undecoded UTF-16");
	    }
	}
    }

    if (!len)
	return; /* nothing to do */

    end = beg + len;
    s = parse_buf(aTHX_ p_state, beg, end, utf8, self);

    if (s == end || p_state->eof) {
	if (p_state->buf) {
	    SvOK_off(p_state->buf);
	}
    }
    else {
	/* need to keep rest in buffer */
	if (p_state->buf) {
	    /* chop off some chars at the beginning */
	    if (SvOK(p_state->buf)) {
		sv_chop(p_state->buf, s);
	    }
	    else {
		sv_setpvn(p_state->buf, s, end - s);
		if (utf8)
		    SvUTF8_on(p_state->buf);
		else
		    SvUTF8_off(p_state->buf);
	    }
	}
	else {
	    p_state->buf = newSVpv(s, end - s);
	    if (utf8)
		SvUTF8_on(p_state->buf);



( run in 0.506 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )