HTML-Parser

 view release on metacpan or  search on metacpan

Changes  view on Meta::CPAN

  * The offset/line/column counters was not properly reset
     after eof.

3.23     2001-05-01
  * If the $p->ignore_elements filter did not work as it should if
     handlers for start/end events was not registered.

3.22     2001-04-17
  * The <textarea> element is now parsed in literal mode, i.e. no other tags
     recognized until the </textarea> tag is seen.  Unlike other literal elements,
     the text content is not 'cdata'.
  * The XML &apos; entity is decoded.  It apos-char itself is still encoded as
     &#39; as &apos; is not really an HTML tag, and not recognized by many HTML
     browsers.

3.21     2001-04-10
  * Fix a memory leak which occurred when using filter methods.
  * Avoid a few compiler warnings (DEC C):
        - Trailing comma found in enumerator list
        - "unsigned char" is not compatible with "const char".
  * Doc update.

Changes  view on Meta::CPAN

  * Michael once again fixed my bad English in the HTML::Parser
    documentation.
  * netscape_buggy_comment will carp instead of warn
  * updated TODO/README
  * Documented that HTML::Filter is depreciated.
  * Made backslash reserved in literal argspec strings.
  * Added several new test scripts.

2.99_94     1999-12-08
  * (should almost be a 3.00 candidate)
  * Renamed 'cdata_flag' as 'is_cdata'.
  * Dropped support for wrapping callback handler and argspec
    in an array and passing a reference to $p->handler.  It
    created ambiguities when you want to pass a array as
    handler destination and not update argspec.  The wrapping
    for constructor arguments are unchanged.
  * Reworked the documentation after updates from Michael.
  * Simplified internal check_handler().  It should probably simply
    be inlined in handler() again.
  * Added argspec 'length' and 'undef'
  * Fix statement-less label.  Fix suggested by Matthew Langford

Parser.xs  view on Meta::CPAN


    pstate2->buf = SvREFCNT_inc(sv_dup(pstate->buf, params));
    pstate2->offset = pstate->offset;
    pstate2->line = pstate->line;
    pstate2->column = pstate->column;
    pstate2->start_document = pstate->start_document;
    pstate2->parsing = pstate->parsing;
    pstate2->eof = pstate->eof;

    pstate2->literal_mode = pstate->literal_mode;
    pstate2->is_cdata = pstate->is_cdata;
    pstate2->no_dash_dash_comment_end = pstate->no_dash_dash_comment_end;
    pstate2->pending_end_tag = pstate->pending_end_tag;

    pstate2->pend_text = SvREFCNT_inc(sv_dup(pstate->pend_text, params));
    pstate2->pend_text_is_cdata = pstate->pend_text_is_cdata;
    pstate2->pend_text_offset = pstate->pend_text_offset;
    pstate2->pend_text_line = pstate->pend_text_offset;
    pstate2->pend_text_column = pstate->pend_text_column;

    pstate2->skipped_text = SvREFCNT_inc(sv_dup(pstate->skipped_text, params));

#ifdef MARKED_SECTION
    pstate2->ms = pstate->ms;
    pstate2->ms_stack =
	(AV *)SvREFCNT_inc(sv_dup((SV *)pstate->ms_stack, params));

README  view on Meta::CPAN

        entities for characters outside the range 0..255 are left unchanged.

        This passes undef except for "text" events.

    "event"
        Event causes the event name to be passed.

        The event name is one of "text", "start", "end", "declaration",
        "comment", "process", "start_document" or "end_document".

    "is_cdata"
        Is_cdata causes a TRUE value to be passed if the event is inside a
        CDATA section or between literal start and end tags ("script",
        "style", "xmp", "iframe", "title", "textarea" and "plaintext").

        if the flag is FALSE for a text event, then you should normally
        either use "dtext" or decode the entities yourself before the text
        is processed further.

    "length"
        Length causes the number of bytes of the source text of the event to
        be passed.

README  view on Meta::CPAN


VERSION 2 COMPATIBILITY
    When an "HTML::Parser" object is constructed with no arguments, a set of
    handlers is automatically provided that is compatible with the old
    HTML::Parser version 2 callback methods.

    This is equivalent to the following method calls:

        $p->handler(start   => "start",   "self, tagname, attr, attrseq, text");
        $p->handler(end     => "end",     "self, tagname, text");
        $p->handler(text    => "text",    "self, text, is_cdata");
        $p->handler(process => "process", "self, token0, text");
        $p->handler(
            comment => sub {
                my ($self, $tokens) = @_;
                for (@$tokens) { $self->comment($_); }
            },
            "self, tokens"
        );
        $p->handler(
            declaration => sub {

hparser.c  view on Meta::CPAN

#endif

#include "hctype.h"    /* isH...() macros */
#include "tokenpos.h"  /* dTOKEN; PUSH_TOKEN() */


const static
struct literal_tag {
    int len;
    char* str;
    int is_cdata;
}
literal_mode_elem[] =
{
    {6, "script", 1},
    {5, "style", 1},
    {3, "xmp", 1},
    {6, "iframe", 1},
    {9, "plaintext", 1},
    {5, "title", 0},
    {8, "textarea", 0},

hparser.c  view on Meta::CPAN

    "tokens",   /* ARG_TOKENS */
    "tokenpos", /* ARG_TOKENPOS */
    "token0",   /* ARG_TOKEN0 */
    "tagname",  /* ARG_TAGNAME */
    "tag",      /* ARG_TAG */
    "attr",     /* ARG_ATTR */
    "@attr",    /* ARG_ATTRARR */
    "attrseq",  /* ARG_ATTRSEQ */
    "text",     /* ARG_TEXT */
    "dtext",    /* ARG_DTEXT */
    "is_cdata", /* ARG_IS_CDATA */
    "skipped_text", /* ARG_SKIPPED_TEXT */
    "offset",   /* ARG_OFFSET */
    "offset_end", /* ARG_OFFSET_END */
    "length",   /* ARG_LENGTH */
    "line",     /* ARG_LINE */
    "column",   /* ARG_COLUMN */
    "event",    /* ARG_EVENT */
    "undef",    /* ARG_UNDEF */
    /* ARG_LITERAL (not compared) */
    /* ARG_FLAG_FLAT_ARRAY */

hparser.c  view on Meta::CPAN

    if (SvTYPE(h->cb) != SVt_PVAV && !SvTRUE(h->cb)) {
	/* FALSE scalar ('' or 0) means IGNORE this event */
	return;
    }

    if (p_state->unbroken_text && event == E_TEXT) {
	/* should buffer text */
	if (!p_state->pend_text)
	    p_state->pend_text = newSV(256);
	if (SvOK(p_state->pend_text)) {
	    if (p_state->is_cdata != p_state->pend_text_is_cdata) {
		flush_pending_text(p_state, self);
		SPAGAIN;
		goto INIT_PEND_TEXT;
	    }
	}
	else {
	INIT_PEND_TEXT:
	    p_state->pend_text_offset = offset;
	    p_state->pend_text_line = line;
	    p_state->pend_text_column = column;
	    p_state->pend_text_is_cdata = p_state->is_cdata;
	    sv_setpvs(p_state->pend_text, "");
	    if (!utf8)
		SvUTF8_off(p_state->pend_text);
	}
	if (utf8 && !SvUTF8(p_state->pend_text))
	    sv_utf8_upgrade(p_state->pend_text);
	if (utf8 || !SvUTF8(p_state->pend_text)) {
	    sv_catpvn(p_state->pend_text, beg, end - beg);
	}
	else {

hparser.c  view on Meta::CPAN

	    arg = sv_2mortal(newSVpvn(beg, end - beg));
	    if (utf8)
		SvUTF8_on(arg);
	    break;

	case ARG_DTEXT:
	    if (event == E_TEXT) {
		arg = sv_2mortal(newSVpvn(beg, end - beg));
		if (utf8)
		    SvUTF8_on(arg);
		if (!p_state->is_cdata) {
		    if (p_state->utf8_mode) {
			sv_utf8_decode(arg);
                        sv_utf8_upgrade(arg);
                    }
		    decode_entities(aTHX_ arg, p_state->entity2char, 1);
		    if (p_state->utf8_mode)
			SvUTF8_off(arg);
		}
	    }
	    break;

	case ARG_IS_CDATA:
	    if (event == E_TEXT) {
		arg = boolSV(p_state->is_cdata);
	    }
	    break;

        case ARG_SKIPPED_TEXT:
	    arg = sv_2mortal(p_state->skipped_text);
	    p_state->skipped_text = newSVpvs("");
            break;

	case ARG_OFFSET:
	    arg = sv_2mortal(newSViv(offset));

hparser.c  view on Meta::CPAN

    return argspec;
}


static void
flush_pending_text(PSTATE* p_state, SV* self)
{
    dTHX;
    bool   old_unbroken_text = p_state->unbroken_text;
    SV*    old_pend_text     = p_state->pend_text;
    bool   old_is_cdata      = p_state->is_cdata;
    STRLEN old_offset        = p_state->offset;
    STRLEN old_line          = p_state->line;
    STRLEN old_column        = p_state->column;

    assert(p_state->pend_text && SvOK(p_state->pend_text));

    p_state->unbroken_text = 0;
    p_state->pend_text     = 0;
    p_state->is_cdata      = p_state->pend_text_is_cdata;
    p_state->offset        = p_state->pend_text_offset;
    p_state->line          = p_state->pend_text_line;
    p_state->column        = p_state->pend_text_column;

    report_event(p_state, E_TEXT,
		 SvPVX(old_pend_text), SvEND(old_pend_text),
		 SvUTF8(old_pend_text), 0, 0, self);
    SvOK_off(old_pend_text);

    p_state->unbroken_text = old_unbroken_text;
    p_state->pend_text     = old_pend_text;
    p_state->is_cdata      = old_is_cdata;
    p_state->offset        = old_offset;
    p_state->line          = old_line;
    p_state->column        = old_column;
}

static char*
skip_until_gt(char *beg, char *end)
{
    /* tries to emulate quote skipping behaviour observed in MSIE */
    char *s = beg;

hparser.c  view on Meta::CPAN

		int i;
		assert(SvTYPE(tokens) == SVt_PVAV);
		for (i = 0; i <= tokens_len; i++) {
		    SV** svp = av_fetch(tokens, i, 0);
		    if (svp) {
			STRLEN len;
			char *token_str = SvPV(*svp, len);
			enum marked_section_t token;
			if (strEQ(token_str, "include"))
			    token = MS_INCLUDE;
			else if (strEQ(token_str, "rcdata"))
			    token = MS_RCDATA;
			else if (strEQ(token_str, "cdata"))
			    token = MS_CDATA;
			else if (strEQ(token_str, "ignore"))
			    token = MS_IGNORE;
			else
			    token = MS_NONE;
			if (p_state->ms < token)
			    p_state->ms = token;
		    }
		}
	    }
	}
    }
    /* printf("MS %d\n", p_state->ms); */
    p_state->is_cdata = (p_state->ms == MS_CDATA);
    return;
}


static char*
parse_marked_section(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
    dTHX;
    char *s;
    AV* tokens = 0;

hparser.c  view on Meta::CPAN

		    char *t = literal_mode_elem[i].str;
		    int len = tag_len;
		    while (len) {
			if (toLOWER(*s) != *t)
			    break;
			s++;
			t++;
			if (!--len) {
			    /* found it */
			    p_state->literal_mode = literal_mode_elem[i].str;
			    p_state->is_cdata = literal_mode_elem[i].is_cdata;
			    /* printf("Found %s\n", p_state->literal_mode); */
			    goto END_OF_LITERAL_SEARCH;
			}
		    }
		}
	    }
	END_OF_LITERAL_SEARCH:
	    ;
	}

hparser.c  view on Meta::CPAN

		    while (isHSPACE(*s))
			s++;
		    if (*s == '>') {
			s++;
			if (t != end_text)
			    report_event(p_state, E_TEXT, t, end_text, utf8,
					 0, 0, self);
			report_event(p_state, E_END,  end_text, s, utf8,
				     &end_token, 1, self);
			p_state->literal_mode = 0;
			p_state->is_cdata = 0;
			t = s;
		    }
		}
	    }
	}

#ifdef MARKED_SECTION
	while (p_state->ms == MS_CDATA || p_state->ms == MS_RCDATA) {
	    while (s < end && *s != ']')
		s++;

hparser.c  view on Meta::CPAN

	}
	report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, 0, self);

	/* reset state */
	p_state->offset = 0;
	if (p_state->line)
	    p_state->line = 1;
	p_state->column = 0;
	p_state->start_document = 0;
	p_state->literal_mode = 0;
	p_state->is_cdata = 0;
	return;
    }

    if (p_state->utf8_mode)
	sv_utf8_downgrade(chunk, 0);

    if (p_state->buf && SvOK(p_state->buf)) {
	sv_catsv(p_state->buf, chunk);
	beg = SvPV(p_state->buf, len);
	utf8 = SvUTF8(p_state->buf);

hparser.h  view on Meta::CPAN

    SV* buf;
    STRLEN offset;
    STRLEN line;
    STRLEN column;
    bool start_document;
    bool parsing;
    bool eof;

    /* special parsing modes */
    char* literal_mode;
    bool  is_cdata;
    bool  no_dash_dash_comment_end;
    char *pending_end_tag;

    /* unbroken_text option needs a buffer of pending text */
    SV*    pend_text;
    bool   pend_text_is_cdata;
    STRLEN pend_text_offset;
    STRLEN pend_text_line;
    STRLEN pend_text_column;

    /* skipped text is accumulated here */
    SV* skipped_text;

#ifdef MARKED_SECTION
    /* marked section support */
    enum marked_section_t ms;

lib/HTML/Parser.pm  view on Meta::CPAN

    my %arg = @_;
    my $api_version = delete $arg{api_version} || (@_ ? 3 : 2);
    if ($api_version >= 4) {
	require Carp;
	Carp::croak("API version $api_version not supported " .
		    "by HTML::Parser $VERSION");
    }

    if ($api_version < 3) {
	# Set up method callbacks compatible with HTML-Parser-2.xx
	$self->handler(text    => "text",    "self,text,is_cdata");
	$self->handler(end     => "end",     "self,tagname,text");
	$self->handler(process => "process", "self,token0,text");
	$self->handler(start   => "start",
		                  "self,tagname,attr,attrseq,text");

	$self->handler(comment =>
		       sub {
			   my($self, $tokens) = @_;
			   for (@$tokens) {
			       $self->comment($_);

lib/HTML/Parser.pm  view on Meta::CPAN


This passes undef except for C<text> events.

=item C<event>

Event causes the event name to be passed.

The event name is one of C<text>, C<start>, C<end>, C<declaration>,
C<comment>, C<process>, C<start_document> or C<end_document>.

=item C<is_cdata>

Is_cdata causes a TRUE value to be passed if the event is inside a CDATA
section or between literal start and end tags (C<script>,
C<style>, C<xmp>, C<iframe>, C<title>, C<textarea> and C<plaintext>).

if the flag is FALSE for a text event, then you should normally
either use C<dtext> or decode the entities yourself before the text is
processed further.

=item C<length>

Length causes the number of bytes of the source text of the event to

lib/HTML/Parser.pm  view on Meta::CPAN

=head1 VERSION 2 COMPATIBILITY

When an C<HTML::Parser> object is constructed with no arguments, a set
of handlers is automatically provided that is compatible with the old
HTML::Parser version 2 callback methods.

This is equivalent to the following method calls:

    $p->handler(start   => "start",   "self, tagname, attr, attrseq, text");
    $p->handler(end     => "end",     "self, tagname, text");
    $p->handler(text    => "text",    "self, text, is_cdata");
    $p->handler(process => "process", "self, token0, text");
    $p->handler(
        comment => sub {
            my ($self, $tokens) = @_;
            for (@$tokens) { $self->comment($_); }
        },
        "self, tokens"
    );
    $p->handler(
        declaration => sub {

lib/HTML/TokeParser.pm  view on Meta::CPAN

our $VERSION = '3.83';

use Carp ();
use HTML::Entities qw(decode_entities);
use HTML::Tagset ();

my %ARGS =
(
 start       => "'S',tagname,attr,attrseq,text",
 end         => "'E',tagname,text",
 text        => "'T',text,is_cdata",
 process     => "'PI',token0,text",
 comment     => "'C',text",
 declaration => "'D',text",

 # options that default on
 unbroken_text => 1,
);


sub new

t/argspec.t  view on Meta::CPAN

my $decl  = '<!ENTITY nbsp   CDATA "&#160;" -- no-break space -->';
my $com1  = '<!-- Comment -->';
my $com2  = '<!-- Comment -- -- Comment -->';
my $start = '<a href="foo">';
my $end   = '</a>';
my $empty = "<IMG SRC='foo'/>";
my $proc  = '<? something completely different ?>';

my @argspec = qw(
    self offset length event tagname tag token0 text
    is_cdata dtext tokens tokenpos attr attrseq
);

my @result;
my $p = HTML::Parser->new(
    default_h      => [\@result, join(',', @argspec)],
    strict_comment => 1,
    xml_mode       => 1
);

my @tests = (    # string, expected results

t/marked-sect.t  view on Meta::CPAN

4.3:49 text "\n"
5.4:54 text "\nINCLUDE\nSTUFF\n"
8.3:72 text "\n.."
9.2:75 start "<h1>"
9.6:79 text "Test"
9.10:83 end "</h1>"
9.15:88 text "\n"
10.0:89 end_document ""
EOT

    my $doc    = "<Tag><![CDATA[This is cdata]]></Tag>";
    my $result = "";
    $p = HTML::Parser->new(
        marked_sections => 1,
        handlers        => {
            default => [sub { $result .= join("", @_); }, "skipped_text,text"]
        }
    )->parse($doc)->eof;
    is($doc, $result);

    $text = "";

t/plaintext.t  view on Meta::CPAN

use strict;
use warnings;

use HTML::Parser ();
use Test::More tests => 3;

my @data;
my $p = HTML::Parser->new(api_version => 3);
$p->handler(default => \@data, '@{event, text, is_cdata}');
$p->parse(<<EOT)->eof;
<xmp><foo></xmp>x<plaintext><foo>
</plaintext>
foo
EOT

for (@data) {
    $_ = "" unless defined;
}

t/textarea.t  view on Meta::CPAN

my $dump = "";
sub tdump {
   my @data = @_;
   for (@data) {
      $_ = "<undef>" unless defined;
      s/\n/\\n/g;
   }
   $dump .= join("|", @data) . "\n";
}

my $p = HTML::Parser->new(default_h => [\&tdump, "event,text,dtext,is_cdata"]);
$p->parse($html)->eof;

#diag $dump;

is($dump, <<'EOT');
start_document||<undef>|<undef>
start|<html>|<undef>|<undef>
text|\n|\n|
start|<title>|<undef>|<undef>
text|This is a <nice> title|This is a <nice> title|

t/unbroken-text.t  view on Meta::CPAN

use strict;
use warnings;

use HTML::Parser ();
use Test::More tests => 3;

my $text = "";

sub text {
    my $cdata = shift() ? "CDATA" : "TEXT";
    my ($offset, $line, $col, $t) = @_;
    $text .= "[$cdata:$offset:$line.$col:$t]";
}

sub tag {
    $text .= shift;
}

my $p = HTML::Parser->new(
    unbroken_text => 1,
    text_h        => [\&text, "is_cdata,offset,line,column,text"],
    start_h       => [\&tag, "text"],
    end_h         => [\&tag, "text"],
);

$p->parse("foo ");
$p->parse("bar ");
$p->parse("<foo>");
$p->parse("bar\n");
$p->parse("</foo>");
$p->parse("<xmp>xmp</xmp>");

t/unbroken-text.t  view on Meta::CPAN


$text = "";
$p->eof;

#diag $text;
is($text, "[TEXT:37:2.20:atend]");


$p = HTML::Parser->new(
    unbroken_text => 1,
    text_h        => [\&text, "is_cdata,offset,line,column,text"],
);

$text = "";
$p->parse("foo");
$p->parse("<foo");
$p->parse(">bar\n");
$p->parse("foo<xm");
$p->parse("p>xmp");
$p->parse("</xmp");
$p->parse(">bar");



( run in 0.705 second using v1.01-cache-2.11-cpan-454fe037f31 )