HTML-Parser
view release on metacpan or search on metacpan
* The offset/line/column counters was not properly reset
after eof.
3.23 2001-05-01
* If the $p->ignore_elements filter did not work as it should if
handlers for start/end events was not registered.
3.22 2001-04-17
* The <textarea> element is now parsed in literal mode, i.e. no other tags
recognized until the </textarea> tag is seen. Unlike other literal elements,
the text content is not 'cdata'.
* The XML ' entity is decoded. It apos-char itself is still encoded as
' as ' is not really an HTML tag, and not recognized by many HTML
browsers.
3.21 2001-04-10
* Fix a memory leak which occurred when using filter methods.
* Avoid a few compiler warnings (DEC C):
- Trailing comma found in enumerator list
- "unsigned char" is not compatible with "const char".
* Doc update.
* Michael once again fixed my bad English in the HTML::Parser
documentation.
* netscape_buggy_comment will carp instead of warn
* updated TODO/README
* Documented that HTML::Filter is depreciated.
* Made backslash reserved in literal argspec strings.
* Added several new test scripts.
2.99_94 1999-12-08
* (should almost be a 3.00 candidate)
* Renamed 'cdata_flag' as 'is_cdata'.
* Dropped support for wrapping callback handler and argspec
in an array and passing a reference to $p->handler. It
created ambiguities when you want to pass a array as
handler destination and not update argspec. The wrapping
for constructor arguments are unchanged.
* Reworked the documentation after updates from Michael.
* Simplified internal check_handler(). It should probably simply
be inlined in handler() again.
* Added argspec 'length' and 'undef'
* Fix statement-less label. Fix suggested by Matthew Langford
pstate2->buf = SvREFCNT_inc(sv_dup(pstate->buf, params));
pstate2->offset = pstate->offset;
pstate2->line = pstate->line;
pstate2->column = pstate->column;
pstate2->start_document = pstate->start_document;
pstate2->parsing = pstate->parsing;
pstate2->eof = pstate->eof;
pstate2->literal_mode = pstate->literal_mode;
pstate2->is_cdata = pstate->is_cdata;
pstate2->no_dash_dash_comment_end = pstate->no_dash_dash_comment_end;
pstate2->pending_end_tag = pstate->pending_end_tag;
pstate2->pend_text = SvREFCNT_inc(sv_dup(pstate->pend_text, params));
pstate2->pend_text_is_cdata = pstate->pend_text_is_cdata;
pstate2->pend_text_offset = pstate->pend_text_offset;
pstate2->pend_text_line = pstate->pend_text_offset;
pstate2->pend_text_column = pstate->pend_text_column;
pstate2->skipped_text = SvREFCNT_inc(sv_dup(pstate->skipped_text, params));
#ifdef MARKED_SECTION
pstate2->ms = pstate->ms;
pstate2->ms_stack =
(AV *)SvREFCNT_inc(sv_dup((SV *)pstate->ms_stack, params));
entities for characters outside the range 0..255 are left unchanged.
This passes undef except for "text" events.
"event"
Event causes the event name to be passed.
The event name is one of "text", "start", "end", "declaration",
"comment", "process", "start_document" or "end_document".
"is_cdata"
Is_cdata causes a TRUE value to be passed if the event is inside a
CDATA section or between literal start and end tags ("script",
"style", "xmp", "iframe", "title", "textarea" and "plaintext").
if the flag is FALSE for a text event, then you should normally
either use "dtext" or decode the entities yourself before the text
is processed further.
"length"
Length causes the number of bytes of the source text of the event to
be passed.
VERSION 2 COMPATIBILITY
When an "HTML::Parser" object is constructed with no arguments, a set of
handlers is automatically provided that is compatible with the old
HTML::Parser version 2 callback methods.
This is equivalent to the following method calls:
$p->handler(start => "start", "self, tagname, attr, attrseq, text");
$p->handler(end => "end", "self, tagname, text");
$p->handler(text => "text", "self, text, is_cdata");
$p->handler(process => "process", "self, token0, text");
$p->handler(
comment => sub {
my ($self, $tokens) = @_;
for (@$tokens) { $self->comment($_); }
},
"self, tokens"
);
$p->handler(
declaration => sub {
#endif
#include "hctype.h" /* isH...() macros */
#include "tokenpos.h" /* dTOKEN; PUSH_TOKEN() */
const static
struct literal_tag {
int len;
char* str;
int is_cdata;
}
literal_mode_elem[] =
{
{6, "script", 1},
{5, "style", 1},
{3, "xmp", 1},
{6, "iframe", 1},
{9, "plaintext", 1},
{5, "title", 0},
{8, "textarea", 0},
"tokens", /* ARG_TOKENS */
"tokenpos", /* ARG_TOKENPOS */
"token0", /* ARG_TOKEN0 */
"tagname", /* ARG_TAGNAME */
"tag", /* ARG_TAG */
"attr", /* ARG_ATTR */
"@attr", /* ARG_ATTRARR */
"attrseq", /* ARG_ATTRSEQ */
"text", /* ARG_TEXT */
"dtext", /* ARG_DTEXT */
"is_cdata", /* ARG_IS_CDATA */
"skipped_text", /* ARG_SKIPPED_TEXT */
"offset", /* ARG_OFFSET */
"offset_end", /* ARG_OFFSET_END */
"length", /* ARG_LENGTH */
"line", /* ARG_LINE */
"column", /* ARG_COLUMN */
"event", /* ARG_EVENT */
"undef", /* ARG_UNDEF */
/* ARG_LITERAL (not compared) */
/* ARG_FLAG_FLAT_ARRAY */
if (SvTYPE(h->cb) != SVt_PVAV && !SvTRUE(h->cb)) {
/* FALSE scalar ('' or 0) means IGNORE this event */
return;
}
if (p_state->unbroken_text && event == E_TEXT) {
/* should buffer text */
if (!p_state->pend_text)
p_state->pend_text = newSV(256);
if (SvOK(p_state->pend_text)) {
if (p_state->is_cdata != p_state->pend_text_is_cdata) {
flush_pending_text(p_state, self);
SPAGAIN;
goto INIT_PEND_TEXT;
}
}
else {
INIT_PEND_TEXT:
p_state->pend_text_offset = offset;
p_state->pend_text_line = line;
p_state->pend_text_column = column;
p_state->pend_text_is_cdata = p_state->is_cdata;
sv_setpvs(p_state->pend_text, "");
if (!utf8)
SvUTF8_off(p_state->pend_text);
}
if (utf8 && !SvUTF8(p_state->pend_text))
sv_utf8_upgrade(p_state->pend_text);
if (utf8 || !SvUTF8(p_state->pend_text)) {
sv_catpvn(p_state->pend_text, beg, end - beg);
}
else {
arg = sv_2mortal(newSVpvn(beg, end - beg));
if (utf8)
SvUTF8_on(arg);
break;
case ARG_DTEXT:
if (event == E_TEXT) {
arg = sv_2mortal(newSVpvn(beg, end - beg));
if (utf8)
SvUTF8_on(arg);
if (!p_state->is_cdata) {
if (p_state->utf8_mode) {
sv_utf8_decode(arg);
sv_utf8_upgrade(arg);
}
decode_entities(aTHX_ arg, p_state->entity2char, 1);
if (p_state->utf8_mode)
SvUTF8_off(arg);
}
}
break;
case ARG_IS_CDATA:
if (event == E_TEXT) {
arg = boolSV(p_state->is_cdata);
}
break;
case ARG_SKIPPED_TEXT:
arg = sv_2mortal(p_state->skipped_text);
p_state->skipped_text = newSVpvs("");
break;
case ARG_OFFSET:
arg = sv_2mortal(newSViv(offset));
return argspec;
}
static void
flush_pending_text(PSTATE* p_state, SV* self)
{
dTHX;
bool old_unbroken_text = p_state->unbroken_text;
SV* old_pend_text = p_state->pend_text;
bool old_is_cdata = p_state->is_cdata;
STRLEN old_offset = p_state->offset;
STRLEN old_line = p_state->line;
STRLEN old_column = p_state->column;
assert(p_state->pend_text && SvOK(p_state->pend_text));
p_state->unbroken_text = 0;
p_state->pend_text = 0;
p_state->is_cdata = p_state->pend_text_is_cdata;
p_state->offset = p_state->pend_text_offset;
p_state->line = p_state->pend_text_line;
p_state->column = p_state->pend_text_column;
report_event(p_state, E_TEXT,
SvPVX(old_pend_text), SvEND(old_pend_text),
SvUTF8(old_pend_text), 0, 0, self);
SvOK_off(old_pend_text);
p_state->unbroken_text = old_unbroken_text;
p_state->pend_text = old_pend_text;
p_state->is_cdata = old_is_cdata;
p_state->offset = old_offset;
p_state->line = old_line;
p_state->column = old_column;
}
static char*
skip_until_gt(char *beg, char *end)
{
/* tries to emulate quote skipping behaviour observed in MSIE */
char *s = beg;
int i;
assert(SvTYPE(tokens) == SVt_PVAV);
for (i = 0; i <= tokens_len; i++) {
SV** svp = av_fetch(tokens, i, 0);
if (svp) {
STRLEN len;
char *token_str = SvPV(*svp, len);
enum marked_section_t token;
if (strEQ(token_str, "include"))
token = MS_INCLUDE;
else if (strEQ(token_str, "rcdata"))
token = MS_RCDATA;
else if (strEQ(token_str, "cdata"))
token = MS_CDATA;
else if (strEQ(token_str, "ignore"))
token = MS_IGNORE;
else
token = MS_NONE;
if (p_state->ms < token)
p_state->ms = token;
}
}
}
}
}
/* printf("MS %d\n", p_state->ms); */
p_state->is_cdata = (p_state->ms == MS_CDATA);
return;
}
static char*
parse_marked_section(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self)
{
dTHX;
char *s;
AV* tokens = 0;
char *t = literal_mode_elem[i].str;
int len = tag_len;
while (len) {
if (toLOWER(*s) != *t)
break;
s++;
t++;
if (!--len) {
/* found it */
p_state->literal_mode = literal_mode_elem[i].str;
p_state->is_cdata = literal_mode_elem[i].is_cdata;
/* printf("Found %s\n", p_state->literal_mode); */
goto END_OF_LITERAL_SEARCH;
}
}
}
}
END_OF_LITERAL_SEARCH:
;
}
while (isHSPACE(*s))
s++;
if (*s == '>') {
s++;
if (t != end_text)
report_event(p_state, E_TEXT, t, end_text, utf8,
0, 0, self);
report_event(p_state, E_END, end_text, s, utf8,
&end_token, 1, self);
p_state->literal_mode = 0;
p_state->is_cdata = 0;
t = s;
}
}
}
}
#ifdef MARKED_SECTION
while (p_state->ms == MS_CDATA || p_state->ms == MS_RCDATA) {
while (s < end && *s != ']')
s++;
}
report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, 0, self);
/* reset state */
p_state->offset = 0;
if (p_state->line)
p_state->line = 1;
p_state->column = 0;
p_state->start_document = 0;
p_state->literal_mode = 0;
p_state->is_cdata = 0;
return;
}
if (p_state->utf8_mode)
sv_utf8_downgrade(chunk, 0);
if (p_state->buf && SvOK(p_state->buf)) {
sv_catsv(p_state->buf, chunk);
beg = SvPV(p_state->buf, len);
utf8 = SvUTF8(p_state->buf);
SV* buf;
STRLEN offset;
STRLEN line;
STRLEN column;
bool start_document;
bool parsing;
bool eof;
/* special parsing modes */
char* literal_mode;
bool is_cdata;
bool no_dash_dash_comment_end;
char *pending_end_tag;
/* unbroken_text option needs a buffer of pending text */
SV* pend_text;
bool pend_text_is_cdata;
STRLEN pend_text_offset;
STRLEN pend_text_line;
STRLEN pend_text_column;
/* skipped text is accumulated here */
SV* skipped_text;
#ifdef MARKED_SECTION
/* marked section support */
enum marked_section_t ms;
lib/HTML/Parser.pm view on Meta::CPAN
my %arg = @_;
my $api_version = delete $arg{api_version} || (@_ ? 3 : 2);
if ($api_version >= 4) {
require Carp;
Carp::croak("API version $api_version not supported " .
"by HTML::Parser $VERSION");
}
if ($api_version < 3) {
# Set up method callbacks compatible with HTML-Parser-2.xx
$self->handler(text => "text", "self,text,is_cdata");
$self->handler(end => "end", "self,tagname,text");
$self->handler(process => "process", "self,token0,text");
$self->handler(start => "start",
"self,tagname,attr,attrseq,text");
$self->handler(comment =>
sub {
my($self, $tokens) = @_;
for (@$tokens) {
$self->comment($_);
lib/HTML/Parser.pm view on Meta::CPAN
This passes undef except for C<text> events.
=item C<event>
Event causes the event name to be passed.
The event name is one of C<text>, C<start>, C<end>, C<declaration>,
C<comment>, C<process>, C<start_document> or C<end_document>.
=item C<is_cdata>
Is_cdata causes a TRUE value to be passed if the event is inside a CDATA
section or between literal start and end tags (C<script>,
C<style>, C<xmp>, C<iframe>, C<title>, C<textarea> and C<plaintext>).
if the flag is FALSE for a text event, then you should normally
either use C<dtext> or decode the entities yourself before the text is
processed further.
=item C<length>
Length causes the number of bytes of the source text of the event to
lib/HTML/Parser.pm view on Meta::CPAN
=head1 VERSION 2 COMPATIBILITY
When an C<HTML::Parser> object is constructed with no arguments, a set
of handlers is automatically provided that is compatible with the old
HTML::Parser version 2 callback methods.
This is equivalent to the following method calls:
$p->handler(start => "start", "self, tagname, attr, attrseq, text");
$p->handler(end => "end", "self, tagname, text");
$p->handler(text => "text", "self, text, is_cdata");
$p->handler(process => "process", "self, token0, text");
$p->handler(
comment => sub {
my ($self, $tokens) = @_;
for (@$tokens) { $self->comment($_); }
},
"self, tokens"
);
$p->handler(
declaration => sub {
lib/HTML/TokeParser.pm view on Meta::CPAN
our $VERSION = '3.83';
use Carp ();
use HTML::Entities qw(decode_entities);
use HTML::Tagset ();
my %ARGS =
(
start => "'S',tagname,attr,attrseq,text",
end => "'E',tagname,text",
text => "'T',text,is_cdata",
process => "'PI',token0,text",
comment => "'C',text",
declaration => "'D',text",
# options that default on
unbroken_text => 1,
);
sub new
t/argspec.t view on Meta::CPAN
my $decl = '<!ENTITY nbsp CDATA " " -- no-break space -->';
my $com1 = '<!-- Comment -->';
my $com2 = '<!-- Comment -- -- Comment -->';
my $start = '<a href="foo">';
my $end = '</a>';
my $empty = "<IMG SRC='foo'/>";
my $proc = '<? something completely different ?>';
my @argspec = qw(
self offset length event tagname tag token0 text
is_cdata dtext tokens tokenpos attr attrseq
);
my @result;
my $p = HTML::Parser->new(
default_h => [\@result, join(',', @argspec)],
strict_comment => 1,
xml_mode => 1
);
my @tests = ( # string, expected results
t/marked-sect.t view on Meta::CPAN
4.3:49 text "\n"
5.4:54 text "\nINCLUDE\nSTUFF\n"
8.3:72 text "\n.."
9.2:75 start "<h1>"
9.6:79 text "Test"
9.10:83 end "</h1>"
9.15:88 text "\n"
10.0:89 end_document ""
EOT
my $doc = "<Tag><![CDATA[This is cdata]]></Tag>";
my $result = "";
$p = HTML::Parser->new(
marked_sections => 1,
handlers => {
default => [sub { $result .= join("", @_); }, "skipped_text,text"]
}
)->parse($doc)->eof;
is($doc, $result);
$text = "";
t/plaintext.t view on Meta::CPAN
use strict;
use warnings;
use HTML::Parser ();
use Test::More tests => 3;
my @data;
my $p = HTML::Parser->new(api_version => 3);
$p->handler(default => \@data, '@{event, text, is_cdata}');
$p->parse(<<EOT)->eof;
<xmp><foo></xmp>x<plaintext><foo>
</plaintext>
foo
EOT
for (@data) {
$_ = "" unless defined;
}
t/textarea.t view on Meta::CPAN
my $dump = "";
sub tdump {
my @data = @_;
for (@data) {
$_ = "<undef>" unless defined;
s/\n/\\n/g;
}
$dump .= join("|", @data) . "\n";
}
my $p = HTML::Parser->new(default_h => [\&tdump, "event,text,dtext,is_cdata"]);
$p->parse($html)->eof;
#diag $dump;
is($dump, <<'EOT');
start_document||<undef>|<undef>
start|<html>|<undef>|<undef>
text|\n|\n|
start|<title>|<undef>|<undef>
text|This is a <nice> title|This is a <nice> title|
t/unbroken-text.t view on Meta::CPAN
use strict;
use warnings;
use HTML::Parser ();
use Test::More tests => 3;
my $text = "";
sub text {
my $cdata = shift() ? "CDATA" : "TEXT";
my ($offset, $line, $col, $t) = @_;
$text .= "[$cdata:$offset:$line.$col:$t]";
}
sub tag {
$text .= shift;
}
my $p = HTML::Parser->new(
unbroken_text => 1,
text_h => [\&text, "is_cdata,offset,line,column,text"],
start_h => [\&tag, "text"],
end_h => [\&tag, "text"],
);
$p->parse("foo ");
$p->parse("bar ");
$p->parse("<foo>");
$p->parse("bar\n");
$p->parse("</foo>");
$p->parse("<xmp>xmp</xmp>");
t/unbroken-text.t view on Meta::CPAN
$text = "";
$p->eof;
#diag $text;
is($text, "[TEXT:37:2.20:atend]");
$p = HTML::Parser->new(
unbroken_text => 1,
text_h => [\&text, "is_cdata,offset,line,column,text"],
);
$text = "";
$p->parse("foo");
$p->parse("<foo");
$p->parse(">bar\n");
$p->parse("foo<xm");
$p->parse("p>xmp");
$p->parse("</xmp");
$p->parse(">bar");
( run in 0.705 second using v1.01-cache-2.11-cpan-454fe037f31 )