perlSGML.1997Sep
view release on metacpan or search on metacpan
lib/SGML/Parser.pm view on Meta::CPAN
*TypeMSC = \4;
*TypePIO = \5;
*TypeSTagO = \6;
$namestart = 'A-Za-z';
$namechars = '\w\.\-';
##########################################################################
##**********************************************************************##
## PUBLIC METHODS
##**********************************************************************##
##----------------------------------------------------------------------
## new() constructor.
##
sub new {
my $this = { };
my $class = shift;
## Private variables
$this->{'_input_stack'} = [ ]; # Input stack
$this->{'_input'} = undef; # Reference to current input info
$this->{'_buf'} = ''; # Working buffer
$this->{'_open_ms'} = 0; # Number of open marked sections
$this->{'_open_ms_ign'} = 0; # Number of open marked sections in
# ignore mared section
## Public variables
$this->{'mode'} = $ModePCData; # Parsing mode: Can be set
# by method callbacks to control
# recognition modes.
bless $this, $class;
$this;
}
##----------------------------------------------------------------------
## parse_data() parses an SGML instance specified by either a
## reference to a filehandle or a reference to a scalar string.
## If a scalar string, the input will get modified. Therefore,
## the callar may need to make a copy before calling parse_data
## if the original input is needed afterwards.
##
## The delimiters defined by the reference concrete syntax are
## assumed.
##
## Example usage:
##
## $parser->parse_data(\*FILE, "file.sgm", $init_buffer_txt,
## $line_no_start);
##
## Only the first argument is required. The other are optional.
##
## The routine calls callback methods for the various events
## that can occur. It is up to the methods to make sense of
## the data.
##
## The following lists the various methods invoked during parsing:
##
## $this->cdata($cdata);
## $this->char_ref($funcname_or_charnum);
## $this->comment_decl(\@comments);
## $this->end_tag($gi);
## $txt = $this->entity_ref($entname);
## $this->ignored_data($data);
## $this->marked_sect_close();
## $this->marked_sect_open($status_keyword, $status_spec);
## $txt = $this->parm_entity_ref($entname);
## $this->processing_inst($pidata);
## $this->start_tag($gi, $attr_spec);
##
## $this->error($message);
##
## The entity reference methods can return a string. If so,
## it is prepended to the current buffer for parsing. Return
## the empty string, or undef, if no text should be parsed.
##
## These methods should be redefined by subclasses to perform
## whatever parsing tasks are required.
##
sub parse_data {
my $this = shift; # Self reference
my $href = { };
my $in = shift; # Input (filehandle or a string reference)
$href->{'_label'} = shift; # Input label (Optional)
my $buf = shift || ''; # Initial buffer (Optional)
$href->{'_ln'} = shift || 0;# Starting line number (Optional).
my($before, $after, $type, $tmp);
my($m1, $gi, $name);
## Set values for subsequent calls to _get_line()
if (ref($in) eq 'SCALAR') {
$href->{'_string'} = $in;
$href->{'_fh'} = undef;
} else {
$href->{'_string'} = undef;
$href->{'_fh'} = $in;
}
push(@{$this->{'_input_stack'}}, $this->{'_input'})
if $this->{'_input'};
$this->{'_input'} = $href;
$this->{'mode'} = $ModePCData;
# Eval code to capture die's
eval {
## Parse input
LOOP: while (defined($buf)) {
# Fill working buffer if empty
if ($buf eq '') {
last LOOP unless defined($buf = $this->_get_line());
}
#--------------------------------------------------------------
# Check for markup. Choose match that occurs earliest in
# string.
#--------------------------------------------------------------
($before, $after, $type, $m1) = (undef,'','','');
# Pcdata mode checks
if ($this->{'mode'} == $ModePCData) {
if ($buf =~ m@<([!?/>$namestart])@o) {
$before = $`; $m1 = $1; $after = $';
BLK: {
if ($m1 eq '!') { $type = $TypeMDO; last BLK; }
if ($m1 eq '?') { $type = $TypePIO; last BLK; }
if ($m1 eq '/') { $type = $TypeETagO; last BLK; }
if ($m1 eq '>') { $type = $TypeSTagO; last BLK; }
$type = $TypeSTagO;
}
}
}
# Check for entity reference
if ($this->{'mode'} == $ModePCData or
$this->{'mode'} == $ModeRCData or
$this->{'mode'} == $ModeMSRCData) {
if ($buf =~ m@\&([#$namestart])@o) {
if (!defined($before) or length($before) > length($`)) {
$before = $`; $m1 = $1; $after = $';
$type = $TypeERO;
}
}
}
# Check for cdata mode
if ($this->{'mode'} == $ModeCData) {
if ($buf =~ m|<(/)|) {
if (!defined($before) or length($before) > length($`)) {
$before = $`; $m1 = $1; $after = $';
$type = $TypeETagO;
}
}
}
# Check for marked section close
if ($this->{'mode'} != $ModeCData) {
if ($buf =~ m|\]\]>|) {
if (!defined($before) or length($before) > length($`)) {
$before = $`; $after = $';
$type = $TypeMSC;
}
}
}
# Check for marked section opens while ignoring
if ($this->{'mode'} == $ModeIgnore) {
if ($buf =~ m|<!\[|) {
if ($type == $TypeMSC and length($before) > length($`)) {
$this->{'_open_ms_ign'}++;
}
}
}
#--------------------------------------------------------------
# Now, check what the type is and process accordingly.
#--------------------------------------------------------------
## Invoke cdata callback if any before text -------------------
if ($before ne '') {
$this->{'mode'} == $ModeIgnore ?
$this->ignored_data($before) : $this->cdata($before);
}
## Entity reference -------------------------------------------
if ($type == $TypeERO) {
$buf = $after;
$name = $m1;
if ($name eq '#') { # Character reference
if ($buf =~ s/^([$namechars]+);?//o) {
$name = $1;
}
$this->char_ref($name);
} else { # General entity reference
if ($buf =~ s/^([$namechars]*);?//o) {
$name .= $1;
}
$buf = $this->entity_ref($name) . $buf;
}
next LOOP;
}
## End tag ----------------------------------------------------
if ($type == $TypeETagO) {
$buf = $after;
$gi = '';
# Get rest of generic identifier
if ($buf =~ s/^([$namechars]*)\s*//o) {
$gi = $1;
}
# Read up to tagc
ETAG: while (1) {
if ($buf =~ />/o) { $buf = $'; last ETAG; }
if (!defined($buf = $this->_get_line())) {
$this->error("Unexpected EOF; end tag not closed");
}
}
$this->end_tag($gi);
$this->{'mode'} = $ModePCData;
next LOOP;
}
## Start tag --------------------------------------------------
if ($type == $TypeSTagO) {
$gi = $m1; $buf = $after;
# Check for null start tag
if ($gi eq '>') {
$this->start_tag('', '');
next LOOP;
}
# Get rest of generic identifier
if ($buf =~ s/^([$namechars]*)//o) { $gi .= $1; }
# Get attribute specification list and tagc
$attr = '';
lib/SGML/Parser.pm view on Meta::CPAN
"marked section start");
}
}
if ($tmp =~ /%([$namechars])/o) {
$keyword = $this->parm_entity_ref($1);
} else {
($keyword = $tmp) =~ s/\s//g;
}
$keyword = uc $keyword;
$this->marked_sect_open($keyword, $tmp);
if ($keyword eq "IGNORE") {
$this->{'mode'} = $ModeIgnore;
} elsif ($keyword eq "RCDATA") {
$this->{'mode'} = $ModeMSRCData;
} elsif ($keyword eq "CDATA") {
$this->{'mode'} = $ModeMSCData;
} else {
$this->{'_open_ms'}++;
}
next LOOP;
} # end marked section open
if ($buf =~ s/^--//) { ## Comment declaration
my(@comms) = ();
# Outer loop for comment declaration as a whole
COMDCL: while (1) {
$tmp = '';
# Inner loop for each comment block in declaration
COMM: while (1) {
if ($buf =~ /--/o) {
$tmp .= $`; $buf = $'; last COMM;
}
$tmp .= $buf;
if (!defined($buf = $this->_get_line())) {
$this->error("Unexpected EOF; " .
"Comment not closed");
last COMM;
}
}
# Push comment block on list
push(@comms, $tmp);
last COMM unless defined($buf);
# Check for declaration close or another comment block
while ($buf !~ /\S/) {
if (!defined($buf = $this->_get_line())) {
$this->error("Unexpected EOF; " .
"Comment declaration " .
"not closed");
last COMDCL;
}
}
if ($buf =~ s/^\s*--//o) {
next COMDCL;
} elsif ($buf =~ s/^\s*>//o) {
last COMDCL;
} else { # punt
$this->error("Invalid cdata outside of comment");
next COMDCL;
}
}
$this->comment_decl(\@comms);
next LOOP;
} # end comment
$buf = "<!" . $buf;
} # end markup declaration
## If not markup, invoke cdata callback -----------------------
$this->{'mode'} == $ModeIgnore ?
$this->ignored_data($buf) :
$this->cdata($buf);
$buf = '';
}
}; # End eval
$this->{'_input'} = pop(@{$this->{'_input_stack'}});
# Return buffer. May contain data if parsing was aborted, otherwise
# should be undef.
$buf;
}
##----------------------------------------------------------------------
## get_line_no() retrieves the current line number of the input.
## Method useful in callback routines.
##
sub get_line_no {
my $this = shift;
$this->{'_input'}{'_ln'};
}
##----------------------------------------------------------------------
## get_input_label() retrieves the label given to the input being
## parsed. Label is defined when the parse_data method is called.
## Method useful in callback routines.
##
sub get_input_label {
my $this = shift;
$this->{'_input'}{'_label'};
}
##########################################################################
##**********************************************************************##
## CALLBACK METHODS
##**********************************************************************##
## Subclasses are to redefine callback methods to perform
## whatever actions are desired.
##**********************************************************************##
sub cdata { }
sub char_ref { }
sub comment_decl { }
sub end_tag { }
sub entity_ref { undef }
sub ignored_data { }
sub marked_sect_close { }
sub marked_sect_open { }
sub parm_entity_ref { undef }
sub processing_inst { }
sub start_tag { }
sub error {
my $this = shift;
my $label = $this->get_input_label();
my $line = $this->get_line_no();
warn(ref($this), ":$label:Line $line:", @_, "\n");
}
##########################################################################
##**********************************************************************##
## PRIVATE METHODS
##**********************************************************************##
##----------------------------------------------------------------------
## _get_line() retrieves the next line from input. undef is
## returned if end of input is reached.
##
sub _get_line {
my $this = shift;
my $ret = undef;
my $href = $this->{'_input'};
my($sref, $fh);
if (defined($fh = $href->{'_fh'})) {
$href->{'_ln'} = $. if defined($ret = <$fh>);
} elsif (defined($sref = $href->{'_string'})) {
if ($$sref =~ s%(.*?${/})%%o) {
$ret = $1;
$href->{'_ln'}++;
} elsif ($$sref ne '') {
$ret = $$sref;
$href->{'_string'} = undef;
$href->{'_ln'}++;
} else {
$href->{'_string'} = undef;
}
}
$ret;
}
##########################################################################
1;
( run in 0.668 second using v1.01-cache-2.11-cpan-437f7b0c052 )