AI-Categorizer
view release on metacpan or search on metacpan
lib/AI/Categorizer/Document/XML.pm view on Meta::CPAN
package AI::Categorizer::Document::XML;
use strict;
use AI::Categorizer::Document;
use base qw(AI::Categorizer::Document);
use XML::SAX;
__PACKAGE__->contained_objects
(
xml_handler => 'AI::Categorizer::Document::XML::Handler',
);
### Constructors
sub parse {
my ($self, %args) = @_;
# it is a string which contains the content of XML
my $body= $args{content};
# it is a hash which includes a pair of <elementName, weight>
my $elementWeight= $args{elementWeight};
# construct Handler which receive event of element, data, comment, processing_instruction
# And convert their values into a sequence of string and save it into buffer
my $xmlHandler = $self->create_contained_object('xml_handler', weights => $elementWeight);
# construct parser
my $xmlParser= XML::SAX::ParserFactory->parser(Handler => $xmlHandler);
# let's start parsing XML, where the methids of Handler will be called
$xmlParser->parse_string($body);
# extract the converted string from Handler
$body= $xmlHandler->getContent;
# Now, construct Document Object and return it
return { body => $body };
}
##########################################################################
package AI::Categorizer::Document::XML::Handler;
use strict;
use base qw(XML::SAX::Base);
# Input: a hash which is weights of elements
# Output: object of this class
# Description: this is constructor
sub new{
my ($class, %args) = @_;
# call super class such as XML::SAX::Base
my $self = $class->SUPER::new;
# save weights of elements which is a hash for pairs <elementName, weight>
# weight is times duplication of corresponding element
# It is provided by caller(one of parameters) at construction, and
# we must save it in order to use doing duplication at end_element
$self->{weightHash} = $args{weights};
# It is storage to store the data produced by Text, CDataSection and etc.
$self->{content} = '';
# This array is used to store the data for every element from root to the current visiting element.
# Thus, data of 0~($levelPointer-1)th in the array is only valid.
# The array which store the starting location(index) of the content for an element,
# From it, we can know all the data produced by an element at the end_element
# It is needed at the duplication of the data produced by the specific element
$self->{locationArray} = [];
return $self;
}
# Input: None
# Output: None
# Description:
# it is called whenever the parser meets the document
# it will be called at once
# Currently, confirm if the content buffer is an empty
sub start_document{
my ($self, $doc)= @_;
# The level(depth) of the last called element in XML tree
# Calling of start_element is the preorder of the tree traversal.
# The level is the level of current visiting element in tree.
# the first element is 0-level
$self->{levelPointer} = 0;
# all data will be saved into here, initially, it is an empty
$self->{content} = "";
#$self->SUPER::start_document($doc);
}
# Input: None
# Output: None
# Description:
# it is called whenever the parser ends the document
# it will be called at once
# Nothing to do
sub end_document{
my ($self, $doc)= @_;
#$self->SUPER::end_document($doc);
}
# Input
# LocalName: $el->{LocalName}
# NamespaceURI: $el->{NamespaceURI}
# Name $el->{Name}
# Prefix $el->{Prefix}
# Attributes $el->{Attributes}
# for each attribute
# LocalName: $el->{LocalName}
# NamespaceURI: $el->{NamespaceURI}
# Name $el->{Name}
( run in 0.504 second using v1.01-cache-2.11-cpan-39bf76dae61 )