cppAdaptive2
view release on metacpan or search on metacpan
src/dlib/xml_parser/xml_parser_kernel_1.h view on Meta::CPAN
);
private:
// -----------------------------------
// attribute_list interface implementation
class attrib_list : public attribute_list
{
public:
// the list of attribute name/value pairs
map list;
bool is_in_list (
const std::string& key
) const
{
return list.is_in_domain(key);
}
const std::string& operator[] (
const std::string& key
) const
{
return list[key];
}
bool at_start (
) const { return list.at_start(); }
void reset (
) const { return list.reset(); }
bool current_element_valid (
) const { return list.current_element_valid(); }
const type& element (
) const { return list.element(); }
type& element (
) { return list.element(); }
bool move_next (
) const { return list.move_next(); }
unsigned long size (
) const { return list.size(); }
};
// -----------------------------------
enum token_type
{
element_start, // the first tag of an element
element_end, // the last tag of an element
empty_element, // the singular tag of an empty element
pi, // processing instruction
chars, // the non-markup data between tags
chars_cdata, // the data from a CDATA section
eof, // this token is returned when we reach the end of input
error, // this token indicates that the tokenizer couldn't
// determine which category the next token fits into
dtd, // this token is for an entire dtd
comment // this is a token for comments
};
/*
notes about the tokens:
the tokenizer guarantees that the following tokens to not
contain the '<' character except as the first character of the token
element_start, element_end, empty_element, and pi. they also only
contain the '>' characer as their last character.
it is also guaranteed that pi is at least of the form <??>. that
is to say that it always always begins with <? and ends with ?>.
it is also guaranteed that all markup tokens will begin with the '<'
character and end with the '>'. there won't be any leading or
trailing whitespaces. this whitespace is considered a chars token.
*/
// private member functions
inline void get_next_token(
std::istream& in,
std::string& token_text,
int& token_kind,
unsigned long& line_number
);
/*!
ensures
gets the next token from in and puts it in token_text and
token_kind == the kind of the token found and
line_number is incremented every time a '\n' is encountered and
entity references are translated into the characters they represent
only for chars tokens
!*/
inline int parse_element (
const std::string& token,
std::string& name,
attrib_list& atts
);
/*!
requires
token is a token of kind start_element or empty_element
ensures
gets the element name and puts it into the string name and
parses out the attributes and puts them into the attribute_list atts
return 0 upon success or
returns -1 if it failed to parse token
!*/
inline int parse_pi (
const std::string& token,
std::string& target,
std::string& data
);
/*!
src/dlib/xml_parser/xml_parser_kernel_1.h view on Meta::CPAN
// ----------------------------------------------------------------------------------------
void xml_parser::
clear(
)
{
// unregister all event handlers
eh_list.clear();
dh_list.clear();
}
// ----------------------------------------------------------------------------------------
void xml_parser::
parse (
std::istream& in
)
{
DLIB_CASSERT ( in.fail() == false ,
"\tvoid xml_parser::parse"
<< "\n\tthe input stream must not be in the fail state"
<< "\n\tthis: " << this
);
// save which exceptions in will throw and make it so it won't throw any
// for the life of this function
std::ios::iostate old_exceptions = in.exceptions();
// set it to not throw anything
in.exceptions(std::ios::goodbit);
try
{
unsigned long line_number = 1;
// skip any whitespace before the start of the document
while (in.peek() == ' ' || in.peek() == '\t' || in.peek() == '\n' || in.peek() == '\r' )
{
if (in.peek() == '\n')
++line_number;
in.get();
}
stack tags; // this stack contains the last start tag seen
bool seen_fatal_error = false;
bool seen_root_tag = false; // this is true after we have seen the root tag
// notify all the document_handlers that we are about to being parsing
for (unsigned long i = 0; i < dh_list.size(); ++i)
{
dh_list[i]->start_document();
}
std::string chars_buf; // used to collect chars data between consecutive
// chars and chars_cdata tokens so that
// document_handlers receive all chars data between
// tags in one call
// variables to be used with the parsing functions
attrib_list atts;
std::string name;
std::string target;
std::string data;
// variables to use with the get_next_token() function
std::string token_text;
int token_kind;
get_next_token(in,token_text,token_kind,line_number);
while (token_kind != eof)
{
bool is_empty = false; // this becomes true when this token is an empty_element
switch (token_kind)
{
case empty_element: is_empty = true;
case element_start:
{
seen_root_tag = true;
int status = parse_element(token_text,name,atts);
// if there was no error parsing the element
if (status == 0)
{
// notify all the document_handlers
for (unsigned long i = 0; i < dh_list.size(); ++i)
{
dh_list[i]->start_element(line_number,name,atts);
if (is_empty)
dh_list[i]->end_element(line_number,name);
}
}
else
{
seen_fatal_error = true;
}
// if this is an element_start token then push the name of
// the element on to the stack
if (token_kind == element_start)
{
tags.push(name);
}
}break;
// ----------------------------------------
case element_end:
src/dlib/xml_parser/xml_parser_kernel_1.h view on Meta::CPAN
}
}
else
{
seen_fatal_error = true;
}
}break;
// ----------------------------------------
case pi:
{
int status = parse_pi (token_text,target,data);
// if there was no error parsing the element
if (status == 0)
{
// notify all the document_handlers
for (unsigned long i = 0; i < dh_list.size(); ++i)
{
dh_list[i]->processing_instruction(line_number,target,data);
}
}
else
{
// notify all the error_handlers
for (unsigned long i = 0; i < eh_list.size(); ++i)
{
eh_list[i]->error(line_number);
}
}
while (in.peek() == ' ' || in.peek() == '\t' || in.peek() == '\n' || in.peek() == '\r' )
{
if (in.peek() == '\n')
++line_number;
in.get();
}
}break;
// ----------------------------------------
case chars:
{
if (tags.size() != 0)
{
chars_buf += token_text;
}
else if (token_text.find_first_not_of(" \t\r\n") != std::string::npos)
{
// you can't have non whitespace chars data outside the root element
seen_fatal_error = true;
}
}break;
// ----------------------------------------
case chars_cdata:
{
if (tags.size() != 0)
{
chars_buf += token_text;
}
else
{
// you can't have chars_data outside the root element
seen_fatal_error = true;
}
}break;
// ----------------------------------------
case eof:
break;
// ----------------------------------------
case error:
{
seen_fatal_error = true;
}break;
// ----------------------------------------
case dtd: // fall though
case comment: // do nothing
break;
// ----------------------------------------
}
// if there was a fatal error then quit loop
if (seen_fatal_error)
break;
// if we have seen the last tag then quit the loop
if (tags.size() == 0 && seen_root_tag)
break;
get_next_token(in,token_text,token_kind,line_number);
// if the next token is not a chars or chars_cdata token then flush
// the chars_buf to the document_handlers
if ( (token_kind != chars) &&
(token_kind != chars_cdata) &&
(token_kind != dtd) &&
(token_kind != comment) &&
(chars_buf.size() != 0)
)
{
// notify all the document_handlers
for (unsigned long i = 0; i < dh_list.size(); ++i)
{
dh_list[i]->characters(chars_buf);
}
chars_buf.erase();
}
} //while (token_kind != eof)
// you can't have any unmatched tags or any fatal erros
if (tags.size() != 0 || seen_fatal_error)
{
// notify all the error_handlers
for (unsigned long i = 0; i < eh_list.size(); ++i)
{
eh_list[i]->fatal_error(line_number);
}
}
// notify all the document_handlers that we have ended parsing
for (unsigned long i = 0; i < dh_list.size(); ++i)
{
dh_list[i]->end_document();
}
}
catch (...)
{
// notify all the document_handlers that we have ended parsing
for (unsigned long i = 0; i < dh_list.size(); ++i)
{
dh_list[i]->end_document();
}
// restore the old exception settings to in
in.exceptions(old_exceptions);
// don't forget to rethrow the exception
throw;
}
// restore the old exception settings to in
in.exceptions(old_exceptions);
}
// ----------------------------------------------------------------------------------------
src/dlib/xml_parser/xml_parser_kernel_1.h view on Meta::CPAN
dh_list.add(dh_list.size(),temp);
}
// ----------------------------------------------------------------------------------------
void xml_parser::
add_error_handler (
error_handler& item
)
{
error_handler* temp = &item;
eh_list.add(eh_list.size(),temp);
}
// ----------------------------------------------------------------------------------------
void xml_parser::
swap (
xml_parser& item
)
{
dh_list.swap(item.dh_list);
eh_list.swap(item.eh_list);
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// private member function definitions
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
void xml_parser::
get_next_token(
std::istream& in,
std::string& token_text,
int& token_kind,
unsigned long& line_number
)
{
token_text.erase();
std::istream::int_type ch1 = in.get();
std::istream::int_type ch2;
switch (ch1)
{
// -----------------------------------------
// this is the start of some kind of a tag
case '<':
{
ch2 = in.get();
switch (ch2)
{
// ---------------------------------
// this is a dtd, comment, or chars_cdata token
case '!':
{
// if this is a CDATA section *******************************
if ( in.peek() == '[')
{
token_kind = chars_cdata;
// throw away the '['
in.get();
// make sure the next chars are CDATA[
std::istream::int_type ch = in.get();
if (ch != 'C')
token_kind = error;
ch = in.get();
if (ch != 'D')
token_kind = error;
ch = in.get();
if (ch != 'A')
token_kind = error;
ch = in.get();
if (ch != 'T')
token_kind = error;
ch = in.get();
if (ch != 'A')
token_kind = error;
ch = in.get();
if (ch != '[')
token_kind = error;
// if this is an error token then end
if (token_kind == error)
break;
// get the rest of the chars and put them into token_text
int brackets_seen = 0; // this is the number of ']' chars
// we have seen in a row
bool seen_closing = false; // true if we have seen ]]>
do
{
ch = in.get();
if (ch == '\n')
++line_number;
token_text += ch;
// if this is the closing
if (brackets_seen == 2 && ch == '>')
seen_closing = true;
// if we are seeing a bracket
else if (ch == ']')
++brackets_seen;
// if we didn't see a bracket
else
brackets_seen = 0;
} while ( (!seen_closing) && (ch != EOF) );
// check if this is an error token
if (ch == EOF)
{
token_kind = error;
}
else
( run in 2.562 seconds using v1.01-cache-2.11-cpan-98e64b0badf )