PPI-XS-Tokenizer
view release on metacpan or search on metacpan
CPPTokenizerWrapper.cc view on Meta::CPAN
"PPI::Token::Prototype", // Token_Prototype,
"PPI::Token::ArrayIndex", // Token_ArrayIndex,
"PPI::Token::HereDoc", // Token_HereDoc,
"PPI::Token::Attribute", // Token_Attribute,
"PPI::Token::Attribute", // Doesn't exist in PPI: Token_Attribute_Parameterized, (okay to map to PPI::Token::Attribute)
"PPI::Token::Label", // Token_Label,
"PPI::Token::Separator", // Token_Separator,
"PPI::Token::End", // Token_End,
"PPI::Token::Data", // Token_Data,
"PPI::Token::Pod", // Token_Pod,
"PPI::Token::BOM", // Token_BOM,
};
const int CPPTokenizerWrapper::fgSpecialToken[43] = {
eSimple, // Token_NoType = 0,
eSimple, // Token_WhiteSpace,
eSimple, // Token_Symbol,
eSimple, // Token_Comment,
eSimple, // Token_Word,
eSimple, // Token_DashedWord,
eSimple, // Token_Structure,
CPPTokenizerWrapper.cc view on Meta::CPAN
eSimple, // Token_Prototype,
eSimple, // Token_ArrayIndex,
eHereDoc, // Token_HereDoc,
eSimple, // Token_Attribute,
eSimple, // Token_Attribute_Parameterized, (PPI::Token::Attribute)
eSimple, // Token_Label,
eSimple, // Token_Separator,
eSimple, // Token_End,
eSimple, // Token_Data,
eSimple, // Token_Pod,
eSimple, // Token_BOM,
};
/*
* special tokens:
* PPI::Token::HereDoc
* all "extended" tokens
*/
src/structure.cpp view on Meta::CPAN
t->_new_token(zone);
return done_it_myself;
}
extern const char l_utf32_be[] = "\x00\x00\xfe\xff"; // => 'UTF-32',
extern const char l_utf32_le[] = "\xff\xfe\x00\x00"; // => 'UTF-32',
extern const char l_utf16_be[] = "\xfe\xff"; // => 'UTF-16',
extern const char l_utf16_le[] = "\xff\xfe"; // => 'UTF-16',
extern const char l_utf8[] = "\xef\xbb\xbf"; // => 'UTF-8',
CharTokenizeResults BOMToken::tokenize(Tokenizer *t, Token *token, unsigned char c_char) {
PredicateOr<
PredicateBinaryLiteral< 4, l_utf32_be >,
PredicateBinaryLiteral< 4, l_utf32_le >,
PredicateBinaryLiteral< 2, l_utf16_be >,
PredicateBinaryLiteral< 2, l_utf16_le >
> regex1;
unsigned long pos = 0;
if ( regex1.test( t->c_line, &pos, t->line_length ) ) {
sprintf(t->ErrorMsg, "BOM error: we do not support anything but ascii and utf8 (%02X,%02X)", t->c_line[0], t->c_line[1]);
return error_fail;
}
PredicateBinaryLiteral< 3, l_utf8 > regex2;
if ( regex2.test( t->c_line, &pos, t->line_length ) ) {
// well, if it's a utf8 maybe we will manage
for (unsigned long ix = 0; ix < pos; ix++ ) {
token->text[ ix ] = t->c_line[ ix ];
}
// move the beginning of the line to after the BOM
t->c_line += pos;
t->line_length -= pos;
token->length = 3;
}
TokenTypeNames zone = t->_finalize_token();
t->_new_token(zone);
return done_it_myself;
}
src/structure.h view on Meta::CPAN
CastToken() : AbstractTokenType( Token_Cast, true ) {}
CharTokenizeResults tokenize(Tokenizer *t, Token *token, unsigned char c_char);
};
class PrototypeToken : public AbstractTokenType {
public:
PrototypeToken() : AbstractTokenType( Token_Prototype, true ) {}
CharTokenizeResults tokenize(Tokenizer *t, Token *token, unsigned char c_char);
};
class BOMToken : public AbstractTokenType {
public:
BOMToken() : AbstractTokenType( Token_BOM, false ) {}
CharTokenizeResults tokenize(Tokenizer *t, Token *token, unsigned char c_char);
};
};
#endif
src/tokenizer.cpp view on Meta::CPAN
TokenTypeNames_pool[Token_Number_Exp] = new ExpNumberToken;
TokenTypeNames_pool[Token_ArrayIndex] = new ArrayIndexToken;
TokenTypeNames_pool[Token_Label] = new LabelToken;
TokenTypeNames_pool[Token_Attribute] = new AttributeToken;
TokenTypeNames_pool[Token_Attribute_Parameterized] = new ParameterizedAttributeToken;
TokenTypeNames_pool[Token_Pod] = new PodToken;
TokenTypeNames_pool[Token_Cast] = new CastToken;
TokenTypeNames_pool[Token_Prototype] = new PrototypeToken;
TokenTypeNames_pool[Token_DashedWord] = new DashedWordToken;
TokenTypeNames_pool[Token_Number_Version] = new VersionNumberToken;
TokenTypeNames_pool[Token_BOM] = new BOMToken;
TokenTypeNames_pool[Token_Separator] = new SeparatorToken;
TokenTypeNames_pool[Token_End] = new EndToken;
TokenTypeNames_pool[Token_Data] = new DataToken;
TokenTypeNames_pool[Token_HereDoc] = new HereDocToken;
//TokenTypeNames_pool[Token_HereDoc_Body] = new HereDocBodyToken;
for (int ix = 0; ix < NUM_SIGNIFICANT_KEPT; ix++) {
m_LastSignificant[ix] = NULL;
}
src/tokenizer.cpp view on Meta::CPAN
if ( ( c_token != NULL ) && ( c_token->type->type == Token_Whitespace ) ) {
}
return reached_eol;
}
LineTokenizeResults Tokenizer::tokenizeLine(char *line, unsigned long line_length) {
line_pos = 0;
c_line = line;
this->line_length = line_length;
if (c_token == NULL)
_new_token(Token_BOM);
while ( NULL != tokens_posponded_head ) {
if ( tokens_posponded_head->type->isa( Token_HereDoc ) ) {
ExtendedToken *tkn = (ExtendedToken *)tokens_posponded_head;
AbstractTokenType::VerifySufficientBufferLength(tkn, line_length);
if ( heredocbody_ended == ((HereDocToken*)(tokens_posponded_head->type))->Unpospone( this, tkn, line, line_length ) ) {
// release all posponded tokens, as long as they are not an another heredoc token
Token *tkn = tokens_posponded_head;
tokens_posponded_head = tkn->next;
chain_token(tkn, tokens_found_head, tokens_found_tail);
while ( ( NULL != tokens_posponded_head ) && ( ! tokens_posponded_head->type->isa( Token_HereDoc ) ) ) {
src/tokenizer.h view on Meta::CPAN
Token_Prototype, // done
Token_ArrayIndex, // done
Token_HereDoc, // done
Token_Attribute, // done
Token_Attribute_Parameterized, // done
Token_Label, // done
Token_Separator, // done
Token_End, // done
Token_Data, // done
Token_Pod, // done
Token_BOM, // done
Token_Foreign_Block, // for Perl6 code, unimplemented
Token_LastTokenType, // Marker for the last real types
// Here are abstract markers
isToken_QuoteOrQuotaLike,
isToken_Extended
};
// FIXME: fix the isa-a relationship between the tokens
src/whitespace.cpp view on Meta::CPAN
// FIXME: Add the c_char > 127 part?
sprintf(t->ErrorMsg, "Error: charecter rejected: %d at pos %d", c_char, t->line_pos);
return error_fail;
}
extern const char end_pod[] = "=cut";
CharTokenizeResults PodToken::tokenize(Tokenizer *t, Token *token, unsigned char c_char) {
// will enter here only on the line's start, but not nessesery on byte 0.
// there may be a BOM before it.
PredicateLiteral< 4, end_pod > regex;
unsigned long pos = t->line_pos;
// suck the line anyway
for ( unsigned long ix = pos; ix < t->line_length; ix++ ) {
token->text[ token->length++ ] = t->c_line[ t->line_pos++ ];
}
if ( regex.test( t->c_line, &pos, t->line_length ) &&
( ( pos >= t->line_length ) || is_whitespace( t->c_line[ pos ] ) ) ) {
TokenTypeNames zone = t->_finalize_token();
t->_new_token(zone);
( run in 0.460 second using v1.01-cache-2.11-cpan-131fc08a04b )