PPI-XS-Tokenizer
view release on metacpan or search on metacpan
src/whitespace.cpp view on Meta::CPAN
( t0->type->isa( Token_Structure ) && ( !strcmp(t0->text, "]" ) ) ) ) {
t->_new_token(Token_Operator);
return my_char;
}
if ( t0->type->isa( Token_Structure ) &&
( ( !strcmp(t0->text, "(") ) || ( !strcmp(t0->text, "{") ) || ( !strcmp(t0->text, ";") ) ) ) {
t->_new_token(Token_Regexp_Match_Bare);
return my_char;
}
if ( t0->type->isa( Token_Word ) &&
( ( !strcmp(t0->text, "split") ) ||
( !strcmp(t0->text, "if") ) ||
( !strcmp(t0->text, "unless") ) ||
( !strcmp(t0->text, "grep") ) ) ) {
t->_new_token(Token_Regexp_Match_Bare);
return my_char;
}
unsigned char n_char = t->c_line[ t->line_pos + 1 ];
if ( ( n_char == '^' ) || ( n_char == '[' ) || ( n_char == '\\' ) ) {
t->_new_token(Token_Regexp_Match);
return my_char;
}
t->_new_token(Token_Operator);
return my_char;
}
if ( c_char == 'x' ) {
unsigned char n_char = t->c_line[ t->line_pos + 1 ];
Token *t0 = t->_last_significant_token(1);
if ( ( t0 != NULL ) && ( n_char >= '0' ) && ( n_char <= 9 ) ) {
TokenTypeNames p_type = t0->type->type;
if ( ( p_type == Token_Quote_Single ) || ( p_type == Token_Quote_Double ) ) { // FIXME
t->_new_token(Token_Operator);
return my_char;
}
}
return t->TokenTypeNames_pool[Token_Word]->commit( t );
}
if ( c_char == '-' ) {
if ( t->_opcontext() == ooc_Operator ) {
t->_new_token(Token_Operator);
return my_char;
} else {
t->_new_token(Token_Unknown);
return my_char;
}
}
// FIXME: Add the c_char > 127 part?
sprintf(t->ErrorMsg, "Error: charecter rejected: %d at pos %d", c_char, t->line_pos);
return error_fail;
}
extern const char end_pod[] = "=cut";
CharTokenizeResults PodToken::tokenize(Tokenizer *t, Token *token, unsigned char c_char) {
// will enter here only on the line's start, but not nessesery on byte 0.
// there may be a BOM before it.
PredicateLiteral< 4, end_pod > regex;
unsigned long pos = t->line_pos;
// suck the line anyway
for ( unsigned long ix = pos; ix < t->line_length; ix++ ) {
token->text[ token->length++ ] = t->c_line[ t->line_pos++ ];
}
if ( regex.test( t->c_line, &pos, t->line_length ) &&
( ( pos >= t->line_length ) || is_whitespace( t->c_line[ pos ] ) ) ) {
TokenTypeNames zone = t->_finalize_token();
t->_new_token(zone);
}
return done_it_myself;
}
CharTokenizeResults EndToken::tokenize(Tokenizer *t, Token *token, unsigned char c_char) {
// will always reach here in a new line
PredicateAnd<
PredicateIsChar< '=' >,
PredicateFunc< is_word >
> regex1;
unsigned long pos = 0;
if ( regex1.test( t->c_line, &pos, t->line_length ) ) {
t->_finalize_token();
t->_new_token( Token_Pod );
return done_it_myself;
}
// if not Pod - just copy the whole line to myself
while ( t->line_length > t->line_pos ) {
token->text[ token->length++ ] = t->c_line[ t->line_pos++ ];
}
return done_it_myself;
}
CharTokenizeResults DataToken::tokenize(Tokenizer *t, Token *token, unsigned char c_char) {
// copy everything anytime
while ( t->line_length > t->line_pos ) {
token->text[ token->length++ ] = t->c_line[ t->line_pos++ ];
}
return done_it_myself;
}
CharTokenizeResults CommentToken::commit(Tokenizer *t) {
if (( t->c_token != NULL ) &&
( t->c_token->type->type == Token_Whitespace ) &&
( t->c_token->length == t->line_pos ) ) {
// This is a whole-line comment, that should own the whitespace before
// and the newline after it.
t->changeTokenType(Token_Comment);
Token *c_token = t->c_token;
while ( ( t->line_pos < t->line_length ) ) {
c_token->text[c_token->length++] = t->c_line[t->line_pos++];
}
} else {
// This is an inline comment - not contains the newline
t->_new_token(Token_Comment);
Token *c_token = t->c_token;
while ( ( t->line_pos < t->line_length ) && ( t->c_line[t->line_pos] != t->local_newline ) ) {
c_token->text[c_token->length++] = t->c_line[t->line_pos++];
( run in 0.549 second using v1.01-cache-2.11-cpan-97f6503c9c8 )