BOM results from the CPAN

PPI-XS-Tokenizer

    "PPI::Token::Prototype", // Token_Prototype,
    "PPI::Token::ArrayIndex", // Token_ArrayIndex,
    "PPI::Token::HereDoc", // Token_HereDoc,
    "PPI::Token::Attribute", // Token_Attribute,
    "PPI::Token::Attribute", // Doesn't exist in PPI: Token_Attribute_Parameterized, (okay to map to PPI::Token::Attribute)
    "PPI::Token::Label", // Token_Label,
    "PPI::Token::Separator", // Token_Separator,
    "PPI::Token::End", // Token_End,
    "PPI::Token::Data", // Token_Data,
    "PPI::Token::Pod", // Token_Pod,
    "PPI::Token::BOM", // Token_BOM,
  };

  const int CPPTokenizerWrapper::fgSpecialToken[43] = {
    eSimple, // Token_NoType = 0,
    eSimple, // Token_WhiteSpace,
    eSimple, // Token_Symbol,
    eSimple, // Token_Comment,
    eSimple, // Token_Word,
    eSimple, // Token_DashedWord,
    eSimple, // Token_Structure,

CPPTokenizerWrapper.cc view on Meta::CPAN

    eSimple, // Token_Prototype,
    eSimple, // Token_ArrayIndex,
    eHereDoc, // Token_HereDoc,
    eSimple, // Token_Attribute,
    eSimple, // Token_Attribute_Parameterized, (PPI::Token::Attribute)
    eSimple, // Token_Label,
    eSimple, // Token_Separator,
    eSimple, // Token_End,
    eSimple, // Token_Data,
    eSimple, // Token_Pod,
    eSimple, // Token_BOM,
  };



/*
 * special tokens:
 * PPI::Token::HereDoc
 * all "extended" tokens
 */

src/structure.cpp view on Meta::CPAN

	t->_new_token(zone);
	return done_it_myself;
}

extern const char l_utf32_be[] = "\x00\x00\xfe\xff"; // => 'UTF-32',
extern const char l_utf32_le[] = "\xff\xfe\x00\x00"; // => 'UTF-32',
extern const char l_utf16_be[] = "\xfe\xff"; //         => 'UTF-16',
extern const char l_utf16_le[] = "\xff\xfe"; //         => 'UTF-16',
extern const char l_utf8[] = "\xef\xbb\xbf"; //     => 'UTF-8',

CharTokenizeResults BOMToken::tokenize(Tokenizer *t, Token *token, unsigned char c_char) {
	PredicateOr< 
		PredicateBinaryLiteral< 4, l_utf32_be >,
		PredicateBinaryLiteral< 4, l_utf32_le >,
		PredicateBinaryLiteral< 2, l_utf16_be >,
		PredicateBinaryLiteral< 2, l_utf16_le >
	> regex1;
	unsigned long pos = 0;
	if ( regex1.test( t->c_line, &pos, t->line_length ) ) {
		sprintf(t->ErrorMsg, "BOM error: we do not support anything but ascii and utf8 (%02X,%02X)", t->c_line[0], t->c_line[1]);
		return error_fail; 
	}
	PredicateBinaryLiteral< 3, l_utf8 > regex2;
	if ( regex2.test( t->c_line, &pos, t->line_length ) ) {
		// well, if it's a utf8 maybe we will manage
		for (unsigned long ix = 0; ix < pos; ix++ ) {
			token->text[ ix ] = t->c_line[ ix ];
		}
		// move the beginning of the line to after the BOM
		t->c_line += pos;
		t->line_length -= pos;
		token->length = 3;
	}
	TokenTypeNames zone = t->_finalize_token();
	t->_new_token(zone);
	return done_it_myself;
}

src/structure.h view on Meta::CPAN

	CastToken() : AbstractTokenType( Token_Cast, true ) {}
	CharTokenizeResults tokenize(Tokenizer *t, Token *token, unsigned char c_char);
};

class PrototypeToken : public AbstractTokenType {
public:
	PrototypeToken() : AbstractTokenType( Token_Prototype, true ) {}
	CharTokenizeResults tokenize(Tokenizer *t, Token *token, unsigned char c_char);
};

class BOMToken : public AbstractTokenType {
public:
	BOMToken() : AbstractTokenType( Token_BOM, false ) {}
	CharTokenizeResults tokenize(Tokenizer *t, Token *token, unsigned char c_char);
};
};

#endif

src/tokenizer.cpp view on Meta::CPAN

	TokenTypeNames_pool[Token_Number_Exp] = new ExpNumberToken;
	TokenTypeNames_pool[Token_ArrayIndex] = new ArrayIndexToken;
	TokenTypeNames_pool[Token_Label] = new LabelToken;
	TokenTypeNames_pool[Token_Attribute] = new AttributeToken;
	TokenTypeNames_pool[Token_Attribute_Parameterized] = new ParameterizedAttributeToken;
	TokenTypeNames_pool[Token_Pod] = new PodToken;
	TokenTypeNames_pool[Token_Cast] = new CastToken;
	TokenTypeNames_pool[Token_Prototype] = new PrototypeToken;
	TokenTypeNames_pool[Token_DashedWord] = new DashedWordToken;
	TokenTypeNames_pool[Token_Number_Version] = new VersionNumberToken;
	TokenTypeNames_pool[Token_BOM] = new BOMToken;
	TokenTypeNames_pool[Token_Separator] = new SeparatorToken;
	TokenTypeNames_pool[Token_End] = new EndToken;
	TokenTypeNames_pool[Token_Data] = new DataToken;
	TokenTypeNames_pool[Token_HereDoc] = new HereDocToken;
	//TokenTypeNames_pool[Token_HereDoc_Body] = new HereDocBodyToken;
	

	for (int ix = 0; ix < NUM_SIGNIFICANT_KEPT; ix++) {
		m_LastSignificant[ix] = NULL;
	}

src/tokenizer.cpp view on Meta::CPAN

	if ( ( c_token != NULL ) && ( c_token->type->type == Token_Whitespace ) ) {
	}
    return reached_eol;
}

LineTokenizeResults Tokenizer::tokenizeLine(char *line, unsigned long line_length) {
	line_pos = 0;
	c_line = line;
	this->line_length = line_length;
	if (c_token == NULL)
		_new_token(Token_BOM);
	while ( NULL != tokens_posponded_head ) {
		if ( tokens_posponded_head->type->isa( Token_HereDoc ) ) {
			ExtendedToken *tkn = (ExtendedToken *)tokens_posponded_head;
			AbstractTokenType::VerifySufficientBufferLength(tkn, line_length);
			if ( heredocbody_ended == ((HereDocToken*)(tokens_posponded_head->type))->Unpospone( this, tkn, line, line_length ) ) {
				// release all posponded tokens, as long as they are not an another heredoc token
				Token *tkn = tokens_posponded_head;
				tokens_posponded_head = tkn->next;
				chain_token(tkn, tokens_found_head, tokens_found_tail);
				while ( ( NULL != tokens_posponded_head ) && ( ! tokens_posponded_head->type->isa( Token_HereDoc ) ) ) {

src/tokenizer.h view on Meta::CPAN

	Token_Prototype, // done
	Token_ArrayIndex, // done
	Token_HereDoc, // done
	Token_Attribute, // done
	Token_Attribute_Parameterized, // done
	Token_Label, // done
	Token_Separator, // done
	Token_End, // done
	Token_Data, // done
	Token_Pod, // done
	Token_BOM, // done
	Token_Foreign_Block, // for Perl6 code, unimplemented
	Token_LastTokenType, // Marker for the last real types

	// Here are abstract markers
	isToken_QuoteOrQuotaLike,
	isToken_Extended
};

// FIXME: fix the isa-a relationship between the tokens

src/whitespace.cpp view on Meta::CPAN


	// FIXME: Add the c_char > 127 part?

	sprintf(t->ErrorMsg, "Error: charecter rejected: %d at pos %d", c_char, t->line_pos);
    return error_fail;
}

extern const char end_pod[] = "=cut";
CharTokenizeResults PodToken::tokenize(Tokenizer *t, Token *token, unsigned char c_char) {
	// will enter here only on the line's start, but not nessesery on byte 0.
	// there may be a BOM before it.
	PredicateLiteral< 4, end_pod > regex;
	unsigned long pos = t->line_pos;
	// suck the line anyway
	for ( unsigned long ix = pos; ix < t->line_length; ix++ ) {
		token->text[ token->length++ ] = t->c_line[ t->line_pos++ ];
	}
	if ( regex.test( t->c_line, &pos, t->line_length ) &&
		( ( pos >= t->line_length ) || is_whitespace( t->c_line[ pos ] ) ) ) {
		TokenTypeNames zone = t->_finalize_token();
		t->_new_token(zone);

( run in 0.460 second using v1.01-cache-2.11-cpan-131fc08a04b )