PPI-XS-Tokenizer

 view release on metacpan or  search on metacpan

src/tokenizer.cpp  view on Meta::CPAN

	for ( unsigned long ix = 0; ix < MAGIC_COUNT; ix++ )
		mmap.insert( uPair ( m_list[ix], 1 ) );
}

Tokenizer::Tokenizer() 
	: 
	c_token(NULL),
	c_line(NULL),
	line_pos(0),
	line_length(0),
	local_newline('\n'),
	tokens_found_head(NULL), 
	tokens_found_tail(NULL),
	tokens_posponded_head(NULL),
	tokens_posponded_tail(NULL),
	zone(Token_Whitespace),
	m_nLastSignificantPos(0)
{
	m_TokensCache = new TokensCacheMany();
	for (int ix = 0; ix < Token_LastTokenType; ix++) {
		TokenTypeNames_pool[ix] = NULL;
	}
	TokenTypeNames_pool[Token_NoType] = NULL;
	TokenTypeNames_pool[Token_Whitespace] = new WhiteSpaceToken;
	TokenTypeNames_pool[Token_Comment] = new CommentToken;
	TokenTypeNames_pool[Token_Structure] = new StructureToken;
	TokenTypeNames_pool[Token_Magic] = new MagicToken;
	TokenTypeNames_pool[Token_Operator] = new OperatorToken;
	TokenTypeNames_pool[Token_Unknown] = new UnknownToken;
	TokenTypeNames_pool[Token_Symbol] = new SymbolToken;
	TokenTypeNames_pool[Token_Operator_Attribute] = new AttributeOperatorToken;
	TokenTypeNames_pool[Token_Quote_Double] = new AbstractSimpleQuote( Token_Quote_Double, true, '"' );
	TokenTypeNames_pool[Token_Quote_Single] = new AbstractSimpleQuote( Token_Quote_Single, true, '\'' );
	TokenTypeNames_pool[Token_QuoteLike_Backtick] = new AbstractSimpleQuote( Token_QuoteLike_Backtick, true, '`' );
	TokenTypeNames_pool[Token_Word] = new WordToken;
	TokenTypeNames_pool[Token_Quote_Literal] = new AbstractQuoteTokenType( Token_Quote_Literal, true, 1, false );
	TokenTypeNames_pool[Token_Quote_Interpolate] = new AbstractQuoteTokenType( Token_Quote_Interpolate, true, 1, false );
	TokenTypeNames_pool[Token_QuoteLike_Words] = new AbstractQuoteTokenType( Token_QuoteLike_Words, true, 1, false );
	TokenTypeNames_pool[Token_QuoteLike_Command] = new AbstractQuoteTokenType( Token_QuoteLike_Command, true, 1, false );
	TokenTypeNames_pool[Token_QuoteLike_Readline] = new AbstractBareQuoteTokenType( Token_QuoteLike_Readline, true, 1, false );
	TokenTypeNames_pool[Token_Regexp_Match] = new AbstractQuoteTokenType( Token_Regexp_Match, true, 1, true );
	TokenTypeNames_pool[Token_Regexp_Match_Bare] = new AbstractBareQuoteTokenType( Token_Regexp_Match_Bare, true, 1, true );
	TokenTypeNames_pool[Token_QuoteLike_Regexp] = new AbstractQuoteTokenType( Token_QuoteLike_Regexp, true, 1, true );
	TokenTypeNames_pool[Token_Regexp_Substitute] = new AbstractQuoteTokenType( Token_Regexp_Substitute, true, 2, true );
	TokenTypeNames_pool[Token_Regexp_Transliterate] = new AbstractQuoteTokenType( Token_Regexp_Transliterate, true, 2, true );
	TokenTypeNames_pool[Token_Number] = new NumberToken;
	TokenTypeNames_pool[Token_Number_Float] = new FloatNumberToken;
	TokenTypeNames_pool[Token_Number_Hex] = new HexNumberToken;
	TokenTypeNames_pool[Token_Number_Binary] = new BinaryNumberToken;
	TokenTypeNames_pool[Token_Number_Octal] = new OctalNumberToken;
	TokenTypeNames_pool[Token_Number_Exp] = new ExpNumberToken;
	TokenTypeNames_pool[Token_ArrayIndex] = new ArrayIndexToken;
	TokenTypeNames_pool[Token_Label] = new LabelToken;
	TokenTypeNames_pool[Token_Attribute] = new AttributeToken;
	TokenTypeNames_pool[Token_Attribute_Parameterized] = new ParameterizedAttributeToken;
	TokenTypeNames_pool[Token_Pod] = new PodToken;
	TokenTypeNames_pool[Token_Cast] = new CastToken;
	TokenTypeNames_pool[Token_Prototype] = new PrototypeToken;
	TokenTypeNames_pool[Token_DashedWord] = new DashedWordToken;
	TokenTypeNames_pool[Token_Number_Version] = new VersionNumberToken;
	TokenTypeNames_pool[Token_BOM] = new BOMToken;
	TokenTypeNames_pool[Token_Separator] = new SeparatorToken;
	TokenTypeNames_pool[Token_End] = new EndToken;
	TokenTypeNames_pool[Token_Data] = new DataToken;
	TokenTypeNames_pool[Token_HereDoc] = new HereDocToken;
	//TokenTypeNames_pool[Token_HereDoc_Body] = new HereDocBodyToken;
	

	for (int ix = 0; ix < NUM_SIGNIFICANT_KEPT; ix++) {
		m_LastSignificant[ix] = NULL;
	}
	fill_maps( operators, magics );
}

Tokenizer::~Tokenizer() {
	Reset();
	for (int ix = 0; ix < Token_LastTokenType; ix++) {
		if ( TokenTypeNames_pool[ix] != NULL ) {
			delete(TokenTypeNames_pool[ix]);
			TokenTypeNames_pool[ix] = NULL;
		}
	}
	delete m_TokensCache;
}

void Tokenizer::Reset() {
	Token *t;
	EndOfDocument();

	while ( ( t = pop_one_token() ) != NULL ) {
		freeToken( t );
	}
	for (int ix = 0; ix < NUM_SIGNIFICANT_KEPT; ix++) {
		if (m_LastSignificant[ix] != NULL) {
			freeToken(m_LastSignificant[ix]);
			m_LastSignificant[ix] = NULL;
		}
	}
	c_token = NULL;
	c_line = NULL;
	line_pos = 0;
	line_length = 0;
	zone = Token_Whitespace;
	m_nLastSignificantPos = 0;
}

unsigned int count_waiting_tokens(Token *head) {
	if (head == NULL)
		return 0;
	unsigned int x = 0;
	while (head!=NULL) {
		x++;
		head = head->next;
	}
	return x;
}

void Tokenizer::EndOfDocument() {
	if ( c_token != NULL )
		_finalize_token();
	while ( NULL != tokens_posponded_head ) {

src/tokenizer.cpp  view on Meta::CPAN


Token *Tokenizer::_last_significant_token(unsigned int n) {
	if (( n < 1) || (n > NUM_SIGNIFICANT_KEPT ))
		return NULL;
	unsigned int ix = ( m_nLastSignificantPos + NUM_SIGNIFICANT_KEPT - n + 1 ) % NUM_SIGNIFICANT_KEPT;
	return m_LastSignificant[ix];
}

OperatorOperandContext Tokenizer::_opcontext() {
	Token *t0 = _last_significant_token(1);
	if ( t0 == NULL )
		return ooc_Operand;
	TokenTypeNames p_type = t0->type->type;
	if ( t0->type->isa( Token_Symbol ) || t0->type->isa( Token_Number ) ||
		t0->type->isa( isToken_QuoteOrQuotaLike ) || ( p_type == Token_ArrayIndex ) ) {
		return ooc_Operator;
	}
	if ( t0->type->isa( Token_Operator ) )
		return ooc_Operand;
	
	// FIXME: Are we searching for Structure tokens?
	if ( t0->length != 1 )
		return ooc_Unknown;

	unsigned char c_char = t0->text[0];
	if ( ( c_char == '(' ) || ( c_char == '{' ) || ( c_char == '[' ) ||  ( c_char == ';' ) ) {
		return ooc_Operand;
	}
	if ( c_char == '}' )
		return ooc_Operator;

	return ooc_Unknown;
}

//=====================================

LineTokenizeResults Tokenizer::_tokenize_the_rest_of_the_line() {
	AbstractTokenType::VerifySufficientBufferLength(c_token, line_length);
    while (line_length > line_pos) {
		CharTokenizeResults rv = c_token->type->tokenize(this, c_token, c_line[line_pos]);
        switch (rv) {
            case my_char:
				c_token->text[c_token->length++] = c_line[line_pos++];
                break;
            case done_it_myself:
                break;
            case error_fail:
                return tokenizing_fail;
        };
    }
	if ( ( c_token != NULL ) && ( c_token->type->type == Token_Whitespace ) ) {
	}
    return reached_eol;
}

LineTokenizeResults Tokenizer::tokenizeLine(char *line, unsigned long line_length) {
	line_pos = 0;
	c_line = line;
	this->line_length = line_length;
	if (c_token == NULL)
		_new_token(Token_BOM);
	while ( NULL != tokens_posponded_head ) {
		if ( tokens_posponded_head->type->isa( Token_HereDoc ) ) {
			ExtendedToken *tkn = (ExtendedToken *)tokens_posponded_head;
			AbstractTokenType::VerifySufficientBufferLength(tkn, line_length);
			if ( heredocbody_ended == ((HereDocToken*)(tokens_posponded_head->type))->Unpospone( this, tkn, line, line_length ) ) {
				// release all posponded tokens, as long as they are not an another heredoc token
				Token *tkn = tokens_posponded_head;
				tokens_posponded_head = tkn->next;
				chain_token(tkn, tokens_found_head, tokens_found_tail);
				while ( ( NULL != tokens_posponded_head ) && ( ! tokens_posponded_head->type->isa( Token_HereDoc ) ) ) {
					Token *tkn = tokens_posponded_head;
					tokens_posponded_head = tkn->next;
					chain_token(tkn, tokens_found_head, tokens_found_tail);
				}
				if ( NULL == tokens_posponded_head )
					tokens_posponded_tail = NULL;
			}
			return reached_eol;
		}
		Token *tkn = tokens_posponded_head;
		tokens_posponded_head = tkn->next;
		chain_token(tkn, tokens_found_head, tokens_found_tail);
	}
	tokens_posponded_tail = NULL;
	return _tokenize_the_rest_of_the_line();

}

void Tokenizer::changeTokenType(TokenTypeNames new_type) {
	AbstractTokenType *oldType = c_token->type;
	AbstractTokenType *newType = TokenTypeNames_pool[new_type];

	if (oldType->isa(isToken_Extended) != newType->isa(isToken_Extended)) {
		Token *newToken = newType->GetNewToken( this, m_TokensCache, line_pos + 1 );
		char *temp_text = c_token->text;
		c_token->text = newToken->text;
		newToken->text = temp_text;

		newToken->length = c_token->length;
		c_token->length = 0;

		unsigned long aSize = c_token->allocated_size;
		c_token->allocated_size = newToken->allocated_size;
		newToken->allocated_size = aSize;

		freeToken( c_token );
		c_token = newToken;
	}
	c_token->type = newType;
}



( run in 1.202 second using v1.01-cache-2.11-cpan-97f6503c9c8 )