XML-Fast

 view release on metacpan or  search on metacpan

Fast.xs  view on Meta::CPAN

#define XML_DEVEL 0

#include "xmlfast.h"

#if XML_DEBUG
#define WHERESTR    " at %s line %d.\n"
#define WHEREARG    __FILE__, __LINE__
#define debug(...)   do{ fprintf(stderr, __VA_ARGS__); fprintf(stderr, WHERESTR, WHEREARG); } while(0)
#else
#define debug(...)
#endif


#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <strings.h>

#ifndef ptr_t
typedef void * ptr_t;
#endif

#ifndef PERL_ARGS_ASSERT_SV_RECODE_TO_UTF8
#define PERL_ARGS_ASSERT_SV_RECODE_TO_UTF8
#endif

typedef struct {
	char *name;
	unsigned int len;
	char *fullname;
	unsigned int fulllen;
} xml_node;

/*
commit 30866c9f74d890c45e8da27ea855468a314a59cf
xmlbare 1785/s      --    -19%
xmlfast 2209/s     24%      --

*/

#define UTF8_BYTES   1
#define UTF8_UPGRADE 2
#define UTF8_DECODE  3
#define UTF8_EDECODE 4

#define EMIT_WARNS   0x0001
#define TAG_MATCH    0x0002

#define MODE_ORDER   0x1000
#define MODE_TRIM    0x2000
#define MODE_ARRAYS  0x4000

typedef struct {
	// config
	unsigned int flags;
	unsigned int bytes;
	unsigned int utf8;
	SV  * attr;
	SV  * text;
	SV  * join;
	SV  * cdata;
	SV  * comm;
	HV  * array;

	// state
	char *encoding;
	SV   *encode;
	int depth;
	unsigned int chainsize;
	xml_node * chain;
	HV ** hchain;
	HV  * hcurrent; //just a pointer

	SV  * pi;
	SV  * attrname;
	SV  * textval;
	
	SV  * error;
	parser_state * state;
	
} parsestate;

typedef struct {
	// config
	unsigned int flags;
	unsigned int bytes;
	unsigned int utf8;
	
	char  * attr; STRLEN attl;
	char  * text;
	char  * join;
	char  * cdata;
	char  * comm;
	
	
	HV  * array;

	// state
	char *encoding;
	SV   *encode;
	unsigned int chainsize;
	xml_node * chain;
	HV ** hchain;
	HV  * hcurrent; //just a pointer

	SV  * pi;
	SV  * attrname;
	SV  * textval;
	
	SV  * error;
	
	/// new
	I32 depth;
	I32 ix;
	SV  * result;
	SV  * nested;
	
	
} compstate;

// hv_store to array if already have non-array value
#define hv_store_a( hv, key, sv ) \
	STMT_START { \
		SV **exists; \
		char *kv = SvPV_nolen(key); \
		int   kl = SvCUR(key); \
		if( exists = hv_fetch(hv, kv, kl, 0) ) { \
			if ( SvROK(*exists) && SvTYPE( SvRV(*exists) ) == SVt_PVAV) { \
				AV *av = (AV *) SvRV( *exists ); \
				av_push( av, sv ); \
			} \
			else { \
				AV *av   = newAV(); \
				if (SvROK(*exists)) { \
					SvREFCNT_inc(*exists); \
					av_push( av, *exists ); \
				} else { \
					SV *old  = newSV(0); \
					sv_copypv(old, *exists); \
					av_push( av, old ); \
				} \
				av_push( av, sv ); \
				(void) hv_store( hv, kv, kl, newRV_noinc( (SV *) av ), 0 ); \
			} \
		} else { \
			(void) hv_store(hv, kv, kl, sv, 0); \
		} \
	} STMT_END

// hv_store to array, create if not exists
#define hv_store_aa( hv, key, sv ) \
	STMT_START { \

Fast.xs  view on Meta::CPAN

		}
		sv_2mortal(ctx->attrname);
		ctx->attrname = 0;
		ctx->textval = 0;
	}
	else {
		hv_store_a(ctx->hcurrent, ctx->text, ctx->textval);
	}
	ctx->textval = 0;
}

void on_bytes_part(void * pctx, char * data, unsigned int length) {
#if XML_DEVEL
	if (!pctx) croak("Context not passed to on_bytes_part");
#endif
	parsestate *ctx = pctx;
	
	if ( !( ctx->attrname || ctx->text ) ) return;
	
	if (ctx->textval) {
		if (length > 0) { sv_catpvn(ctx->textval, data, length); }
	} else {
		ctx->textval = newSVpvn(data, length);
	}
}

void on_bytes(void * pctx, char * data, unsigned int length) {
#if XML_DEVEL
	if (!pctx) croak("Context not passed to on_bytes");
#endif
	parsestate *ctx = pctx;
	//if (!ctx->textval && !length) {
	//	my_warn(ctx,"Called on_bytes with empty text and empty body");
	//}
	if ( !( ctx->attrname || ctx->text ) ) return;
	
	if (ctx->textval) {
		if (length > 0) { sv_catpvn(ctx->textval, data, length); }
	} else {
		ctx->textval = newSVpvn(data, length);
	}
	xml_sv_decode(ctx,ctx->textval);
	if (ctx->attrname) {
		if (ctx->pi) {
			on_pi_attr(ctx);
		} else {
			hv_store_a(ctx->hcurrent, ctx->attrname, ctx->textval);
		}
		sv_2mortal(ctx->attrname);
		ctx->attrname = 0;
		ctx->textval = 0;
	}
	else {
		//printf("text close, store %s\n",SvPV_nolen(ctx->textval));
		hv_store_a(ctx->hcurrent, ctx->text, ctx->textval);
	}
	ctx->textval = 0;
}


void on_cdata(void * pctx, char * data,unsigned int length) {
#if XML_DEVEL
	if (!pctx) croak("Context not passed to on_cdata");
#endif
	parsestate *ctx = pctx;
	SV *sv   = newSVpvn(data, length);
	xml_sv_decode(ctx,sv);
	hv_store_a(ctx->hcurrent, ctx->cdata, sv );
}

void on_pi_open(void * pctx, char * data, unsigned int length) {
#if XML_DEVEL
	if (!pctx) croak("Context not passed to on_pi_open");
#endif
	parsestate *ctx = pctx;
	ctx->pi = newSVpvn(data,length);
}

void on_pi_close(void * pctx, char * data, unsigned int length) {
#if XML_DEVEL
	if (!pctx) croak("Context not passed to on_pi_close");
#endif
	parsestate *ctx = pctx;
	sv_2mortal(ctx->pi);
	ctx->pi = 0;
}

void on_tag_open(void * pctx, char * data, unsigned int length) {
#if XML_DEVEL
	if (!pctx) croak("Context not passed to on_tag_open");
#endif
	parsestate *ctx = pctx;
	if (ctx->textval) {
		xml_sv_decode(ctx,ctx->textval);
		hv_store_a(ctx->hcurrent, ctx->text, ctx->textval);
		ctx->textval = 0;
	}
	HV * hv = newHV();
	ctx->depth++;
	if (ctx->depth >= ctx->chainsize) {
		warn("XML depth too high. Consider increasing `_max_depth' to at more than %d to avoid reallocations",ctx->chainsize);
		ctx->chainsize *= 2;
		Renew( ctx->hchain, ctx->chainsize, HV* );
		Renew( ctx->chain, ctx->chainsize, xml_node);
	}
	ctx->chain[ctx->depth].len = length;
	ctx->chain[ctx->depth].name = data;
	if (ctx->flags & TAG_MATCH) {
		if (ctx->depth == 0) {
			ctx->chain[ctx->depth].fulllen = length + 1;
			Newx(ctx->chain[ctx->depth].fullname, ctx->chain[ctx->depth].fulllen + 1, char);
			ctx->chain[ctx->depth].fullname[0] = '/';
			memcpy(ctx->chain[ctx->depth].fullname+1,data,length);
			ctx->chain[ctx->depth].fullname[length+1] = 0;
			//printf("Fullame = %s\n",ctx->chain[ctx->depth].fullname);
		} else {
			ctx->chain[ctx->depth].fulllen = ctx->chain[ctx->depth - 1].fulllen + length + 1;
			Newx(ctx->chain[ctx->depth].fullname, ctx->chain[ctx->depth].fulllen + 1, char);
			memcpy(
				ctx->chain[ctx->depth].fullname,
				ctx->chain[ctx->depth - 1].fullname,
				ctx->chain[ctx->depth - 1].fulllen
			);
			ctx->chain[ctx->depth].fullname[ ctx->chain[ctx->depth - 1].fulllen ] = '/';
			memcpy(
				ctx->chain[ctx->depth].fullname + ctx->chain[ctx->depth - 1].fulllen + 1,
				data,
				length

Fast.xs  view on Meta::CPAN

	//printf ("\n");
}
void h2xpe( compstate *p, char *s ) {
	char *b = s;
	while (1) {
		switch (*s) {
			warn("%c", *s);
			case 0:
				if (b < s) sv_catpvf( p->result, "%-.*s", (int)(s - b), b );
				return;
			case '<':
				if (b < s) sv_catpvf( p->result, "%-.*s", (int)(s - b), b );
				sv_catpvf( p->result, "%s", "&lt;" );
				b = s+1;
				break;
			case '>':
				if (b < s) sv_catpvf( p->result, "%-.*s", (int)(s - b), b );
				sv_catpvf( p->result, "%s", "&gt;" );
				b = s+1;
				break;
			case '"':
				if (b < s) sv_catpvf( p->result, "%-.*s", (int)(s - b), b );
				sv_catpvf( p->result, "%s", "&quot;" );
				b = s+1;
				break;
			case '\'':
				if (b < s) sv_catpvf( p->result, "%-.*s", (int)(s - b), b );
				sv_catpvf( p->result, "%s", "&apos;" );
				b = s+1;
				break;
			case '&':
				if (b < s) sv_catpvf( p->result, "%-.*s", (int)(s - b), b );
				sv_catpvf( p->result, "%s", "&amp;" );
				b = s+1;
				break;
			default:
				break;
		}
		s++;
	}
}


void kv2x ( char *key, SV *val, compstate *p );
void kv2x ( char *key, SV *val, compstate *p ) {
	char closed;
	
	HE* ent;
	
	char   *nkey;
	STRLEN  i, nlen;
	
	AV  *av;
	SV **avv;
	
	debug("key=%s, val=%s",key, SvPV_nolen(val));
	if ( mystrcmp( key, p->text ) == 0 ) {
		h2xpe(p, SvPV_nolen( val ));
	}
	else
	if ( mystrcmp( key, p->cdata ) == 0 ) {
		h2xp(p, "<![CDATA[");
		h2xp(p, SvPV_nolen( val ));
		h2xp(p, "]]>");
	}
	else
	if ( p->comm && mystrcmp( key, p->comm) == 0 ) {
		debug("comm: %s", SvPV_nolen( val ));
		h2xp(p, "<!-- ");
		h2xpe(p, SvPV_nolen( val ));
		h2xp(p, " -->");
	}
	else {
		if (SvROK( val )) {
			switch ( SvTYPE( SvRV( val ) ) ) {
				case SVt_PVHV:
					debug("%s -> hash inside", key);
					(void) hv_iterinit( (HV *) SvRV( val ) );
					h2xp(p, "<%s", key);
					closed = 0;
					while ((ent = hv_iternext( (HV *) SvRV( val ) ))) {
						nkey = HePV(ent, nlen);
						if ( strncmp( nkey, p->attr, p->attl ) == 0 ) {
							nkey += p->attl;
							h2xp(p, " %s=\"",nkey);
							h2xpe(p, SvPV_nolen(HeVAL(ent)));
							h2xp(p, "\"");
							continue;
						}
					}
					
					(void) hv_iterinit( (HV *) SvRV( val ) );
					while ((ent = hv_iternext( (HV *) SvRV( val ) ))) {
						nkey = HePV(ent, nlen);
						if ( strncmp( nkey, p->attr, p->attl ) == 0 ) {
							continue;
						}
						debug("Nested %s->%s", nkey, SvPV_nolen(HeVAL(ent)));
						if (!closed) {
							closed = 1;
							h2xp(p,">");
						}
						p->depth++;
						kv2x( nkey, HeVAL(ent), p );
						p->depth--;
					}
					
					if (!closed) {
						h2xp(p, "/>");
					} else {
						h2xp(p, "</%s>",key);
					}
					
					
					break;
				case SVt_PVAV:
					debug("array inside (Multinode)");
					
					av = (AV *) SvRV( val );
					nlen = av_len(av) + 1;
					

Fast.xs  view on Meta::CPAN

		SV *encode = find_encoding("windows-1251");
		end = uvchr_to_utf8(utf, chr);
		*end = '\0';
		SV *tmp = sv_2mortal(newSVpvn(utf, end-utf));
		SvUTF8_on(tmp);
		SV *bytes = sv_recode_from_utf8(aTHX_ tmp, encode);
		sv_dump(bytes);
		printf("Created char %s / %s / bytes = %s\n", utf, SvPV_nolen(tmp), SvPV_nolen(bytes));
		//sv_recode_to_utf8(tmp, encode);
		//printf("Recoded %s\n",SvPV_nolen(tmp));
		croak("Force exit");
		

void
_xml2hash(xml_sv,conf)
		SV *xml_sv;
		HV *conf;
	PROTOTYPE: $$
	PPCODE:
		SvGETMAGIC(xml_sv);
		char *xml = SvPVbyte_nolen(xml_sv);
		SV * RV;
		
		parser_state state;
		memset(&state,0,sizeof(state));
		
		parsestate ctx;
		memset(&ctx,0,sizeof(parsestate));
		
		state.ctx = &ctx;
		ctx.state = &state;
		
		SV **key;
		if ((key = hv_fetch(conf, "order", 5, 0)) && SvTRUE(*key)) {
			ctx.flags |= MODE_ORDER;
		}
		if ((key = hv_fetch(conf, "trim", 4, 0)) && SvTRUE(*key)) {
			ctx.flags |= MODE_TRIM;
		}
		if ((key = hv_fetch(conf, "bytes", 5, 0)) && SvTRUE(*key)) {
			ctx.bytes = 1;
		} else {
			if ((key = hv_fetch(conf, "utf8decode", 10, 0)) && SvTRUE(*key)) {
				ctx.utf8 = UTF8_DECODE;
			} else {
				ctx.utf8 = UTF8_UPGRADE;
			}
		}
		
		if ((key = hv_fetch(conf, "attr", 4, 0))) {
			if (SvOK(*key) && SvCUR(*key) > 0 ) { // defined and length
				ctx.attr = *key;
			}
		}
		if ((key = hv_fetch(conf, "text", 4, 0)) && SvOK(*key)) {
			ctx.text = *key;
		}
		if ((key = hv_fetch(conf, "join", 4, 0)) && SvPOK(*key)) {
			ctx.join = *key;
		}
		if ((key = hv_fetch(conf, "cdata", 5, 0)) && SvPOK(*key)) {
			ctx.cdata = *key;
		}
		if ((key = hv_fetch(conf, "comm", 4, 0)) && SvPOK(*key)) {
			ctx.comm = *key;
		}
		if ((key = hv_fetch(conf, "array", 5, 0)) && SvOK(*key)) {
			if (SvROK(*key) && SvTYPE( SvRV(*key) ) == SVt_PVAV) {
				AV *av = (AV *) SvRV( *key );
				ctx.array = newHV();
				I32 len = 0, avlen = av_len(av) + 1;
				SV **val;
				for ( len = 0; len < avlen; len++ ) {
					if( ( val = av_fetch(av,len,0) ) && SvOK(*val) ) {
						if(SvPOK(*val)) {
							(void) hv_store( ctx.array, SvPV_nolen(*val), SvCUR(*val), newSV(0), 0 );
						} else {
							my_croak(&ctx,"Bad enrty in array entry: %s",SvPV_nolen(*val));
						}
					}
				}
				
				
			}
			else if (!SvROK(*key)) {
				//printf("Remember all should be arrays\n");
				if (SvTRUE(*key)) {
					ctx.flags |= MODE_ARRAYS;
				}
			}
			else {
				my_croak(&ctx,"Bad entry in array: %s",SvPV_nolen(*key));
			}
		}
		
		//ctx.flags |= TAG_MATCH;
		
		
		if ((key = hv_fetch(conf, "_max_depth", 10, 0)) && SvOK(*key)) {
			ctx.chainsize = SvIV(*key);
			if (ctx.chainsize < 1) {
				my_croak(&ctx,"_max_depth contains bad value (%d)",ctx.chainsize);
			}
		} else {
			ctx.chainsize = 256;
		}
		
		if (!ctx.bytes) {
			ctx.encoding = "utf8";
		}
		
		if (ctx.flags & MODE_ORDER) {
			my_croak(&ctx,"Ordered mode not implemented yet\n");
		} else{
			ctx.hcurrent = newHV();
			
			Newx(ctx.chain, ctx.chainsize, xml_node);
			Newx(ctx.hchain, ctx.chainsize, HV*);
			ctx.depth    = -1;
			
			RV  = sv_2mortal(newRV_noinc( (SV *) ctx.hcurrent ));
			
			state.cb.piopen      = on_pi_open;
			state.cb.piclose     = on_pi_close;
			state.cb.tagopen      = on_tag_open;
			state.cb.tagclose     = on_tag_close;
			
			state.cb.attrname     = on_attr_name;
			if ((key = hv_fetch(conf, "nowarn", 6, 0)) && SvTRUE(*key)) {
				//
			} else {
				state.cb.warn         = on_warn;
				ctx.flags |= EMIT_WARNS;
			}
			state.cb.die         = on_die;
			
			if(ctx.comm)
				state.cb.comment      = on_comment;
			
			if(ctx.cdata)
				state.cb.cdata        = on_cdata;
			else if(ctx.text)
				state.cb.cdata        = on_bytes;
			
			state.cb.bytes        = on_bytes;
			state.cb.bytespart    = on_bytes_part;
			state.cb.uchar        = on_uchar;
			
			if (!(ctx.flags & MODE_TRIM))
				state.save_wsp     = 1;
		}
		parse(xml,&state);
		
		if (ctx.depth > -1 && !ctx.error) {
			ctx.error = sv_2mortal(newSVpv("Unbalanced tags",0));
		}
		
		DESTROY(&ctx);
		
		if (ctx.error) {
			croak("%s", SvPV_nolen(ctx.error));
		}
		ST(0) = RV;
		XSRETURN(1);

void
_hash2xml(hash,conf)
		SV *hash;
		HV *conf;
	PROTOTYPE: $$
	PPCODE:
		compstate ctx;
		memset(&ctx,0,sizeof(parsestate));
		
		SV **key;
		if ((key = hv_fetch(conf, "order", 5, 0)) && SvTRUE(*key)) {
			ctx.flags |= MODE_ORDER;
		}
		if ((key = hv_fetch(conf, "trim", 4, 0)) && SvTRUE(*key)) {
			ctx.flags |= MODE_TRIM;
		}
		/*
		if ((key = hv_fetch(conf, "bytes", 5, 0)) && SvTRUE(*key)) {
			ctx.bytes = 1;
		} else {
			if ((key = hv_fetch(conf, "utf8decode", 10, 0)) && SvTRUE(*key)) {
				ctx.utf8 = UTF8_DECODE;
			} else {
				ctx.utf8 = UTF8_UPGRADE;
			}
		}
		*/
		
		if ((key = hv_fetch(conf, "attr", 4, 0)) && SvPOK(*key)) {
			ctx.attr = SvPV_nolen(*key);
			// warn ("Set attr to '%s'", ctx.attr);
		} else {
			ctx.attr = "-";
		}
		if ((key = hv_fetch(conf, "text", 4, 0)) && SvPOK(*key)) {
			ctx.text = SvPV_nolen(*key);
		} else {
			ctx.text = "#text";
		}
		if ((key = hv_fetch(conf, "cdata", 5, 0)) && SvPOK(*key)) {
			ctx.cdata = SvPV_nolen(*key);
		} else {
			ctx.cdata = 0;
		}
		if ((key = hv_fetch(conf, "comm", 4, 0)) && SvPOK(*key)) {
			ctx.comm = SvPV_nolen(*key);
		} else {
			ctx.comm = 0;
		}
		/*
		if ((key = hv_fetch(conf, "array", 5, 0)) && SvOK(*key)) {
			if (SvROK(*key) && SvTYPE( SvRV(*key) ) == SVt_PVAV) {
				AV *av = (AV *) SvRV( *key );
				ctx.array = newHV();
				I32 len = 0, avlen = av_len(av) + 1;
				SV **val;
				for ( len = 0; len < avlen; len++ ) {
					if( ( val = av_fetch(av,len,0) ) && SvOK(*val) ) {
						if(SvPOK(*val)) {
							(void) hv_store( ctx.array, SvPV_nolen(*val), SvCUR(*val), newSV(0), 0 );
						} else {
							//my_croak(&ctx,"Bad enrty in array entry: %s",SvPV_nolen(*val));
						}
					}
				}
				
				
			}
			else if (!SvROK(*key)) {
				//printf("Remember all should be arrays\n");
				if (SvTRUE(*key)) {
					ctx.flags |= MODE_ARRAYS;
				}
			}
			else {
				//my_croak(&ctx,"Bad entry in array: %s",SvPV_nolen(*key));
			}
		}
		
		//ctx.flags |= TAG_MATCH;
		
		if (!ctx.bytes) {
			ctx.encoding = "utf8";
		}
		*/
		
		ctx.depth  = 0;
		ctx.ix     = 0;
		ctx.result = sv_2mortal(newSVpv("",0));
		//ctx.nested = newSVpv("",0);
		ctx.attl   = strlen(ctx.attr);
		
		SvGROW(ctx.result,1024);
		//SvGROW(ctx.nested,1024);
		h2x( hash, &ctx);
		
		//croak("XXX: %s", SvPV_nolen(ctx.result));
		
		if (ctx.error) {
			croak("%s", SvPV_nolen(ctx.error));
		}
		
		ST(0) = ctx.result;



( run in 0.880 second using v1.01-cache-2.11-cpan-39bf76dae61 )