XML-Fast
view release on metacpan or search on metacpan
#define XML_DEVEL 0
#include "xmlfast.h"
#if XML_DEBUG
#define WHERESTR " at %s line %d.\n"
#define WHEREARG __FILE__, __LINE__
#define debug(...) do{ fprintf(stderr, __VA_ARGS__); fprintf(stderr, WHERESTR, WHEREARG); } while(0)
#else
#define debug(...)
#endif
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <strings.h>
#ifndef ptr_t
typedef void * ptr_t;
#endif
#ifndef PERL_ARGS_ASSERT_SV_RECODE_TO_UTF8
#define PERL_ARGS_ASSERT_SV_RECODE_TO_UTF8
#endif
typedef struct {
char *name;
unsigned int len;
char *fullname;
unsigned int fulllen;
} xml_node;
/*
commit 30866c9f74d890c45e8da27ea855468a314a59cf
xmlbare 1785/s -- -19%
xmlfast 2209/s 24% --
*/
#define UTF8_BYTES 1
#define UTF8_UPGRADE 2
#define UTF8_DECODE 3
#define UTF8_EDECODE 4
#define EMIT_WARNS 0x0001
#define TAG_MATCH 0x0002
#define MODE_ORDER 0x1000
#define MODE_TRIM 0x2000
#define MODE_ARRAYS 0x4000
typedef struct {
// config
unsigned int flags;
unsigned int bytes;
unsigned int utf8;
SV * attr;
SV * text;
SV * join;
SV * cdata;
SV * comm;
HV * array;
// state
char *encoding;
SV *encode;
int depth;
unsigned int chainsize;
xml_node * chain;
HV ** hchain;
HV * hcurrent; //just a pointer
SV * pi;
SV * attrname;
SV * textval;
SV * error;
parser_state * state;
} parsestate;
typedef struct {
// config
unsigned int flags;
unsigned int bytes;
unsigned int utf8;
char * attr; STRLEN attl;
char * text;
char * join;
char * cdata;
char * comm;
HV * array;
// state
char *encoding;
SV *encode;
unsigned int chainsize;
xml_node * chain;
HV ** hchain;
HV * hcurrent; //just a pointer
SV * pi;
SV * attrname;
SV * textval;
SV * error;
/// new
I32 depth;
I32 ix;
SV * result;
SV * nested;
} compstate;
// hv_store to array if already have non-array value
#define hv_store_a( hv, key, sv ) \
STMT_START { \
SV **exists; \
char *kv = SvPV_nolen(key); \
int kl = SvCUR(key); \
if( exists = hv_fetch(hv, kv, kl, 0) ) { \
if ( SvROK(*exists) && SvTYPE( SvRV(*exists) ) == SVt_PVAV) { \
AV *av = (AV *) SvRV( *exists ); \
av_push( av, sv ); \
} \
else { \
AV *av = newAV(); \
if (SvROK(*exists)) { \
SvREFCNT_inc(*exists); \
av_push( av, *exists ); \
} else { \
SV *old = newSV(0); \
sv_copypv(old, *exists); \
av_push( av, old ); \
} \
av_push( av, sv ); \
(void) hv_store( hv, kv, kl, newRV_noinc( (SV *) av ), 0 ); \
} \
} else { \
(void) hv_store(hv, kv, kl, sv, 0); \
} \
} STMT_END
// hv_store to array, create if not exists
#define hv_store_aa( hv, key, sv ) \
STMT_START { \
}
sv_2mortal(ctx->attrname);
ctx->attrname = 0;
ctx->textval = 0;
}
else {
hv_store_a(ctx->hcurrent, ctx->text, ctx->textval);
}
ctx->textval = 0;
}
void on_bytes_part(void * pctx, char * data, unsigned int length) {
#if XML_DEVEL
if (!pctx) croak("Context not passed to on_bytes_part");
#endif
parsestate *ctx = pctx;
if ( !( ctx->attrname || ctx->text ) ) return;
if (ctx->textval) {
if (length > 0) { sv_catpvn(ctx->textval, data, length); }
} else {
ctx->textval = newSVpvn(data, length);
}
}
void on_bytes(void * pctx, char * data, unsigned int length) {
#if XML_DEVEL
if (!pctx) croak("Context not passed to on_bytes");
#endif
parsestate *ctx = pctx;
//if (!ctx->textval && !length) {
// my_warn(ctx,"Called on_bytes with empty text and empty body");
//}
if ( !( ctx->attrname || ctx->text ) ) return;
if (ctx->textval) {
if (length > 0) { sv_catpvn(ctx->textval, data, length); }
} else {
ctx->textval = newSVpvn(data, length);
}
xml_sv_decode(ctx,ctx->textval);
if (ctx->attrname) {
if (ctx->pi) {
on_pi_attr(ctx);
} else {
hv_store_a(ctx->hcurrent, ctx->attrname, ctx->textval);
}
sv_2mortal(ctx->attrname);
ctx->attrname = 0;
ctx->textval = 0;
}
else {
//printf("text close, store %s\n",SvPV_nolen(ctx->textval));
hv_store_a(ctx->hcurrent, ctx->text, ctx->textval);
}
ctx->textval = 0;
}
void on_cdata(void * pctx, char * data,unsigned int length) {
#if XML_DEVEL
if (!pctx) croak("Context not passed to on_cdata");
#endif
parsestate *ctx = pctx;
SV *sv = newSVpvn(data, length);
xml_sv_decode(ctx,sv);
hv_store_a(ctx->hcurrent, ctx->cdata, sv );
}
void on_pi_open(void * pctx, char * data, unsigned int length) {
#if XML_DEVEL
if (!pctx) croak("Context not passed to on_pi_open");
#endif
parsestate *ctx = pctx;
ctx->pi = newSVpvn(data,length);
}
void on_pi_close(void * pctx, char * data, unsigned int length) {
#if XML_DEVEL
if (!pctx) croak("Context not passed to on_pi_close");
#endif
parsestate *ctx = pctx;
sv_2mortal(ctx->pi);
ctx->pi = 0;
}
void on_tag_open(void * pctx, char * data, unsigned int length) {
#if XML_DEVEL
if (!pctx) croak("Context not passed to on_tag_open");
#endif
parsestate *ctx = pctx;
if (ctx->textval) {
xml_sv_decode(ctx,ctx->textval);
hv_store_a(ctx->hcurrent, ctx->text, ctx->textval);
ctx->textval = 0;
}
HV * hv = newHV();
ctx->depth++;
if (ctx->depth >= ctx->chainsize) {
warn("XML depth too high. Consider increasing `_max_depth' to at more than %d to avoid reallocations",ctx->chainsize);
ctx->chainsize *= 2;
Renew( ctx->hchain, ctx->chainsize, HV* );
Renew( ctx->chain, ctx->chainsize, xml_node);
}
ctx->chain[ctx->depth].len = length;
ctx->chain[ctx->depth].name = data;
if (ctx->flags & TAG_MATCH) {
if (ctx->depth == 0) {
ctx->chain[ctx->depth].fulllen = length + 1;
Newx(ctx->chain[ctx->depth].fullname, ctx->chain[ctx->depth].fulllen + 1, char);
ctx->chain[ctx->depth].fullname[0] = '/';
memcpy(ctx->chain[ctx->depth].fullname+1,data,length);
ctx->chain[ctx->depth].fullname[length+1] = 0;
//printf("Fullame = %s\n",ctx->chain[ctx->depth].fullname);
} else {
ctx->chain[ctx->depth].fulllen = ctx->chain[ctx->depth - 1].fulllen + length + 1;
Newx(ctx->chain[ctx->depth].fullname, ctx->chain[ctx->depth].fulllen + 1, char);
memcpy(
ctx->chain[ctx->depth].fullname,
ctx->chain[ctx->depth - 1].fullname,
ctx->chain[ctx->depth - 1].fulllen
);
ctx->chain[ctx->depth].fullname[ ctx->chain[ctx->depth - 1].fulllen ] = '/';
memcpy(
ctx->chain[ctx->depth].fullname + ctx->chain[ctx->depth - 1].fulllen + 1,
data,
length
//printf ("\n");
}
void h2xpe( compstate *p, char *s ) {
char *b = s;
while (1) {
switch (*s) {
warn("%c", *s);
case 0:
if (b < s) sv_catpvf( p->result, "%-.*s", (int)(s - b), b );
return;
case '<':
if (b < s) sv_catpvf( p->result, "%-.*s", (int)(s - b), b );
sv_catpvf( p->result, "%s", "<" );
b = s+1;
break;
case '>':
if (b < s) sv_catpvf( p->result, "%-.*s", (int)(s - b), b );
sv_catpvf( p->result, "%s", ">" );
b = s+1;
break;
case '"':
if (b < s) sv_catpvf( p->result, "%-.*s", (int)(s - b), b );
sv_catpvf( p->result, "%s", """ );
b = s+1;
break;
case '\'':
if (b < s) sv_catpvf( p->result, "%-.*s", (int)(s - b), b );
sv_catpvf( p->result, "%s", "'" );
b = s+1;
break;
case '&':
if (b < s) sv_catpvf( p->result, "%-.*s", (int)(s - b), b );
sv_catpvf( p->result, "%s", "&" );
b = s+1;
break;
default:
break;
}
s++;
}
}
void kv2x ( char *key, SV *val, compstate *p );
void kv2x ( char *key, SV *val, compstate *p ) {
char closed;
HE* ent;
char *nkey;
STRLEN i, nlen;
AV *av;
SV **avv;
debug("key=%s, val=%s",key, SvPV_nolen(val));
if ( mystrcmp( key, p->text ) == 0 ) {
h2xpe(p, SvPV_nolen( val ));
}
else
if ( mystrcmp( key, p->cdata ) == 0 ) {
h2xp(p, "<![CDATA[");
h2xp(p, SvPV_nolen( val ));
h2xp(p, "]]>");
}
else
if ( p->comm && mystrcmp( key, p->comm) == 0 ) {
debug("comm: %s", SvPV_nolen( val ));
h2xp(p, "<!-- ");
h2xpe(p, SvPV_nolen( val ));
h2xp(p, " -->");
}
else {
if (SvROK( val )) {
switch ( SvTYPE( SvRV( val ) ) ) {
case SVt_PVHV:
debug("%s -> hash inside", key);
(void) hv_iterinit( (HV *) SvRV( val ) );
h2xp(p, "<%s", key);
closed = 0;
while ((ent = hv_iternext( (HV *) SvRV( val ) ))) {
nkey = HePV(ent, nlen);
if ( strncmp( nkey, p->attr, p->attl ) == 0 ) {
nkey += p->attl;
h2xp(p, " %s=\"",nkey);
h2xpe(p, SvPV_nolen(HeVAL(ent)));
h2xp(p, "\"");
continue;
}
}
(void) hv_iterinit( (HV *) SvRV( val ) );
while ((ent = hv_iternext( (HV *) SvRV( val ) ))) {
nkey = HePV(ent, nlen);
if ( strncmp( nkey, p->attr, p->attl ) == 0 ) {
continue;
}
debug("Nested %s->%s", nkey, SvPV_nolen(HeVAL(ent)));
if (!closed) {
closed = 1;
h2xp(p,">");
}
p->depth++;
kv2x( nkey, HeVAL(ent), p );
p->depth--;
}
if (!closed) {
h2xp(p, "/>");
} else {
h2xp(p, "</%s>",key);
}
break;
case SVt_PVAV:
debug("array inside (Multinode)");
av = (AV *) SvRV( val );
nlen = av_len(av) + 1;
SV *encode = find_encoding("windows-1251");
end = uvchr_to_utf8(utf, chr);
*end = '\0';
SV *tmp = sv_2mortal(newSVpvn(utf, end-utf));
SvUTF8_on(tmp);
SV *bytes = sv_recode_from_utf8(aTHX_ tmp, encode);
sv_dump(bytes);
printf("Created char %s / %s / bytes = %s\n", utf, SvPV_nolen(tmp), SvPV_nolen(bytes));
//sv_recode_to_utf8(tmp, encode);
//printf("Recoded %s\n",SvPV_nolen(tmp));
croak("Force exit");
void
_xml2hash(xml_sv,conf)
SV *xml_sv;
HV *conf;
PROTOTYPE: $$
PPCODE:
SvGETMAGIC(xml_sv);
char *xml = SvPVbyte_nolen(xml_sv);
SV * RV;
parser_state state;
memset(&state,0,sizeof(state));
parsestate ctx;
memset(&ctx,0,sizeof(parsestate));
state.ctx = &ctx;
ctx.state = &state;
SV **key;
if ((key = hv_fetch(conf, "order", 5, 0)) && SvTRUE(*key)) {
ctx.flags |= MODE_ORDER;
}
if ((key = hv_fetch(conf, "trim", 4, 0)) && SvTRUE(*key)) {
ctx.flags |= MODE_TRIM;
}
if ((key = hv_fetch(conf, "bytes", 5, 0)) && SvTRUE(*key)) {
ctx.bytes = 1;
} else {
if ((key = hv_fetch(conf, "utf8decode", 10, 0)) && SvTRUE(*key)) {
ctx.utf8 = UTF8_DECODE;
} else {
ctx.utf8 = UTF8_UPGRADE;
}
}
if ((key = hv_fetch(conf, "attr", 4, 0))) {
if (SvOK(*key) && SvCUR(*key) > 0 ) { // defined and length
ctx.attr = *key;
}
}
if ((key = hv_fetch(conf, "text", 4, 0)) && SvOK(*key)) {
ctx.text = *key;
}
if ((key = hv_fetch(conf, "join", 4, 0)) && SvPOK(*key)) {
ctx.join = *key;
}
if ((key = hv_fetch(conf, "cdata", 5, 0)) && SvPOK(*key)) {
ctx.cdata = *key;
}
if ((key = hv_fetch(conf, "comm", 4, 0)) && SvPOK(*key)) {
ctx.comm = *key;
}
if ((key = hv_fetch(conf, "array", 5, 0)) && SvOK(*key)) {
if (SvROK(*key) && SvTYPE( SvRV(*key) ) == SVt_PVAV) {
AV *av = (AV *) SvRV( *key );
ctx.array = newHV();
I32 len = 0, avlen = av_len(av) + 1;
SV **val;
for ( len = 0; len < avlen; len++ ) {
if( ( val = av_fetch(av,len,0) ) && SvOK(*val) ) {
if(SvPOK(*val)) {
(void) hv_store( ctx.array, SvPV_nolen(*val), SvCUR(*val), newSV(0), 0 );
} else {
my_croak(&ctx,"Bad enrty in array entry: %s",SvPV_nolen(*val));
}
}
}
}
else if (!SvROK(*key)) {
//printf("Remember all should be arrays\n");
if (SvTRUE(*key)) {
ctx.flags |= MODE_ARRAYS;
}
}
else {
my_croak(&ctx,"Bad entry in array: %s",SvPV_nolen(*key));
}
}
//ctx.flags |= TAG_MATCH;
if ((key = hv_fetch(conf, "_max_depth", 10, 0)) && SvOK(*key)) {
ctx.chainsize = SvIV(*key);
if (ctx.chainsize < 1) {
my_croak(&ctx,"_max_depth contains bad value (%d)",ctx.chainsize);
}
} else {
ctx.chainsize = 256;
}
if (!ctx.bytes) {
ctx.encoding = "utf8";
}
if (ctx.flags & MODE_ORDER) {
my_croak(&ctx,"Ordered mode not implemented yet\n");
} else{
ctx.hcurrent = newHV();
Newx(ctx.chain, ctx.chainsize, xml_node);
Newx(ctx.hchain, ctx.chainsize, HV*);
ctx.depth = -1;
RV = sv_2mortal(newRV_noinc( (SV *) ctx.hcurrent ));
state.cb.piopen = on_pi_open;
state.cb.piclose = on_pi_close;
state.cb.tagopen = on_tag_open;
state.cb.tagclose = on_tag_close;
state.cb.attrname = on_attr_name;
if ((key = hv_fetch(conf, "nowarn", 6, 0)) && SvTRUE(*key)) {
//
} else {
state.cb.warn = on_warn;
ctx.flags |= EMIT_WARNS;
}
state.cb.die = on_die;
if(ctx.comm)
state.cb.comment = on_comment;
if(ctx.cdata)
state.cb.cdata = on_cdata;
else if(ctx.text)
state.cb.cdata = on_bytes;
state.cb.bytes = on_bytes;
state.cb.bytespart = on_bytes_part;
state.cb.uchar = on_uchar;
if (!(ctx.flags & MODE_TRIM))
state.save_wsp = 1;
}
parse(xml,&state);
if (ctx.depth > -1 && !ctx.error) {
ctx.error = sv_2mortal(newSVpv("Unbalanced tags",0));
}
DESTROY(&ctx);
if (ctx.error) {
croak("%s", SvPV_nolen(ctx.error));
}
ST(0) = RV;
XSRETURN(1);
void
_hash2xml(hash,conf)
SV *hash;
HV *conf;
PROTOTYPE: $$
PPCODE:
compstate ctx;
memset(&ctx,0,sizeof(parsestate));
SV **key;
if ((key = hv_fetch(conf, "order", 5, 0)) && SvTRUE(*key)) {
ctx.flags |= MODE_ORDER;
}
if ((key = hv_fetch(conf, "trim", 4, 0)) && SvTRUE(*key)) {
ctx.flags |= MODE_TRIM;
}
/*
if ((key = hv_fetch(conf, "bytes", 5, 0)) && SvTRUE(*key)) {
ctx.bytes = 1;
} else {
if ((key = hv_fetch(conf, "utf8decode", 10, 0)) && SvTRUE(*key)) {
ctx.utf8 = UTF8_DECODE;
} else {
ctx.utf8 = UTF8_UPGRADE;
}
}
*/
if ((key = hv_fetch(conf, "attr", 4, 0)) && SvPOK(*key)) {
ctx.attr = SvPV_nolen(*key);
// warn ("Set attr to '%s'", ctx.attr);
} else {
ctx.attr = "-";
}
if ((key = hv_fetch(conf, "text", 4, 0)) && SvPOK(*key)) {
ctx.text = SvPV_nolen(*key);
} else {
ctx.text = "#text";
}
if ((key = hv_fetch(conf, "cdata", 5, 0)) && SvPOK(*key)) {
ctx.cdata = SvPV_nolen(*key);
} else {
ctx.cdata = 0;
}
if ((key = hv_fetch(conf, "comm", 4, 0)) && SvPOK(*key)) {
ctx.comm = SvPV_nolen(*key);
} else {
ctx.comm = 0;
}
/*
if ((key = hv_fetch(conf, "array", 5, 0)) && SvOK(*key)) {
if (SvROK(*key) && SvTYPE( SvRV(*key) ) == SVt_PVAV) {
AV *av = (AV *) SvRV( *key );
ctx.array = newHV();
I32 len = 0, avlen = av_len(av) + 1;
SV **val;
for ( len = 0; len < avlen; len++ ) {
if( ( val = av_fetch(av,len,0) ) && SvOK(*val) ) {
if(SvPOK(*val)) {
(void) hv_store( ctx.array, SvPV_nolen(*val), SvCUR(*val), newSV(0), 0 );
} else {
//my_croak(&ctx,"Bad enrty in array entry: %s",SvPV_nolen(*val));
}
}
}
}
else if (!SvROK(*key)) {
//printf("Remember all should be arrays\n");
if (SvTRUE(*key)) {
ctx.flags |= MODE_ARRAYS;
}
}
else {
//my_croak(&ctx,"Bad entry in array: %s",SvPV_nolen(*key));
}
}
//ctx.flags |= TAG_MATCH;
if (!ctx.bytes) {
ctx.encoding = "utf8";
}
*/
ctx.depth = 0;
ctx.ix = 0;
ctx.result = sv_2mortal(newSVpv("",0));
//ctx.nested = newSVpv("",0);
ctx.attl = strlen(ctx.attr);
SvGROW(ctx.result,1024);
//SvGROW(ctx.nested,1024);
h2x( hash, &ctx);
//croak("XXX: %s", SvPV_nolen(ctx.result));
if (ctx.error) {
croak("%s", SvPV_nolen(ctx.error));
}
ST(0) = ctx.result;
( run in 0.880 second using v1.01-cache-2.11-cpan-39bf76dae61 )