SWISH-3

 view release on metacpan or  search on metacpan

libswish3.c  view on Meta::CPAN

typedef struct swish_3                  swish_3;
typedef struct swish_StringList         swish_StringList;
typedef struct swish_Config             swish_Config;
typedef struct swish_ConfigFlags        swish_ConfigFlags;
typedef struct swish_ConfigValue        swish_ConfigValue;
typedef struct swish_DocInfo            swish_DocInfo;
typedef struct swish_MetaStackElement   swish_MetaStackElement;
typedef struct swish_MetaStackElement  *swish_MetaStackElementPtr;
typedef struct swish_MetaStack          swish_MetaStack;
typedef struct swish_MetaName           swish_MetaName;
typedef struct swish_Property           swish_Property;
typedef struct swish_Token              swish_Token;
typedef struct swish_TokenList          swish_TokenList;
typedef struct swish_TokenIterator      swish_TokenIterator;
typedef struct swish_ParserData         swish_ParserData;
typedef struct swish_Tag                swish_Tag;
typedef struct swish_TagStack           swish_TagStack;
typedef struct swish_Analyzer           swish_Analyzer;
typedef struct swish_Parser             swish_Parser;
typedef struct swish_NamedBuffer        swish_NamedBuffer;

/*
=head2 Data Structures
*/

struct swish_3
{
    int             ref_cnt;
    void           *stash;
    swish_Config   *config;
    swish_Analyzer *analyzer;
    swish_Parser   *parser;
};

struct swish_StringList
{
    unsigned int    n;
    unsigned int    max;
    xmlChar**       word;
};


struct swish_Config
{
    int                          ref_cnt;
    void                        *stash;      /* for bindings */
    xmlHashTablePtr              misc;
    xmlHashTablePtr              properties;
    xmlHashTablePtr              metanames;
    xmlHashTablePtr              tag_aliases;
    xmlHashTablePtr              parsers;
    xmlHashTablePtr              mimes;
    xmlHashTablePtr              index;
    xmlHashTablePtr              stringlists;
    struct swish_ConfigFlags    *flags;      /* shortcuts for parsing */
};

struct swish_ConfigFlags
{
    boolean         tokenize;
    boolean         cascade_meta_context;
    boolean         ignore_xmlns;
    boolean         follow_xinclude;
    int             undef_metas;
    int             undef_attrs;
    int             max_meta_id;
    int             max_prop_id;
    xmlHashTablePtr meta_ids;
    xmlHashTablePtr prop_ids;
    //xmlHashTablePtr contexts;
};

struct swish_NamedBuffer
{
    int             ref_cnt;    /* for bindings */
    void           *stash;      /* for bindings */
    xmlHashTablePtr hash;       /* the meat */
};

struct swish_DocInfo
{
    time_t              mtime;
    off_t               size;
    xmlChar *           mime;
    xmlChar *           encoding;
    xmlChar *           uri;
    unsigned int        nwords;
    xmlChar *           ext;
    xmlChar *           parser;
    xmlChar *           action;
    boolean             is_gzipped;
    int                 ref_cnt;
};

struct swish_MetaName
{
    int                 ref_cnt;
    int                 id;
    xmlChar            *name;
    int                 bias;
    xmlChar            *alias_for;
};

struct swish_Property
{
    int                 ref_cnt;
    int                 id;
    xmlChar            *name;
    boolean             ignore_case;
    int                 type;
    boolean             verbatim;
    xmlChar            *alias_for;
    unsigned int        max;
    boolean             sort;
    boolean             presort;
    unsigned int        sort_length;
};

struct swish_Token
{
    unsigned int        pos;            // this token's position in document

libswish3.c  view on Meta::CPAN

    if (prop->ref_cnt < 1) {
        swish_property_free(prop);
    }
}

static void
free_metas(
    swish_MetaName *meta,
    xmlChar *metaname
)
{
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG(" freeing config->meta %s", metaname);
        swish_metaname_debug((swish_MetaName *)meta);
    }
    meta->ref_cnt--;
    if (meta->ref_cnt < 1) {
        swish_metaname_free(meta);
    }
}

void
swish_config_free(
    swish_Config *config
)
{
    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG("freeing config");
        SWISH_DEBUG_MSG("ptr addr: 0x%x  %d", (long int)config, (long int)config);
        swish_mem_debug();
    }

    xmlHashFree(config->misc, (xmlHashDeallocator)free_string);
    xmlHashFree(config->properties, (xmlHashDeallocator)free_props);
    xmlHashFree(config->metanames, (xmlHashDeallocator)free_metas);
    xmlHashFree(config->tag_aliases, (xmlHashDeallocator)free_string);
    xmlHashFree(config->parsers, (xmlHashDeallocator)free_string);
    xmlHashFree(config->mimes, (xmlHashDeallocator)free_string);
    xmlHashFree(config->index, (xmlHashDeallocator)free_string);
    xmlHashFree(config->stringlists, (xmlHashDeallocator)free_stringlist);
    swish_config_flags_free(config->flags);

    if (config->ref_cnt != 0) {
        SWISH_WARN("config ref_cnt != 0: %d", config->ref_cnt);
    }

    if (config->stash != NULL) {
        SWISH_WARN("possible memory leak: config->stash was not freed");
    }

    swish_xfree(config);
}

swish_ConfigFlags *
swish_config_init_flags(
)
{
    swish_ConfigFlags *flags;
    flags = swish_xmalloc(sizeof(swish_ConfigFlags));
    flags->tokenize = SWISH_TRUE;
    flags->cascade_meta_context = SWISH_FALSE;  /* add tokens to every metaname in the stack */
    flags->ignore_xmlns = SWISH_TRUE;
    flags->follow_xinclude = SWISH_TRUE;
    flags->undef_metas = SWISH_UNDEF_METAS_INDEX;
    flags->undef_attrs = SWISH_UNDEF_ATTRS_DISABLE;
    flags->max_meta_id = -1;
    flags->max_prop_id = -1;
    flags->meta_ids = swish_hash_init(8);
    flags->prop_ids = swish_hash_init(8);
    //flags->contexts = swish_hash_init(8); // TODO cache these to save malloc/frees

    return flags;
}

void
swish_config_flags_free(
    swish_ConfigFlags * flags
)
{
    /*
       these hashes are for convenience and are really freed in swish_config_free() 
     */
    xmlHashFree(flags->meta_ids, NULL);
    xmlHashFree(flags->prop_ids, NULL);
    if (SWISH_DEBUG) {
        swish_config_flags_debug(flags);
    }
    swish_xfree(flags);
}

void
swish_config_flags_debug(
    swish_ConfigFlags *flags
)
{
    SWISH_DEBUG_MSG("config->tokenize == %d", flags->tokenize);
    SWISH_DEBUG_MSG("config->cascade_meta_context == %d", flags->cascade_meta_context);
    SWISH_DEBUG_MSG("config->ignore_xmlns == %d", flags->ignore_xmlns);
    SWISH_DEBUG_MSG("config->follow_xinclude == %d", flags->follow_xinclude);
    SWISH_DEBUG_MSG("config->undef_metas == %d", flags->undef_metas);
    SWISH_DEBUG_MSG("config->undef_attrs == %d", flags->undef_attrs);
    SWISH_DEBUG_MSG("config->max_meta_id == %d", flags->max_meta_id);
    SWISH_DEBUG_MSG("config->max_prop_id == %d", flags->max_prop_id);
}

/* init config object */
swish_Config *
swish_config_init(
)
{
    swish_Config *config;

    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG("init config");
    }

/* the hashes will automatically grow as needed so we init with sane starting size */
    config = swish_xmalloc(sizeof(swish_Config));
    config->flags = swish_config_init_flags();
    config->misc = swish_hash_init(8);
    config->metanames = swish_hash_init(8);
    config->properties = swish_hash_init(8);
    config->parsers = swish_hash_init(8);
    config->index = swish_hash_init(8);
    config->tag_aliases = swish_hash_init(8);   /* alias => real */
    config->stringlists = swish_hash_init(8);
    config->mimes = NULL;
    config->ref_cnt = 0;
    config->stash = NULL;

    if (SWISH_DEBUG & SWISH_DEBUG_MEMORY) {
        SWISH_DEBUG_MSG("config ptr 0x%x", (long int)config);
    }

    return config;

}

void
swish_config_set_default(
    swish_Config *config
)
{
    swish_Property *tmpprop;
    swish_MetaName *tmpmeta;
    xmlChar *tmpbuf;

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG)
        SWISH_DEBUG_MSG("setting default config");

/* we xstrdup a lot in order to consistently free in swish_config_free() */

/* MIME types */
    config->mimes = swish_mime_defaults();

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG)
        SWISH_DEBUG_MSG("mime hash set");

libswish3.c  view on Meta::CPAN

/* values in config2 override and are set in config1 */

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("Merging config2 0x%lx into config1 0x%lx",
            config2, config1);
        swish_config_debug(config2);
        swish_config_debug(config1);
    }


    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("merge properties");
    }
    merge_properties(config1->properties, config2->properties);

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("merge metanames");
    }
    merge_metanames(config1->metanames, config2->metanames);

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("merge parsers");
    }
    swish_hash_merge(config1->parsers, config2->parsers);

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("merge mimes");
    }
    swish_hash_merge(config1->mimes, config2->mimes);

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("merge index");
    }
    swish_hash_merge(config1->index, config2->index);

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("merge tag_aliases");
    }
    swish_hash_merge(config1->tag_aliases, config2->tag_aliases);

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("merge misc");
    }
    swish_hash_merge(config1->misc, config2->misc);
    
    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("merge stringlists");
    }
    merge_stringlists(config1->stringlists, config2->stringlists);

    if (SWISH_DEBUG & SWISH_DEBUG_CONFIG) {
        SWISH_DEBUG_MSG("merge complete");
    }

/* set flags */
    if (swish_hash_exists(config2->misc, BAD_CAST SWISH_TOKENIZE)) {
        config2->flags->tokenize = swish_string_to_boolean(swish_hash_fetch(config2->misc, BAD_CAST SWISH_TOKENIZE));
    }
    config1->flags->tokenize = config2->flags->tokenize;
    if (swish_hash_exists(config2->misc, BAD_CAST SWISH_CASCADE_META_CONTEXT)) {
        config2->flags->cascade_meta_context = 
            swish_string_to_boolean(swish_hash_fetch(config2->misc, BAD_CAST SWISH_CASCADE_META_CONTEXT));
    }
    config1->flags->cascade_meta_context = config2->flags->cascade_meta_context;
    if (swish_hash_exists(config2->misc, BAD_CAST SWISH_IGNORE_XMLNS)) {
        config2->flags->ignore_xmlns = 
            swish_string_to_boolean(swish_hash_fetch(config2->misc, BAD_CAST SWISH_IGNORE_XMLNS));
    }
    config1->flags->ignore_xmlns = config2->flags->ignore_xmlns;
    if (swish_hash_exists(config2->misc, BAD_CAST SWISH_FOLLOW_XINCLUDE)) {
        config2->flags->follow_xinclude =
            swish_string_to_boolean(swish_hash_fetch(config2->misc, BAD_CAST SWISH_FOLLOW_XINCLUDE));
    }
    config1->flags->follow_xinclude = config2->flags->follow_xinclude;
    if (swish_hash_exists(config2->misc, BAD_CAST SWISH_UNDEFINED_METATAGS)) {
        v = swish_hash_fetch(config2->misc, BAD_CAST SWISH_UNDEFINED_METATAGS);
        if (xmlStrEqual(v, BAD_CAST "error")) {
            config2->flags->undef_metas = SWISH_UNDEF_METAS_ERROR;
        }
        else if (xmlStrEqual(v, BAD_CAST "ignore")) {
            config2->flags->undef_metas = SWISH_UNDEF_METAS_IGNORE;
        }
        else if (xmlStrEqual(v, BAD_CAST "index")) {
            config2->flags->undef_metas = SWISH_UNDEF_METAS_INDEX;
        }
        else if (xmlStrEqual(v, BAD_CAST "auto")) {
            config2->flags->undef_metas = SWISH_UNDEF_METAS_AUTO;
        }
        else if (xmlStrEqual(v, BAD_CAST "autoall")) {
            config2->flags->undef_metas = SWISH_UNDEF_METAS_AUTOALL;
        }
        else {
            SWISH_CROAK("Unknown value for %s: %s", SWISH_UNDEFINED_METATAGS, v);
        }
    }
    config1->flags->undef_metas = config2->flags->undef_metas;
    if (swish_hash_exists(config2->misc, BAD_CAST SWISH_UNDEFINED_XML_ATTRIBUTES)) {
        v = swish_hash_fetch(config2->misc, BAD_CAST SWISH_UNDEFINED_XML_ATTRIBUTES);
        if (xmlStrEqual(v, BAD_CAST "error")) {
            config2->flags->undef_attrs = SWISH_UNDEF_ATTRS_ERROR;
        }
        else if (xmlStrEqual(v, BAD_CAST "ignore")) {
            config2->flags->undef_attrs = SWISH_UNDEF_ATTRS_IGNORE;
        }
        else if (xmlStrEqual(v, BAD_CAST "index")) {
            config2->flags->undef_attrs = SWISH_UNDEF_ATTRS_INDEX;
        }
        else if (xmlStrEqual(v, BAD_CAST "auto")) {
            config2->flags->undef_attrs = SWISH_UNDEF_ATTRS_AUTO;
        }
        else if (xmlStrEqual(v, BAD_CAST "autoall")) {
            config2->flags->undef_attrs = SWISH_UNDEF_ATTRS_AUTOALL;
        }
        else if (xmlStrEqual(v, BAD_CAST "disable")) {
            config2->flags->undef_attrs = SWISH_UNDEF_ATTRS_DISABLE;
        }
        else {
            SWISH_CROAK("Unknown value for %s: %s", SWISH_UNDEFINED_XML_ATTRIBUTES, v);
        }
    }
    config1->flags->undef_attrs = config2->flags->undef_attrs;
    
    if (config1->flags->max_meta_id < config2->flags->max_meta_id) {
        config1->flags->max_meta_id = config2->flags->max_meta_id;

libswish3.c  view on Meta::CPAN

 */
    alias = swish_hash_fetch(parser_data->s3->config->tag_aliases, swishtag);
    if (alias) {
        if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
            SWISH_DEBUG_MSG("%s alias -> %s", swishtag, alias); 
        }
        swish_xfree(swishtag);
        swishtag = swish_xstrdup(alias);
    }
    else {
        swishdomtag = flatten_tag_stack(swishtag, parser_data->domstack, SWISH_DOT);
        alias = swish_hash_fetch(parser_data->s3->config->tag_aliases, swishdomtag);
        if (alias) {
            if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
                SWISH_DEBUG_MSG("%s alias -> %s", swishdomtag, alias); 
            }
            swish_xfree(swishtag);
            swishtag = swish_xstrdup(alias);
        }
        swish_xfree(swishdomtag);
    }

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER) {
        SWISH_DEBUG_MSG(" swishtag = %s", swishtag);
    }

    return swishtag;
}

static void
flush_buffer(
    swish_ParserData *parser_data,
    xmlChar *metaname,
    xmlChar *context
)
{
    swish_MetaName *meta;
    xmlChar *metaname_stored_as;
    swish_TagStack *s = parser_data->metastack;

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("buffer is >>%s<< before flush",
                        xmlBufferContent(parser_data->meta_buf));

/*
* add meta_buf as-is to metanames buffer under current tag. this
* gives us both tokens and raw text de-tagged but organized by
* metaname. If the metaname is an alias_for, use the target of the alias.
*/
    meta = swish_hash_fetch(parser_data->s3->config->metanames, metaname);
    if (meta->alias_for != NULL) {
        metaname_stored_as = meta->alias_for;
    }
    else {
        metaname_stored_as = metaname;
    }
    swish_nb_add_buf(parser_data->metanames, metaname_stored_as, parser_data->meta_buf,
                        (xmlChar *)SWISH_TOKENPOS_BUMPER, 0, 1);

/*
*  if cascade_meta_context is true, add tokens (buffer) to every metaname on the stack.
*/

    if (parser_data->s3->config->flags->cascade_meta_context) {
        for (s->temp = s->head; s->temp != NULL; s->temp = s->temp->next) {
            if (xmlStrEqual(s->temp->baked, metaname_stored_as))  /*  already added */
                continue;

            swish_nb_add_buf(parser_data->metanames, s->temp->baked,
                                parser_data->meta_buf, (xmlChar *)SWISH_TOKENPOS_BUMPER,
                                0, 1);
        }
    }

    if (parser_data->s3->analyzer->tokenize) {
        tokenize(parser_data, (xmlChar *)xmlBufferContent(parser_data->meta_buf),
                 xmlBufferLength(parser_data->meta_buf), metaname_stored_as, context);
    }

    xmlBufferEmpty(parser_data->meta_buf);

}

/* 
* SAX2 callback 
*/
static void
mystartDocument(
    void *data
)
{

/*
* swish_ParserData *parser_data = (swish_ParserData *) data; 
*/

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("startDocument()");

}

/* 
* SAX2 callback 
*/
static void
myendDocument(
    void *parser_data
)
{

    if (SWISH_DEBUG & SWISH_DEBUG_PARSER)
        SWISH_DEBUG_MSG("endDocument()");

/*
* whatever's left 
*/
    flush_buffer(parser_data, (xmlChar *)SWISH_DEFAULT_METANAME,
                 (xmlChar *)SWISH_DEFAULT_METANAME);

}

/* 
* SAX1 callback 
*/

libswish3.c  view on Meta::CPAN

static void write_property(
    swish_Property *prop,
    xmlTextWriterPtr writer,
    xmlChar *name
);
static void write_properties(
    xmlTextWriterPtr writer,
    xmlHashTablePtr properties
);
static void write_parser(
    xmlChar *val,
    xmlTextWriterPtr writer,
    xmlChar *key
);
static void write_parsers(
    xmlTextWriterPtr writer,
    xmlHashTablePtr parsers
);
static void write_mime(
    xmlChar *type,
    temp_things *things,
    xmlChar *ext
);
static void write_mimes(
    xmlTextWriterPtr writer,
    xmlHashTablePtr mimes
);
static void write_index(
    xmlTextWriterPtr writer,
    xmlHashTablePtr index
);
static void write_tag_aliases(
    xmlTextWriterPtr writer,
    xmlHashTablePtr tag_aliases
);
static void write_misc(
    xmlTextWriterPtr writer,
    xmlHashTablePtr hash
);
static void handle_special_misc_flags(
    headmaker *h
);

static void
handle_special_misc_flags(
    headmaker *h
)
{
    xmlChar *v;
    
    if (swish_hash_exists(h->config->misc, BAD_CAST SWISH_TOKENIZE)) {
        /*
        SWISH_DEBUG_MSG("tokenize in config == %s", 
            swish_hash_fetch(h->config->misc, BAD_CAST SWISH_TOKENIZE));
        */
        h->config->flags->tokenize = 
            swish_string_to_boolean(swish_hash_fetch(h->config->misc, BAD_CAST SWISH_TOKENIZE));
    }
    if (swish_hash_exists(h->config->misc, BAD_CAST SWISH_CASCADE_META_CONTEXT)) {
        /*
        SWISH_DEBUG_MSG("cascade_meta_context in config == %s", 
            swish_hash_fetch(h->config->misc, BAD_CAST SWISH_CASCADE_META_CONTEXT));
        */
        h->config->flags->cascade_meta_context = 
            swish_string_to_boolean(swish_hash_fetch(h->config->misc, BAD_CAST SWISH_CASCADE_META_CONTEXT));
    }
    if (swish_hash_exists(h->config->misc, BAD_CAST SWISH_IGNORE_XMLNS)) {
        /*
        SWISH_DEBUG_MSG("ignore_xmlns in config == %s", 
            swish_hash_fetch(h->config->misc, BAD_CAST SWISH_IGNORE_XMLNS));
        */
        h->config->flags->ignore_xmlns = 
            swish_string_to_boolean(swish_hash_fetch(h->config->misc, BAD_CAST SWISH_IGNORE_XMLNS));
    }
    if (swish_hash_exists(h->config->misc, BAD_CAST SWISH_UNDEFINED_METATAGS)) {
        v = swish_hash_fetch(h->config->misc, BAD_CAST SWISH_UNDEFINED_METATAGS);
        if (xmlStrEqual(v, BAD_CAST "error")) {
            h->config->flags->undef_metas = SWISH_UNDEF_METAS_ERROR;
        }
        else if (xmlStrEqual(v, BAD_CAST "ignore")) {
            h->config->flags->undef_metas = SWISH_UNDEF_METAS_IGNORE;
        }
        else if (xmlStrEqual(v, BAD_CAST "index")) {
            h->config->flags->undef_metas = SWISH_UNDEF_METAS_INDEX;
        }
        else if (xmlStrEqual(v, BAD_CAST "auto")) {
            h->config->flags->undef_metas = SWISH_UNDEF_METAS_AUTO;
        }
        else if (xmlStrEqual(v, BAD_CAST "autoall")) {
            h->config->flags->undef_metas = SWISH_UNDEF_METAS_AUTOALL;
        }
        else {
            SWISH_CROAK("Unknown value for %s: %s", SWISH_UNDEFINED_METATAGS, v);
        }
    }
    if (swish_hash_exists(h->config->misc, BAD_CAST SWISH_UNDEFINED_XML_ATTRIBUTES)) {
        v = swish_hash_fetch(h->config->misc, BAD_CAST SWISH_UNDEFINED_XML_ATTRIBUTES);
        if (xmlStrEqual(v, BAD_CAST "error")) {
            h->config->flags->undef_attrs = SWISH_UNDEF_ATTRS_ERROR;
        }
        else if (xmlStrEqual(v, BAD_CAST "ignore")) {
            h->config->flags->undef_attrs = SWISH_UNDEF_ATTRS_IGNORE;
        }
        else if (xmlStrEqual(v, BAD_CAST "index")) {
            h->config->flags->undef_attrs = SWISH_UNDEF_ATTRS_INDEX;
        }
        else if (xmlStrEqual(v, BAD_CAST "auto")) {
            h->config->flags->undef_attrs = SWISH_UNDEF_ATTRS_AUTO;
        }
        else if (xmlStrEqual(v, BAD_CAST "autoall")) {
            h->config->flags->undef_attrs = SWISH_UNDEF_ATTRS_AUTOALL;
        }
        else if (xmlStrEqual(v, BAD_CAST "disable")) {
            h->config->flags->undef_attrs = SWISH_UNDEF_ATTRS_DISABLE;
        }
        else {
            SWISH_CROAK("Unknown value for %s: %s", SWISH_UNDEFINED_XML_ATTRIBUTES, v);
        }
    }

}

static void
read_metaname_aliases(



( run in 1.288 second using v1.01-cache-2.11-cpan-5837b0d9d2c )