HTML-Content-Extractor

 view release on metacpan or  search on metacpan

Extractor.xs  view on Meta::CPAN

        return_list->real_count = -1;
        return_list->list = (struct mem_tag *)malloc(sizeof(struct mem_tag) * return_list->count);
    }
    
    while ((tag = get_next_element_in_level(my_r))) {
        if(my_r->tags->ai[ tag->tag_id ] == AI_IMG) {
            struct mem_params * param = find_param_by_key_in_element(&my_r->my[tag->my_id], "src");
            if(param == NULL)
                continue;
            
            if(tag->my_id > 0) {
                if( find_stop_word_param(stop_words, &my_r->my[tag->my_id]) )
                    continue;
            }
            
            struct mem_params * width = find_param_by_key_in_element(&my_r->my[tag->my_id], "width");
            if( width == NULL || min_width == 0 || (min_width > 0 && _check_img_size(width->value) >= min_width) ) {
                if(param->lvalue > -1) {
                    return_list->real_count++;
                    return_list->list[ return_list->real_count ] = my_r->my[tag->my_id];
                }
            }
        }
    }
    
    if(inc < 1) {
        struct html_tree *curr_pos = get_curr_element(my_r);
        get_prev_element_curr_level(my_r);
        get_text_images_href(my_r, return_list, ++inc, stop_words, min_width);
        set_position(my_r, curr_pos);
    }
    
    if(inc == 1) {
        my_r->nco_pos = save_nco_pos;
        get_next_element_curr_level(my_r);
        get_text_images_href(my_r, return_list, ++inc, stop_words, min_width);
    }
    
    my_r->nco_pos = save_nco_pos;
    
    return return_list;
}

struct html_tree * check_html(struct tree_list *my_r, struct max_element *max) {
    struct html_tree * tag;
    long i = -1;
    
    size_t istags= 26;
    int skip_tags[istags];
    skip_tags[0]  = get_tag_id(my_r->tags, "address");
    skip_tags[1]  = get_tag_id(my_r->tags, "applet");
    skip_tags[2]  = get_tag_id(my_r->tags, "audio");
    skip_tags[3]  = get_tag_id(my_r->tags, "video");
    skip_tags[4]  = get_tag_id(my_r->tags, "source");
    skip_tags[5]  = get_tag_id(my_r->tags, "track");
    skip_tags[6]  = get_tag_id(my_r->tags, "bgsound");
    skip_tags[7]  = get_tag_id(my_r->tags, "canvas");
    skip_tags[8]  = get_tag_id(my_r->tags, "datalist");
    skip_tags[9]  = get_tag_id(my_r->tags, "button");
    skip_tags[10] = get_tag_id(my_r->tags, "fieldset");
    skip_tags[11] = get_tag_id(my_r->tags, "legend");
    skip_tags[12] = get_tag_id(my_r->tags, "input");
    skip_tags[13] = get_tag_id(my_r->tags, "keygen");
    skip_tags[14] = get_tag_id(my_r->tags, "textarea");
    skip_tags[15] = get_tag_id(my_r->tags, "frameset");
    skip_tags[16] = get_tag_id(my_r->tags, "noframes");
    skip_tags[17] = get_tag_id(my_r->tags, "label");
    skip_tags[18] = get_tag_id(my_r->tags, "link");
    skip_tags[19] = get_tag_id(my_r->tags, "map");
    skip_tags[20] = get_tag_id(my_r->tags, "object");
    skip_tags[21] = get_tag_id(my_r->tags, "progress");
    skip_tags[22] = get_tag_id(my_r->tags, "time");
    skip_tags[23] = get_tag_id(my_r->tags, "xmp");
    skip_tags[24] = get_tag_id(my_r->tags, "footer");
    skip_tags[25] = get_tag_id(my_r->tags, "noindex");
    
    struct html_tree *curr_element = get_curr_element(my_r);
    
    if(curr_element->tag_id != get_tag_id(my_r->tags, "form"))
    {
        long count_words = curr_element->count_word;
        long count_link = curr_element->counts[AI_LINK];
        
        while((tag = get_child_n(my_r, ++i))) {
            count_link += tag->counts[AI_LINK];
            
            if(my_r->tags->ai[ tag->tag_id ] == AI_TEXT) {
                count_words += tag->count_word;
            }
        }
        
        if((count_words > 0 && count_link > 0 && ((count_words / count_link) > 1) && max->count_words < count_words) || (max->count_words < count_words && count_link == 0))
        {
            max->count_words = count_words;
            max->element = get_curr_element(my_r);
        }
    }
    
    int n = 0, t;
    i = -1;
    
    while((tag = get_child_n(my_r, ++i))) {
        if(my_r->tags->ai[ tag->tag_id ] == AI_LINK) {
            continue;
        }
        
        n = 0;
        for (t = 0; t < istags; t++) {
            if(tag->tag_id == skip_tags[t]) {
                n = 1;
                break;
            }
        }
        
        if (n == 1)
            continue;
        
        if(my_r->tags->family[tag->tag_id] == FAMILY_SELECT) {
            continue;
        }
        

Extractor.xs  view on Meta::CPAN

    
    add_tag_R(tags, "bgsound", 7, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
    
    add_tag_R(tags, "big", 3, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "blink", 5, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "blockquote", 10, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
    
    add_tag_R(tags, "br", 2, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
    
    add_tag_R(tags, "canvas", 6, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "center", 6, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_TEXT);
    add_tag_R(tags, "cite", 4, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_TEXT);
    add_tag_R(tags, "code", 4, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_TEXT);
    add_tag_R(tags, "comment", 7, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_TEXT);
    
    // ++ datalist ++
    add_tag_R(tags, "datalist", 8, 20, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    //add_tag_R(tags, "option", 6, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    // -- datalist --
    
    add_tag_R(tags, "del", 3, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    
    // ++ details ++
    add_tag_R(tags, "details", 7, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "summary", 7, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    // -- details --
    
    add_tag_R(tags, "dfn", 3, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    
    // ++ dir ++
    add_tag_R(tags, "dir", 3, 20, FAMILY_LIST, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL); // out of test
    //add_tag_R(tags, "li", 2, 0, 0, TYPE_TAG_NORMAL, EXTRA_TAG_CLOSE_IF_SELF, OPTION_NULL, AI_NULL);
    // -- dir --
    
    add_tag_R(tags, "div", 3, 50, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
    
    // ++ dl ++
    add_tag_R(tags, "dl", 2, 20, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "dt", 2, 19, 0, TYPE_TAG_BLOCK, EXTRA_TAG_CLOSE_PRIORITY_FAMILY, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "dd", 2, 19, 0, TYPE_TAG_BLOCK, EXTRA_TAG_CLOSE_PRIORITY_FAMILY, OPTION_NULL, AI_NULL);
    // -- dl --
    
    add_tag_R(tags, "em", 2, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    
    add_tag_R(tags, "embed", 5, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
    
    // ++ figure ++
    add_tag_R(tags, "figure", 6, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "figcaption", 10, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
    // -- figure --
    
    add_tag_R(tags, "font", 4, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "footer", 6, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
    
    // ++ form ++
    add_tag_R(tags, "form", 4, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "button", 6, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    
    // ++ form: fieldset ++
    add_tag_R(tags, "fieldset", 8, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "legend", 6, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    // -- form: fieldset --
    
    // ++ form: select ++
    add_tag_R(tags, "select", 6, 20, FAMILY_SELECT, TYPE_TAG_NORMAL, EXTRA_TAG_CLOSE_PRIORITY, OPTION_CLEAN_TAGS, AI_NULL);
    add_tag_R(tags, "optgroup", 8, 19, FAMILY_SELECT, TYPE_TAG_NORMAL, EXTRA_TAG_CLOSE_PRIORITY, OPTION_CLEAN_TAGS_SAVE, AI_NULL);
    add_tag_R(tags, "option", 6, 18, FAMILY_SELECT, TYPE_TAG_NORMAL, EXTRA_TAG_CLOSE_PRIORITY, OPTION_CLEAN_TAGS_SAVE, AI_NULL);
    // -- form: select --
    
    add_tag_R(tags, "input", 5, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "keygen", 6, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "textarea", 8, 0, 0, TYPE_TAG_NORMAL, EXTRA_TAG_SIMPLE, OPTION_NULL, AI_NULL);
    // -- form --
    
    // ++ frameset ++
    add_tag_R(tags, "frameset", 8, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "frame", 5, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "noframes", 8, 0, 0, TYPE_TAG_NORMAL, EXTRA_TAG_SIMPLE_TREE, OPTION_NULL, AI_NULL);
    // -- frameset --
    
    // ++ isindex ++ // hm, crazy tag
    add_tag_R(tags, "isindex", 7, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
    // -- isindex --
    
    add_tag_R(tags, "h1", 2, 0, FAMILY_H, TYPE_TAG_BLOCK, EXTRA_TAG_CLOSE_IF_SELF_FAMILY, OPTION_NULL, AI_TEXT);
    add_tag_R(tags, "h2", 2, 0, FAMILY_H, TYPE_TAG_BLOCK, EXTRA_TAG_CLOSE_IF_SELF_FAMILY, OPTION_NULL, AI_TEXT);
    add_tag_R(tags, "h3", 2, 0, FAMILY_H, TYPE_TAG_BLOCK, EXTRA_TAG_CLOSE_IF_SELF_FAMILY, OPTION_NULL, AI_TEXT);
    add_tag_R(tags, "h4", 2, 0, FAMILY_H, TYPE_TAG_BLOCK, EXTRA_TAG_CLOSE_IF_SELF_FAMILY, OPTION_NULL, AI_TEXT);
    add_tag_R(tags, "h5", 2, 0, FAMILY_H, TYPE_TAG_BLOCK, EXTRA_TAG_CLOSE_IF_SELF_FAMILY, OPTION_NULL, AI_TEXT);
    add_tag_R(tags, "h6", 2, 0, FAMILY_H, TYPE_TAG_BLOCK, EXTRA_TAG_CLOSE_IF_SELF_FAMILY, OPTION_NULL, AI_TEXT);
    
    add_tag_R(tags, "header", 6, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "hgroup", 6, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
    
    add_tag_R(tags, "hr", 2, 0, 0, TYPE_TAG_BLOCK, EXTRA_TAG_CLOSE_NOW, OPTION_NULL, AI_NULL); // TYPE_TAG_ONE :)
    
    add_tag_R(tags, "i", 1, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_TEXT);
    
    add_tag_R(tags, "iframe", 6, 0, 0, TYPE_TAG_NORMAL, EXTRA_TAG_SIMPLE, OPTION_NULL, AI_NULL);
    
    add_tag_R(tags, "img", 3, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_IMG);
    
    add_tag_R(tags, "ins", 3, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "kbd", 3, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "label", 5, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    
    // ++ link ++
    add_tag_R(tags, "link", 4, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
    // -- link --
    
    // ++ map ++
    add_tag_R(tags, "map", 3, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "area", 4, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
    // -- map --
    
    add_tag_R(tags, "mark", 4, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "marquee", 7, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
    
    // ++ menu ++
    add_tag_R(tags, "menu", 4, 20, FAMILY_LIST, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
    add_tag_R(tags, "command", 7, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);



( run in 1.277 second using v1.01-cache-2.11-cpan-39bf76dae61 )