HTML-Content-Extractor
view release on metacpan or search on metacpan
libextractor/libextractor.c view on Meta::CPAN
return_list->real_count = -1;
return_list->list = (struct mem_tag *)malloc(sizeof(struct mem_tag) * return_list->count);
}
while ((tag = get_next_element_in_level(my_r))) {
if(my_r->tags->ai[ tag->tag_id ] == AI_IMG) {
struct mem_params * param = find_param_by_key_in_element(&my_r->my[tag->my_id], "src");
if(param == NULL)
continue;
if(tag->my_id > 0) {
if( find_stop_word_param(stop_words, &my_r->my[tag->my_id]) )
continue;
}
struct mem_params * width = find_param_by_key_in_element(&my_r->my[tag->my_id], "width");
if( width == NULL || min_width == 0 || (min_width > 0 && _check_img_size(width->value) >= min_width) ) {
if(param->lvalue > -1) {
return_list->real_count++;
return_list->list[ return_list->real_count ] = my_r->my[tag->my_id];
}
}
}
}
if(inc < 1) {
struct html_tree *curr_pos = get_curr_element(my_r);
get_prev_element_curr_level(my_r);
get_text_images_href(my_r, return_list, ++inc, stop_words, min_width);
set_position(my_r, curr_pos);
}
if(inc == 1) {
my_r->nco_pos = save_nco_pos;
get_next_element_curr_level(my_r);
get_text_images_href(my_r, return_list, ++inc, stop_words, min_width);
}
my_r->nco_pos = save_nco_pos;
return return_list;
}
struct html_tree * check_html(struct tree_list *my_r, struct max_element *max) {
struct html_tree * tag;
long i = -1;
size_t istags= 26;
int skip_tags[istags];
skip_tags[0] = get_tag_id(my_r->tags, "address");
skip_tags[1] = get_tag_id(my_r->tags, "applet");
skip_tags[2] = get_tag_id(my_r->tags, "audio");
skip_tags[3] = get_tag_id(my_r->tags, "video");
skip_tags[4] = get_tag_id(my_r->tags, "source");
skip_tags[5] = get_tag_id(my_r->tags, "track");
skip_tags[6] = get_tag_id(my_r->tags, "bgsound");
skip_tags[7] = get_tag_id(my_r->tags, "canvas");
skip_tags[8] = get_tag_id(my_r->tags, "datalist");
skip_tags[9] = get_tag_id(my_r->tags, "button");
skip_tags[10] = get_tag_id(my_r->tags, "fieldset");
skip_tags[11] = get_tag_id(my_r->tags, "legend");
skip_tags[12] = get_tag_id(my_r->tags, "input");
skip_tags[13] = get_tag_id(my_r->tags, "keygen");
skip_tags[14] = get_tag_id(my_r->tags, "textarea");
skip_tags[15] = get_tag_id(my_r->tags, "frameset");
skip_tags[16] = get_tag_id(my_r->tags, "noframes");
skip_tags[17] = get_tag_id(my_r->tags, "label");
skip_tags[18] = get_tag_id(my_r->tags, "link");
skip_tags[19] = get_tag_id(my_r->tags, "map");
skip_tags[20] = get_tag_id(my_r->tags, "object");
skip_tags[21] = get_tag_id(my_r->tags, "progress");
skip_tags[22] = get_tag_id(my_r->tags, "time");
skip_tags[23] = get_tag_id(my_r->tags, "xmp");
skip_tags[24] = get_tag_id(my_r->tags, "footer");
skip_tags[25] = get_tag_id(my_r->tags, "noindex");
struct html_tree *curr_element = get_curr_element(my_r);
if(curr_element->tag_id != get_tag_id(my_r->tags, "form"))
{
long count_words = curr_element->count_word;
long count_link = curr_element->counts[AI_LINK];
while((tag = get_child_n(my_r, ++i))) {
count_link += tag->counts[AI_LINK];
if(my_r->tags->ai[ tag->tag_id ] == AI_TEXT) {
count_words += tag->count_word;
}
}
if((count_words > 0 && count_link > 0 && ((count_words / count_link) > 1) && max->count_words < count_words) || (max->count_words < count_words && count_link == 0))
{
max->count_words = count_words;
max->element = get_curr_element(my_r);
}
}
int n = 0, t;
i = -1;
while((tag = get_child_n(my_r, ++i))) {
if(my_r->tags->ai[ tag->tag_id ] == AI_LINK) {
continue;
}
n = 0;
for (t = 0; t < istags; t++) {
if(tag->tag_id == skip_tags[t]) {
n = 1;
break;
}
}
if (n == 1)
continue;
if(my_r->tags->family[tag->tag_id] == FAMILY_SELECT) {
continue;
}
libextractor/libextractor.c view on Meta::CPAN
add_tag_R(tags, "bgsound", 7, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "big", 3, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "blink", 5, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "blockquote", 10, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "br", 2, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "canvas", 6, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "center", 6, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_TEXT);
add_tag_R(tags, "cite", 4, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_TEXT);
add_tag_R(tags, "code", 4, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_TEXT);
add_tag_R(tags, "comment", 7, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_TEXT);
// ++ datalist ++
add_tag_R(tags, "datalist", 8, 20, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
//add_tag_R(tags, "option", 6, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
// -- datalist --
add_tag_R(tags, "del", 3, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
// ++ details ++
add_tag_R(tags, "details", 7, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "summary", 7, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
// -- details --
add_tag_R(tags, "dfn", 3, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
// ++ dir ++
add_tag_R(tags, "dir", 3, 20, FAMILY_LIST, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL); // out of test
//add_tag_R(tags, "li", 2, 0, 0, TYPE_TAG_NORMAL, EXTRA_TAG_CLOSE_IF_SELF, OPTION_NULL, AI_NULL);
// -- dir --
add_tag_R(tags, "div", 3, 50, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
// ++ dl ++
add_tag_R(tags, "dl", 2, 20, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "dt", 2, 19, 0, TYPE_TAG_BLOCK, EXTRA_TAG_CLOSE_PRIORITY_FAMILY, OPTION_NULL, AI_NULL);
add_tag_R(tags, "dd", 2, 19, 0, TYPE_TAG_BLOCK, EXTRA_TAG_CLOSE_PRIORITY_FAMILY, OPTION_NULL, AI_NULL);
// -- dl --
add_tag_R(tags, "em", 2, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "embed", 5, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
// ++ figure ++
add_tag_R(tags, "figure", 6, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "figcaption", 10, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
// -- figure --
add_tag_R(tags, "font", 4, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "footer", 6, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
// ++ form ++
add_tag_R(tags, "form", 4, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "button", 6, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
// ++ form: fieldset ++
add_tag_R(tags, "fieldset", 8, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "legend", 6, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
// -- form: fieldset --
// ++ form: select ++
add_tag_R(tags, "select", 6, 20, FAMILY_SELECT, TYPE_TAG_NORMAL, EXTRA_TAG_CLOSE_PRIORITY, OPTION_CLEAN_TAGS, AI_NULL);
add_tag_R(tags, "optgroup", 8, 19, FAMILY_SELECT, TYPE_TAG_NORMAL, EXTRA_TAG_CLOSE_PRIORITY, OPTION_CLEAN_TAGS_SAVE, AI_NULL);
add_tag_R(tags, "option", 6, 18, FAMILY_SELECT, TYPE_TAG_NORMAL, EXTRA_TAG_CLOSE_PRIORITY, OPTION_CLEAN_TAGS_SAVE, AI_NULL);
// -- form: select --
add_tag_R(tags, "input", 5, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "keygen", 6, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "textarea", 8, 0, 0, TYPE_TAG_NORMAL, EXTRA_TAG_SIMPLE, OPTION_NULL, AI_NULL);
// -- form --
// ++ frameset ++
add_tag_R(tags, "frameset", 8, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "frame", 5, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "noframes", 8, 0, 0, TYPE_TAG_NORMAL, EXTRA_TAG_SIMPLE_TREE, OPTION_NULL, AI_NULL);
// -- frameset --
// ++ isindex ++ // hm, crazy tag
add_tag_R(tags, "isindex", 7, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
// -- isindex --
add_tag_R(tags, "h1", 2, 0, FAMILY_H, TYPE_TAG_BLOCK, EXTRA_TAG_CLOSE_IF_SELF_FAMILY, OPTION_NULL, AI_TEXT);
add_tag_R(tags, "h2", 2, 0, FAMILY_H, TYPE_TAG_BLOCK, EXTRA_TAG_CLOSE_IF_SELF_FAMILY, OPTION_NULL, AI_TEXT);
add_tag_R(tags, "h3", 2, 0, FAMILY_H, TYPE_TAG_BLOCK, EXTRA_TAG_CLOSE_IF_SELF_FAMILY, OPTION_NULL, AI_TEXT);
add_tag_R(tags, "h4", 2, 0, FAMILY_H, TYPE_TAG_BLOCK, EXTRA_TAG_CLOSE_IF_SELF_FAMILY, OPTION_NULL, AI_TEXT);
add_tag_R(tags, "h5", 2, 0, FAMILY_H, TYPE_TAG_BLOCK, EXTRA_TAG_CLOSE_IF_SELF_FAMILY, OPTION_NULL, AI_TEXT);
add_tag_R(tags, "h6", 2, 0, FAMILY_H, TYPE_TAG_BLOCK, EXTRA_TAG_CLOSE_IF_SELF_FAMILY, OPTION_NULL, AI_TEXT);
add_tag_R(tags, "header", 6, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "hgroup", 6, 0, 0, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "hr", 2, 0, 0, TYPE_TAG_BLOCK, EXTRA_TAG_CLOSE_NOW, OPTION_NULL, AI_NULL); // TYPE_TAG_ONE :)
add_tag_R(tags, "i", 1, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_TEXT);
add_tag_R(tags, "iframe", 6, 0, 0, TYPE_TAG_NORMAL, EXTRA_TAG_SIMPLE, OPTION_NULL, AI_NULL);
add_tag_R(tags, "img", 3, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_IMG);
add_tag_R(tags, "ins", 3, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "kbd", 3, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "label", 5, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
// ++ link ++
add_tag_R(tags, "link", 4, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
// -- link --
// ++ map ++
add_tag_R(tags, "map", 3, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "area", 4, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
// -- map --
add_tag_R(tags, "mark", 4, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "marquee", 7, 0, 0, TYPE_TAG_NORMAL, 0, OPTION_NULL, AI_NULL);
// ++ menu ++
add_tag_R(tags, "menu", 4, 20, FAMILY_LIST, TYPE_TAG_BLOCK, 0, OPTION_NULL, AI_NULL);
add_tag_R(tags, "command", 7, 0, 0, TYPE_TAG_ONE, 0, OPTION_NULL, AI_NULL);
( run in 1.805 second using v1.01-cache-2.11-cpan-39bf76dae61 )