WWW-BookBot
view release on metacpan or search on metacpan
get_trunk_fresh_size => 5000, #if get size > xxxBytes, then refresh trunk display
get_visited_url_num => 0, #statistics of visted urls, to be used in get_from_file/get_save_file
language_decode => "utf8", #to read with encoding
language_encode => "utf8", #to save with encoding
process_all => 0, #process all pages of catalog
result_no_crlf => 1, #0-with crlf, 1-no crlf
space_leading_remove => 1, #remove leading spaces
space_leading_max => 20, #max leading spaces
space_inner_remove => 1, #remove inner spaces
space_inner_min_words => 5, #minimal length of word with inner spaces
text_paragraph_type => 'br', #type of paragraph split methods
# br one br as end of paragraph
# brbr two br as end of paragraph
# cr one cr as end of paragraph
# crcr two cr as end of paragraph
# crandspace one cr and followed with space as end of paragraph
screen_limit_trunk => 25, #max trunks to be displayed
screen_limit_title => 14, #max title to be displayed
};
}
sub initialize {
$_[1]=~s/\n$//s; #remove ending \n
}
sub parse_title {
$_[0]->normalize_space($_[1]);
$_[0]->remove_html($_[1]);
$_[0]->decode_entity($_[1]);
$_[1]=~s/\n+/ /sg; # CRLF as space
$_[0]->normalize_paragraph_1($_[1]);
$_[1]=~s/ +/ /sg; #remove extra spaces
#remove ending space or wordsplit mark
my $p1=$_[0]->{patterns}->{mark_wordsplit};
$p1=~s/(?:^\[|\]$)//sg;
$p1="[".$p1." ]";
$_[1]=~s/$p1+$//os;
#remove paraentheses
$_[1]=~s/(?:^ +| +$)//sg;
while($_[1]=~/^(?:$_[0]->{patterns}->{parentheses})$/os) {
$_[1]=$^N;
$_[1]=~s/(?:^ +| +$)//sg;
( run in 0.553 second using v1.01-cache-2.11-cpan-5511b514fd6 )