WWW-BookBot
view release on metacpan or search on metacpan
# $bot->initialize => N/A
# $bot->work_dir($dir) => $work_dir
#-------------------------------------------------------------
sub new {
my $class = ref($_[0]) || $_[0];
my $pargs = ref($_[1]) ? $_[1] : {} ;
my $self = $class->default_settings;
bless($self, $class);
# Set default work dir
my $dirname=$self->{classname};
$dirname=~s/^WWW:://;
$self->{work_dir}=catfile($ENV{HOME}, split(/::/,$dirname));
# Set user defined args
foreach (keys %$pargs) {
$self->{$_} = $pargs->{$_};
}
# initialize and return
$self->initialize;
return $self;
}
sub default_settings {
{
classname => shift, #cureent classname
book_get_num => 0, #statistics of books, to be used in file title
book_has_chapters => 1, #0-only 1 chapter, 1-multiple chapters
book_max_levels => 5, #max levels of book - chapters - chapters - chapters ..
book_max_chapters => 500, #max chapters in 1 book
catalog_max_pages => 500, #max catalog pages
get_agent_name => "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0; AIRF)",
get_agent_proxy => "Default",
#Default Use default IE proxy
#No Don't use proxy
#196.23.147.34:80 Use given proxy
#Default;202.105.138.19:8080;202.110.220.14:80;...
# Use multiple proxy, one visit use one proxy in turn
get_delay_second => 0, #if get_delay_second>0 then delay get_delay_second+rand seconds
get_delay_second_rand => 2,
get_fail_showtype => '', #''-show simplified info, 'Detail'-show detailed info
get_file_directory => './saved', #debug save and read file from this directory
get_from_file => 0, #0-normal operation, 1-get from file only
get_language => "en", #get headers: language
get_max_retries => 5, #max retries of 1 get
get_save_file => 0, #0-normal operation, 1-save to file for latter debug
get_skip_zip => 1, #skip fetch zip files
get_skip_media => 1, #skip fetch media files
get_timeout => 40, #get timeout
get_trunk_size => 50000, #define 1 trunk = xxxBytes for display
get_trunk_fresh_size => 5000, #if get size > xxxBytes, then refresh trunk display
get_visited_url_num => 0, #statistics of visted urls, to be used in get_from_file/get_save_file
language_decode => "utf8", #to read with encoding
language_encode => "utf8", #to save with encoding
process_all => 0, #process all pages of catalog
result_no_crlf => 1, #0-with crlf, 1-no crlf
space_leading_remove => 1, #remove leading spaces
space_leading_max => 20, #max leading spaces
space_inner_remove => 1, #remove inner spaces
space_inner_min_words => 5, #minimal length of word with inner spaces
text_paragraph_type => 'br', #type of paragraph split methods
# br one br as end of paragraph
# brbr two br as end of paragraph
# cr one cr as end of paragraph
# crcr two cr as end of paragraph
# crandspace one cr and followed with space as end of paragraph
screen_limit_trunk => 25, #max trunks to be displayed
screen_limit_title => 14, #max title to be displayed
};
}
sub initialize {
my $self = shift;
# Initialize languages
$self->{lang_encode}=find_encoding($self->{language_encode})
if $self->{language_encode} ne '';
$self->{lang_decode}=find_encoding($self->{language_decode})
if $self->{language_decode} ne '';
# Initialize messages
$self->trandict_init;
$self->msg_init;
# Create work directory
$self->work_dir( $self->{work_dir} );
eval {mkpath($self->{work_dir})};
$self->fatal_error("FailMkDir", dir=>$self->{work_dir}, errmsg=>$@) if $@;
# Check debug directory
$self->{get_file_directory_save}=catfile($self->{get_file_directory}, $self->get_alias());
$self->{get_file_directory_read}=$self->{get_file_directory_save};
$self->{get_file_directory_read}=~s,\\,/,sg;
$self->{get_file_directory_read}=~s,/+$,,sg;
eval {mkpath($self->{get_file_directory_save})}
if $self->{get_from_file} or $self->{get_save_file};
# Initialize patterns
$self->{patterns} = {};
foreach ($self->getpattern_lists) {
my $sub="getpattern_".$_;
my $sub_data=$sub."_data";
$self->{patterns}->{$_} = $self->can($sub) ?
$self->$sub : $self->parse_patterns($self->$sub_data);
}
# Content Type Initialize
$self->contenttype_init;
# Initialize LWP user agents
$self->agent_init;
# Initialize DB
$self->db_init;
$self->db_load;
# Try to login
$self->go_login;
}
sub work_dir {
my ($self, $work_dir) = @_;
return $self->{work_dir} if $work_dir eq '';
code => $_[0]->code,
req_content => $_[0]->request->as_string,
status_line => $_[0]->status_line,
res_content => $_[0]->as_string,
url_real => $url_real,
});
}
#-------------------------------------------------------------
# Parser utility functions
# $bot->normalize_space($content_dein_deout) => N/A
# $bot->remove_html($content_dein_deout) => N/A
# $bot->decode_entity($content_dein_deout) => N/A
# $bot->normalize_paragraph_1($content_dein_deout) => N/A
# $bot->parse_title($content_dein_deout) => $content_deout
# $bot->parse_titleen($content_dein_deout) => $content_enout
# $bot->normalize_paragraph($content_dein_deout) => N/A
# $bot->remove_line_by_end($content_dein_deout)
# $bot->parse_paragraph_br($content_dein_deout)
# $bot->parse_paragraph_brbr($content_dein_deout)
# $bot->parse_paragraph_brandspace($content_dein_deout)
# $bot->parse_paragraph_brbr_or_brandspace($content_dein_deout)
# $bot->parse_paragraph_cr($content_dein_deout)
# $bot->parse_paragraph_crcr($content_dein_deout)
# $bot->parse_paragraph_crandspace($content_dein_deout)
# $bot->remove_leadingspace($content_dein_deout)
# $bot->remove_innerspace($content_dein_deout)
#-------------------------------------------------------------
sub normalize_space {
$_[1]=~s/$_[0]->{patterns}->{space}/ /osg;
$_[1]=~s/$_[0]->{patterns}->{space2}/ /osg;
}
sub remove_html {
$_[1]=~s/$_[0]->{patterns}->{remove_html}//osg;
$_[1]=~s/<[^<>]*>//osg;
}
sub decode_entity {
$_[1]=~s/(?:&\#(\d{1,5});?)/chr($1)/esg;
$_[1]=~s/(?:&\#[xX]([0-9a-fA-F]{1,5});?)/chr(hex($1))/esg;
$_[1]=~s/(&([0-9a-zA-Z]{1,9});?)/$entity2char{$2} or $1/esg;
}
sub normalize_paragraph_1 {
$_[1]=~s/^ +/ /mg; #normalize spaces before paragraph
$_[1]=~s/ +$//mg; #remove spaces after paragraph
$_[1]=~s/^ ?(?:$_[0]->{patterns}->{mark_dash} *){2,}/ ---/omg;
#normalize repeated dash
$_[1]=~s/\n{2,}/\n/sg; #remove empty paragraph
$_[1]=~s/(?: ---\n?){2,}/ ---\n/sg; #remove too much dash line
$_[1]=~s/(?:^\n|\n$)//s; #remove leading and ending \n
$_[1]=~s/$_[0]->{patterns}->{word_finish}//os; #remove finish words
$_[1]=~s/\n$//s; #remove ending \n
}
sub parse_title {
$_[0]->normalize_space($_[1]);
$_[0]->remove_html($_[1]);
$_[0]->decode_entity($_[1]);
$_[1]=~s/\n+/ /sg; # CRLF as space
$_[0]->normalize_paragraph_1($_[1]);
$_[1]=~s/ +/ /sg; #remove extra spaces
#remove ending space or wordsplit mark
my $p1=$_[0]->{patterns}->{mark_wordsplit};
$p1=~s/(?:^\[|\]$)//sg;
$p1="[".$p1." ]";
$_[1]=~s/$p1+$//os;
#remove paraentheses
$_[1]=~s/(?:^ +| +$)//sg;
while($_[1]=~/^(?:$_[0]->{patterns}->{parentheses})$/os) {
$_[1]=$^N;
$_[1]=~s/(?:^ +| +$)//sg;
}
$_[1];
}
sub parse_titleen {
$_[0]->en_code($_[0]->parse_title($_[1]));
}
sub normalize_paragraph {
$_[0]->normalize_space($_[1]);
$_[0]->parse_paragraph_begin($_[1]);
my $sub="parse_paragraph_".$_[0]->{text_paragraph_type};
$_[0]->$sub($_[1]);
$_[0]->remove_html($_[1]);
$_[0]->decode_entity($_[1]);
$_[0]->normalize_paragraph_1($_[1]);
$_[0]->remove_line_by_end($_[1]);
$_[0]->normalize_paragraph_1($_[1]);
$_[0]->parse_paragraph_end($_[1]);
$_[1]=~s/ ?\$BOOKBOTRETURN\$//sg; #remove for reserved return
$sub='$_[1]=~s/^ /'.$_[0]->{patterns}->{line_head}.'/mg;'; #normalize with 4 spaces
eval $sub;
}
sub remove_line_by_end {
$_[1]=~s/(?:---\n|\n).*(?:$_[0]->{patterns}->{remove_line_by_end})$_[0]->{patterns}->{parentheses_right}?$//omg;
$_[1]=~s/\n $_[0]->{patterns}->{remove_line_by_end_special}$//osg;
}
sub parse_paragraph_br {
$_[1]=~s/\n//sg;
$_[1]=~s/<[bB][rR]> */\n /sg;
}
sub parse_paragraph_brbr {
$_[1]=~s/\n//sg;
$_[1]=~s/(?:<[bB][rR]> *){2,}/\n /sg;
}
sub parse_paragraph_brandspace {
$_[1]=~s/\n//sg;
$_[1]=~s/<[bB][rR]>(?=[^ ])//sg;
$_[1]=~s/<[bB][rR]> */\n /sg;
}
sub parse_paragraph_brbr_or_brandspace {
$_[1]=~s/\n//sg;
$_[1]=~s/<[bB][rR]>(?=[^ <])//sg;
$_[1]=~s/(?:<[bB][rR]> *){1,}/\n /sg;
}
sub parse_paragraph_br_or_p {
$_[1]=~s/\n/ /sg;
$_[1]=~s/<[bB\/][rRpP]>/\n/sg;
}
sub parse_paragraph_cr {
}
( run in 1.776 second using v1.01-cache-2.11-cpan-71847e10f99 )