WWW-BookBot

 view release on metacpan or  search on metacpan

BookBot.pm  view on Meta::CPAN

#	$bot->initialize								=> N/A
#	$bot->work_dir($dir)							=> $work_dir
#-------------------------------------------------------------
sub new {
	my $class = ref($_[0]) || $_[0];
	my $pargs = ref($_[1]) ? $_[1] : {} ;
	my $self = $class->default_settings;
	bless($self, $class);

	# Set default work dir
	my $dirname=$self->{classname};
	$dirname=~s/^WWW:://;
	$self->{work_dir}=catfile($ENV{HOME}, split(/::/,$dirname));

	# Set user defined args
	foreach (keys %$pargs) {
		$self->{$_} = $pargs->{$_};
	}

	# initialize and return
	$self->initialize;
	return $self;
}
sub default_settings {
	{
		classname				=> shift,	#cureent classname
		book_get_num			=> 0,		#statistics of books, to be used in file title
		book_has_chapters		=> 1,		#0-only 1 chapter, 1-multiple chapters
		book_max_levels			=> 5,		#max levels of book - chapters - chapters - chapters ..
		book_max_chapters		=> 500,		#max chapters in 1 book
		catalog_max_pages		=> 500,		#max catalog pages
		get_agent_name			=> "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0; AIRF)",
		get_agent_proxy			=> "Default",
			#Default			Use default IE proxy
			#No					Don't use proxy
			#196.23.147.34:80	Use given proxy
			#Default;202.105.138.19:8080;202.110.220.14:80;...
			#					Use multiple proxy, one visit use one proxy in turn
		get_delay_second		=> 0,		#if get_delay_second>0 then delay get_delay_second+rand seconds
		get_delay_second_rand	=> 2,
		get_fail_showtype		=> '',		#''-show simplified info, 'Detail'-show detailed info
		get_file_directory		=> './saved',	#debug save and read file from this directory
		get_from_file			=> 0,		#0-normal operation, 1-get from file only
		get_language			=> "en",	#get headers: language
		get_max_retries			=> 5,		#max retries of 1 get
		get_save_file			=> 0,		#0-normal operation, 1-save to file for latter debug
		get_skip_zip			=> 1,		#skip fetch zip files
		get_skip_media			=> 1,		#skip fetch media files
		get_timeout				=> 40,		#get timeout
		get_trunk_size			=> 50000,	#define 1 trunk = xxxBytes for display
		get_trunk_fresh_size	=> 5000,	#if get size > xxxBytes, then refresh trunk display
		get_visited_url_num		=> 0,		#statistics of visted urls, to be used in get_from_file/get_save_file
		language_decode			=> "utf8",	#to read with encoding
		language_encode			=> "utf8",	#to save with encoding
		process_all				=> 0,		#process all pages of catalog
		result_no_crlf			=> 1,		#0-with crlf, 1-no crlf
		space_leading_remove 	=> 1,		#remove leading spaces
		space_leading_max		=> 20,		#max leading spaces
		space_inner_remove		=> 1,		#remove inner spaces
		space_inner_min_words	=> 5,		#minimal length of word with inner spaces
		text_paragraph_type		=> 'br',	#type of paragraph split methods
			# br		 one br as end of paragraph
			# brbr		 two br as end of paragraph
			# cr		 one cr as end of paragraph
			# crcr		 two cr as end of paragraph
			# crandspace one cr and followed with space as end of paragraph
		screen_limit_trunk		=> 25,		#max trunks to be displayed
		screen_limit_title		=> 14,		#max title to be displayed
	};
}
sub initialize {
	my $self = shift;

	# Initialize languages
	$self->{lang_encode}=find_encoding($self->{language_encode})
		if $self->{language_encode} ne '';
	$self->{lang_decode}=find_encoding($self->{language_decode})
		if $self->{language_decode} ne '';

	# Initialize messages
	$self->trandict_init;
	$self->msg_init;

	# Create work directory
	$self->work_dir( $self->{work_dir} );
	eval {mkpath($self->{work_dir})};
	$self->fatal_error("FailMkDir", dir=>$self->{work_dir}, errmsg=>$@) if $@;

	# Check debug directory
	$self->{get_file_directory_save}=catfile($self->{get_file_directory}, $self->get_alias());
	$self->{get_file_directory_read}=$self->{get_file_directory_save};
	$self->{get_file_directory_read}=~s,\\,/,sg;
	$self->{get_file_directory_read}=~s,/+$,,sg;
	eval {mkpath($self->{get_file_directory_save})}
		if $self->{get_from_file} or $self->{get_save_file};

	# Initialize patterns
	$self->{patterns} = {};
	foreach ($self->getpattern_lists) {
		my $sub="getpattern_".$_;
		my $sub_data=$sub."_data";
		$self->{patterns}->{$_} = $self->can($sub) ?
			$self->$sub : $self->parse_patterns($self->$sub_data);
	}

	# Content Type Initialize
	$self->contenttype_init;

	# Initialize LWP user agents
	$self->agent_init;

	# Initialize DB
	$self->db_init;
	$self->db_load;
	
	# Try to login
	$self->go_login;
}
sub work_dir {
	my ($self, $work_dir) = @_;
	return $self->{work_dir} if $work_dir eq '';

BookBot.pm  view on Meta::CPAN

		code		=>	$_[0]->code,
		req_content	=>	$_[0]->request->as_string,
		status_line	=>	$_[0]->status_line,
		res_content	=>	$_[0]->as_string,
		url_real	=>	$url_real,
	});
}

#-------------------------------------------------------------
# Parser utility functions
#	$bot->normalize_space($content_dein_deout)			=> N/A
#	$bot->remove_html($content_dein_deout)				=> N/A
#	$bot->decode_entity($content_dein_deout)			=> N/A
#	$bot->normalize_paragraph_1($content_dein_deout)	=> N/A
#	$bot->parse_title($content_dein_deout)				=> $content_deout
#	$bot->parse_titleen($content_dein_deout)			=> $content_enout
#	$bot->normalize_paragraph($content_dein_deout)		=> N/A
#		$bot->remove_line_by_end($content_dein_deout)
#		$bot->parse_paragraph_br($content_dein_deout)
#		$bot->parse_paragraph_brbr($content_dein_deout)
#		$bot->parse_paragraph_brandspace($content_dein_deout)
#		$bot->parse_paragraph_brbr_or_brandspace($content_dein_deout)
#		$bot->parse_paragraph_cr($content_dein_deout)
#		$bot->parse_paragraph_crcr($content_dein_deout)
#		$bot->parse_paragraph_crandspace($content_dein_deout)
#		$bot->remove_leadingspace($content_dein_deout)
#		$bot->remove_innerspace($content_dein_deout)
#-------------------------------------------------------------
sub normalize_space {
	$_[1]=~s/$_[0]->{patterns}->{space}/ /osg;
	$_[1]=~s/$_[0]->{patterns}->{space2}/  /osg;
}
sub remove_html {
	$_[1]=~s/$_[0]->{patterns}->{remove_html}//osg;
	$_[1]=~s/<[^<>]*>//osg;
}
sub decode_entity {
	$_[1]=~s/(?:&\#(\d{1,5});?)/chr($1)/esg;
	$_[1]=~s/(?:&\#[xX]([0-9a-fA-F]{1,5});?)/chr(hex($1))/esg;
	$_[1]=~s/(&([0-9a-zA-Z]{1,9});?)/$entity2char{$2} or $1/esg;
}
sub normalize_paragraph_1 {
	$_[1]=~s/^ +/ /mg;								#normalize spaces before paragraph
	$_[1]=~s/ +$//mg;								#remove spaces after paragraph
	$_[1]=~s/^ ?(?:$_[0]->{patterns}->{mark_dash} *){2,}/ ---/omg;
													#normalize repeated dash
	$_[1]=~s/\n{2,}/\n/sg;							#remove empty paragraph
	$_[1]=~s/(?: ---\n?){2,}/ ---\n/sg;				#remove too much dash line
	$_[1]=~s/(?:^\n|\n$)//s;						#remove leading and ending \n
	$_[1]=~s/$_[0]->{patterns}->{word_finish}//os;	#remove finish words
	$_[1]=~s/\n$//s;								#remove ending \n
}
sub parse_title {
	$_[0]->normalize_space($_[1]);
	$_[0]->remove_html($_[1]);
	$_[0]->decode_entity($_[1]);
	$_[1]=~s/\n+/ /sg;	# CRLF as space
	$_[0]->normalize_paragraph_1($_[1]);
	$_[1]=~s/ +/ /sg;	#remove extra spaces

	#remove ending space or wordsplit mark
	my $p1=$_[0]->{patterns}->{mark_wordsplit};
	$p1=~s/(?:^\[|\]$)//sg;
	$p1="[".$p1." ]";
	$_[1]=~s/$p1+$//os;

	#remove paraentheses
	$_[1]=~s/(?:^ +| +$)//sg;
	while($_[1]=~/^(?:$_[0]->{patterns}->{parentheses})$/os) {
		$_[1]=$^N;
		$_[1]=~s/(?:^ +| +$)//sg;
	}
	
	$_[1];
}
sub parse_titleen {
	$_[0]->en_code($_[0]->parse_title($_[1]));
}
sub normalize_paragraph {
	$_[0]->normalize_space($_[1]);
	$_[0]->parse_paragraph_begin($_[1]);
	my $sub="parse_paragraph_".$_[0]->{text_paragraph_type};
	$_[0]->$sub($_[1]);
	$_[0]->remove_html($_[1]);
	$_[0]->decode_entity($_[1]);
	$_[0]->normalize_paragraph_1($_[1]);
	$_[0]->remove_line_by_end($_[1]);
	$_[0]->normalize_paragraph_1($_[1]);
	$_[0]->parse_paragraph_end($_[1]);
	$_[1]=~s/ ?\$BOOKBOTRETURN\$//sg;		#remove for reserved return
	$sub='$_[1]=~s/^ /'.$_[0]->{patterns}->{line_head}.'/mg;'; #normalize with 4 spaces
	eval $sub;
}
sub remove_line_by_end {
	$_[1]=~s/(?:---\n|\n).*(?:$_[0]->{patterns}->{remove_line_by_end})$_[0]->{patterns}->{parentheses_right}?$//omg;
	$_[1]=~s/\n $_[0]->{patterns}->{remove_line_by_end_special}$//osg;
}
sub parse_paragraph_br {
	$_[1]=~s/\n//sg;
	$_[1]=~s/<[bB][rR]> */\n /sg;
}
sub parse_paragraph_brbr {
	$_[1]=~s/\n//sg;
	$_[1]=~s/(?:<[bB][rR]> *){2,}/\n /sg;
}
sub parse_paragraph_brandspace {
	$_[1]=~s/\n//sg;
	$_[1]=~s/<[bB][rR]>(?=[^ ])//sg;
	$_[1]=~s/<[bB][rR]> */\n /sg;
}
sub parse_paragraph_brbr_or_brandspace {
	$_[1]=~s/\n//sg;
	$_[1]=~s/<[bB][rR]>(?=[^ <])//sg;
	$_[1]=~s/(?:<[bB][rR]> *){1,}/\n /sg;
}
sub parse_paragraph_br_or_p {
	$_[1]=~s/\n/ /sg;
	$_[1]=~s/<[bB\/][rRpP]>/\n/sg;
}
sub parse_paragraph_cr {
}



( run in 1.776 second using v1.01-cache-2.11-cpan-71847e10f99 )