WWW-BookBot

 view release on metacpan or  search on metacpan

BookBot.pm  view on Meta::CPAN

		get_trunk_fresh_size	=> 5000,	#if get size > xxxBytes, then refresh trunk display
		get_visited_url_num		=> 0,		#statistics of visted urls, to be used in get_from_file/get_save_file
		language_decode			=> "utf8",	#to read with encoding
		language_encode			=> "utf8",	#to save with encoding
		process_all				=> 0,		#process all pages of catalog
		result_no_crlf			=> 1,		#0-with crlf, 1-no crlf
		space_leading_remove 	=> 1,		#remove leading spaces
		space_leading_max		=> 20,		#max leading spaces
		space_inner_remove		=> 1,		#remove inner spaces
		space_inner_min_words	=> 5,		#minimal length of word with inner spaces
		text_paragraph_type		=> 'br',	#type of paragraph split methods
			# br		 one br as end of paragraph
			# brbr		 two br as end of paragraph
			# cr		 one cr as end of paragraph
			# crcr		 two cr as end of paragraph
			# crandspace one cr and followed with space as end of paragraph
		screen_limit_trunk		=> 25,		#max trunks to be displayed
		screen_limit_title		=> 14,		#max title to be displayed
	};
}
sub initialize {

BookBot.pm  view on Meta::CPAN

	$_[1]=~s/\n$//s;								#remove ending \n
}
sub parse_title {
	$_[0]->normalize_space($_[1]);
	$_[0]->remove_html($_[1]);
	$_[0]->decode_entity($_[1]);
	$_[1]=~s/\n+/ /sg;	# CRLF as space
	$_[0]->normalize_paragraph_1($_[1]);
	$_[1]=~s/ +/ /sg;	#remove extra spaces

	#remove ending space or wordsplit mark
	my $p1=$_[0]->{patterns}->{mark_wordsplit};
	$p1=~s/(?:^\[|\]$)//sg;
	$p1="[".$p1." ]";
	$_[1]=~s/$p1+$//os;

	#remove paraentheses
	$_[1]=~s/(?:^ +| +$)//sg;
	while($_[1]=~/^(?:$_[0]->{patterns}->{parentheses})$/os) {
		$_[1]=$^N;
		$_[1]=~s/(?:^ +| +$)//sg;



( run in 1.814 second using v1.01-cache-2.11-cpan-5511b514fd6 )