Aion-Format

 view release on metacpan or  search on metacpan

lib/Aion/Format/Html.pm  view on Meta::CPAN

	$make_page->() if @page;

	# Если размер последней страницы меньше чем 2/3, то добавляем её к предпоследней
	$pages[$#pages - 1] .= pop @pages if @pages > 1 and length($pages[$#pages]) < $symbols_on_page * 2 / 3;

	#my $len = 0; $len += length for @pages;
	#die "Суммарный размер страниц не изменился: " . length($html) . " == $len pages=" . @pages . " ->\n\n$html" if $len == length $html;

	my ($end1) = $html =~ m!([^<>\s]{1,13})\s*(</?\w[^<>]*>\s*)*$!a;
	my ($end2) = $pages[$#pages] =~ m!([^<>\s]{1,13})\s*(</?\w[^<>]*>\s*)*$!a;
	die "Концы текста и последней страницы не сходятся! `$end1` <> `$end2`" if $end1 ne $end2;

	return @pages;
}

our %TAG2SPACE = (
	"br" => "\n",
	"dd" => "\n  ",
	"table" => "\n",
	"tr" => "\n| ",
	"td" => "\t| ",
	"th" => "\t| ",
	"ol" => "\n",
	"ul" => "\n",
	"li" => "\n* ",
	"img" => " ",
	"p" => "\n",
	"div" => "\n",
);

our %CLOSE_TAG2SPACE = (
	"div" => "\n",
	"p" => "\n",
	"table" => "\n",
	"ol" => "\n",
	"ul" => "\n",
);

our %WITH_CLOSE_TAG2SPACE = (
	"p" => "\n",
	"br" => "\n",
	"img" => " ",
);


# переводит html в text
sub from_html (;$) {
	local ($_) = @_? @_: $_;

	# 1. Убираем энтитиес:
	my $ent = sub {
		exists $+{word}? (exists $ENTITIES{$+{word}}? chr $ENTITIES{$+{word}}: $&):
		exists $+{num}? chr $+{num}:
		exists $+{hex}? chr hex $+{hex}: ""
	};
	s{&(
		(?<word>\w+)
		|\#(?<num>\d+)
		|\#x(?<hex>[a-f\d]+)
	);?
	}{$ent->()}genix;

	my $pre;

	my $to = sub {
		my $s1 = $pre? $+{s1}: ($+{s1} eq ""? "": " ");
		
		my $x =
		exists $+{space}? ($pre? $+{space}: " "):
		exists $+{nbsp}? " ":
		exists $+{xhr}? $+{xhr}:
		exists $+{tag}? do {
			my $tag = lc $+{tag};
			$pre = 1 if $tag eq "pre";
			exists $+{close}? $WITH_CLOSE_TAG2SPACE{$tag}: $TAG2SPACE{$tag} 
		}:
		exists $+{ctag}? do {
			my $tag = lc $+{ctag};
			$pre = 0 if $tag eq "pre";
			
			$CLOSE_TAG2SPACE{$tag}
		}:
		"";
		
		my $s2 = $pre? $+{s2}: ($+{s2} eq "" || $s1? "": " ");
		
		$x =~ /\n/ ? $x: join "", $s1, $x, $s2
	};

	s{
		(?<s1> \s*) (
		
			  <(script|style|template)\b [^<>]*> .*? </ \g1 \s* >
			| <xhr \b [^<>]*> (?<xhr> .*? ) </xhr \s* >
			| < (?<tag> [a-z]\w* ) [^<>]*? (?<close> / )? \s*>
			| </ (?<ctag> [a-z]\w* ) \s*>
			| <!--.*?-->
		
		) (?<s2> \s*)
		| (?<space> [\ \t\n\r\f]+)
		| (?<nbsp> \xa0)
	}{$to->()}genisx;

	$_
}

# Все, кроме запрещённых:
#  applet, script, style, embed, object, param,
#  video, audio, source, track, frame, frameset, iframe, comment
#  html, head, body, title, meta, base, basefont, bgsound, link
#  form, keygen, output, textarea, select, option, optgroup, legend, label, input
#  plaintext, xmp
# А так же удаляет атрибуты начинающиеся на "on", name, for, formaction и др..
my %SAFE_TAG = map {$_=>1} qw/
a
abbr
acronym
address

area
article



( run in 0.740 second using v1.01-cache-2.11-cpan-df04353d9ac )