Aion-Format
view release on metacpan or search on metacpan
lib/Aion/Format/Html.pm view on Meta::CPAN
$make_page->() if @page;
# ÐÑли ÑÐ°Ð·Ð¼ÐµÑ Ð¿Ð¾Ñледней ÑÑÑаниÑÑ Ð¼ÐµÐ½ÑÑе Ñем 2/3, Ñо добавлÑем ÐµÑ Ðº пÑедпоÑледней
$pages[$#pages - 1] .= pop @pages if @pages > 1 and length($pages[$#pages]) < $symbols_on_page * 2 / 3;
#my $len = 0; $len += length for @pages;
#die "СÑммаÑнÑй ÑÐ°Ð·Ð¼ÐµÑ ÑÑÑÐ°Ð½Ð¸Ñ Ð½Ðµ изменилÑÑ: " . length($html) . " == $len pages=" . @pages . " ->\n\n$html" if $len == length $html;
my ($end1) = $html =~ m!([^<>\s]{1,13})\s*(</?\w[^<>]*>\s*)*$!a;
my ($end2) = $pages[$#pages] =~ m!([^<>\s]{1,13})\s*(</?\w[^<>]*>\s*)*$!a;
die "ÐонÑÑ ÑекÑÑа и поÑледней ÑÑÑаниÑÑ Ð½Ðµ ÑÑ
одÑÑÑÑ! `$end1` <> `$end2`" if $end1 ne $end2;
return @pages;
}
our %TAG2SPACE = (
"br" => "\n",
"dd" => "\n ",
"table" => "\n",
"tr" => "\n| ",
"td" => "\t| ",
"th" => "\t| ",
"ol" => "\n",
"ul" => "\n",
"li" => "\n* ",
"img" => " ",
"p" => "\n",
"div" => "\n",
);
our %CLOSE_TAG2SPACE = (
"div" => "\n",
"p" => "\n",
"table" => "\n",
"ol" => "\n",
"ul" => "\n",
);
our %WITH_CLOSE_TAG2SPACE = (
"p" => "\n",
"br" => "\n",
"img" => " ",
);
# пеÑÐµÐ²Ð¾Ð´Ð¸Ñ html в text
sub from_html (;$) {
local ($_) = @_? @_: $_;
# 1. УбиÑаем ÑнÑиÑиеÑ:
my $ent = sub {
exists $+{word}? (exists $ENTITIES{$+{word}}? chr $ENTITIES{$+{word}}: $&):
exists $+{num}? chr $+{num}:
exists $+{hex}? chr hex $+{hex}: ""
};
s{&(
(?<word>\w+)
|\#(?<num>\d+)
|\#x(?<hex>[a-f\d]+)
);?
}{$ent->()}genix;
my $pre;
my $to = sub {
my $s1 = $pre? $+{s1}: ($+{s1} eq ""? "": " ");
my $x =
exists $+{space}? ($pre? $+{space}: " "):
exists $+{nbsp}? " ":
exists $+{xhr}? $+{xhr}:
exists $+{tag}? do {
my $tag = lc $+{tag};
$pre = 1 if $tag eq "pre";
exists $+{close}? $WITH_CLOSE_TAG2SPACE{$tag}: $TAG2SPACE{$tag}
}:
exists $+{ctag}? do {
my $tag = lc $+{ctag};
$pre = 0 if $tag eq "pre";
$CLOSE_TAG2SPACE{$tag}
}:
"";
my $s2 = $pre? $+{s2}: ($+{s2} eq "" || $s1? "": " ");
$x =~ /\n/ ? $x: join "", $s1, $x, $s2
};
s{
(?<s1> \s*) (
<(script|style|template)\b [^<>]*> .*? </ \g1 \s* >
| <xhr \b [^<>]*> (?<xhr> .*? ) </xhr \s* >
| < (?<tag> [a-z]\w* ) [^<>]*? (?<close> / )? \s*>
| </ (?<ctag> [a-z]\w* ) \s*>
| <!--.*?-->
) (?<s2> \s*)
| (?<space> [\ \t\n\r\f]+)
| (?<nbsp> \xa0)
}{$to->()}genisx;
$_
}
# ÐÑе, кÑоме запÑеÑÑннÑÑ
:
# applet, script, style, embed, object, param,
# video, audio, source, track, frame, frameset, iframe, comment
# html, head, body, title, meta, base, basefont, bgsound, link
# form, keygen, output, textarea, select, option, optgroup, legend, label, input
# plaintext, xmp
# Ð Ñак же ÑдалÑÐµÑ Ð°ÑÑибÑÑÑ Ð½Ð°ÑинаÑÑиеÑÑ Ð½Ð° "on", name, for, formaction и дÑ..
my %SAFE_TAG = map {$_=>1} qw/
a
abbr
acronym
address
area
article
( run in 0.740 second using v1.01-cache-2.11-cpan-df04353d9ac )