Aion-Format
view release on metacpan or search on metacpan
lib/Aion/Format/Html.pm view on Meta::CPAN
le => 8804,
# ⥠болÑÑе или Ñавно = is greater or equal to ≥ ≥
ge => 8805,
# â подмножеÑÑво = is subset of ⊂ ⊂
sub => 8834,
# â вклÑÑÐ°ÐµÑ Ð² ÑÐµÐ±Ñ = is superset of ⊃ ⊃
sup => 8835,
# â не ÑвлÑеÑÑÑ Ð¿Ð¾Ð´Ð¼Ð½Ð¾Ð¶ÐµÑÑвом = is not subset of ⊄ ⊄
nsub => 8836,
# â подмножеÑÑво или ÑквиваленÑно = is subset of or equivalent to ⊆ ⊆
sube => 8838,
# â вклÑÑÐ°ÐµÑ Ð² ÑÐµÐ±Ñ Ð¸Ð»Ð¸ ÑквиваленÑно = is superset of or equivalent to ⊇ ⊇
supe => 8839,
# â плÑÑ Ð² кÑÑге = пÑÑÐ¼Ð°Ñ ÑÑмма ⊕ ⊕
oplus => 8853,
# â Ñмножение [коÑÑм] кÑеÑÑом в кÑÑге = ÑензоÑное пÑоизведение ⊗ ⊗
otimes => 8855,
# ⥠up tack = оÑÑогоналÑно к = пеÑпендикÑлÑÑно ⊥ ⊥
perp => 8869,
# â
Ñмножение ÑоÑкой = ÑкалÑÑное пÑоизведение ⋅ ⋅
sdot => 8901,
#
#
# ТеÑ
ниÑеÑкие ÑимволÑ
# Символ ÐазнаÑение Ðнемоника Ðод
#
# â Ð»ÐµÐ²Ð°Ñ Ñкобка окÑÑÐ³Ð»ÐµÐ½Ð¸Ñ Ð²Ð²ÐµÑÑ
= apl upstile ⌈ ⌈
lceil => 8968,
# â пÑÐ°Ð²Ð°Ñ Ñкобка окÑÑÐ³Ð»ÐµÐ½Ð¸Ñ Ð²Ð²ÐµÑÑ
⌉ ⌉
rceil => 8969,
# â Ð»ÐµÐ²Ð°Ñ Ñкобка окÑÑÐ³Ð»ÐµÐ½Ð¸Ñ Ð²Ð½Ð¸Ð· = apl downstile ⌊ ⌊
lfloor => 8970,
# â пÑÐ°Ð²Ð°Ñ Ñкобка окÑÑÐ³Ð»ÐµÐ½Ð¸Ñ Ð²Ð½Ð¸Ð· ⌋ ⌋
rfloor => 8971,
# ⨠ÑÐ³Ð»Ð¾Ð²Ð°Ñ Ñкобка влево = бÑа ⟨ 〈
lang => 9001,
# â© ÑÐ³Ð»Ð¾Ð²Ð°Ñ Ñкобка впÑаво = ÐºÐµÑ ⟩ 〉
rang => 9002,
#
#
# ÐеомеÑÑиÑеÑкие ÑигÑÑÑ
# Символ ÐазнаÑение Ðнемоника Ðод
#
# â Ñомб ◊ ◊
loz => 9674,
#
#
# РазлиÑнÑе ÑимволÑ
# Символ ÐазнаÑение Ðнемоника Ðод
#
# â «пики» (каÑÑоÑÐ½Ð°Ñ Ð¼Ð°ÑÑÑ) ♠ ♠
spades => 9824,
# ⣠«ÑÑеÑÑ» (каÑÑоÑÐ½Ð°Ñ Ð¼Ð°ÑÑÑ) ♣ ♣
clubs => 9827,
# ⥠«ÑеÑвÑ» (каÑÑоÑÐ½Ð°Ñ Ð¼Ð°ÑÑÑ) ♥ ♥
hearts => 9829,
# ⦠«бÑбнÑ» (каÑÑоÑÐ½Ð°Ñ Ð¼Ð°ÑÑÑ) ♦ ♦
diams => 9830,
);
sub _set(@) { +{ map { $_ => 1 } @_ } }
# Теги не имеÑÑие закÑÑваÑÑего Ñега
our %SINGLE_TAG = %{ _set qw/area base br col embed hr img input link meta param source track wbr/ };
# <li> закÑÑваеÑÑÑ, еÑли пÑиÑ
Ð¾Ð´Ð¸Ñ </ol> или </ul>
our %TOP_CLOSE_TAG = (
li => _set(qw/ol ul/),
caption => _set(qw/table/),
thead => _set(qw/table/),
tbody => _set(qw/table/),
tfoot => _set(qw/table/),
tr => _set(qw/table thead tbody tfoot caption/),
td => _set(qw/table thead tbody tfoot caption/),
th => _set(qw/table thead tbody tfoot caption/),
dt => _set(qw/dl/),
dd => _set(qw/dl/),
rt => _set(qw/ruby/),
rp => _set(qw/ruby/),
option => _set(qw/optgroup select/),
optgroup => _set(qw/select/),
);
# <tr> закÑÑÐ²Ð°ÐµÑ Ð¾ÑкÑÑÑÑе <td> и <th> и <tr>
our %TOP_NEW_TAG = (
head => _set(qw/body/),
tr => _set(qw/tr thead tbody tfoot/),
td => _set(qw/tr thead tbody tfoot td th/),
th => _set(qw/tr thead tbody tfoot td th/),
col => _set(qw/tr thead tbody tfoot td th colgroup/),
colgroup => _set(qw/thead tbody tfoot caption colgroup/),
caption => _set(qw/thead tbody tfoot caption/),
thead => _set(qw/thead tbody tfoot caption/),
tbody => _set(qw/thead tbody tfoot caption/),
tfoot => _set(qw/thead tbody tfoot caption/),
li => _set(qw/li/),
dt => _set(qw/dt dd/),
dd => _set(qw/dt dd/),
rt => _set(qw/rt rp/),
rp => _set(qw/rt rp/),
option => _set(qw/option optgroup/),
optgroup => _set(qw/optgroup/),
);
# ÐÑовеÑÑеÑ, ÑÑо Ñег â одиноÑнÑй
sub is_single_tag(;$) {
exists $SINGLE_TAG{$_[0] // $_}
}
# ÐабÑаÑÑÐ²Ð°ÐµÑ Ñег в ÑÑек и возвÑаÑÐ°ÐµÑ Ð·Ð°ÐºÑÑÑÑе
sub in_tag(\@$$) {
my ($S, $tag, $atag) = @_;
# ÐÑбÑаÑÑваем из ÑÑека пÑедÑдÑÑий Ñег
my @ret;
push @ret, pop @$S while @$S and
($TOP_NEW_TAG{ $S->[$#$S][0] })->{$tag};
push @$S, [$tag, $atag] unless exists $SINGLE_TAG{$tag};
@ret
}
# ÐÑбÑаÑÑÐ²Ð°ÐµÑ Ñег из ÑÑека. ÐозвÑаÑÐ°ÐµÑ Ð²ÑдавленнÑе им и вÑбÑаÑÑÐ²Ð°ÐµÑ Ð¸ÑклÑÑение, еÑли Ñакого в ÑÑеке неÑ
sub out_tag(\@$) {
my ($S, $tag) = @_;
# ÐÑо одиноÑнÑй Ñег - он не Ð¼Ð¾Ð¶ÐµÑ Ð±ÑÑÑ Ð·Ð°ÐºÑÑваÑÑим
die "</$tag> is a single tag - it cannot be a closing tag" if exists $SINGLE_TAG{$tag};
# закÑÑваем пÑедÑдÑÑий, еÑли нÑжно
my @ret;
push @ret, pop @$S while @$S and
($TOP_CLOSE_TAG{$S->[$#$S][0]})->{$tag};
die "</$tag>, but stack is empty!" unless @$S;
# Ñег Ñавен закÑÑваÑÑемÑ
die "<$S->[$#$S][0]> in stack ne </$tag>!" if $S->[$#$S][0] ne $tag;
push @ret, pop @$S;
@ret
}
# Ð Ð°Ð·Ð±Ð¸Ð²Ð°ÐµÑ ÑекÑÑ Ð½Ð° ÑÑÑаниÑÑ Ñ ÑÑÑÑом html-Ñегов
#
# 1. html-Ñег должен бÑÑÑ Ñак же ÑазнеÑÑн на ÑÑÑаниÑÑ.
#
sub split_on_pages(@) {
my ($html, $symbols_on_page, $by) = @_;
# Ðа какое ÑаÑÑÑоÑние ÑÑÑаниÑа Ð¼Ð¾Ð¶ÐµÑ Ð±ÑÑÑ Ð±Ð¾Ð»ÑÑе
$by //= $symbols_on_page / 3 < 1000 ? int($symbols_on_page / 3): 1000;
my $max = $symbols_on_page + $by;
my @pages; # маÑÑив ÑÑÑаниÑ
my @page; # маÑÑив ÑлеменÑов ÑекÑÑа и Ñегов ÑекÑÑей ÑÑÑаниÑÑ
my $c = 0; # колиÑеÑÑво Ñимволов в ÑекÑÑей ÑÑÑаниÑе
my $i_page = 0; # Ð¸Ð½Ð´ÐµÐºÑ ÑлеменÑа @page коÑоÑÑй пÑивÑÑил ÑÐ°Ð·Ð¼ÐµÑ ÑÑÑаниÑÑ
my $c_page = 0; # колиÑеÑÑво Ñимволов @page по $i_page
my $is_proposal = 0; # РконÑе ÑекÑÑей ÑÑÑаниÑÑ Ð¾Ð±Ð½Ð°ÑÑжен ÐºÐ¾Ð½ÐµÑ Ð¿ÑедложениÑ
my $re_proposal = qr/[.?!â¦]/;
my @S; # маÑÑив оÑкÑÑваÑÑиÑ
Ñегов [tag, '<tag ...>']
# ФÑнкÑÐ¸Ñ ÑикÑиÑÑÐµÑ ÑÑÑаниÑÑ Ð¸ ÑбÑаÑÑÐ²Ð°ÐµÑ ÑÑÑÑÑики
my $make_page = sub {
push @pages, join "", @page, map { "</$_->[0]>" } reverse @S;
$i_page = $c = $is_proposal = 0;
@page = map $_->[1], @S;
};
for(grep length, split m{(
<[a-z] [^<>]* >
| </ \s* [a-z]\w* \s* >
| &(?: [a-z]\w* | \# \d+ | \#x[0-9a-f]+ ) ;?
| \n # ÐбзаÑ
| $re_proposal+ # ÐÑедложение
| \b # Слово
)}xiu, $html) {
if(/^&/) {$c++} # html-Ñимвол
elsif(/^<\/\s*([a-z]\w*)/) { # закÑÑваÑÑий Ñег
my $tag = lc $1;
eval { out_tag @S, $tag };
next if $@;
# </p> пÑевÑаÑаем в <p></p>
$_ = "<p></p>" if $tag eq "p";
}
elsif(/^<([a-z]\w*)/) { in_tag @S, lc $1, $_ } # Ñег
else {$c += length} # ÑекÑÑ
push @page, $_; # накапливаем ÑÐ¸Ð¼Ð²Ð¾Ð»Ñ Ð² маÑÑиве @page
next if $c < $symbols_on_page; # ÑÑÑаниÑÑ Ð½Ðµ набÑали - Ñогда на next
$c_page = $c, $i_page = @page if !$i_page;
# ÐÑоÑмаÑÑиваем впеÑÑд пока не найдÑм гÑаниÑÑ Ð¸Ð»Ð¸ не доÑÑигнем огÑаниÑениÑ
if(/^\n/) { $make_page->() } # ÐбзаÑ
elsif(!$is_proposal && /^$re_proposal/) { $i_page = @page; $c_page = $c; $is_proposal = 1 }
elsif($c >= $max) {
# ÐÑли ÑледÑÑÑий за пÑедложением или Ñловом ÑÐ»ÐµÐ¼ÐµÐ½Ñ - пÑобелÑ, Ñо добавлÑем иÑ
к ÑÑÑаниÑе
$c_page -= length $page[$i_page++] if $page[$i_page] =~ /^\s/;
my @x = splice @page, $i_page;
$make_page->();
push @page, @x;
$c -= $c_page;
}
}
$make_page->() if @page;
# ÐÑли ÑÐ°Ð·Ð¼ÐµÑ Ð¿Ð¾Ñледней ÑÑÑаниÑÑ Ð¼ÐµÐ½ÑÑе Ñем 2/3, Ñо добавлÑем ÐµÑ Ðº пÑедпоÑледней
$pages[$#pages - 1] .= pop @pages if @pages > 1 and length($pages[$#pages]) < $symbols_on_page * 2 / 3;
#my $len = 0; $len += length for @pages;
#die "СÑммаÑнÑй ÑÐ°Ð·Ð¼ÐµÑ ÑÑÑÐ°Ð½Ð¸Ñ Ð½Ðµ изменилÑÑ: " . length($html) . " == $len pages=" . @pages . " ->\n\n$html" if $len == length $html;
my ($end1) = $html =~ m!([^<>\s]{1,13})\s*(</?\w[^<>]*>\s*)*$!a;
my ($end2) = $pages[$#pages] =~ m!([^<>\s]{1,13})\s*(</?\w[^<>]*>\s*)*$!a;
die "ÐонÑÑ ÑекÑÑа и поÑледней ÑÑÑаниÑÑ Ð½Ðµ ÑÑ
одÑÑÑÑ! `$end1` <> `$end2`" if $end1 ne $end2;
return @pages;
}
our %TAG2SPACE = (
"br" => "\n",
"dd" => "\n ",
"table" => "\n",
lib/Aion/Format/Html.pm view on Meta::CPAN
exists $+{hex}? chr hex $+{hex}: ""
};
s{&(
(?<word>\w+)
|\#(?<num>\d+)
|\#x(?<hex>[a-f\d]+)
);?
}{$ent->()}genix;
my $pre;
my $to = sub {
my $s1 = $pre? $+{s1}: ($+{s1} eq ""? "": " ");
my $x =
exists $+{space}? ($pre? $+{space}: " "):
exists $+{nbsp}? " ":
exists $+{xhr}? $+{xhr}:
exists $+{tag}? do {
my $tag = lc $+{tag};
$pre = 1 if $tag eq "pre";
exists $+{close}? $WITH_CLOSE_TAG2SPACE{$tag}: $TAG2SPACE{$tag}
}:
exists $+{ctag}? do {
my $tag = lc $+{ctag};
$pre = 0 if $tag eq "pre";
$CLOSE_TAG2SPACE{$tag}
}:
"";
my $s2 = $pre? $+{s2}: ($+{s2} eq "" || $s1? "": " ");
$x =~ /\n/ ? $x: join "", $s1, $x, $s2
};
s{
(?<s1> \s*) (
<(script|style|template)\b [^<>]*> .*? </ \g1 \s* >
| <xhr \b [^<>]*> (?<xhr> .*? ) </xhr \s* >
| < (?<tag> [a-z]\w* ) [^<>]*? (?<close> / )? \s*>
| </ (?<ctag> [a-z]\w* ) \s*>
| <!--.*?-->
) (?<s2> \s*)
| (?<space> [\ \t\n\r\f]+)
| (?<nbsp> \xa0)
}{$to->()}genisx;
$_
}
# ÐÑе, кÑоме запÑеÑÑннÑÑ
:
# applet, script, style, embed, object, param,
# video, audio, source, track, frame, frameset, iframe, comment
# html, head, body, title, meta, base, basefont, bgsound, link
# form, keygen, output, textarea, select, option, optgroup, legend, label, input
# plaintext, xmp
# Ð Ñак же ÑдалÑÐµÑ Ð°ÑÑибÑÑÑ Ð½Ð°ÑинаÑÑиеÑÑ Ð½Ð° "on", name, for, formaction и дÑ..
my %SAFE_TAG = map {$_=>1} qw/
a
abbr
acronym
address
area
article
aside
b
bdi
bdo
blockquote
big
blink
br
button
canvas
caption
center
cite
code
col
colgroup
command
datalist
dd
del
details
dfn
dir
div
dl
dt
em
figcaption
figure
font
footer
h1
h2
h3
h4
h5
h6
header
hgroup
hr
i
img
ins
isindex
kbd
keygen
li
main
map
marquee
mark
menu
meter
nav
nobr
noembed
noframes
noscript
ol
p
pre
progress
q
rp
rt
ruby
s
samp
section
small
span
strike
strong
sub
summary
sup
table
tbody
td
tfoot
th
thead
time
tr
tt
u
ul
var
wbr
/;
my %SAFE_ATTR = map {$_=>1} qw/
pubdate datetime
open optimum
dir lang language style tabindex title high low hreflang icon
max min
href media ping rel rev name type
class
src
alt crossorigin decoding height width importance intrinsicsize loading sizes srcset
align border hspace vspace longdesc axis char charoff summary
colspan rowspan
border cite bgcolor color
coords
/;
# ÑÑÐµÐ·Ð°ÐµÑ Ñ html-Ñ Ð¾Ð¿Ð°ÑнÑе, а Ñак же неведомÑе Ñеги
sub safe_html($;$) {
(local $_, my $link) = @_;
my $f = sub {
return "" if !exists $SAFE_TAG{lc $2};
return "</$2>" if $1 ne "";
my $tag = $2;
my $x = $3;
my @attrs;
while($x =~ /
\b (?<attr> [a-z][a-z\d]*) ( \s*=\s* ( (?<quot> ") (?<val> [^"]*)" | (?<quot> ') (?<val> [^']*)' | (?<val> \S*) ) )?
/gixn) {
push @attrs, $+{val} eq ""? " $+{attr}"
: join "", " ", $+{attr}, "=", $+{quot},
lc $+{attr} ~~ [qw/src href/]
? Aion::Format::Url::normalize_url($+{val}, $link)
: $+{val},
$+{quot}
if exists $SAFE_ATTR{lc $+{attr}};
}
push @attrs, " target=_blank" if lc $tag eq "a";
"<$tag@attrs>"
};
s{<(/\s*)?([a-z][a-z\d:-]*)([^<>]*)>|<!--(?:.*?)-->}{ $f->() }igse;
$_
}
1;
__END__
( run in 1.305 second using v1.01-cache-2.11-cpan-39bf76dae61 )