Aion-Format

 view release on metacpan or  search on metacpan

lib/Aion/Format/Html.pm  view on Meta::CPAN


	# 1. Убираем энтитиес:
	my $ent = sub {
		exists $+{word}? (exists $ENTITIES{$+{word}}? chr $ENTITIES{$+{word}}: $&):
		exists $+{num}? chr $+{num}:
		exists $+{hex}? chr hex $+{hex}: ""
	};
	s{&(
		(?<word>\w+)
		|\#(?<num>\d+)
		|\#x(?<hex>[a-f\d]+)
	);?
	}{$ent->()}genix;

	my $pre;

	my $to = sub {
		my $s1 = $pre? $+{s1}: ($+{s1} eq ""? "": " ");
		
		my $x =
		exists $+{space}? ($pre? $+{space}: " "):
		exists $+{nbsp}? " ":
		exists $+{xhr}? $+{xhr}:
		exists $+{tag}? do {
			my $tag = lc $+{tag};
			$pre = 1 if $tag eq "pre";
			exists $+{close}? $WITH_CLOSE_TAG2SPACE{$tag}: $TAG2SPACE{$tag} 
		}:
		exists $+{ctag}? do {
			my $tag = lc $+{ctag};
			$pre = 0 if $tag eq "pre";
			
			$CLOSE_TAG2SPACE{$tag}
		}:
		"";
		
		my $s2 = $pre? $+{s2}: ($+{s2} eq "" || $s1? "": " ");
		
		$x =~ /\n/ ? $x: join "", $s1, $x, $s2
	};

	s{
		(?<s1> \s*) (
		
			  <(script|style|template)\b [^<>]*> .*? </ \g1 \s* >
			| <xhr \b [^<>]*> (?<xhr> .*? ) </xhr \s* >
			| < (?<tag> [a-z]\w* ) [^<>]*? (?<close> / )? \s*>
			| </ (?<ctag> [a-z]\w* ) \s*>
			| <!--.*?-->
		
		) (?<s2> \s*)
		| (?<space> [\ \t\n\r\f]+)
		| (?<nbsp> \xa0)
	}{$to->()}genisx;

	$_
}

# Все, кроме запрещённых:
#  applet, script, style, embed, object, param,
#  video, audio, source, track, frame, frameset, iframe, comment
#  html, head, body, title, meta, base, basefont, bgsound, link
#  form, keygen, output, textarea, select, option, optgroup, legend, label, input
#  plaintext, xmp
# А так же удаляет атрибуты начинающиеся на "on", name, for, formaction и др..
my %SAFE_TAG = map {$_=>1} qw/
a
abbr
acronym
address

area
article
aside

b
bdi
bdo

blockquote
big

blink
br
button

canvas
caption
center
cite
code
col
colgroup
command

datalist
dd
del
details
dfn
dir
div
dl
dt

em

figcaption
figure
font
footer

h1
h2
h3
h4
h5
h6
header
hgroup
hr

i
img

ins
isindex

kbd
keygen

li

main
map
marquee
mark
menu

meter

nav
nobr
noembed
noframes
noscript

ol

p
pre
progress

q

rp
rt
ruby

s
samp
section
small
span
strike
strong
   sub
summary
sup

table
tbody
td
tfoot
th
thead
time

tr
tt

u
ul

var

wbr
/;

my %SAFE_ATTR = map {$_=>1} qw/
pubdate datetime
open optimum

dir lang language style tabindex title high low hreflang icon

max min

href media ping rel rev name type

class

src

alt crossorigin decoding height width importance  intrinsicsize loading sizes srcset



( run in 2.264 seconds using v1.01-cache-2.11-cpan-e1769b4cff6 )