Aion-Format

 view release on metacpan or  search on metacpan

lib/Aion/Format/Html.pm  view on Meta::CPAN

	"p" => "\n",
	"table" => "\n",
	"ol" => "\n",
	"ul" => "\n",
);

our %WITH_CLOSE_TAG2SPACE = (
	"p" => "\n",
	"br" => "\n",
	"img" => " ",
);


# переводит html в text
sub from_html (;$) {
	local ($_) = @_? @_: $_;

	# 1. Убираем энтитиес:
	my $ent = sub {
		exists $+{word}? (exists $ENTITIES{$+{word}}? chr $ENTITIES{$+{word}}: $&):
		exists $+{num}? chr $+{num}:
		exists $+{hex}? chr hex $+{hex}: ""
	};
	s{&(
		(?<word>\w+)
		|\#(?<num>\d+)
		|\#x(?<hex>[a-f\d]+)
	);?
	}{$ent->()}genix;

	my $pre;

	my $to = sub {
		my $s1 = $pre? $+{s1}: ($+{s1} eq ""? "": " ");
		
		my $x =
		exists $+{space}? ($pre? $+{space}: " "):
		exists $+{nbsp}? " ":
		exists $+{xhr}? $+{xhr}:
		exists $+{tag}? do {
			my $tag = lc $+{tag};
			$pre = 1 if $tag eq "pre";
			exists $+{close}? $WITH_CLOSE_TAG2SPACE{$tag}: $TAG2SPACE{$tag} 
		}:
		exists $+{ctag}? do {
			my $tag = lc $+{ctag};
			$pre = 0 if $tag eq "pre";
			
			$CLOSE_TAG2SPACE{$tag}
		}:
		"";
		
		my $s2 = $pre? $+{s2}: ($+{s2} eq "" || $s1? "": " ");
		
		$x =~ /\n/ ? $x: join "", $s1, $x, $s2
	};

	s{
		(?<s1> \s*) (
		
			  <(script|style|template)\b [^<>]*> .*? </ \g1 \s* >
			| <xhr \b [^<>]*> (?<xhr> .*? ) </xhr \s* >
			| < (?<tag> [a-z]\w* ) [^<>]*? (?<close> / )? \s*>
			| </ (?<ctag> [a-z]\w* ) \s*>
			| <!--.*?-->
		
		) (?<s2> \s*)
		| (?<space> [\ \t\n\r\f]+)
		| (?<nbsp> \xa0)
	}{$to->()}genisx;

	$_
}

# Все, кроме запрещённых:
#  applet, script, style, embed, object, param,
#  video, audio, source, track, frame, frameset, iframe, comment
#  html, head, body, title, meta, base, basefont, bgsound, link
#  form, keygen, output, textarea, select, option, optgroup, legend, label, input
#  plaintext, xmp
# А так же удаляет атрибуты начинающиеся на "on", name, for, formaction и др..
my %SAFE_TAG = map {$_=>1} qw/
a
abbr
acronym
address

area
article
aside

b
bdi
bdo

blockquote
big

blink
br
button

canvas
caption
center
cite
code
col
colgroup
command

datalist
dd
del
details
dfn
dir
div
dl
dt

em

figcaption
figure
font
footer

h1
h2
h3
h4
h5
h6
header
hgroup

lib/Aion/Format/Html.pm  view on Meta::CPAN

main
map
marquee
mark
menu

meter

nav
nobr
noembed
noframes
noscript

ol

p
pre
progress

q

rp
rt
ruby

s
samp
section
small
span
strike
strong
   sub
summary
sup

table
tbody
td
tfoot
th
thead
time

tr
tt

u
ul

var

wbr
/;

my %SAFE_ATTR = map {$_=>1} qw/
pubdate datetime
open optimum

dir lang language style tabindex title high low hreflang icon

max min

href media ping rel rev name type

class

src

alt crossorigin decoding height width importance  intrinsicsize loading sizes srcset

align border hspace vspace longdesc  axis  char charoff summary

colspan rowspan

border cite bgcolor color

coords
/;

# срезает у html-я опасные, а так же неведомые теги
sub safe_html($;$) {
	(local $_, my $link) = @_;

	my $f = sub {
		return "" if !exists $SAFE_TAG{lc $2};
		return "</$2>" if $1 ne "";
		my $tag = $2;
		my $x = $3;
		my @attrs;
		while($x =~ /
			\b (?<attr> [a-z][a-z\d]*) ( \s*=\s* ( (?<quot> ") (?<val> [^"]*)" | (?<quot> ') (?<val> [^']*)' | (?<val> \S*) ) )?
		/gixn) {
			push @attrs, $+{val} eq ""? " $+{attr}"
				: join "", " ", $+{attr}, "=", $+{quot},
					lc $+{attr} ~~ [qw/src href/]
						? Aion::Format::Url::normalize_url($+{val}, $link)
						: $+{val},
					$+{quot}
				if exists $SAFE_ATTR{lc $+{attr}};
		}

		push @attrs, " target=_blank" if lc $tag eq "a";

		"<$tag@attrs>"
	};

	s{<(/\s*)?([a-z][a-z\d:-]*)([^<>]*)>|<!--(?:.*?)-->}{ $f->() }igse;

	$_
}

1;

__END__

=encoding utf-8

=head1 NAME



( run in 0.725 second using v1.01-cache-2.11-cpan-39bf76dae61 )