Text-HTML-Turndown
view release on metacpan or search on metacpan
lib/Text/HTML/CollapseWhitespace.pm view on Meta::CPAN
package Text::HTML::CollapseWhitespace 0.12;
use 5.020;
use experimental 'signatures';
use stable 'postderef';
use Exporter 'import';
require Text::HTML::Turndown::Node;
our @EXPORT_OK = ('collapseWhitespace');
=head1 NAME
Text::HTML::CollapseWhitespace - remove extraneous whitespace from a fragment
=head1 SYNOPSIS
my $tree = XML::LibXML->new->parse_html_string(
$input,
{ recover => 2, encoding => 'UTF-8' }
);
$tree = collapseWhitespace($tree);
=cut
=head1 FUNCTIONS
=head2 C<< collapseWhitespace (%options) >>
collapseWhitespace( element => $tree,
isVoid => \&_isVoid,
)
our @voidElements = (
'AREA', 'BASE', 'BR', 'COL', 'COMMAND', 'EMBED', 'HR', 'IMG', 'INPUT',
'KEYGEN', 'LINK', 'META', 'PARAM', 'SOURCE', 'TRACK', 'WBR'
);
our %voidElements = map { $_ => 1, lc $_ => 1 } @voidElements;
sub _isVoid( $element ) {
$voidElements{ $element->nodeName }
}
This function modifies the tree in-place and removes extraneous whitespace
from the elements. The C<isPre>, C<isVoid> and C<isBlock> predicates allow you
to customize what elements are recognized as pre , void or block HTML elements
if needed.
=cut
sub collapseWhitespace (%options) {
my $element = $options{ element }; # should be XML::LibXML
my $isBlock = $options{ isBlock } // \&Text::HTML::Turndown::Node::_isBlock;
my $isVoid = $options{ isVoid } // \&Text::HTML::Turndown::Node::_isVoid;
my $isPre = $options{ isPre } || sub ($node) {
return uc($node->nodeName) eq 'PRE'
};
return
if (!$element->firstChild || $isPre->($element));
my $prevText;
my $keepLeadingWs;
my $prev;
my $node = _next($prev, $element, $isPre);
while (! $node->isEqual($element)) {
if ($node->nodeType == 3 || $node->nodeType == 4) { # Node.TEXT_NODE or Node.CDATA_SECTION_NODE
my $text = $node->data =~ s/[ \r\n\t]+/ /gr; # we only want to fold ASCII whitespace here
if ((!$prevText || $prevText->data =~ / $/) &&
!$keepLeadingWs && substr($text,0,1) eq ' ') {
$text = substr($text, 1);
}
# `text` might be empty at this point.
if (!$text) {
$node = remove($node);
next;
}
$node->setData( $text );
$prevText = $node
} elsif ($node->nodeType == 1) { # Node.ELEMENT_NODE
if ($isBlock->($node) || uc $node->nodeName eq 'BR') {
if ($prevText) {
$prevText->setData( $prevText->data =~ s/ $//r );
}
undef $prevText;
undef $keepLeadingWs;
} elsif ($isVoid->($node) || $isPre->($node)) {
# Avoid trimming space around non-block, non-BR void elements and inline PRE.
undef $prevText;
$keepLeadingWs = 1;
( run in 2.061 seconds using v1.01-cache-2.11-cpan-71847e10f99 )