Text-HTML-Turndown

 view release on metacpan or  search on metacpan

lib/Text/HTML/CollapseWhitespace.pm  view on Meta::CPAN

package Text::HTML::CollapseWhitespace 0.12;
use 5.020;
use experimental 'signatures';
use stable 'postderef';
use Exporter 'import';

require Text::HTML::Turndown::Node;

our @EXPORT_OK = ('collapseWhitespace');

=head1 NAME

Text::HTML::CollapseWhitespace - remove extraneous whitespace from a fragment

=head1 SYNOPSIS

  my $tree = XML::LibXML->new->parse_html_string(
      $input,
      { recover => 2, encoding => 'UTF-8' }
  );
  $tree = collapseWhitespace($tree);

=cut

=head1 FUNCTIONS

=head2 C<< collapseWhitespace (%options) >>

  collapseWhitespace( element => $tree,
      isVoid  => \&_isVoid,
  )

  our @voidElements = (
    'AREA', 'BASE', 'BR', 'COL', 'COMMAND', 'EMBED', 'HR', 'IMG', 'INPUT',
    'KEYGEN', 'LINK', 'META', 'PARAM', 'SOURCE', 'TRACK', 'WBR'
  );
  our %voidElements = map { $_ => 1, lc $_ => 1 } @voidElements;
  sub _isVoid( $element ) {
      $voidElements{ $element->nodeName }
  }

This function modifies the tree in-place and removes extraneous whitespace
from the elements. The C<isPre>, C<isVoid> and C<isBlock> predicates allow you
to customize what elements are recognized as pre , void or block HTML elements
if needed.

=cut

sub collapseWhitespace (%options) {
  my $element = $options{ element }; # should be XML::LibXML
  my $isBlock = $options{ isBlock } // \&Text::HTML::Turndown::Node::_isBlock;
  my $isVoid  = $options{ isVoid  } // \&Text::HTML::Turndown::Node::_isVoid;
  my $isPre   = $options{ isPre } || sub ($node) {
    return uc($node->nodeName) eq 'PRE'
  };

  return
      if (!$element->firstChild || $isPre->($element));

  my $prevText;
  my $keepLeadingWs;

  my $prev;
  my $node = _next($prev, $element, $isPre);

  while (! $node->isEqual($element)) {
    if ($node->nodeType == 3 || $node->nodeType == 4) { # Node.TEXT_NODE or Node.CDATA_SECTION_NODE
      my $text = $node->data =~ s/[ \r\n\t]+/ /gr; # we only want to fold ASCII whitespace here

      if ((!$prevText || $prevText->data =~ / $/) &&
          !$keepLeadingWs && substr($text,0,1) eq ' ') {
        $text = substr($text, 1);
      }

      # `text` might be empty at this point.
      if (!$text) {
        $node = remove($node);
        next;
      }

      $node->setData( $text );

      $prevText = $node
    } elsif ($node->nodeType == 1) { # Node.ELEMENT_NODE
      if ($isBlock->($node) || uc $node->nodeName eq 'BR') {
        if ($prevText) {
            $prevText->setData( $prevText->data =~ s/ $//r );
        }
        undef $prevText;
        undef $keepLeadingWs;
      } elsif ($isVoid->($node) || $isPre->($node)) {
        # Avoid trimming space around non-block, non-BR void elements and inline PRE.
        undef $prevText;
        $keepLeadingWs = 1;



( run in 2.061 seconds using v1.01-cache-2.11-cpan-71847e10f99 )