Test.*Differences results from the CPAN

HTML-Toc
view release on metacpan or search on metacpan
ï»¿#!/usr/bin/perl -w
use strict;
use HTML::Toc;
use HTML::TocInsertor;
use HTML::Entities;
use Encode;

use Test::More tests => 7;
use Test::Differences;


# http://search.cpan.org/dist/HTML-Toc/Toc.pod#templateAnchorName
sub assembleAnchorName() {
    my ($aFile, $aGroupId, $aLevel, $aNode, $text, $children) = @_;

    if ($text !~ /^\s*$/) {
        # generate a SEO-friendly anchor right from the token content
        # The allowed character set is limited first by the URI specification for fragments, http://tools.ietf.org/html/rfc3986#section-2: characters
        # then by the limitations of the values of 'id' and 'name' attributes: http://www.w3.org/TR/REC-html40/types.html#type-name
        # Eventually, the only punctuation allowed in id values is [_.:-]
        # Unicode characters with code points > 0x7E (e.g. Chinese characters) are allowed (test "<h1 id="è¡Œæ”¿åŒºåŸŸ">header</h1>" at http://validator.w3.org/#validate_by_input+with_options), except for smart quotes (!), see http://www.w3.org/Searc...
        # However, that contradicts the HTML 4.01 spec: "Anchor names should be restricted to ASCII characters." - http://www.w3.org/TR/REC-html40/struct/links.html#h-12.2.1
        # ...and the [A-Za-z] class of letters mentioned at http://www.w3.org/TR/REC-html40/types.html#type-name
        # Finally, note that pod2html fails miserably to generate XHTML-compliant anchor links. See http://validator.w3.org/check?uri=http%3A%2F%2Fsearch.cpan.org%2Fdist%2FCatalyst-Runtime%2Flib%2FCatalyst%2FRequest.pm&charset=(detect+automatically)&...
        $text =~ s/\s/_/g;
        decode_entities($text);  # we need to replace [#&;] only when they are NOT part of an HTML entity. decode_entities saves us from crafting a nasty regexp
        $text = encode('utf-8', $text);  # convert to UTF-8 because we need to output the UTF-8 bytes
        $text =~ s/([^A-Za-z0-9_:.-])/sprintf('.%02X', ord($1))/eg;  # MediaWiki also uses the period, see http://en.wikipedia.org/wiki/Hierarchies#Ethics.2C_behavioral_psychology.2C_philosophies_of_identity
        $text = 'L'.$text if $text =~ /\A\W/; # "ID and NAME tokens must begin with a letter ([A-Za-z])" -- http://www.w3.org/TR/html4/types.html#type-name
    }
    $text = 'id' if $text eq '';

    # check if the anchor already exists; if so, add a number
    # NOTE: there is no way currently to do this easily in HTML-Toc-1.10.

    #my $anch_num = 1;
    #my $word_name = $name;
    ## Reference: http://www.w3.org/TR/REC-html40/struct/links.html#h-12.2.1
    ## Anchor names must be unique within a document. Anchor names that differ only in case may not appear in the same document.
    #while (grep {lc $_ eq lc $name} keys %{$args{anchors}}) {
    #    # FIXME (in caller sub): to avoid the grep above, the $args{anchors} hash
    #    # should have as key the lowercased anchor name, and as value its actual value (instead of '1')
    #    $name = $word_name . "_$anch_num";
    #    $anch_num++;
    #}

    return $text;
}


my $toc = HTML::Toc->new();
my $tocInsertor = HTML::TocInsertor->new();
my $output;

$toc->setOptions({
    #header => '',  # by default, \n<!-- Table of Contents generated by Perl - HTML::Toc -->\n
    insertionPoint => 'replace {{toc}}',
    doLinkToId => 0,
    levelToToc => "[1-6]",
    templateAnchorName => \&assembleAnchorName,
});



#--- Basic functionality --------------------------------------------
my $content = <<HTML;
{{toc}}<br />
<h1>Chapter 1</h1>
Some text here
( run in 1.160 second using v1.01-cache-2.11-cpan-e1769b4cff6 )