HTML-Toc
view release on metacpan or search on metacpan
t/anchors.t view on Meta::CPAN
#!/usr/bin/perl -w
use strict;
use HTML::Toc;
use HTML::TocInsertor;
use HTML::Entities;
use Encode;
use Test::More tests => 7;
use Test::Differences;
# http://search.cpan.org/dist/HTML-Toc/Toc.pod#templateAnchorName
sub assembleAnchorName() {
my ($aFile, $aGroupId, $aLevel, $aNode, $text, $children) = @_;
if ($text !~ /^\s*$/) {
# generate a SEO-friendly anchor right from the token content
# The allowed character set is limited first by the URI specification for fragments, http://tools.ietf.org/html/rfc3986#section-2: characters
# then by the limitations of the values of 'id' and 'name' attributes: http://www.w3.org/TR/REC-html40/types.html#type-name
# Eventually, the only punctuation allowed in id values is [_.:-]
# Unicode characters with code points > 0x7E (e.g. Chinese characters) are allowed (test "<h1 id="è¡æ¿åºå">header</h1>" at http://validator.w3.org/#validate_by_input+with_options), except for smart quotes (!), see http://www.w3.org/Searc...
# However, that contradicts the HTML 4.01 spec: "Anchor names should be restricted to ASCII characters." - http://www.w3.org/TR/REC-html40/struct/links.html#h-12.2.1
# ...and the [A-Za-z] class of letters mentioned at http://www.w3.org/TR/REC-html40/types.html#type-name
# Finally, note that pod2html fails miserably to generate XHTML-compliant anchor links. See http://validator.w3.org/check?uri=http%3A%2F%2Fsearch.cpan.org%2Fdist%2FCatalyst-Runtime%2Flib%2FCatalyst%2FRequest.pm&charset=(detect+automatically)&...
$text =~ s/\s/_/g;
decode_entities($text); # we need to replace [#&;] only when they are NOT part of an HTML entity. decode_entities saves us from crafting a nasty regexp
$text = encode('utf-8', $text); # convert to UTF-8 because we need to output the UTF-8 bytes
$text =~ s/([^A-Za-z0-9_:.-])/sprintf('.%02X', ord($1))/eg; # MediaWiki also uses the period, see http://en.wikipedia.org/wiki/Hierarchies#Ethics.2C_behavioral_psychology.2C_philosophies_of_identity
$text = 'L'.$text if $text =~ /\A\W/; # "ID and NAME tokens must begin with a letter ([A-Za-z])" -- http://www.w3.org/TR/html4/types.html#type-name
}
$text = 'id' if $text eq '';
# check if the anchor already exists; if so, add a number
# NOTE: there is no way currently to do this easily in HTML-Toc-1.10.
#my $anch_num = 1;
#my $word_name = $name;
## Reference: http://www.w3.org/TR/REC-html40/struct/links.html#h-12.2.1
## Anchor names must be unique within a document. Anchor names that differ only in case may not appear in the same document.
#while (grep {lc $_ eq lc $name} keys %{$args{anchors}}) {
# # FIXME (in caller sub): to avoid the grep above, the $args{anchors} hash
# # should have as key the lowercased anchor name, and as value its actual value (instead of '1')
# $name = $word_name . "_$anch_num";
# $anch_num++;
#}
return $text;
}
my $toc = HTML::Toc->new();
my $tocInsertor = HTML::TocInsertor->new();
my $output;
$toc->setOptions({
#header => '', # by default, \n<!-- Table of Contents generated by Perl - HTML::Toc -->\n
insertionPoint => 'replace {{toc}}',
doLinkToId => 0,
levelToToc => "[1-6]",
templateAnchorName => \&assembleAnchorName,
});
#--- Basic functionality --------------------------------------------
my $content = <<HTML;
{{toc}}<br />
<h1>Chapter 1</h1>
Some text here
( run in 1.160 second using v1.01-cache-2.11-cpan-e1769b4cff6 )