App-DocKnot
view release on metacpan or search on metacpan
lib/App/DocKnot/Spin/Text.pm view on Meta::CPAN
use 5.024;
use autodie;
use warnings FATAL => 'utf8';
use vars qw($INDENT @INDENT);
use App::DocKnot;
use App::DocKnot::Util qw(print_fh);
use Path::Tiny qw(path);
use POSIX qw(strftime);
# Replace with the month names you want to use, if you don't want English.
our @MONTHS = qw(January February March April May June July August September
October November December);
##############################################################################
# Utility functions
##############################################################################
# Turns section numbers at the beginning of lines in a paragraph into links.
#
# $text - Text to format
#
# Returns: Text formatted as links to section numbers given by the numbers at
# the start of each line.
sub _format_contents {
my ($text) = @_;
$text =~ s{
^
(\s* ([\d.]+) [.\)] \s+ )
(.*?)
([ \t]*\n)
}{$1<a href="#S$2">$3</a>$4}xmsg;
return $text;
}
# Turns *some text* into <strong>some text</strong>, while trying to be
# careful to avoid other uses of wildcards.
#
# $string - Text to format
#
# Returns: Text with bold replaced with HTML markup.
sub _format_bold {
my ($text) = @_;
$text =~ s{
(^|\s) [*] ( \w .*? \S ) [*] ([,.!?;\s])
}{$1<strong>$2</strong>$3}xmsg;
return $text;
}
# Format a link. All whitespace in the link is treated as insignficant.
#
# $link - Link to format
#
# Returns: Link formatted as an HTML link, with the link anchor being the same
# as the link with any mailto: or news: removed.
sub _format_url {
my ($link) = @_;
my $text = $link;
$link = _smash(_unescape($link));
$text =~ s{ \A (?: mailto | news ): }{}xms;
return '<<a href="' . $link . '">' . $text . '</a>>';
}
# Looks for URLs in <> or <URL:...> form and wraps a link around it. Assumes
# that < and > have already been escaped.
#
# $text - Text to format
#
# Returns: Text with any embedded links turned into proper HTML links.
sub _format_urls {
my ($text) = @_;
$text =~ s{
< (?:URL:)? ([a-z]{2,}:.+?) >
}{
_format_url($1)
}xmsge;
return $text;
}
# Remove an initial bullet from a paragraph, replacing it with a space.
#
# $string - Input string
#
# Returns: String with the bullet replaced with spaces.
sub _remove_bullet {
my ($string) = @_;
$string =~ s{ \A (\s*) [-*o] (\s) }{$1 $2}xms;
return $string;
}
# Removes an initial number on a paragraph, replacing it with spaces.
#
# $string - Input string
#
# Returns: String with the number replaced with spaces.
sub _remove_number {
my ($string) = @_;
$string =~ s{
\A (\s*) (\d\d?[.\)]) (\s)
}{
$1 . q{ } x length($2) . $3
}xmse;
return $string;
}
# Remove a constant prefix at the beginning of each line of a paragraph.
#
# $string - Input string
#
# Returns: String with the prefix removed from each line.
sub _remove_prefix {
my ($string, $prefix) = @_;
$string =~ s{
( (?:\A|\n) \s* ) ( \Q$prefix\E \s+ )
}{
$1 . q{ } x length($2)
}xmsge;
return $string;
}
# Remove ASCII underlining from a section heading.
#
# $string - Input string
#
# Returns: String with the underlining removed.
sub _remove_rule {
my ($string) = @_;
$string =~ s{ \A [-=~]+ \n }{}xms;
return $string;
}
# Remove all whitespace in a string.
#
# $string - Input string
#
# Returns: String with all whitespace removed.
sub _smash {
my ($string) = @_;
$string =~ s{ \s }{}xmsg;
return $string;
}
# Unescape &, <, and > characters.
#
# $text - Text to remove HTML escapes from.
#
# Returns: Text with HTML escapes changed back to their regular characters.
sub _unescape {
my ($text) = @_;
$text =~ s{ > }{>}xmsg;
$text =~ s{ < }{<}xmsg;
$text =~ s{ & }{&}xmsg;
return $text;
}
# Escapes &, <, and > characters found in a string.
sub escape { local $_ = shift; s/&/&/g; s/</</g; s/>/>/g; $_ }
# Returns the length of the indentation of a line or paragraph.
sub indent { $_[0] =~ /^(\s*)/; length $1 }
# Returns the number of lines in a paragraph, not counting trailing blanks.
sub lines { local $_ = shift; s/\s+$/\n/; tr/\n// }
# Returns a nicely formatted "Last modified" string from an RCS/CVS Id.
sub modified_id {
my $id = shift;
my ($version, $date) = (split (' ', $id))[2,3];
my ($year, $month, $day) = split (m%[/-]%, $date);
$day =~ s/^0//;
my $revision = ($version =~ /\./) ? " (revision $version)" : '';
'Last modified '. $MONTHS[$month - 1] . ' ' . $day . ', ' . $year
. $revision;
}
# The same, but from a UNIX timestamp.
sub modified_timestamp {
my $timestamp = shift;
my ($year, $month, $day) = (localtime $timestamp)[5, 4, 3];
$year += 1900;
'Last modified ' . $MONTHS[$month] . ' ' . $day . ', ' . $year;
}
# Strip a number of characters of indentation from a line that's given by the
# second argument, returning the result. Used to strip leading indentation
# off of <pre> text so that it isn't indented excessively just because in the
# text version it had to be indented relative to the surrounding text.
sub strip_indent {
local $_ = shift;
my $indent = shift;
if (defined $indent && $indent > 0) {
s/^ {$indent}//gm;
}
$_;
}
# Replace tabs with spaces.
sub untabify {
local $_ = shift;
1 while s/^(.*?)(\t+)/' ' x (length ($2) * 8 - length ($1) % 8)/me;
$_;
}
# Remove whitespace at the beginning and end of a string.
sub whitechomp { local $_ = shift; s/^\s+//; s/\s+$//; $_ }
##############################################################################
# Classification functions
##############################################################################
# Whether a paragram is composed entirely of bullet items. Take some care to
# avoid returning true for paragraphs that consist of a single bullet entry,
# since we want to handle those separately to wrap them in paragraph tags.
#
# $paragraph - Paragraph to classify
#
# Returns: True if so, false otherwise
lib/App/DocKnot/Spin/Text.pm view on Meta::CPAN
# multiple titles.
#
# $paragraph - Paragraph to classify
#
# Returns: True if so, false otherwise
sub _is_description {
my ($paragraph) = @_;
return if $paragraph !~ m{
\A
(\s*) \S \N* \n # title (1 is indent)
(?: \1 \S \N* \n)* # possibly more than one
(\s+) \S \N* \n # first line of description (2 is indent)
(?: \2 \S \N* \n)* # subsequent lines
\s* \z
}xms;
return length($1) < length($2);
}
# Whether a line is a digest divider.
#
# $line - Line to classify
#
# Returns: True if so, false otherwise
sub _is_divider {
my ($line) = @_;
return $line =~ m{ \A -{30} \s* \z }xms;
}
# Whether a line is an RFC 2822 header.
#
# $line - Line to classify
#
# Returns: True if so, false otherwise
sub _is_header {
my ($line) = @_;
return if $line =~ m{ \A [\w-]+: \s+ \N }xms;
}
# Whether a paragraph is a heading. This is all about heuristics and guesses,
# and there are a number of other things we could confuse for headings, so we
# have to be careful.
#
# If it's a single line and outdented from the baseline, it's probably a
# heading.
#
# If it's at the baseline, check to see if it looks like a heading and either
# it's in all caps or there is a rule underneath it. If we haven't seen a
# baseline, be more accepting about headers.
#
# If we're inside a contents block, be even more careful and disallow numbered
# things that look like a heading unless they're outdented.
#
# Unlike most of the classification functions, this is a regular method, since
# it needs access to the parsing state.
#
# $paragraph - Paragraph to classify
#
# Returns: True if a heading, false otherwise
sub _is_heading {
my ($self, $paragraph) = @_;
$paragraph = _unescape($paragraph);
my $indent = indent($paragraph);
my $nobase = !defined($self->{baseline});
my $outdented = defined($self->{baseline}) && $indent < $self->{baseline};
# Numbered lines inside the contents section are definitely not headings.
my $numbered = $paragraph =~ m{ \A [\d.]+[.\)] \s }xms;
return if !$outdented && $self->{contents} && $numbered;
# Outdented single lines are headings as long as they're either short or
# contain at least two words.
if ($outdented && lines($paragraph) == 1) {
return 1 if $paragraph =~ m{ \S \s \S }xms;
return 1 if length($paragraph) < 30;
}
# Indented lines are never headings.
return if defined($INDENT) && $indent > $INDENT;
# Lines of at most 31 characters ending in a word character or closing
# quote or paren are headings if they're underlined.
return 1 if $paragraph =~ m{
\A \s*
[ \w\"\(\),:./&-]{0,30} [\w\"\)] \s* \n
[-=~]+ \s*
\z
}xms;
# All-uppercase lines of at most 31 characters ending in an uppercase
# character, digit, or closing quote or paren are headings.
return 1 if $paragraph =~ m{
\A \s*
[ [:upper:]\d\"\(\),:./&-]{0,30} [[:upper:]\d\"\)]
\s* \n
\z
}xms;
# If there is no baseline, assume single lines of at most 34 characters
# with no unexpected characters are headings.
return $nobase && $paragraph =~ m{
\A \s*
[ \w\"\(\),:./&-]{0,33} [\w\"\)]
\s* \n
\z
}xms;
}
# Whether a line is an RCS/CVS Id string that has been expanded.
#
# $line - Line to classify
#
# Returns: True if so, false otherise
sub _is_id {
my ($line) = @_;
return $line =~ m{ \A \s* [\$]Id: \N+ [\$] \s* \z }xms;
}
# Whether a paragraph should be a literal paragraph, decided based on whether
# it has internal whitespace.
#
# $paragraph - Paragraph to classify
lib/App/DocKnot/Spin/Text.pm view on Meta::CPAN
$line = $self->_next_line();
if (_is_centered($line)) {
$header{heading} = whitechomp($line);
if (!defined($header{title})) {
$header{title} = $header{heading};
if (_is_allcaps($header{title})) {
$header{title} =~ s{ \b ([A-Z]+) \b }{\L\u$1}xmsg;
}
}
$self->_skip_blank_lines_and_rules();
} else {
$self->_buffer_line($line);
$header{heading} = $header{title} // $self->{title};
}
# Return the parsed header.
return \%header;
}
# Parse the subheaders of a text document and generate the subheaders for the
# output document. The author information from the headers will be included,
# as will the last modified date if configured. Existing subheadings that
# look like they're just Revision or Date strings will be replaced by a
# nicely-formatted string.
#
# $header_ref - Main headers of the text document
#
# Returns: List of lists of subheaders to put at the top of the output
# document
sub _parse_subheaders {
my ($self, $header_ref) = @_;
my (@subheaders, $modified);
# Generate a last modified date if we have an RCS/CVS Id string or if a
# last modified subheader from the file modification time was requested.
# We'll set $modified back to undef if we push it into the subheaders at
# any point; otherwise, we'll add it at the end.
if ($header_ref->{id}) {
$modified = modified_id($header_ref->{id});
} elsif ($self->{modified} && defined($self->{in_path})) {
$modified = modified_timestamp($self->{in_path}->stat()->[9]);
}
# Parse subheaders. The first must be centered; after that, assume
# everything is a subheading until a blank line.
my $line;
while (defined($line = $self->_next_line())) {
next if _is_rule($line);
last if $line =~ m{ \A \s* \z }xms;
# For cases other than a rule or blank line, we have to either be in a
# subheading or the line must be centered.
last if !(@subheaders || _is_centered($line));
# A subheading to add. Replace Revision and Date keywords with our
# modified timestamp if we have one.
if ($modified && $line =~ m{ [\$] (?: Revision | Date ) }xms) {
push(@subheaders, $modified);
$modified = undef;
} else {
push(@subheaders, _format_urls(escape(whitechomp($line))));
}
}
$self->_buffer_line($line);
$self->_skip_blank_lines_and_rules();
# If there is no subheading, but we have an author from the file headings,
# create a subheading with that information.
if (!@subheaders && $header_ref->{author}) {
push(@subheaders, escape($header_ref->{author}));
if ($header_ref->{original}) {
push(
@subheaders,
'(originally by ' . escape($header_ref->{original}) . ')',
)
}
}
# If we have modification information and haven't output it yet, add that
# to the subheading.
if (defined($modified)) {
push(@subheaders, $modified);
}
# Return what we have.
return @subheaders;
}
##############################################################################
# Document conversion
##############################################################################
# Convert a document from text to HTML.
#
# $in_fh - Input file handle
# $in_path - Input path
# $out_fh - Output file handle
# $out_path - Output path
sub _convert_document {
my ($self, $in_fh, $in_path, $out_fh, $out_path) = @_;
# Initialize object state for a new document.
#<<<
$self->{baseline} = undef; # Baseline indentation of text
$self->{buffer} = undef; # Buffered input line not yet converted
$self->{contents} = 0; # Whether inside a contents section
$self->{in_fh} = $in_fh; # Input file handle
$self->{in_path} = $in_path; # Path to input file
$self->{h2} = undef; # Indentation level for h2 headings
$self->{out_fh} = $out_fh; # Output file handle
$self->{out_path} = $out_path; # Path to the output file
$self->{pre} = 0; # Whether inside a preformatted block
$self->{whitespace} = q{}; # Pending whitespace
#>>>
# Parse the document headers.
my $header_ref = $self->_parse_headers();
# Generate the header of the HTML file.
$self->_output_header($header_ref);
# Open the body of the document, print the navigation links if possible,
# and print out the heading if we found one.
$self->_output("<body>\n\n");
if ($self->{sitemap} && defined($self->{output}) && defined($out_path)) {
my $page = $out_path->relative($self->{output});
my @navbar = $self->{sitemap}->navbar($page);
if (@navbar) {
$self->_output(@navbar, "\n");
}
}
if ($header_ref->{heading}) {
$self->_output(h1($header_ref->{heading}), "\n");
}
# Parse and output the subheaders, if any.
my @subheaders = $self->_parse_subheaders($header_ref);
if (@subheaders) {
$self->_output(qq(<p class="subheading">\n));
$self->_output(q{ }, join("<br />\n ", @subheaders), "\n</p>\n\n");
}
# Scan the actual body of the text. We don't use paragraph mode, since it
# doesn't work with blank lines that contain whitespace; instead, we
# cobble together our own paragraph mode that does. Note that $_ already
# has a non-blank line of input coming into this loop.
my $space;
while (defined($_ = $self->_next_paragraph())) {
last if _is_signature($_);
# If we just hit a digest divider, the next thing will likely be a
# Subject: line that we want to turn into a section header. Digest
# section titles are always level 2 headers currently.
if (_is_divider $_) {
$self->{pre} = 0;
$self->_output(start(-1));
undef $INDENT;
($self->{whitespace}) = /\n(\s*)$/;
$_ = $self->_next_paragraph();
s/\n(\s*)$/\n/;
$space = $1;
if (s/^Subject:\s+//) {
$self->{contents} = /\bcontents\b/i;
$_ = escape $_;
if (/^([\d.]+)[.\)]\s/) {
$self->_output(
h2(container(qq(a name="S$1" id="S$1"), $_))
);
} else {
$self->_output(h2($_));
}
next;
}
}
# Treat lines of dash-type characters as rules.
if (_is_rule $_) {
$self->{pre} = 0;
($space) = /\n(\s*)$/;
$self->_output(start(-1), "<hr />\n");
undef $INDENT;
next
}
# Everything else needs to have special characters escaped. We don't
# do this earlier because if we want to allow < and > in rules, the
# escaping would make our lives miserable.
$_ = escape $_;
# Do this before untabification and stashing of trailing whitespace,
# but after escaping. Check to see if this paragraph looks like
# literal text. If so, we wrap it in <pre> and output it as is. As a
# special exception to our normal paragraph handling, this paragraph
# doesn't end until we find a literal blank line; this hack lets full
# diffs be included in a FAQ without confusing the parser.
if (_is_literal $_) {
if (/\n[ \t]+$/) { $_ .= $self->_next_paragraph(1) }
$self->_output(pre(strip_indent($_, $INDENT)));
s/\n(\n\s*)$/\n/;
$space = $1;
$self->{pre} = 1;
next;
}
# Not literal text, so untabify it and stash whitespace.
$_ = untabify $_;
s/\n(\s*)$/\n/;
$space = $1;
my $indent = indent $_;
# If the paragraph has inconsistent indentation, or is indented
# relative to the baseline *and* the last paragraph we emitted was
# enclosed in <pre>, assume that this paragraph belongs in <pre> as
# well.
if ($self->{pre}) {
if (_is_offset ($_) || (defined $INDENT && $indent > $INDENT)) {
$self->_output(pre(strip_indent($_, $INDENT)));
next;
} else {
$self->{pre} = 0;
}
}
# Check for a heading. We distinguish between level 2 headings and
# level 3 headings as follows: The first heading we encounter is
# assumed to be a level 2 heading, and any further headers at that
# same indentation level are also level 2 headings. If we detect any
# other headings at a greater indent, they're marked as level 3.
if ($self->_is_heading ($_)) {
s/^\s+//;
$self->{contents} = /\bcontents\b/i;
my $h;
if (defined $self->{h2}) {
if ($indent <= $self->{h2}) { $h = \&h2 }
else { $h = \&h3 }
} else {
$self->{h2} = $indent;
$h = \&h2;
}
$_ = _remove_rule($_);
if (/^([\d.]+)[.\)]\s/) {
my $anchor = qq(a name="S$1" id="S$1");
$self->_output(start(), $h->(container($anchor, $_)));
} else {
$self->_output(start(), $h->($_));
}
$INDENT = $self->{baseline};
next;
( run in 1.180 second using v1.01-cache-2.11-cpan-5837b0d9d2c )