App-DocKnot

 view release on metacpan or  search on metacpan

lib/App/DocKnot/Spin/Text.pm  view on Meta::CPAN

# Convert some particular text formats into HTML.
#
# This program is an ad hoc set of heuristics and tricks, attempting to
# convert a few text file formats that I commonly use into reasonable HTML.
# General text to XHTML conversions is impossible due to the wildly differing
# formats used by people when writing text, so this module doesn't try to
# solve the general problem.  It's good enough to turn the FAQs I maintain
# into HTML documents, which is all that I need of it.
#
# SPDX-License-Identifier: MIT

##############################################################################
# Modules and declarations
##############################################################################

package App::DocKnot::Spin::Text v8.0.1;

use 5.024;
use autodie;
use warnings FATAL => 'utf8';

use vars qw($INDENT @INDENT);

use App::DocKnot;
use App::DocKnot::Util qw(print_fh);
use Path::Tiny qw(path);
use POSIX qw(strftime);

# Replace with the month names you want to use, if you don't want English.
our @MONTHS = qw(January February March April May June July August September
                 October November December);

##############################################################################
# Utility functions
##############################################################################

# Turns section numbers at the beginning of lines in a paragraph into links.
#
# $text - Text to format
#
# Returns: Text formatted as links to section numbers given by the numbers at
#          the start of each line.
sub _format_contents {
    my ($text) = @_;
    $text =~ s{
        ^
        (\s* ([\d.]+) [.\)] \s+ )
        (.*?)
        ([ \t]*\n)
    }{$1<a href="#S$2">$3</a>$4}xmsg;
    return $text;
}

# Turns *some text* into <strong>some text</strong>, while trying to be
# careful to avoid other uses of wildcards.
#
# $string - Text to format
#
# Returns: Text with bold replaced with HTML markup.
sub _format_bold {
    my ($text) = @_;
    $text =~ s{
        (^|\s) [*] ( \w .*? \S ) [*] ([,.!?;\s])
    }{$1<strong>$2</strong>$3}xmsg;
    return $text;
}

# Format a link.  All whitespace in the link is treated as insignficant.
#
# $link - Link to format
#
# Returns: Link formatted as an HTML link, with the link anchor being the same
#          as the link with any mailto: or news: removed.
sub _format_url {
    my ($link) = @_;
    my $text = $link;
    $link = _smash(_unescape($link));
    $text =~ s{ \A (?: mailto | news ): }{}xms;
    return '&lt;<a href="' . $link . '">' . $text . '</a>&gt;';
}

# Looks for URLs in <> or <URL:...> form and wraps a link around it.  Assumes
# that < and > have already been escaped.
#
# $text - Text to format
#
# Returns: Text with any embedded links turned into proper HTML links.
sub _format_urls {
    my ($text) = @_;
    $text =~ s{
        &lt; (?:URL:)? ([a-z]{2,}:.+?) &gt;
    }{
        _format_url($1)
    }xmsge;
    return $text;
}

# Remove an initial bullet from a paragraph, replacing it with a space.
#
# $string - Input string
#
# Returns: String with the bullet replaced with spaces.
sub _remove_bullet {
    my ($string) = @_;
    $string =~ s{ \A (\s*) [-*o] (\s) }{$1 $2}xms;
    return $string;
}

# Removes an initial number on a paragraph, replacing it with spaces.
#
# $string - Input string
#
# Returns: String with the number replaced with spaces.
sub _remove_number {
    my ($string) = @_;
    $string =~ s{
        \A (\s*) (\d\d?[.\)]) (\s)
    }{
        $1 . q{ } x length($2) . $3
    }xmse;

lib/App/DocKnot/Spin/Text.pm  view on Meta::CPAN

        # Check for a heading.  We distinguish between level 2 headings and
        # level 3 headings as follows: The first heading we encounter is
        # assumed to be a level 2 heading, and any further headers at that
        # same indentation level are also level 2 headings.  If we detect any
        # other headings at a greater indent, they're marked as level 3.
        if ($self->_is_heading ($_)) {
            s/^\s+//;
            $self->{contents} = /\bcontents\b/i;
            my $h;
            if (defined $self->{h2}) {
                if ($indent <= $self->{h2}) { $h = \&h2 }
                else                        { $h = \&h3 }
            } else {
                $self->{h2} = $indent;
                $h = \&h2;
            }
            $_ = _remove_rule($_);
            if (/^([\d.]+)[.\)]\s/) {
                my $anchor = qq(a name="S$1" id="S$1");
                $self->_output(start(), $h->(container($anchor, $_)));
            } else {
                $self->_output(start(), $h->($_));
            }
            $INDENT = $self->{baseline};
            next;
        }

        # A sudden change to an indentation of 0 when that's less than our
        # indentation baseline is also a sign of literal text.
        if ($INDENT && $indent == 0 && $INDENT > 0 && defined($self->{baseline})
            && $self->{baseline} > 0) {
            $self->_output(pre(strip_indent($_, $INDENT)));
            $self->{pre} = 1;
            next;
        }

        # We're dealing with a normal paragraph of some sort, so go ahead and
        # turn URLs into links.  Check whether the paragraph is broken first,
        # though, and stash that information, since turning URLs into links
        # can artificially lengthen lines.
        my $broken = _is_broken $_;
        $_ = _format_urls($_);

        # Check to see if we're in a contents section and this paragraph looks
        # like a table of contents.  If so, turn all of the section headings
        # into links.
        if ($self->{contents} && _is_contents($_)) {
            $_ = _format_contents($_)
        }

        # Check for paragraphs that are entirely bulletted lines, and turn
        # them into unordered lists without <p> tags.
        if (_is_allbullet $_) {
            my $last;
            my @lines = split (/\n/, $_);
            for (@lines) {
                next unless /\S/;
                if (_is_bullet $_) {
                    if (defined $last) {
                        $self->_output(start($INDENT, 'ul'));
                        $self->_output(li($INDENT, _format_bold($last)));
                    }
                    $last = _remove_bullet($_);
                    $INDENT = indent $last;
                } else {
                    $last .= "\n$_";
                }
            }
            if (defined $last) {
                $self->_output(start($INDENT, 'ul'));
                $self->_output(li($INDENT, _format_bold($last)));
            }
            next;
        }

        # Check for paragraphs that are entirely numbered lines, and turn them
        # into ordered lists without <p> tags.
        if (_is_allnumbered $_) {
            my @lines = split (/\n/, $_);
            for (@lines) {
                next unless /\S/;
                my ($number) = /^(\d+)/;
                $_ = _remove_number($_);
                $INDENT = indent $_;
                $self->_output(start($INDENT, 'ol'));
                $self->_output(li($INDENT, _format_bold($_), $number));
            }
            next;
        }

        # Check for bulletted paragraphs and turn them into lists.
        if (_is_bullet $_) {
            $_ = _remove_bullet($_);
            $INDENT = indent $_;
            $self->_output(start($INDENT, 'ul'));
            $self->_output(li($INDENT, p(_format_bold($_))));
            next;
        }

        # Check for paragraphs quoted with some character and turn them into
        # blockquotes provided they don't have inconsistent indentation.
        my $quote = _is_quoted ($_);
        if ($quote && !$broken) {
            $_ = _remove_prefix($_, $quote);
            $INDENT = indent $_;
            $self->_output(start($INDENT, 'blockquote', p(_format_bold($_))));
            next;
        }

        # Check for numbered paragraphs and turn them into lists.
        my $number = _is_numbered ($_);
        if (defined $number) {
            my $contents = _is_contents ($_);
            $_ = _remove_number($_);
            $INDENT = indent $_;
            s%(\n\s*\S)%<br />$1%g if ($broken || $contents);
            $self->_output(start($INDENT, 'ol'));
            $self->_output(li($INDENT, p(_format_bold($_)), $number));
            next;
        }

        # Check for things that look like description lists and handle them.
        # Note that we don't allow indented description lists, because they're
        # usually something we actually want to make <pre>.  This is another
        # fairly fragile heuristic.
        if (_is_description ($_) && defined $INDENT) {
            my (@title, $body);
            ($title[0], $body) = split ("\n", $_, 2);
            my ($space) = ($title[0] =~ /^(\s*)/);
            while ($body =~ /^$space\S/) {
                my $title;
                ($title, $body) = split ("\n", $body, 2);
                push (@title, $title);
            }
            if ($indent == $INDENT || indent ($body) == $INDENT) {
                @title = map { _format_bold($_) } @title;
                my $title = join ("<br />\n", @title) . "\n";
                $INDENT = indent $body;
                $body =~ s%(\n\s*\S)%<br />$1%g if _is_broken $body;
                $self->_output(start($indent, 'dl', dt($title)));
                $self->_output(start($INDENT, 'dd', p(_format_bold($body))));
                next;
            }
        }

        # If the paragraph has inconsistent indentation, we should output it
        # in <pre>.
        if (_is_offset $_) {
            $self->_output(pre(strip_indent($_, $INDENT)));
            $self->{pre} = 1;
            next;
        }

        # A sudden indentation change also means the paragraph should be
        # blockquoted.  We render broken blockquoted text in <pre>, which may
        # not be what's wanted for things like quotes of poetry... this is
        # probably worth looking at in more detail.
        if (defined $INDENT && $indent > $INDENT) {
            if ($broken || (lines ($_) == 1 && !_is_sentence $_)) {
                $self->_output(pre(strip_indent($_, $INDENT)));
                $self->{pre} = 1;
            } else {
                $INDENT = $indent;
                my $paragraph = p(_format_bold($_));
                $self->_output(start($INDENT, 'blockquote', $paragraph));
            }
            next;
        }

        # Close multiparagraph structure if we've outdented again.
        if ($INDENT && $indent < $INDENT) { $self->_output(start($indent)) }

        # Looks like a normal paragraph.  Establish our indentation baseline
        # if we haven't already.
        if (!defined $self->{baseline} && !$INDENT) {
            $self->{baseline} = $indent;
        }
        $INDENT = $indent;
        s%(\n\s*\S)%<br />$1%g if $broken;
        $self->_output(p(_format_bold($_)));

    } continue {
        $self->{whitespace} = $space;
    }

    # All done.  Print out our closing tags.
    $self->_output(start(-1));
    if ($self->{sitemap} && defined($self->{output}) && defined($out_path)) {
        my $page = $out_path->relative($self->{output});
        my @navbar = $self->{sitemap}->navbar($page);
        if (@navbar) {
            $self->_output("\n", @navbar);
        }
    }
    $self->_output("\n</body>\n</html>\n");
}

##############################################################################
# Public interface
##############################################################################

# Create a new text to HTML converter.
#
# $args_ref - Anonymous hash of arguments with the following keys:
#   output    - Root of the output tree (for sitemap information)
#   modified  - Whether to get last-modified date from source file
#   sitemap   - App::DocKnot::Spin::Sitemap object
#   style     - URL to the style sheet
#   title     - Document title
#
# Returns: Newly created object
sub new {
    my ($class, $args_ref) = @_;

    # Create and return the object.
    my $self = {
        output   => $args_ref->{output},
        modified => $args_ref->{modified},
        sitemap  => $args_ref->{sitemap},
        style    => $args_ref->{style},
        title    => $args_ref->{title},
    };
    bless($self, $class);
    return $self;
}

# Convert text to HTML.
#
# $input  - Input file (if not given, assumes standard input)
# $output - Output file (if not given, assumes standard output)
sub spin_text_file {
    my ($self, $input, $output) = @_;
    my ($in_fh, $out_fh);

    # Figure out what file we're going to be processing.  We can function as a
    # filter if so desired.
    if (defined($input)) {
        $input = path($input)->realpath();
        $in_fh = $input->openr_utf8();
    } else {

lib/App/DocKnot/Spin/Text.pm  view on Meta::CPAN

their text to intuit document structure from it.  This is therefore a
converter that will translate documents written the way I write.  It may or
may not work for you.  The chances that it will work for you are directly
proportional to how much your writing looks like mine.

App::DocKnot::Spin::Text understands digest separators (lines of exactly
thirty hyphens, from the minimal digest standard) and will treat a C<Subject>
header immediately after them as a section header.  Beyond that, headings must
either be outdented, underlined on the following line, or in all caps to be
recognized as section headers.  (Outdenting means that the regular text is
indented by a few spaces, but headers start in column 0, or at least in a
column farther to the left than the regular text.)

Section headers that begin with numbers (with any number of periods) will be
given C<< <a id> >> tags containing that number prepended with C<S>.  As a
special case of the parsing, any section with a header containing C<contents>
will have lines beginning with numbers turned into links to the appropriate <a
id> tags in the same document.  You can use this to turn the table of contents
of your minimal digest format FAQ into a real table of contents with links in
the HTML version.

Text with embedded whitespace more than a single space or a couple of spaces
at a sentence boundary or after a colon (and any text with literal tabs) will
be wrapped in C<< <pre> >> tags.  So will any indented text that doesn't look
like English paragraphs.  URLs surrounded by C<< <...> >> or C<< <URL:...> >>
will be turned into links.  Other URLs will not be turned into links, nor is
any effort made to turn random body text into links because it happens to look
like a link.

Bullet lists and numbered lists will be turned into the appropriate HTML
structures.  Some attempt is also made to recognize description lists, but
App::DocKnot::Spin::Text was written by someone who writes a lot of technical
documentation and therefore tends to prefer C<< <pre> >> if unsure whether
something is a description list or preformatted text.  Description lists are
therefore only going to work if the description titles aren't indented
relative to the surrounding text.

Regular indented paragraphs or paragraphs quoted with a consistent
non-alphanumeric quote character are recognized and turned into HTML block
quotes.

It's worthwhile paying attention to the headers at the top of your document so
that App::DocKnot::Spin::Text can get a few things right.  If you use RCS or
CVS, put the RCS C<Id> keyword as the first line of your document; it will be
stripped out of the resulting output and App::DocKnot::Spin::Text will use it
to determine the document revision.  This should be followed by regular
message headers and news.answers subheaders if the document is an actual FAQ,
and App::DocKnot::Spin::Text will use the C<From> and C<Subject> headers to
figure out a title and headings to use.  As a special case, an HTML-title
header in the subheaders will override any other title that
App::DocKnot::Spin::Text thinks it should use for the document.

App::DocKnot::Spin::Text expects your document to have an C<< <h1> >> title,
and will add one from the Subject header if it doesn't find one.  It will also
add subheaders (C<class="subheading">) giving the author (from the C<From>
header) and the last modified time and revision (from the RCS C<Id> string) if
there are no subheadings already.  If there's a subheading that contains RCS
identifiers, it will be replaced by a nicely formatted heading generated from
the RCS C<Id> information in the HTML output.

Text marked as C<*bold*> using the standard asterisk notation will be
surrounded by C<< <strong> >> tags, if the asterisks appear to be marking bold
text rather than serving as wildcards or some other function.

App::DocKnot::Spin::Text produces output (at least in the absence of any
lurking bugs) which complies with the XHTML 1.0 Transitional standard.  The
input and output character set is assumed to be UTF-8.

=head1 CLASS METHODS

=over 4

=item new(ARGS)

Create a new App::DocKnot::Spin::Text object.  A single converter object can
be reused to convert multiple files provided that they have the same options.
ARGS should be a hash reference with one or more of the following keys, all of
which are optional:

=over 4

=item output

The path to the root of the output tree when converting a tree of files.  This
will be used to calculate relative path names for generating inter-page links
using the provided C<sitemap> argument.  If C<sitemap> is given, this option
should also always be given.

=item modified

Add a last modified subheader to the document.  This will always be done if an
RCS C<Id> string is present in the input.  Otherwise, a last modified
subheader based on the last modification date of the input file will be added
if the input is a file and this option is set to a true value.  The default is
false.

=item sitemap

An App::DocKnot::Spin::Sitemap object.  This will be used to create inter-page
links.  For inter-page links, the C<output> argument must also be provided.

=item style

The URL to the style sheet to use.  The appropriate HTML will be added to the
C<< <head> >> section of the resulting document.

=item title

The HTML page title to use.  This will also be used as the C<< <h1> >> heading
if the document doesn't contain one, but will not override a heading found in
the document (only the HTML C<< <title> >> attribute).

=back

=back

=head1 INSTANCE METHODS

=over 4

=item spin_text_file([INPUT[, OUTPUT]])



( run in 1.296 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )