App-DocKnot

 view release on metacpan or  search on metacpan

lib/App/DocKnot/Spin/Text.pm  view on Meta::CPAN


use 5.024;
use autodie;
use warnings FATAL => 'utf8';

use vars qw($INDENT @INDENT);

use App::DocKnot;
use App::DocKnot::Util qw(print_fh);
use Path::Tiny qw(path);
use POSIX qw(strftime);

# Replace with the month names you want to use, if you don't want English.
our @MONTHS = qw(January February March April May June July August September
                 October November December);

##############################################################################
# Utility functions
##############################################################################

# Turns section numbers at the beginning of lines in a paragraph into links.
#
# $text - Text to format
#
# Returns: Text formatted as links to section numbers given by the numbers at
#          the start of each line.
sub _format_contents {
    my ($text) = @_;
    $text =~ s{
        ^
        (\s* ([\d.]+) [.\)] \s+ )
        (.*?)
        ([ \t]*\n)
    }{$1<a href="#S$2">$3</a>$4}xmsg;
    return $text;
}

# Turns *some text* into <strong>some text</strong>, while trying to be
# careful to avoid other uses of wildcards.
#
# $string - Text to format
#
# Returns: Text with bold replaced with HTML markup.
sub _format_bold {
    my ($text) = @_;
    $text =~ s{
        (^|\s) [*] ( \w .*? \S ) [*] ([,.!?;\s])
    }{$1<strong>$2</strong>$3}xmsg;
    return $text;
}

# Format a link.  All whitespace in the link is treated as insignficant.
#
# $link - Link to format
#
# Returns: Link formatted as an HTML link, with the link anchor being the same
#          as the link with any mailto: or news: removed.
sub _format_url {
    my ($link) = @_;
    my $text = $link;
    $link = _smash(_unescape($link));
    $text =~ s{ \A (?: mailto | news ): }{}xms;
    return '&lt;<a href="' . $link . '">' . $text . '</a>&gt;';
}

# Looks for URLs in <> or <URL:...> form and wraps a link around it.  Assumes
# that < and > have already been escaped.
#
# $text - Text to format
#
# Returns: Text with any embedded links turned into proper HTML links.
sub _format_urls {
    my ($text) = @_;
    $text =~ s{
        &lt; (?:URL:)? ([a-z]{2,}:.+?) &gt;
    }{
        _format_url($1)
    }xmsge;
    return $text;
}

# Remove an initial bullet from a paragraph, replacing it with a space.
#
# $string - Input string
#
# Returns: String with the bullet replaced with spaces.
sub _remove_bullet {
    my ($string) = @_;
    $string =~ s{ \A (\s*) [-*o] (\s) }{$1 $2}xms;
    return $string;
}

# Removes an initial number on a paragraph, replacing it with spaces.
#
# $string - Input string
#
# Returns: String with the number replaced with spaces.
sub _remove_number {
    my ($string) = @_;
    $string =~ s{
        \A (\s*) (\d\d?[.\)]) (\s)
    }{
        $1 . q{ } x length($2) . $3
    }xmse;
    return $string;
}

# Remove a constant prefix at the beginning of each line of a paragraph.
#
# $string - Input string
#
# Returns: String with the prefix removed from each line.
sub _remove_prefix {
    my ($string, $prefix) = @_;
    $string =~ s{
        ( (?:\A|\n) \s* ) ( \Q$prefix\E \s+ )
    }{
        $1 . q{ } x length($2)
    }xmsge;
    return $string;
}

# Remove ASCII underlining from a section heading.
#
# $string - Input string
#
# Returns: String with the underlining removed.
sub _remove_rule {
    my ($string) = @_;
    $string =~ s{ \A [-=~]+ \n }{}xms;
    return $string;
}

# Remove all whitespace in a string.
#
# $string - Input string
#
# Returns: String with all whitespace removed.
sub _smash {
    my ($string) = @_;
    $string =~ s{ \s }{}xmsg;
    return $string;
}

# Unescape &, <, and > characters.
#
# $text - Text to remove HTML escapes from.
#
# Returns: Text with HTML escapes changed back to their regular characters.
sub _unescape {
    my ($text) = @_;
    $text =~ s{ &gt; }{>}xmsg;
    $text =~ s{ &lt; }{<}xmsg;
    $text =~ s{ &amp; }{&}xmsg;
    return $text;
}

# Escapes &, <, and > characters found in a string.
sub escape { local $_ = shift; s/&/&amp;/g; s/</&lt;/g; s/>/&gt;/g; $_ }

# Returns the length of the indentation of a line or paragraph.
sub indent { $_[0] =~ /^(\s*)/; length $1 }

# Returns the number of lines in a paragraph, not counting trailing blanks.
sub lines { local $_ = shift; s/\s+$/\n/; tr/\n// }

# Returns a nicely formatted "Last modified" string from an RCS/CVS Id.
sub modified_id {
    my $id = shift;
    my ($version, $date) = (split (' ', $id))[2,3];
    my ($year, $month, $day) = split (m%[/-]%, $date);
    $day =~ s/^0//;
    my $revision = ($version =~ /\./) ? " (revision $version)" : '';
    'Last modified '. $MONTHS[$month - 1] . ' ' . $day . ', ' . $year
        . $revision;
}

# The same, but from a UNIX timestamp.
sub modified_timestamp {
    my $timestamp = shift;
    my ($year, $month, $day) = (localtime $timestamp)[5, 4, 3];
    $year += 1900;
    'Last modified ' . $MONTHS[$month] . ' ' . $day . ', ' . $year;
}

# Strip a number of characters of indentation from a line that's given by the
# second argument, returning the result.  Used to strip leading indentation
# off of <pre> text so that it isn't indented excessively just because in the
# text version it had to be indented relative to the surrounding text.
sub strip_indent {
    local $_ = shift;
    my $indent = shift;
    if (defined $indent && $indent > 0) {
        s/^ {$indent}//gm;
    }
    $_;
}

# Replace tabs with spaces.
sub untabify {
    local $_ = shift;
    1 while s/^(.*?)(\t+)/' ' x (length ($2) * 8 - length ($1) % 8)/me;
    $_;
}

# Remove whitespace at the beginning and end of a string.
sub whitechomp { local $_ = shift; s/^\s+//; s/\s+$//; $_ }

##############################################################################
# Classification functions
##############################################################################

# Whether a paragram is composed entirely of bullet items.  Take some care to
# avoid returning true for paragraphs that consist of a single bullet entry,
# since we want to handle those separately to wrap them in paragraph tags.
#
# $paragraph - Paragraph to classify
#
# Returns: True if so, false otherwise

lib/App/DocKnot/Spin/Text.pm  view on Meta::CPAN

# multiple titles.
#
# $paragraph - Paragraph to classify
#
# Returns: True if so, false otherwise
sub _is_description {
    my ($paragraph) = @_;
    return if $paragraph !~ m{
        \A
        (\s*) \S \N* \n         # title (1 is indent)
        (?: \1 \S \N* \n)*      # possibly more than one
        (\s+) \S \N* \n         # first line of description (2 is indent)
        (?: \2 \S \N* \n)*      # subsequent lines
        \s* \z
    }xms;
    return length($1) < length($2);
}

# Whether a line is a digest divider.
#
# $line - Line to classify
#
# Returns: True if so, false otherwise
sub _is_divider {
    my ($line) = @_;
    return $line =~ m{ \A -{30} \s* \z }xms;
}

# Whether a line is an RFC 2822 header.
#
# $line - Line to classify
#
# Returns: True if so, false otherwise
sub _is_header {
    my ($line) = @_;
    return if $line =~ m{ \A [\w-]+: \s+ \N }xms;
}

# Whether a paragraph is a heading.  This is all about heuristics and guesses,
# and there are a number of other things we could confuse for headings, so we
# have to be careful.
#
# If it's a single line and outdented from the baseline, it's probably a
# heading.
#
# If it's at the baseline, check to see if it looks like a heading and either
# it's in all caps or there is a rule underneath it.  If we haven't seen a
# baseline, be more accepting about headers.
#
# If we're inside a contents block, be even more careful and disallow numbered
# things that look like a heading unless they're outdented.
#
# Unlike most of the classification functions, this is a regular method, since
# it needs access to the parsing state.
#
# $paragraph - Paragraph to classify
#
# Returns: True if a heading, false otherwise
sub _is_heading {
    my ($self, $paragraph) = @_;
    $paragraph = _unescape($paragraph);
    my $indent = indent($paragraph);
    my $nobase = !defined($self->{baseline});
    my $outdented = defined($self->{baseline}) && $indent < $self->{baseline};

    # Numbered lines inside the contents section are definitely not headings.
    my $numbered = $paragraph =~ m{ \A [\d.]+[.\)] \s }xms;
    return if !$outdented && $self->{contents} && $numbered;

    # Outdented single lines are headings as long as they're either short or
    # contain at least two words.
    if ($outdented && lines($paragraph) == 1) {
        return 1 if $paragraph =~ m{ \S \s \S }xms;
        return 1 if length($paragraph) < 30;
    }

    # Indented lines are never headings.
    return if defined($INDENT) && $indent > $INDENT;

    # Lines of at most 31 characters ending in a word character or closing
    # quote or paren are headings if they're underlined.
    return 1 if $paragraph =~ m{
        \A \s*
        [ \w\"\(\),:./&-]{0,30} [\w\"\)] \s* \n
        [-=~]+ \s*
        \z
    }xms;

    # All-uppercase lines of at most 31 characters ending in an uppercase
    # character, digit, or closing quote or paren are headings.
    return 1 if $paragraph =~ m{
        \A \s*
        [ [:upper:]\d\"\(\),:./&-]{0,30} [[:upper:]\d\"\)]
        \s* \n
        \z
    }xms;

    # If there is no baseline, assume single lines of at most 34 characters
    # with no unexpected characters are headings.
    return $nobase && $paragraph =~ m{
        \A \s*
        [ \w\"\(\),:./&-]{0,33} [\w\"\)]
        \s* \n
        \z
    }xms;
}

# Whether a line is an RCS/CVS Id string that has been expanded.
#
# $line - Line to classify
#
# Returns: True if so, false otherise
sub _is_id {
    my ($line) = @_;
    return $line =~ m{ \A \s* [\$]Id: \N+ [\$] \s* \z }xms;
}

# Whether a paragraph should be a literal paragraph, decided based on whether
# it has internal whitespace.
#
# $paragraph - Paragraph to classify

lib/App/DocKnot/Spin/Text.pm  view on Meta::CPAN

    $line = $self->_next_line();
    if (_is_centered($line)) {
        $header{heading} = whitechomp($line);
        if (!defined($header{title})) {
            $header{title} = $header{heading};
            if (_is_allcaps($header{title})) {
                $header{title} =~ s{ \b ([A-Z]+) \b }{\L\u$1}xmsg;
            }
        }
        $self->_skip_blank_lines_and_rules();
    } else {
        $self->_buffer_line($line);
        $header{heading} = $header{title} // $self->{title};
    }

    # Return the parsed header.
    return \%header;
}

# Parse the subheaders of a text document and generate the subheaders for the
# output document.  The author information from the headers will be included,
# as will the last modified date if configured.  Existing subheadings that
# look like they're just Revision or Date strings will be replaced by a
# nicely-formatted string.
#
# $header_ref - Main headers of the text document
#
# Returns: List of lists of subheaders to put at the top of the output
#          document
sub _parse_subheaders {
    my ($self, $header_ref) = @_;
    my (@subheaders, $modified);

    # Generate a last modified date if we have an RCS/CVS Id string or if a
    # last modified subheader from the file modification time was requested.
    # We'll set $modified back to undef if we push it into the subheaders at
    # any point; otherwise, we'll add it at the end.
    if ($header_ref->{id}) {
        $modified = modified_id($header_ref->{id});
    } elsif ($self->{modified} && defined($self->{in_path})) {
        $modified = modified_timestamp($self->{in_path}->stat()->[9]);
    }

    # Parse subheaders.  The first must be centered; after that, assume
    # everything is a subheading until a blank line.
    my $line;
    while (defined($line = $self->_next_line())) {
        next if _is_rule($line);
        last if $line =~ m{ \A \s* \z }xms;

        # For cases other than a rule or blank line, we have to either be in a
        # subheading or the line must be centered.
        last if !(@subheaders || _is_centered($line));

        # A subheading to add.  Replace Revision and Date keywords with our
        # modified timestamp if we have one.
        if ($modified && $line =~ m{ [\$] (?: Revision | Date ) }xms) {
            push(@subheaders, $modified);
            $modified = undef;
        } else {
            push(@subheaders, _format_urls(escape(whitechomp($line))));
        }
    }
    $self->_buffer_line($line);
    $self->_skip_blank_lines_and_rules();

    # If there is no subheading, but we have an author from the file headings,
    # create a subheading with that information.
    if (!@subheaders && $header_ref->{author}) {
        push(@subheaders, escape($header_ref->{author}));
        if ($header_ref->{original}) {
            push(
                @subheaders,
                '(originally by ' . escape($header_ref->{original}) . ')',
            )
        }
    }

    # If we have modification information and haven't output it yet, add that
    # to the subheading.
    if (defined($modified)) {
        push(@subheaders, $modified);
    }

    # Return what we have.
    return @subheaders;
}

##############################################################################
# Document conversion
##############################################################################

# Convert a document from text to HTML.
#
# $in_fh    - Input file handle
# $in_path  - Input path
# $out_fh   - Output file handle
# $out_path - Output path
sub _convert_document {
    my ($self, $in_fh, $in_path, $out_fh, $out_path) = @_;

    # Initialize object state for a new document.
    #<<<
    $self->{baseline}   = undef;      # Baseline indentation of text
    $self->{buffer}     = undef;      # Buffered input line not yet converted
    $self->{contents}   = 0;          # Whether inside a contents section
    $self->{in_fh}      = $in_fh;     # Input file handle
    $self->{in_path}    = $in_path;   # Path to input file
    $self->{h2}         = undef;      # Indentation level for h2 headings
    $self->{out_fh}     = $out_fh;    # Output file handle
    $self->{out_path}   = $out_path;  # Path to the output file
    $self->{pre}        = 0;          # Whether inside a preformatted block
    $self->{whitespace} = q{};        # Pending whitespace
    #>>>

    # Parse the document headers.
    my $header_ref = $self->_parse_headers();

    # Generate the header of the HTML file.
    $self->_output_header($header_ref);

    # Open the body of the document, print the navigation links if possible,
    # and print out the heading if we found one.
    $self->_output("<body>\n\n");
    if ($self->{sitemap} && defined($self->{output}) && defined($out_path)) {
        my $page = $out_path->relative($self->{output});
        my @navbar = $self->{sitemap}->navbar($page);
        if (@navbar) {
            $self->_output(@navbar, "\n");
        }
    }
    if ($header_ref->{heading}) {
        $self->_output(h1($header_ref->{heading}), "\n");
    }

    # Parse and output the subheaders, if any.
    my @subheaders = $self->_parse_subheaders($header_ref);
    if (@subheaders) {
        $self->_output(qq(<p class="subheading">\n));
        $self->_output(q{  }, join("<br />\n  ", @subheaders), "\n</p>\n\n");
    }

    # Scan the actual body of the text.  We don't use paragraph mode, since it
    # doesn't work with blank lines that contain whitespace; instead, we
    # cobble together our own paragraph mode that does.  Note that $_ already
    # has a non-blank line of input coming into this loop.
    my $space;
    while (defined($_ = $self->_next_paragraph())) {
        last if _is_signature($_);

        # If we just hit a digest divider, the next thing will likely be a
        # Subject: line that we want to turn into a section header.  Digest
        # section titles are always level 2 headers currently.
        if (_is_divider $_) {
            $self->{pre} = 0;
            $self->_output(start(-1));
            undef $INDENT;
            ($self->{whitespace}) = /\n(\s*)$/;
            $_ = $self->_next_paragraph();
            s/\n(\s*)$/\n/;
            $space = $1;
            if (s/^Subject:\s+//) {
                $self->{contents} = /\bcontents\b/i;
                $_ = escape $_;
                if (/^([\d.]+)[.\)]\s/) {
                    $self->_output(
                        h2(container(qq(a name="S$1" id="S$1"), $_))
                    );
                } else {
                    $self->_output(h2($_));
                }
                next;
            }
        }

        # Treat lines of dash-type characters as rules.
        if (_is_rule $_) {
            $self->{pre} = 0;
            ($space) = /\n(\s*)$/;
            $self->_output(start(-1), "<hr />\n");
            undef $INDENT;
            next
        }

        # Everything else needs to have special characters escaped.  We don't
        # do this earlier because if we want to allow < and > in rules, the
        # escaping would make our lives miserable.
        $_ = escape $_;

        # Do this before untabification and stashing of trailing whitespace,
        # but after escaping.  Check to see if this paragraph looks like
        # literal text.  If so, we wrap it in <pre> and output it as is.  As a
        # special exception to our normal paragraph handling, this paragraph
        # doesn't end until we find a literal blank line; this hack lets full
        # diffs be included in a FAQ without confusing the parser.
        if (_is_literal $_) {
            if (/\n[ \t]+$/) { $_ .= $self->_next_paragraph(1) }
            $self->_output(pre(strip_indent($_, $INDENT)));
            s/\n(\n\s*)$/\n/;
            $space = $1;
            $self->{pre} = 1;
            next;
        }

        # Not literal text, so untabify it and stash whitespace.
        $_ = untabify $_;
        s/\n(\s*)$/\n/;
        $space = $1;
        my $indent = indent $_;

        # If the paragraph has inconsistent indentation, or is indented
        # relative to the baseline *and* the last paragraph we emitted was
        # enclosed in <pre>, assume that this paragraph belongs in <pre> as
        # well.
        if ($self->{pre}) {
            if (_is_offset ($_) || (defined $INDENT && $indent > $INDENT)) {
                $self->_output(pre(strip_indent($_, $INDENT)));
                next;
            } else {
                $self->{pre} = 0;
            }
        }

        # Check for a heading.  We distinguish between level 2 headings and
        # level 3 headings as follows: The first heading we encounter is
        # assumed to be a level 2 heading, and any further headers at that
        # same indentation level are also level 2 headings.  If we detect any
        # other headings at a greater indent, they're marked as level 3.
        if ($self->_is_heading ($_)) {
            s/^\s+//;
            $self->{contents} = /\bcontents\b/i;
            my $h;
            if (defined $self->{h2}) {
                if ($indent <= $self->{h2}) { $h = \&h2 }
                else                        { $h = \&h3 }
            } else {
                $self->{h2} = $indent;
                $h = \&h2;
            }
            $_ = _remove_rule($_);
            if (/^([\d.]+)[.\)]\s/) {
                my $anchor = qq(a name="S$1" id="S$1");
                $self->_output(start(), $h->(container($anchor, $_)));
            } else {
                $self->_output(start(), $h->($_));
            }
            $INDENT = $self->{baseline};
            next;



( run in 1.180 second using v1.01-cache-2.11-cpan-5837b0d9d2c )