looks results from the CPAN

App-DocKnot

# Returns: True if so, false otherwise
sub _is_allcaps {
    my ($line) = @_;
    return $line !~ m{ [^[:upper:]\d\s\"\(\),:.!/?-] }xms;
}

# Whether a paragraph is broken into a series of short lines or a series of
# lines without internal space.  The last line of the paragraph doesn't matter
# for this determination.
#
# $paragraph - Paragraph to classify
#
# Returns: True if so, false otherwise
sub _is_broken {
    my ($paragraph) = @_;
    $paragraph =~ s{ \s* \z }{\n}xms;
    my @lines = split(m{ \n }xms, $paragraph);
    return if @lines == 1;
    pop(@lines);
    return 1 if grep { length($_) < 40 } @lines;
    my $short = grep { length($_) < 60 } @lines;
    return 1 if $short >= int(@lines / 2) + 1;
    return $paragraph =~ m{ \A (?: \s* \S+ [ \t]* \n )+ \z }xms;
}

# Whether a paragraph is a bullet item.
#
# $paragraph - Paragraph to classify
#
# Returns: True if so, false otherwise
sub _is_bullet {
    my ($paragraph) = @_;
    return $paragraph =~ m{ \A \s* [-o*] \s }xms;
}

# Whether a line is centered (in 74 columns).  Also require at least 10 spaces
# of whitespace so that we don't catch accidentally centered paragraph lines
# by mistake.
#
# $line - Line to classify
#
# Returns: True if so, false otherwise
sub _is_centered {
    my ($line) = @_;
    return if $line !~ m{ \A (\s+) (.+) }xms;
    my ($space, $text) = ($1, $2);
    return if abs(74 - length($text) - length($space) * 2) >= 2;
    return length(untabify($space)) >= 8;
}

# Whether a paragraph is a content listing.
#
# $paragraph - Paragraph to classify
#
# Returns: True if so, false otherwise
sub _is_contents {
    my ($paragraph) = @_;
    return $paragraph =~ m{ \A (?: \s* [\d.]+[.\)] [ \t] \N* \n)+ \s* \z }xms;
}

# Whether a paragraph looks like a title and a description.  Allows for
# multiple titles.
#
# $paragraph - Paragraph to classify
#
# Returns: True if so, false otherwise
sub _is_description {
    my ($paragraph) = @_;
    return if $paragraph !~ m{
        \A
        (\s*) \S \N* \n         # title (1 is indent)
        (?: \1 \S \N* \n)*      # possibly more than one
        (\s+) \S \N* \n         # first line of description (2 is indent)
        (?: \2 \S \N* \n)*      # subsequent lines
        \s* \z
    }xms;
    return length($1) < length($2);
}

# Whether a line is a digest divider.
#
# $line - Line to classify
#
# Returns: True if so, false otherwise
sub _is_divider {
    my ($line) = @_;
    return $line =~ m{ \A -{30} \s* \z }xms;
}

# Whether a line is an RFC 2822 header.
#
# $line - Line to classify
#
# Returns: True if so, false otherwise
sub _is_header {
    my ($line) = @_;
    return if $line =~ m{ \A [\w-]+: \s+ \N }xms;
}

# Whether a paragraph is a heading.  This is all about heuristics and guesses,
# and there are a number of other things we could confuse for headings, so we
# have to be careful.
#
# If it's a single line and outdented from the baseline, it's probably a
# heading.
#
# If it's at the baseline, check to see if it looks like a heading and either
# it's in all caps or there is a rule underneath it.  If we haven't seen a
# baseline, be more accepting about headers.
#
# If we're inside a contents block, be even more careful and disallow numbered
# things that look like a heading unless they're outdented.
#
# Unlike most of the classification functions, this is a regular method, since
# it needs access to the parsing state.
#
# $paragraph - Paragraph to classify
#
# Returns: True if a heading, false otherwise
sub _is_heading {
    my ($self, $paragraph) = @_;
    $paragraph = _unescape($paragraph);
    my $indent = indent($paragraph);
    my $nobase = !defined($self->{baseline});
    my $outdented = defined($self->{baseline}) && $indent < $self->{baseline};

    # Numbered lines inside the contents section are definitely not headings.
    my $numbered = $paragraph =~ m{ \A [\d.]+[.\)] \s }xms;
    return if !$outdented && $self->{contents} && $numbered;

    # Outdented single lines are headings as long as they're either short or
    # contain at least two words.
    if ($outdented && lines($paragraph) == 1) {
        return 1 if $paragraph =~ m{ \S \s \S }xms;
        return 1 if length($paragraph) < 30;
    }

    # Indented lines are never headings.
    return if defined($INDENT) && $indent > $INDENT;

    # Lines of at most 31 characters ending in a word character or closing
    # quote or paren are headings if they're underlined.
    return 1 if $paragraph =~ m{
        \A \s*
        [ \w\"\(\),:./&-]{0,30} [\w\"\)] \s* \n
        [-=~]+ \s*
        \z
    }xms;

    # All-uppercase lines of at most 31 characters ending in an uppercase
    # character, digit, or closing quote or paren are headings.
    return 1 if $paragraph =~ m{
        \A \s*
        [ [:upper:]\d\"\(\),:./&-]{0,30} [[:upper:]\d\"\)]
        \s* \n
        \z
    }xms;

    # If there is no baseline, assume single lines of at most 34 characters
    # with no unexpected characters are headings.
    return $nobase && $paragraph =~ m{
        \A \s*
        [ \w\"\(\),:./&-]{0,33} [\w\"\)]
        \s* \n
        \z
    }xms;
}

lib/App/DocKnot/Spin/Text.pm view on Meta::CPAN

    }
    return $output unless $tag;
    if (!@INDENT || $indent > $INDENT[0][1]) {
        $output .= "<$tag>\n";
        unshift (@INDENT, [ $tag, $indent ]);
    }
    $output . $data;
}

# Handle titles, which should have newlines turned into spaces and leading and
# trailing whitespace zapped.
sub title {
    local $_ = shift;
    s/\s*\n\s*/ /g;
    s/^\s+//;
    s/\s+$//;
    '<title>' . $_ . '</title>';
}

# Various containers.
sub blockquote { paragraph ('blockquote', @_) }
sub dt         { container ('dt',         @_) }
sub h1         { container ('h1',         @_) }
sub h2         { container ('h2',         @_) }
sub h3         { container ('h3',         @_) }
sub p          { paragraph ('p',          @_) }
sub pre        { container ('pre',        @_) }

##############################################################################
# Header parsing
##############################################################################

# Parse a block of RFC 2822 headers.
#
# Returns: Hash of lower-cased header names to contents, or the empty hash if
#          no headers were seen
sub _parse_rfc2822_headers {
    my ($self) = @_;
    my %header;

    my $line = $self->_next_line();
    while (defined($line) && $line =~ m{ \A ([\w-]+): \s+ (.*) }xms) {
        my ($header, $content) = ($1, $2);

        # Deal with continuation lines.
        $line = $self->_next_line();
        while (defined($line) && $line =~ m{ \A \s+ \S }xms) {
            $content .= $line;
            $line = $self->_next_line();
        }

        # Save the header contents.
        chomp($content);
        $header{lc($header)} = $content;
    }
    $self->_buffer_line($line);

    return \%header;
}

# Check to see if the header looks like that of a FAQ.  If so, parse it.
#
# $header_ref - Hash into which to store the parse results.
sub _handle_faq_headers {
    my ($self, $header_ref) = @_;
    my $line = $self->_next_line();

    # Skip over a leading "From " line from an mbox file.
    if (defined($line) && $line !~ m{ \A From [ ] }xms) {
        $self->_buffer_line($line);
    }

    # Parse the top-level headers, if any, followed by the FAQ headers,
    # skipping blank lines after each header section.
    my $top_ref = $self->_parse_rfc2822_headers();
    $self->_skip_blank_lines();
    my $sub_ref = $self->_parse_rfc2822_headers();
    $self->_skip_blank_lines();

    # Store the information we care about from the headers.
    $header_ref->{author} = $top_ref->{from};
    $header_ref->{original} = $sub_ref->{'original-author'};
    $header_ref->{title} = $sub_ref->{'html-title'} // $top_ref->{subject};
    return;
}

# Parse the headers of a text document.
#
# Returns: Hash of data from headers with the following keys:
#            author   - Author of document
#            id       - RCS Id string
#            heading  - Main document heading
#            original - Original author of document
#            title    - Document title
sub _parse_headers {
    my ($self) = @_;
    my %header;

    # Check for a leading RCS/CVS version identifier.  For FAQs that I'm
    # posting to Usenet using postfaq, this will always be the first line of
    # the file stored on disk.
    my $line = $self->_next_line();
    if (_is_id($line)) {
        chomp($line);
        $header{id} = $line;
        $self->_skip_blank_lines();
        $line = $self->_next_line();
    }

    # Check for the type of document.  First, see if it looks like a FAQ with
    # news/mail headers, and if so read those headers and the subheaders.
    # Otherwise, skip over leading blank lines and rules.
    $self->_buffer_line($line);
    if (!$self->{title} && (_is_header($line) || $line =~ m{ \A From }xms)) {
        $self->_handle_faq_headers(\%header);
    }
    $self->_skip_blank_lines_and_rules();

    # See if we have a centered title at the top of the document.  If so,
    # we'll make that the document title unless we also saw a Subject header
    # or a constructor argument.  Titles shouldn't be in all caps, though.
    $line = $self->_next_line();
    if (_is_centered($line)) {
        $header{heading} = whitechomp($line);
        if (!defined($header{title})) {
            $header{title} = $header{heading};
            if (_is_allcaps($header{title})) {
                $header{title} =~ s{ \b ([A-Z]+) \b }{\L\u$1}xmsg;
            }
        }
        $self->_skip_blank_lines_and_rules();
    } else {
        $self->_buffer_line($line);
        $header{heading} = $header{title} // $self->{title};
    }

    # Return the parsed header.
    return \%header;
}

# Parse the subheaders of a text document and generate the subheaders for the
# output document.  The author information from the headers will be included,
# as will the last modified date if configured.  Existing subheadings that
# look like they're just Revision or Date strings will be replaced by a
# nicely-formatted string.
#
# $header_ref - Main headers of the text document
#
# Returns: List of lists of subheaders to put at the top of the output
#          document
sub _parse_subheaders {
    my ($self, $header_ref) = @_;
    my (@subheaders, $modified);

    # Generate a last modified date if we have an RCS/CVS Id string or if a
    # last modified subheader from the file modification time was requested.
    # We'll set $modified back to undef if we push it into the subheaders at
    # any point; otherwise, we'll add it at the end.
    if ($header_ref->{id}) {
        $modified = modified_id($header_ref->{id});
    } elsif ($self->{modified} && defined($self->{in_path})) {
        $modified = modified_timestamp($self->{in_path}->stat()->[9]);
    }

    # Parse subheaders.  The first must be centered; after that, assume
    # everything is a subheading until a blank line.
    my $line;
    while (defined($line = $self->_next_line())) {
        next if _is_rule($line);
        last if $line =~ m{ \A \s* \z }xms;

lib/App/DocKnot/Spin/Text.pm view on Meta::CPAN

    }
    if ($header_ref->{heading}) {
        $self->_output(h1($header_ref->{heading}), "\n");
    }

    # Parse and output the subheaders, if any.
    my @subheaders = $self->_parse_subheaders($header_ref);
    if (@subheaders) {
        $self->_output(qq(<p class="subheading">\n));
        $self->_output(q{  }, join("<br />\n  ", @subheaders), "\n</p>\n\n");
    }

    # Scan the actual body of the text.  We don't use paragraph mode, since it
    # doesn't work with blank lines that contain whitespace; instead, we
    # cobble together our own paragraph mode that does.  Note that $_ already
    # has a non-blank line of input coming into this loop.
    my $space;
    while (defined($_ = $self->_next_paragraph())) {
        last if _is_signature($_);

        # If we just hit a digest divider, the next thing will likely be a
        # Subject: line that we want to turn into a section header.  Digest
        # section titles are always level 2 headers currently.
        if (_is_divider $_) {
            $self->{pre} = 0;
            $self->_output(start(-1));
            undef $INDENT;
            ($self->{whitespace}) = /\n(\s*)$/;
            $_ = $self->_next_paragraph();
            s/\n(\s*)$/\n/;
            $space = $1;
            if (s/^Subject:\s+//) {
                $self->{contents} = /\bcontents\b/i;
                $_ = escape $_;
                if (/^([\d.]+)[.\)]\s/) {
                    $self->_output(
                        h2(container(qq(a name="S$1" id="S$1"), $_))
                    );
                } else {
                    $self->_output(h2($_));
                }
                next;
            }
        }

        # Treat lines of dash-type characters as rules.
        if (_is_rule $_) {
            $self->{pre} = 0;
            ($space) = /\n(\s*)$/;
            $self->_output(start(-1), "<hr />\n");
            undef $INDENT;
            next
        }

        # Everything else needs to have special characters escaped.  We don't
        # do this earlier because if we want to allow < and > in rules, the
        # escaping would make our lives miserable.
        $_ = escape $_;

        # Do this before untabification and stashing of trailing whitespace,
        # but after escaping.  Check to see if this paragraph looks like
        # literal text.  If so, we wrap it in <pre> and output it as is.  As a
        # special exception to our normal paragraph handling, this paragraph
        # doesn't end until we find a literal blank line; this hack lets full
        # diffs be included in a FAQ without confusing the parser.
        if (_is_literal $_) {
            if (/\n[ \t]+$/) { $_ .= $self->_next_paragraph(1) }
            $self->_output(pre(strip_indent($_, $INDENT)));
            s/\n(\n\s*)$/\n/;
            $space = $1;
            $self->{pre} = 1;
            next;
        }

        # Not literal text, so untabify it and stash whitespace.
        $_ = untabify $_;
        s/\n(\s*)$/\n/;
        $space = $1;
        my $indent = indent $_;

        # If the paragraph has inconsistent indentation, or is indented
        # relative to the baseline *and* the last paragraph we emitted was
        # enclosed in <pre>, assume that this paragraph belongs in <pre> as
        # well.
        if ($self->{pre}) {
            if (_is_offset ($_) || (defined $INDENT && $indent > $INDENT)) {
                $self->_output(pre(strip_indent($_, $INDENT)));
                next;
            } else {
                $self->{pre} = 0;
            }
        }

        # Check for a heading.  We distinguish between level 2 headings and
        # level 3 headings as follows: The first heading we encounter is
        # assumed to be a level 2 heading, and any further headers at that
        # same indentation level are also level 2 headings.  If we detect any
        # other headings at a greater indent, they're marked as level 3.
        if ($self->_is_heading ($_)) {
            s/^\s+//;
            $self->{contents} = /\bcontents\b/i;
            my $h;
            if (defined $self->{h2}) {
                if ($indent <= $self->{h2}) { $h = \&h2 }
                else                        { $h = \&h3 }
            } else {
                $self->{h2} = $indent;
                $h = \&h2;
            }
            $_ = _remove_rule($_);
            if (/^([\d.]+)[.\)]\s/) {
                my $anchor = qq(a name="S$1" id="S$1");
                $self->_output(start(), $h->(container($anchor, $_)));
            } else {
                $self->_output(start(), $h->($_));
            }
            $INDENT = $self->{baseline};
            next;
        }

        # A sudden change to an indentation of 0 when that's less than our
        # indentation baseline is also a sign of literal text.
        if ($INDENT && $indent == 0 && $INDENT > 0 && defined($self->{baseline})
            && $self->{baseline} > 0) {
            $self->_output(pre(strip_indent($_, $INDENT)));
            $self->{pre} = 1;
            next;
        }

        # We're dealing with a normal paragraph of some sort, so go ahead and
        # turn URLs into links.  Check whether the paragraph is broken first,
        # though, and stash that information, since turning URLs into links
        # can artificially lengthen lines.
        my $broken = _is_broken $_;
        $_ = _format_urls($_);

        # Check to see if we're in a contents section and this paragraph looks
        # like a table of contents.  If so, turn all of the section headings
        # into links.
        if ($self->{contents} && _is_contents($_)) {
            $_ = _format_contents($_)
        }

        # Check for paragraphs that are entirely bulletted lines, and turn
        # them into unordered lists without <p> tags.
        if (_is_allbullet $_) {
            my $last;
            my @lines = split (/\n/, $_);
            for (@lines) {
                next unless /\S/;
                if (_is_bullet $_) {
                    if (defined $last) {
                        $self->_output(start($INDENT, 'ul'));
                        $self->_output(li($INDENT, _format_bold($last)));
                    }
                    $last = _remove_bullet($_);
                    $INDENT = indent $last;
                } else {
                    $last .= "\n$_";
                }
            }
            if (defined $last) {
                $self->_output(start($INDENT, 'ul'));
                $self->_output(li($INDENT, _format_bold($last)));
            }
            next;
        }

        # Check for paragraphs that are entirely numbered lines, and turn them
        # into ordered lists without <p> tags.
        if (_is_allnumbered $_) {
            my @lines = split (/\n/, $_);
            for (@lines) {
                next unless /\S/;
                my ($number) = /^(\d+)/;
                $_ = _remove_number($_);
                $INDENT = indent $_;
                $self->_output(start($INDENT, 'ol'));
                $self->_output(li($INDENT, _format_bold($_), $number));
            }
            next;
        }

        # Check for bulletted paragraphs and turn them into lists.
        if (_is_bullet $_) {
            $_ = _remove_bullet($_);
            $INDENT = indent $_;
            $self->_output(start($INDENT, 'ul'));
            $self->_output(li($INDENT, p(_format_bold($_))));
            next;
        }

        # Check for paragraphs quoted with some character and turn them into
        # blockquotes provided they don't have inconsistent indentation.
        my $quote = _is_quoted ($_);
        if ($quote && !$broken) {
            $_ = _remove_prefix($_, $quote);

lib/App/DocKnot/Spin/Text.pm view on Meta::CPAN

    # filter if so desired.
    if (defined($input)) {
        $input = path($input)->realpath();
        $in_fh = $input->openr_utf8();
    } else {
        open($in_fh, '<&:raw:encoding(utf-8)', 'STDIN');
    }

    # Open the output file.
    if (defined($output)) {
        $output = path($output)->absolute();
        $out_fh = $output->openw_utf8();
    } else {
        open($out_fh, '>&:raw:encoding(utf-8)', 'STDOUT');
    }

    # Do the work.
    $self->_convert_document($in_fh, $input, $out_fh, $output);

    # Close input and output.
    close($in_fh);
    close($out_fh);
}

##############################################################################
# Module return value and documentation
##############################################################################

1;
__END__

=for stopwords
Allbery DocKnot MERCHANTABILITY NONINFRINGEMENT sublicense outdenting RCS
documentable outdented subheaders preformatted XHTML

=head1 NAME

App::DocKnot::Spin::Text - Convert some particular text formats into HTML

=head1 SYNOPSIS

    use App::DocKnot::Spin::Text;

    my $text = App::DocKnot::Spin::Text->new({style => '/styles/faq.css'});
    $text->spin_text_file('/path/to/input', '/path/to/output.html');

=head1 REQUIREMENTS

Perl 5.24 or later and the modules List::SomeUtils, Path::Tiny, and
Sort::Versions, available from CPAN.

=head1 DESCRIPTION

This is another of those odd breed of partially functional beasts, a text to
HTML converter.

This is not truly possible in general; people do too many varied things with
their text to intuit document structure from it.  This is therefore a
converter that will translate documents written the way I write.  It may or
may not work for you.  The chances that it will work for you are directly
proportional to how much your writing looks like mine.

App::DocKnot::Spin::Text understands digest separators (lines of exactly
thirty hyphens, from the minimal digest standard) and will treat a C<Subject>
header immediately after them as a section header.  Beyond that, headings must
either be outdented, underlined on the following line, or in all caps to be
recognized as section headers.  (Outdenting means that the regular text is
indented by a few spaces, but headers start in column 0, or at least in a
column farther to the left than the regular text.)

Section headers that begin with numbers (with any number of periods) will be
given C<< <a id> >> tags containing that number prepended with C<S>.  As a
special case of the parsing, any section with a header containing C<contents>
will have lines beginning with numbers turned into links to the appropriate <a
id> tags in the same document.  You can use this to turn the table of contents
of your minimal digest format FAQ into a real table of contents with links in
the HTML version.

Text with embedded whitespace more than a single space or a couple of spaces
at a sentence boundary or after a colon (and any text with literal tabs) will
be wrapped in C<< <pre> >> tags.  So will any indented text that doesn't look
like English paragraphs.  URLs surrounded by C<< <...> >> or C<< <URL:...> >>
will be turned into links.  Other URLs will not be turned into links, nor is
any effort made to turn random body text into links because it happens to look
like a link.

Bullet lists and numbered lists will be turned into the appropriate HTML
structures.  Some attempt is also made to recognize description lists, but
App::DocKnot::Spin::Text was written by someone who writes a lot of technical
documentation and therefore tends to prefer C<< <pre> >> if unsure whether
something is a description list or preformatted text.  Description lists are
therefore only going to work if the description titles aren't indented
relative to the surrounding text.

Regular indented paragraphs or paragraphs quoted with a consistent
non-alphanumeric quote character are recognized and turned into HTML block
quotes.

It's worthwhile paying attention to the headers at the top of your document so
that App::DocKnot::Spin::Text can get a few things right.  If you use RCS or
CVS, put the RCS C<Id> keyword as the first line of your document; it will be
stripped out of the resulting output and App::DocKnot::Spin::Text will use it
to determine the document revision.  This should be followed by regular
message headers and news.answers subheaders if the document is an actual FAQ,
and App::DocKnot::Spin::Text will use the C<From> and C<Subject> headers to
figure out a title and headings to use.  As a special case, an HTML-title
header in the subheaders will override any other title that
App::DocKnot::Spin::Text thinks it should use for the document.

App::DocKnot::Spin::Text expects your document to have an C<< <h1> >> title,
and will add one from the Subject header if it doesn't find one.  It will also
add subheaders (C<class="subheading">) giving the author (from the C<From>
header) and the last modified time and revision (from the RCS C<Id> string) if
there are no subheadings already.  If there's a subheading that contains RCS
identifiers, it will be replaced by a nicely formatted heading generated from
the RCS C<Id> information in the HTML output.

Text marked as C<*bold*> using the standard asterisk notation will be
surrounded by C<< <strong> >> tags, if the asterisks appear to be marking bold
text rather than serving as wildcards or some other function.

( run in 0.509 second using v1.01-cache-2.11-cpan-39bf76dae61 )