HTML-Quoted

 view release on metacpan or  search on metacpan

lib/HTML/Quoted.pm  view on Meta::CPAN


Variouse MUAs use quite different approaches for quoting in mails.

Some use I<blockquote> tag and it's quite easy to parse.

Some wrap text into I<p> tags and add '>' in the beginning of the
paragraphs.

Things gettign messier when it's an HTML reply on plain text mail
thread.

If B<you found format> that is not supported then file a bug report
via rt.cpan.org with as short as possible example. B<Test file>
is even better. Test file with patch is the best. Not obviouse patches
without tests suck.

=head1 METHODS

=head2 extract

    my $struct = HTML::Quoted->extract( $html );

Takes a string with HTML and returns array reference. Each element
in the array either array or hash. For example:


    [
        { 'raw' => 'Hi,' },
        { 'raw' => '<div><br><div>On date X wrote:<br>' },
        [
             { 'raw' => '<blockquote>' },
             { 'raw' => 'Hello,' },
             { 'raw' => '<div>How are you?</div>' },
             { 'raw' => '</blockquote>' }
        ],
        ...
    ]

Hashes represent a part of the html. The following keys are
meaningful at the moment:

=over 4

=item * raw - raw HTML

=item * quoter_raw, quoter - raw and decoded (entities are converted) quoter if block is prefixed with quoting characters

=back

=cut

sub extract {
    my $self = shift;
    my $parser = HTML::Quoted::Parser->new(
        api_version => 3,
        handlers => {
            start_document => [handle_doc_start => 'self'],
            end_document   => [handle_doc_end => 'self'],
            start   => [handle_start   => 'self, tagname, attr, attrseq, text'],
            end     => [handle_end     => 'self, tagname, text'],
            text    => [handle_text    => 'self, text, is_cdata'],
            default => [handle_default => 'self, event, text'],
        },
    );
    $parser->empty_element_tags(1);
    $parser->parse($_[0]);
    $parser->eof;

    return $parser->{'html_quoted_parser'}{'result'};
}

=head2 combine_hunks

  my $html = HTML::Quoted->combine_hunks( $arrayref_of_hunks );

Takes the output of C<extract> and turns it back into HTML.

=cut

sub combine_hunks {
    my ($self, $hunks) = @_;

    join "",
      map {; ref $_ eq 'HASH' ? $_->{raw} : $self->combine_hunks($_) } @$hunks;
}

package HTML::Quoted::Parser;
use base "HTML::Parser";

sub handle_doc_start {
    my ($self) = @_;
    my $meta = $self->{'html_quoted_parser'} = {};
    my $res = $meta->{'result'} = [{}];
    $meta->{'current'} = $res->[0];
    $meta->{'stack'} = [$res];
    $meta->{'in'} = { quote => 0, block => [0] };
}

sub handle_doc_end {
    my ($self) = @_;

    my $meta = $self->{'html_quoted_parser'};
    pop @{ $meta->{'result'} } if ref $meta->{'result'}[-1] eq 'HASH' && !keys %{ $meta->{'result'}[-1] };
    $self->organize( $meta->{'result'} );
}

sub organize {
    my ($self, $list) = @_;

    my $prev = undef;
    foreach my $e ( splice @$list ) {
        if ( ref $e eq 'ARRAY' ) {
            push @$list, $self->organize($e);
            $prev = undef;
        }
        elsif ( $e->{'block'} ) {
            push @$list, $e;
            $prev = undef;
        }
        elsif ( defined $e->{'quoter'} ) {
            if ( !$prev || $self->combine( $prev, $e ) ) {



( run in 0.531 second using v1.01-cache-2.11-cpan-d7a12ab2c7f )