HTML-Quoted
view release on metacpan or search on metacpan
lib/HTML/Quoted.pm view on Meta::CPAN
Variouse MUAs use quite different approaches for quoting in mails.
Some use I<blockquote> tag and it's quite easy to parse.
Some wrap text into I<p> tags and add '>' in the beginning of the
paragraphs.
Things gettign messier when it's an HTML reply on plain text mail
thread.
If B<you found format> that is not supported then file a bug report
via rt.cpan.org with as short as possible example. B<Test file>
is even better. Test file with patch is the best. Not obviouse patches
without tests suck.
=head1 METHODS
=head2 extract
my $struct = HTML::Quoted->extract( $html );
Takes a string with HTML and returns array reference. Each element
in the array either array or hash. For example:
[
{ 'raw' => 'Hi,' },
{ 'raw' => '<div><br><div>On date X wrote:<br>' },
[
{ 'raw' => '<blockquote>' },
{ 'raw' => 'Hello,' },
{ 'raw' => '<div>How are you?</div>' },
{ 'raw' => '</blockquote>' }
],
...
]
Hashes represent a part of the html. The following keys are
meaningful at the moment:
=over 4
=item * raw - raw HTML
=item * quoter_raw, quoter - raw and decoded (entities are converted) quoter if block is prefixed with quoting characters
=back
=cut
sub extract {
my $self = shift;
my $parser = HTML::Quoted::Parser->new(
api_version => 3,
handlers => {
start_document => [handle_doc_start => 'self'],
end_document => [handle_doc_end => 'self'],
start => [handle_start => 'self, tagname, attr, attrseq, text'],
end => [handle_end => 'self, tagname, text'],
text => [handle_text => 'self, text, is_cdata'],
default => [handle_default => 'self, event, text'],
},
);
$parser->empty_element_tags(1);
$parser->parse($_[0]);
$parser->eof;
return $parser->{'html_quoted_parser'}{'result'};
}
=head2 combine_hunks
my $html = HTML::Quoted->combine_hunks( $arrayref_of_hunks );
Takes the output of C<extract> and turns it back into HTML.
=cut
sub combine_hunks {
my ($self, $hunks) = @_;
join "",
map {; ref $_ eq 'HASH' ? $_->{raw} : $self->combine_hunks($_) } @$hunks;
}
package HTML::Quoted::Parser;
use base "HTML::Parser";
sub handle_doc_start {
my ($self) = @_;
my $meta = $self->{'html_quoted_parser'} = {};
my $res = $meta->{'result'} = [{}];
$meta->{'current'} = $res->[0];
$meta->{'stack'} = [$res];
$meta->{'in'} = { quote => 0, block => [0] };
}
sub handle_doc_end {
my ($self) = @_;
my $meta = $self->{'html_quoted_parser'};
pop @{ $meta->{'result'} } if ref $meta->{'result'}[-1] eq 'HASH' && !keys %{ $meta->{'result'}[-1] };
$self->organize( $meta->{'result'} );
}
sub organize {
my ($self, $list) = @_;
my $prev = undef;
foreach my $e ( splice @$list ) {
if ( ref $e eq 'ARRAY' ) {
push @$list, $self->organize($e);
$prev = undef;
}
elsif ( $e->{'block'} ) {
push @$list, $e;
$prev = undef;
}
elsif ( defined $e->{'quoter'} ) {
if ( !$prev || $self->combine( $prev, $e ) ) {
( run in 0.531 second using v1.01-cache-2.11-cpan-d7a12ab2c7f )