Foorum
view release on metacpan - search on metacpan
view release on metacpan or search on metacpan
lib/Foorum/Scraper/MailMan.pm view on Meta::CPAN
package Foorum::Scraper::MailMan;
# directly copied from mailman-archive-to-rss
# http://taint.org/mmrss/
# Thanks, Adam Shand
use strict;
use warnings;
our $VERSION = '1.001000';
use HTML::TokeParser;
use LWP::Simple;
use Encode qw/from_to/;
use Encode::Guess qw/euc-cn/; # XXX? can't explain
sub new {
my $class = shift;
my $self = {};
return bless $self => $class;
}
sub scraper {
my ( $self, $url ) = @_;
my $html = get($url);
unless ($html) {
return;
}
my $urlbase = $url;
$urlbase =~ s,/[^/]+$,/,gs;
$self->{url_base} = $urlbase;
my $ret = $self->extract_from_thread($html);
foreach (@$ret) {
my $details = get( $_->{url} );
if ($details) {
( $_->{when}, $_->{text} )
= $self->extract_from_message($details);
}
}
return $ret;
}
sub extract_from_thread {
my ( $self, $html ) = @_;
my $stream = HTML::TokeParser->new( \$html ) or die $!;
my @posts = ();
my $nest = 0;
while ( my $tag = $stream->get_tag( 'li', 'ul', '/ul' ) ) {
$tag = $stream->get_tag('a');
my $url = $tag->[1]{href} || '--';
# only follow Mailman-style numeric links
next unless ( $url =~ /(\d+|msg\d+)\.html$/ );
my $msg_id = $1;
$msg_id =~ s/\D+//isg;
$url = $self->{url_base} . $url;
my $headline = $stream->get_trimmed_text('/a');
$headline =~ s/&/&/g;
$headline =~ s/</</g;
$headline =~ s/>/>/g;
$headline =~ s/^\s*\[\w+\]\s*//;
$tag = $stream->get_tag('i');
my $who = $stream->get_trimmed_text('/i');
$who =~ s/<.*?>//g;
$who =~ s/\<.*?\>//ig;
$who =~ s/\&/\&/g;
$who =~ s/</\</g;
$who =~ s/>/\>/g;
view all matches for this distributionview release on metacpan - search on metacpan
( run in 0.966 second using v1.00-cache-2.02-grep-82fe00e-cpan-1925d2aa809 )