Foorum

 view release on metacpan or  search on metacpan

lib/Foorum/Scraper/MailMan.pm  view on Meta::CPAN

package Foorum::Scraper::MailMan;

# directly copied from mailman-archive-to-rss
# http://taint.org/mmrss/
# Thanks, Adam Shand

use strict;
use warnings;
our $VERSION = '1.001000';
use HTML::TokeParser;
use LWP::Simple;
use Encode qw/from_to/;
use Encode::Guess qw/euc-cn/;    # XXX? can't explain

sub new {
    my $class = shift;
    my $self  = {};

    return bless $self => $class;
}

sub scraper {
    my ( $self, $url ) = @_;

    my $html = get($url);
    unless ($html) {
        return;
    }

    my $urlbase = $url;
    $urlbase =~ s,/[^/]+$,/,gs;
    $self->{url_base} = $urlbase;

    my $ret = $self->extract_from_thread($html);

    foreach (@$ret) {
        my $details = get( $_->{url} );
        if ($details) {
            ( $_->{when}, $_->{text} )
                = $self->extract_from_message($details);
        }
    }

    return $ret;
}

sub extract_from_thread {
    my ( $self, $html ) = @_;

    my $stream = HTML::TokeParser->new( \$html ) or die $!;

    my @posts = ();
    my $nest  = 0;
    while ( my $tag = $stream->get_tag( 'li', 'ul', '/ul' ) ) {

        $tag = $stream->get_tag('a');
        my $url = $tag->[1]{href} || '--';

        # only follow Mailman-style numeric links
        next unless ( $url =~ /(\d+|msg\d+)\.html$/ );
        my $msg_id = $1;
        $msg_id =~ s/\D+//isg;

        $url = $self->{url_base} . $url;

        my $headline = $stream->get_trimmed_text('/a');
        $headline =~ s/&/&/g;
        $headline =~ s/</&lt;/g;
        $headline =~ s/>/&gt;/g;
        $headline =~ s/^\s*\[\w+\]\s*//;

        $tag = $stream->get_tag('i');
        my $who = $stream->get_trimmed_text('/i');
        $who =~ s/<.*?>//g;
        $who =~ s/\&lt;.*?\&gt;//ig;
        $who =~ s/\&/\&amp;/g;
        $who =~ s/</\&lt;/g;
        $who =~ s/>/\&gt;/g;

 view all matches for this distribution
 view release on metacpan -  search on metacpan

( run in 0.966 second using v1.00-cache-2.02-grep-82fe00e-cpan-1925d2aa809 )