Mediawiki-Spider
view release on metacpan or search on metacpan
lib/Mediawiki/Spider.pm view on Meta::CPAN
}
sub sortedwikiindex{
my ( $self, %sortedwikiindex) = @_;
%{$self->{_sortedwikiindex}} = %sortedwikiindex if %sortedwikiindex ;
if( defined(%{$self->{_sortedwikiindex}})) {
return %{$self->{_sortedwikiindex}};
};
}
sub wikiindex{
# wikiindex; a hash of hashes?
my ( $self, %wikiindex) = @_;
%{$self->{_wikiindex}} = %wikiindex if %wikiindex ;
if( defined(%{$self->{_wikiindex}})) {
return %{$self->{_wikiindex}};
};
}
sub wikiwords{
my ( $self, @wikiwords) = @_;
@{$self->{_wikiwords}} = @wikiwords if @wikiwords ;
if( defined(@{$self->{_wikiwords}})) {
return @{$self->{_wikiwords}};
};
}
sub buildmenu{
my ($self,%addedhash)=@_;
my %wikiindex=$self->wikiindex();
my %inversion;
for my $key (keys %wikiindex) {
for my $key2( keys %{$wikiindex{$key}}){
if($key2 ne "" && $key ne ""){
$inversion{$key2}->{$key}=1;
}
}
}
# print "Inversion: ".Data::Dumper->Dump([%inversion]);
$self->sortedwikiindex(%inversion);
return %inversion;
}
sub makepretty{
my($self,$string)=@_;
$string=~s/\_/\ /g;
return $string;
}
sub printmenu{
# also get it to put %extras in -- extras should be a hash similar to %inverted
my ($self, $page, $extratitle,@extras)=@_;
my %sortedindex=$self->sortedwikiindex();
open (FILE2,"<header.html");
my @rawheader=<FILE2>;
my $header=join('',@rawheader);
close(FILE2);
open(FILEHANDLE, ">$page") || die("($page): cannot open file: ". $!);
print FILEHANDLE "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">";
print FILEHANDLE "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head>\n";
print FILEHANDLE "<title>Index</title>\n<link rel=stylesheet href=\"style.css\" type=\"text/css\"> \n</head>\n<body>\n";
print FILEHANDLE "<?php include('header.inc');?>";
print FILEHANDLE "$header\n";
print FILEHANDLE "<div id=\"column-content\">";
my $incremental=0;
for my $key (sort keys %sortedindex) {
$incremental++;
# put in categories you wish to exclude
if($key=~/Exclude/){
next;
} elsif($key=~/^Category$/){
next;
}
my $keytoshow=$key;
$keytoshow=~s/Category\://g;
$keytoshow=$self->makepretty($keytoshow);
print FILEHANDLE "\n<div id=\"inc$incremental\"><h3>$keytoshow</h3>\n <p id=\"incs$incremental\">";
for my $key2 (sort keys %{$sortedindex{$key}}){
my $key2toshow=$key2;
if($key2=~/rint_All/){
next;
} elsif($key2=~/_Context/){
next;
}
$key2toshow=$self->makepretty($key2toshow);
print FILEHANDLE "<a href=\"".$key2.".".$self->extension()."\">$key2toshow</a>\n";
print FILEHANDLE "<br/>\n";
}
print FILEHANDLE "</p></div>\n";
}
if($extratitle && $#extras>-1){
$incremental++;
print FILEHANDLE "\n<div id=\"inc$incremental\"><h3>$extratitle</h3>\n<p id=\"incs$incremental\">";
foreach my $key3 (@extras){
print FILEHANDLE "\n<a href=\"".$key3.".".$self->extension()."\">".$self->makepretty($self->urldecode($key3))."</a>\n";
print FILEHANDLE "<br/>\n";
}
print FILEHANDLE "\n</p></div>\n";
}
print FILEHANDLE "\n</div>\n<?php include('footer.inc'); ?></body></html>";
close(FILEHANDLE);
return;
}
sub getwikiwords {
my ($self,$uri)=@_;
$self->seturi($uri) if defined($uri);
my @wikiwords;
my $browser=LWP::UserAgent->new();
my $content = $browser->get($uri);
if($content->{_rc} eq "200"){
my $theuri= $content->{_request}->{_uri};
$theuri=~/^(.*)\//;
$theuri=$1."/";
$self->seturi($theuri);
#print "URI: $theuri";
$content=$browser->get($theuri."Special:Allpages");
my @lines=split(/\/Special:Allpages\//,$content->{_content});
lib/Mediawiki/Spider.pm view on Meta::CPAN
$temptest=$word;
$temptest=~s/\:/\-/g;
if($is_wikiword{$word} || $is_wikiword{$temptest}){ # we already did this word (we got the wikiwords from special::allpages)!
print "Ignoring $word (already done)\n";
# return;
} elsif($word=~/http\:\/\/(.*)/){ # no sucking the whole interweb, please!
print "Ignoring $word (inappropriate) \n";
} elsif($word=~/(.*)\.(\w\w|\w\w\w)\/(.*)/){ # thingy.wossname.ac.uk/something?
print "Ignoring $word (inappropriate) \n";
} elsif($word=~/Mediawiki\:/){
} elsif($word=~/Special\:/){
} else {
sleep 3;
print "Looking at $word (do)\n";
push(@wikiwords,$word); # is this the right way round?
$is_wikiword{$word}=1;
$self->wikiwords(@wikiwords); # add that back to the collective 'dealt with' list
my $text=$extractor->gethtml($uri.$word,"tagid=content");
$text=~s/\<table class="wikitable"(.*?)\<\/table\>//;
#$text=~s/<div class="printfooter"(.*?)\<\/div\>//;
my @rawcategories;
if($self->extension()!=""){
my $ext=$self->extension();
$text=~s/\"\/$uriextension\/([0-9A-z\-\_\:\%\&\.\,\;\+\#]+)/\"$1\.$ext/g;
@rawcategories=split(/href=\"([0-9A-z\-\_\:\%\&\.\,\;\+\#]+)\.$ext/,$text);
} else {
$text=~s/\"\/$uriextension\/([0-9A-z\-\_\:\%\&\.\,\;\+\#]+)/\"$1\.html/g;
@rawcategories=split(/href=\"([0-9A-z\-\_\:\%\&\.\,\;\+\#]+)\.html/,$text);
}
if(!$#rawcategories<1){
foreach my $category (@rawcategories) {
# in page $word we found categories @rawcategories
$category=~/(^[0-9A-Za-z\-\_\:\%\&\.\,\;\+\#]+)$/;
if(!$1 eq ""){
#print "Considering category $1\n";
push(@categories,$1);
my $topush=$1;
if($topush=~/Category/ && !$word=~/Category/){
print "Pushing $topush\n";
$wikiindex{$word}->{$topush}=1;
$self->wikiindex(%wikiindex);
}
} # check this bit for safety - it may well be possible to craft dangerous wikiwords...
}
$text=~s/href=\"Category:([0-9A-z\-\_\%\&\.\,\;\+\#]+)/href=\"Category-$1/g;
$word=~s/\:/\-/g;
# if page content contains noinclude tag, don't include it
if($text=~/Category:Exclude/){
print "Not printing $word (excluded)\n";
} else {
$text=~s/\[<a href=(.*?)\W+>edit<\/a>\]//g;
#$text=~s/\<table class="wikitable"(.*?)\<\/table\>//;
$text=~s/<div id="catlinks"(.*?)\<\/div\>//;
$text=~s/<div id="jump-to-nav">(.*?)\<\/div\>//;
open(FILEHANDLE, ">$folder/".$self->urldecode($word).".".$self->extension()) || die("($word): cannot open file: ". $!);
open (FILE2,"<header.html");
my @rawheader=<FILE2>;
my $header=join('',@rawheader);
close(FILE2);
print FILEHANDLE "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">";
print FILEHANDLE "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head>\n<title>$word</title>\n<link rel=stylesheet href=\"style.css\" type=\"text/css\"/>\n </head>\n<body>\n";
print FILEHANDLE "\n<?php include('header.inc'); ?>\n";
print FILEHANDLE "$header\n$text\n";
print FILEHANDLE "\n<?php include('footer.inc'); ?>\n";
print FILEHANDLE "</body></html>"; #sleep 7; #don't go mad if not using this on own site!
close(FILEHANDLE);
}
}
}
}
my %saw;
undef %saw;
my @out = grep(!$saw{$_}++, @categories);
@categories=@out;
my @finalcategories;
%is_wikiword= ();
for (@wikiwords) { $is_wikiword{$_} = 1 }
for(@categories){
if(!$is_wikiword{$_}){
push(@finalcategories,$_);
}
}
# have to compare @categories and @wikiwords
if($#finalcategories>0){
# $self->do_wikisuck($folder,$makecategories,@finalcategories);
# no need to actually recurse for this task, it appears... but nonetheless
print "Left to do:".Data::Dumper->Dump([@finalcategories])."\n";
print Data::Dumper->Dump([%wikiindex]);
}
}
sub makeflatpages{
## make this thing recursive tomorrow...
my ( $self, $folder,$makecategories) = @_;
my @wikiwords=$self->wikiwords();
# print "Wikiwords".Data::Dumper->Dump([@wikiwords])."\n";
my $extractor=new HTML::Extract();
my $uri=$self->seturi();
$uri=~/(.*)\/(.*)\//;
my $uriextension=$2;
my @categories;
my %wikiindex;
# @wikiwords=('Technical_Frameworks_Context');
foreach my $word (@wikiwords){
if($word=~/http\:\/\/(.*)/){ # no sucking the whole interweb, please!
print "Looking at $word (ignore) \n";
}else { # get page, collect categories...
sleep 3;
print "Looking at $word (get page) \n";
my $text=$extractor->gethtml($uri.$word,"tagid=content");
if($text=~/\<div\ id=\"contentSub\">\(Redirected from/){
print "Don't want this word (Is redirect)\n";
next;
}
#$text=~s/\<table class="wikitable"(.*?)\<\/table\>//;
#$text=~s/<div id="catlinks"(.*?)\<\/div\>//;
#$text=~s/<div id="jump-to-nav">(.*?)\<\/div\>//;
$text=~s/\<table class="wikitable"(.*?)\<\/table\>//;
$text=~s/\"\/$uriextension\/([0-9A-z\-\_\:\%\&\.\,\;\+\#]+)/\"$1\.html/g;
my @rawcategories=split(/href=\"([0-9A-z\-\_\:\%\&\.\,\;\+]+)\.html\"/,$text);
# this buggers up when there are 0 categories.
if($#rawcategories<1){
print "Raw categories: ".$#rawcategories."\n";
} else {
# my @rawcategories=split(/href=\"\/$uriextension\/(.*)\"/,$text);
# print Data::Dumper->Dump([@rawcategories]);
foreach my $category (@rawcategories) {
$category=~/(^[0-9A-Za-z\-\_\:\%\&\.\,\;\+\#]+)$/;
if(!$1 eq ""){
#print "Category is $1\n";
push(@categories,$1);
my $topush=$1;
if($topush=~/Category/ ){
#print "Pushing $topush\n";
$wikiindex{$word}->{$topush}=1;
}
}
}
}
if($text =~ /Category:Exclude/){
print "Not printing $word (excluded)\n";
} else {
# Do not have category: files... : in files is bad
$text=~s/href=\"Category:([0-9A-z\-\_\%\&\.\,\;\+\#]+)/href=\"Category-$1/g;
# squelch the '[edit]' links
$text=~s/\[<a href=(.*?)\W+>edit<\/a>\]//g;
$text=~s/<div id="catlinks"(.*?)\<\/div\>//;
$text=~s/<div id="jump-to-nav">(.*?)\<\/div\>//;
open(FILEHANDLE, ">$folder/$word.".$self->extension()) || die("cannot open file: ". $!);
open (FILE2,"<header.html");
my @rawheader=<FILE2>;
my $header=join('',@rawheader);
close(FILE2);
#print FILEHANDLE "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\" />";
print FILEHANDLE "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">";
print FILEHANDLE "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head><title>$word</title><link rel=stylesheet href=\"style.css\" type=\"text/css\"> </head><body>";
print FILEHANDLE "\n<?php include('header.inc'); ?>\n";
print FILEHANDLE "$header\n$text";
print FILEHANDLE "\n<?php include('footer.inc'); ?>\n";
print FILEHANDLE "</body></html>";
close (FILEHANDLE);
#sleep 7; #don't go mad, eh?
}
}
my %saw;
undef %saw;
my @out = grep(!$saw{$_}++, @categories);
@categories=@out;
# print Data::Dumper->Dump([@categories])."\n";
}
print Data::Dumper->Dump([%wikiindex]);
$self->wikiindex(%wikiindex);
if($makecategories){
$self->do_wikisuck($folder,$makecategories,@categories);
}
}
1;
__END__
# Below is stub documentation for your module. You'd better edit it!
=head1 NAME
Mediawiki::Spider - Perl extension for flat mirror of mediawikis
=head1 SYNOPSIS
use Mediawiki::Spider;
=head1 DESCRIPTION
Essentially pretty simple...
=head2 EXPORT
None by default.
=head1 SEE ALSO
There were many ways to achieve this aim. This is one of them. Others (such as XSL stylesheets over mediawiki xml) would probably be cleaner.
=head1 AUTHOR
Emma Tonkin, E<lt>cselt@sourceforge.netE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2006 by Emma Tonkin
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.6 or,
at your option, any later version of Perl 5 you may have available.
( run in 1.612 second using v1.01-cache-2.11-cpan-39bf76dae61 )