Alvis-Convert
view release on metacpan or search on metacpan
bin/alvis_wikipedia_add_cats view on Meta::CPAN
my $ScoreType='wikipedia Fundamental two top levels';
my $ListF=undef;
my $OutDir='.';
my $Root='fundamental';
GetOptions('help|?'=>\$PrintHelp,
'man'=>\$PrintManual,
'warnings!'=>\$Warnings,
'out-dir=s'=>\$OutDir,
'alvis-suffix=s'=>\$Suffix,
'dump-file=s'=>\$DumpF,
'root=s'=>\$Root,
'score-type=s'=>\$ScoreType,
'method=s'=>\$MethodName,
'category-list-file=s'=>\$ListF
) or
pod2usage(2);
pod2usage(1) if $PrintHelp;
pod2usage(-exitstatus => 0, -verbose => 2) if $PrintManual;
pod2usage(1) if (@ARGV!=1);
if ($MethodName eq '2toplevels')
{
$Method=$Alvis::Wikipedia::CatGraph::TWO_TOP_LEVELS;
}
elsif ($MethodName eq 'given')
{
$Method=$Alvis::Wikipedia::CatGraph::GIVEN_LIST;
}
my $Dir=shift @ARGV;
my %Seen=();
my $Scores;
$|=1;
my $Parser=Alvis::Wikipedia::WikitextParser->new();
if (!defined($Parser))
{
die("Instantiating Alvis::Wikipedia::WikitextParser failed.\n");
}
my $G=Alvis::Wikipedia::CatGraph->new(method=>$Method,
root=>$Root);
if (!defined($G))
{
die("Instantiating Alvis::Wikipedia::CatGraph failed.\n");
}
print STDERR "Loading the graph....\r";
if (!$G->load_graph($DumpF))
{
die("Loading the graph dump failed: " . $G->errmsg());
}
print STDERR "\n";
my $List;
if ($ListF)
{
print STDERR "Getting the list of categories....\r";
open(L,"<:utf8",$ListF) || die("Unable to open \"$ListF\"");;
while (my $l=<L>)
{
chomp $l;
push(@$List,$l);
}
close(L);
print STDERR "\n";
}
print STDERR "Building the path length map....\r";
if (!$G->build_path_length_map($List))
{
die("Building the path length map failed. " .
$G->errmsg());
}
print STDERR "\n";
system("mkdir -p $OutDir");
if (!&_add_cats_to_collection($Dir,{alvisSuffix=>$Suffix}))
{
die("Adding categories to the collection failed. " . $G->errmsg());
}
sub _parse_entries
{
my $entries=shift;
my $options=shift;
my $alvis_entries=shift;
for my $e (@$entries)
{
if ($Seen{$e})
{
next;
}
$Seen{$e}=1;
if (-d $e)
{
my @entries=glob("$e/*");;
&_parse_entries(\@entries,$options,$alvis_entries);
next;
}
my ($basename,$suffix);
if ($e=~/^(.*)\.([^\.]+)$/)
{
$basename=$1;
$suffix=$2;
}
else
{
warn "Skipping non-suffixed non-directory entry \"$e\"." if
$Warnings;
next;
}
if ($suffix eq $options->{alvisSuffix})
{
$alvis_entries->{$basename}{alvisF}=$e;
}
}
}
sub _add_cats_to_collection
{
my $root_dir=shift;
my $options=shift;
my @entries=glob("$root_dir/*");
my %alvis_entries=();
%Seen=();
print "Parsing the source directory entries...\r";
&_parse_entries(\@entries,$options,\%alvis_entries);
print " \r";
for my $base_name (keys %alvis_entries)
{
my $alvisXML;
if (exists($alvis_entries{$base_name}{alvisF}))
{
my $f=$alvis_entries{$base_name}{alvisF};
open(W,"<:utf8",$f) || die("Unable to open \"$f\"");
my $out=$f;
$out=~s/^.*\///sgo;
open(OUT,">:utf8","$OutDir/$out") ||
die("Unable to open \"$OutDir/$out\"");
my $new_rec="";
my $N=1;
while (my $record=&_get_next_rec(*W))
{
my $cats=&_get_cats($record);
if (!defined($cats))
{
warn "Getting the categories of record #$N in file " .
"\"$f\" failed.";
next;
}
$Scores=$G->get_relative_scores($cats);
if (!defined($Scores))
{
warn "Getting the relative scores of record #$N in file " .
"\"$f\" failed.";
next;
}
$new_rec=&_output_rec_with_new_scores($record,$Scores);
if (!defined($new_rec))
{
warn "Getting the new, category-added version " .
"of record #$N in file " .
"\"$f\" failed.";
next;
}
print OUT $new_rec;
print STDERR "$N\r";
$N++;
}
close(W);
print STDERR "\n";
close(OUT);
}
}
return 1;
}
sub _output_rec_with_new_scores
{
my $rec=shift;
my $new_rec="";
my @lines=split(/\n/,$rec);
for my $l (@lines)
{
if ($l=~/<\/relevance>/)
{
$new_rec.=" <scoreset type=\"$ScoreType\">\n";
for my $c (sort _c_score_cmp keys %$Scores)
{
my $score=sprintf("%.1f",$Scores->{$c});
$new_rec.=" <score topicId=\"$c\">$score</score>\n";
}
( run in 1.025 second using v1.01-cache-2.11-cpan-140bd7fdf52 )