Alvis-Convert

 view release on metacpan or  search on metacpan

bin/alvis_wikipedia_add_cats  view on Meta::CPAN

my $ScoreType='wikipedia Fundamental two top levels';
my $ListF=undef;
my $OutDir='.';
my $Root='fundamental';

GetOptions('help|?'=>\$PrintHelp, 
	   'man'=>\$PrintManual,
	   'warnings!'=>\$Warnings,
	   'out-dir=s'=>\$OutDir,
	   'alvis-suffix=s'=>\$Suffix,
	   'dump-file=s'=>\$DumpF,
	   'root=s'=>\$Root,
	   'score-type=s'=>\$ScoreType,
	   'method=s'=>\$MethodName,
	   'category-list-file=s'=>\$ListF
	   ) or 
    pod2usage(2);

pod2usage(1) if $PrintHelp;
pod2usage(-exitstatus => 0, -verbose => 2) if $PrintManual;
pod2usage(1) if (@ARGV!=1); 

if ($MethodName eq '2toplevels')
{
    $Method=$Alvis::Wikipedia::CatGraph::TWO_TOP_LEVELS;
}
elsif ($MethodName eq 'given')
{
    $Method=$Alvis::Wikipedia::CatGraph::GIVEN_LIST;
}

my $Dir=shift @ARGV;
my %Seen=();
my $Scores;

$|=1;

my $Parser=Alvis::Wikipedia::WikitextParser->new();
if (!defined($Parser))
{
    die("Instantiating Alvis::Wikipedia::WikitextParser failed.\n");
}
my $G=Alvis::Wikipedia::CatGraph->new(method=>$Method,
				      root=>$Root);
if (!defined($G))
{
    die("Instantiating Alvis::Wikipedia::CatGraph failed.\n");
}

print STDERR "Loading the graph....\r";
if (!$G->load_graph($DumpF))
{
    die("Loading the graph dump failed: " . $G->errmsg());
}
print STDERR "\n";

my $List;
if ($ListF)
{
    print STDERR "Getting the list of categories....\r";
    open(L,"<:utf8",$ListF) || die("Unable to open \"$ListF\"");;
    while (my $l=<L>)
    {
	chomp $l;
	push(@$List,$l);
    }
    close(L);
    print STDERR "\n";
}

print STDERR "Building the path length map....\r";
if (!$G->build_path_length_map($List))
{
    die("Building the path length map failed. " . 
	$G->errmsg());
}
print STDERR "\n";

system("mkdir -p $OutDir");
if (!&_add_cats_to_collection($Dir,{alvisSuffix=>$Suffix}))
{
    die("Adding categories to the collection failed. " . $G->errmsg());
}

sub _parse_entries
{
    my $entries=shift;
    my $options=shift;
    my $alvis_entries=shift;
    
    for my $e (@$entries)
    {
	if ($Seen{$e})
	{
	    next;
	}
	
	$Seen{$e}=1;
	if (-d $e)
	{
	    my @entries=glob("$e/*");;
	    &_parse_entries(\@entries,$options,$alvis_entries);
	    next;
	}

	my ($basename,$suffix);
	if ($e=~/^(.*)\.([^\.]+)$/)
	{
	    $basename=$1;
	    $suffix=$2;
	}
	else
	{
	    warn "Skipping non-suffixed non-directory entry \"$e\"." if 
		$Warnings;
	    next;
	}
	
	if ($suffix eq $options->{alvisSuffix})
	{
	    $alvis_entries->{$basename}{alvisF}=$e;
	}
    }
}

sub _add_cats_to_collection
{
    my $root_dir=shift;
    my $options=shift;

    my @entries=glob("$root_dir/*");
    my %alvis_entries=();
    %Seen=();
    print "Parsing the source directory entries...\r";
    &_parse_entries(\@entries,$options,\%alvis_entries);	
    print "                                       \r";

    for my $base_name (keys %alvis_entries)
    {
	my $alvisXML;
	
	if (exists($alvis_entries{$base_name}{alvisF}))
	{
	    my $f=$alvis_entries{$base_name}{alvisF};
	    open(W,"<:utf8",$f) || die("Unable to open \"$f\"");
	    my $out=$f;
	    $out=~s/^.*\///sgo;
	    open(OUT,">:utf8","$OutDir/$out") || 
		die("Unable to open \"$OutDir/$out\"");

	    my $new_rec="";
	    my $N=1;
	    while (my $record=&_get_next_rec(*W))
	    {
		my $cats=&_get_cats($record);
		if (!defined($cats))
		{
		    warn "Getting the categories of record #$N in file " .
			"\"$f\" failed."; 
		    next;
		}

		$Scores=$G->get_relative_scores($cats);
		if (!defined($Scores))
		{
		    warn "Getting the relative scores of record #$N in file " .
			"\"$f\" failed."; 
		    next;
		}
		
		$new_rec=&_output_rec_with_new_scores($record,$Scores);
		if (!defined($new_rec))
		{
		     warn "Getting the new, category-added version " .
			 "of record #$N in file " .
			 "\"$f\" failed."; 
		    next;
		}

		print OUT $new_rec;
		print STDERR "$N\r";
		$N++;
	    }
	    close(W);
	    print STDERR "\n";
	    close(OUT);
	}
    }

    return 1;
}

sub _output_rec_with_new_scores
{
    my $rec=shift;

    my $new_rec="";
    my @lines=split(/\n/,$rec);
    for my $l (@lines)
    {
	if ($l=~/<\/relevance>/)
	{
	    $new_rec.="      <scoreset type=\"$ScoreType\">\n";
	    for my $c (sort _c_score_cmp keys %$Scores)
	    {
		my $score=sprintf("%.1f",$Scores->{$c});
		$new_rec.="          <score topicId=\"$c\">$score</score>\n";
	    }



( run in 1.025 second using v1.01-cache-2.11-cpan-140bd7fdf52 )