DDG

 view release on metacpan or  search on metacpan

lib/DDG/Meta/Data.pm  view on Meta::CPAN

package DDG::Meta::Data;
our $AUTHORITY = 'cpan:DDG';
# ABSTRACT: Metadata functions for instant answers
$DDG::Meta::Data::VERSION = '1018';
use JSON::XS qw'decode_json encode_json';
use Path::Class;
use File::ShareDir 'dist_file';
use LWP::UserAgent;
use File::Copy::Recursive 'pathmk';
use List::Util qw( all any );
use File::Copy;

use strict;

sub debug { 0 }
use if debug, 'Data::Printer';

no warnings 'uninitialized';

# $ia_metadata => {
#    id => ...
#    module => ...
#    sharedir => ...
# }
my %ia_metadata;

# Only build metadata once.
unless(%ia_metadata){

    my $tmpdir = $ENV{METADATA_TMP_DIR} || '/var/tmp/ddg-metadata';

    my $mdir = "$tmpdir-$>";
    unless(-d $mdir){
        pathmk $mdir or die "Failed to mkdir $mdir: $!";
    }

    debug && warn "Processing metadata";

    my $f = "$mdir/metadata.json.bz2";
    my $tmp_bak = "$f.bak";
    my @timestamps = (stat $f)[8,9];
    if(-e $f){
        copy $f, $tmp_bak or die "Failed to copy $f to $tmp_bak: $!";;
    }
    utime @timestamps, $tmp_bak;
    my $restore_backup = sub {
        move $tmp_bak, $f or die "Failed to move backup $tmp_bak to $f: $!";
        utime @timestamps, $f;
    };

    unless($ENV{NO_METADATA_DOWNLOAD}){
        my $ua = LWP::UserAgent->new;
        $ua->timeout(5);
        $ua->default_header('Accept-Encoding' => scalar HTTP::Message::decodable());
        my $res = $ua->mirror('http://ddg-community.s3.amazonaws.com/metadata/repo_all.json.bz2', $f);
        unless($res->is_success || $res->code == 304){
            debug && warn "Failed to download metdata: " . $res->status_line . " .  Restoring backup from $tmp_bak";
            $restore_backup->();
        }
    }

    my $metadata;
    while(!$metadata){
        eval {
            # Decompress to command-line
            open my $fh, "bzip2 -dc $f |" or die "Failed to open file $f: $!";
            # slurp into a single string
            my $json = do { local $/;  <$fh> };
            $metadata = decode_json($json);
        }
        or do {
            if(-e $tmp_bak){
                debug && warn "Failed to process metadata $f: $@. Restoring backup from $tmp_bak";
                $restore_backup->();
            }
            else{
                die "Failed to to process metadata from $f: $@";
            }
        };
    }
    unlink $tmp_bak if -e $tmp_bak;

    # { "<id>": {
    #     "id": " "
    #     "signal" : " "
    #     ....
    # }

    IA: while (my ($id, $ia) = each %{ $metadata }) {

        # 20150502 (zt) Can't filter like this yet as some tests depend on non-live IA metadata
        #next unless $ia->{status} eq 'live';

        # warn if we run into a duplicate id.  These should be unique within *and*
        # across all IA types
        if( $ia_metadata{id}{$id} ){
            warn "Duplicate ID for IA with ID: $id";
        }

        # Perl modules aren't strictly required, e.g. Fatheads and Longtails
        # It's ok not to have every IA represented in the by_module view
        # We may need to update the parameter to _js_callback_name in the future
        if(my $perl_module = $ia->{perl_module}){
            #add new ia to ia_metadata{module}. Multiple ias per module possible
            push @{$ia_metadata{module}{$perl_module}}, $ia;
            $ia->{js_callback_name} = _js_callback_name($perl_module);
        }

        # Clean up/set some values
        $ia->{signal_from} ||= $ia->{id};

        #add new ia to ia_metadata{id}
        $ia_metadata{id}{$id} = $ia;
        # by language for multilang wiki
        if($ia->{repo} eq 'fathead'){
            my $source = $ia->{src_id};
            # by source number for fatheads
            $ia_metadata{fathead_source}{$source} = $ia;
            # by language for multi language wiki sources
            # check that language is set since most fatheads don't have a language
            if( my $lang = $ia->{src_options}{language}){
                # By default use current source
                my $want_src = $source;
                if(exists $ia_metadata{fathead_lang}{$lang}){
                    my $prev_ia = $ia_metadata{fathead_source}{$ia_metadata{fathead_lang}{$lang}};
                    # Skip setting lang to source if existing src_id is lower
                    $want_src = 0 if $prev_ia->{src_id} < $ia->{src_id};
                }
                $ia_metadata{fathead_lang}{$lang} = $want_src if $want_src;
            }
            my $min_length = $ia->{src_options}{min_abstract_length};
            $ia_metadata{fathead_min_length}{$source} = $min_length if $min_length;
        }
    }



( run in 0.877 second using v1.01-cache-2.11-cpan-99c4e6809bf )