Bio-MUST-Apps-Physeter

 view release on metacpan or  search on metacpan

bin/physeter.pl  view on Meta::CPAN

    }

    return \@subsets;
}

sub parse_report {
    my $infile    = shift;
    my $query_org = shift;
    my $taxids    = shift // ();

    # setup boolean filter for current database subset (if any)
    my %unwanted = map { $_ => 1 } @{$taxids};

    my $report = Table->new( file => $infile );

    tie my %lca_for, 'Tie::IxHash';

    my $curr_query = q{};
    my $best_score;
    my @relatives;

    my $method = 'next_hit';

    HIT:
    while (my $hit = $report->$method) {

        # at each new query...
        if ($hit->query_id ne $curr_query) {

            # store query LCA if enough relatives
            # TODO: make Taxonomy:: class for collections of lineages
            if (@relatives >= $ARGV_tax_min_hits) {
                $lca_for{$curr_query} = {
                    taxon => scalar $tax->compute_lca(
                        $ARGV_tax_min_lca_freq, @relatives),
                    rel_n => scalar @relatives,
                };
            }

            # prepare analysis of new query
            $curr_query = q{};
            $best_score = undef;
            @relatives = ();

            $method = 'next_hit';
        }

        # skip extra (unused) relatives
        if (@relatives >= $ARGV_tax_max_hits) {
            $method = 'next_query';
            next HIT;
        }

        # skip weak hits (classical mode)
        next HIT if $hit->hsp_length       < $ARGV_tax_min_len;
        next HIT if $hit->percent_identity < $ARGV_tax_min_ident;
        next HIT if $hit->bit_score        < $ARGV_tax_min_score;

        # fetch hit taxonomy and org
        # optimized code (requires taxon_id|accession seq_ids)
        my $taxon_id = ( split m{\|}xms, $hit->hit_id )[0];

        # k-folds mode (skip hits from current database subset)
        next HIT if $ARGV_kfold && $unwanted{$taxon_id};

        my @taxonomy = $tax->get_taxonomy($taxon_id);
        # regular code (more general)
#       my @taxonomy = $tax->get_taxonomy_from_seq_id( $hit->hit_id );
        my $hit_org = $taxonomy[-1];

        # skip self hits (corresponding to query_org)
        next HIT if $hit_org eq $query_org;

        # skip weak hits (MEGAN-like mode)
        $best_score //= $hit->bit_score;
        next HIT if $hit->bit_score < $ARGV_tax_score_mul * $best_score;

        # retain hit taxonomy as relative
        $curr_query = $hit->query_id;
        push @relatives, \@taxonomy;
    }

    # store last query LCA if enough relatives
    if ($curr_query && @relatives >= $ARGV_tax_min_hits) {
        $lca_for{$curr_query} = {
            taxon => scalar $tax->compute_lca(
                $ARGV_tax_min_lca_freq, @relatives),
            rel_n => scalar @relatives,
        };
    }

    return \%lca_for;
}

sub auto_detect {
    my $lca_for = shift;

    # TODO: implement and compare the opposite approach where we first label
    # then determine the exp_tax...

    # count LCA lineage occurrences (i.e., sort | uniq -c)
    my %count_for = count_by { join q{;}, @{ $_->{taxon} } } values %{$lca_for};

    # compute LCA proportions
    my @taxa = sort {
        $count_for{$b} <=> $count_for{$a}
    } keys %count_for;

    my $exp_tax = $labeler->classify(
        $taxa[0], { greedy => $ARGV_greedy_taxa }
    );

    return $exp_tax;
}

sub tabulate_lcas {
    my $lca_for = shift;
    my $exp_tax = shift;
    my $seq_n   = shift;

    my $self_n    = 0;



( run in 1.872 second using v1.01-cache-2.11-cpan-483215c6ad5 )