EdgeExpressDB
view release on metacpan or search on metacpan
scripts/eedb_sync_entrezgene.pl view on Meta::CPAN
$entrez_source->url("http://www.ncbi.nlm.nih.gov/sites/entrez?db=gene");
$entrez_source->store($eeDB);
printf("Needed to create:: %s\n", $entrez_source->display_desc);
}
unless($entrez_source) { printf("error Entrez feature_source [%s]\n\n", $fsrc_name); usage(); }
my $deprecate_source = EEDB::FeatureSource->fetch_by_category_name($eeDB, "gene", "deprecated_entrez_gene");
unless($deprecate_source) {
$deprecate_source = new EEDB::FeatureSource;
$deprecate_source->category("gene");
$deprecate_source->name("deprecated_entrez_gene");
$deprecate_source->import_source("NCBI Entrez Gene");
$deprecate_source->url("http://www.ncbi.nlm.nih.gov/sites/entrez?db=gene");
$deprecate_source->store($eeDB);
printf("Needed to create:: %s\n", $deprecate_source->display_desc);
}
unless($deprecate_source) { printf("error making [deprecated_entrez_gene] feature_source\n\n"); usage(); }
printf("============\n");
printf("eeDB:: %s\n", $eeDB->url);
$assembly->display_info;
$entrez_source->display_info;
$deprecate_source->display_info;
printf("============\n");
if(defined($entrez_id)) {
fetch_gene_from_webservice($entrez_id);
} else {
update_from_webservice();
}
#fetch_gene_from_webservice(100128520); #19
#100128520
fetch_gene_from_webservice();#flush
printf("MOVED stats : %d / %d = %1.2f%%\n", $locmove_count, $genecount, 100.0*$locmove_count/$genecount);
exit(1);
#########################################################################################
sub usage {
print "eedb_sync_entrezgene.pl [options]\n";
print " -help : print this help\n";
print " -url <url> : URL to database\n";
print " -assembly <name> : name of species/assembly (eg hg18 or mm9)\n";
print " -entrezID <id> : synchronize specific entrez gene\n";
print "eedb_sync_entrezgene.pl v1.0\n";
exit(1);
}
##################################################################
#
# new XML webservice based mathods
#
##################################################################
sub update_from_webservice {
my $url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?".
"db=gene&retmax=100000";
$url .= sprintf("&term=%d[taxid]%%20AND%%20gene_all[filter]", $assembly->taxon_id);
printf("URL: %s\n", $url);
my $tpp = XML::TreePP->new();
my $tree = $tpp->parsehttp( GET => $url );
#print $tree, "\n";
my $search_count = $tree->{'eSearchResult'}->{'Count'};
printf("search returned %d genes\n", $search_count);
my $id_list = $tree->{'eSearchResult'}->{'IdList'}->{'Id'};
#printf("idList %s\n", $id_list);
my $geneIDs =[];
if($id_list =~ /ARRAY/) { $geneIDs = $id_list; }
else { $geneIDs = [$id_list]; }
#should maybe do something here to filter the list
#into:: new, deprecated, and update
my $sql = "select sym_value from symbol join feature_2_symbol using (symbol_id) ".
"JOIN feature using(feature_id) ".
"WHERE sym_type='EntrezID' AND feature_source_id=?";
my $loadedEntrezIDs = MQdb::MappedQuery->fetch_col_array($eeDB, $sql, $entrez_source->id);
my $eIDhash = {};
my $newCount=0;
my $updateCount=0;
my $deprecateCount=0;
foreach my $geneID (@$loadedEntrezIDs) { $eIDhash->{$geneID}='dbonly'; }
foreach my $geneID (@$id_list) {
if($eIDhash->{$geneID} and ($eIDhash->{$geneID} eq 'dbonly')) {
$updateCount++;
$eIDhash->{$geneID}='update';
}
else { $newCount++; $eIDhash->{$geneID}='new'; }
}
for my $geneID (keys(%$eIDhash)) {
$deprecateCount++ if($eIDhash->{$geneID} eq 'dbonly');
}
printf("%d new genes to add\n", $newCount);
printf("%d genes to update check\n", $updateCount);
printf("%d genes to deprecate\n", $deprecateCount);
sleep(5);
#first add new genes
for my $geneID (keys(%$eIDhash)) {
next unless($eIDhash->{$geneID} eq 'new');
fetch_gene_from_webservice($geneID);
}
fetch_gene_from_webservice(); #flushes the buffer
#then the deprecates
for my $geneID (keys(%$eIDhash)) {
next unless($eIDhash->{$geneID} eq 'dbonly');
deprecate_geneID($geneID);
}
#then the updates
unless($skip_update) {
for my $geneID (keys(%$eIDhash)) {
( run in 1.519 second using v1.01-cache-2.11-cpan-39bf76dae61 )