EdgeExpressDB

 view release on metacpan or  search on metacpan

scripts/eedb_sync_entrezgene.pl  view on Meta::CPAN

  $entrez_source->url("http://www.ncbi.nlm.nih.gov/sites/entrez?db=gene");
  $entrez_source->store($eeDB);
  printf("Needed to create:: %s\n", $entrez_source->display_desc);
}
unless($entrez_source) { printf("error Entrez feature_source [%s]\n\n", $fsrc_name); usage(); }

my $deprecate_source = EEDB::FeatureSource->fetch_by_category_name($eeDB, "gene", "deprecated_entrez_gene");
unless($deprecate_source) {
  $deprecate_source = new EEDB::FeatureSource;
  $deprecate_source->category("gene");
  $deprecate_source->name("deprecated_entrez_gene");
  $deprecate_source->import_source("NCBI Entrez Gene");
  $deprecate_source->url("http://www.ncbi.nlm.nih.gov/sites/entrez?db=gene");
  $deprecate_source->store($eeDB);
  printf("Needed to create:: %s\n", $deprecate_source->display_desc);
}
unless($deprecate_source) { printf("error making [deprecated_entrez_gene] feature_source\n\n"); usage(); }


printf("============\n");
printf("eeDB:: %s\n", $eeDB->url);
$assembly->display_info;
$entrez_source->display_info;
$deprecate_source->display_info;
printf("============\n");

if(defined($entrez_id)) {
  fetch_gene_from_webservice($entrez_id);
} else {
  update_from_webservice();
}
#fetch_gene_from_webservice(100128520); #19
#100128520

fetch_gene_from_webservice();#flush

printf("MOVED stats : %d / %d = %1.2f%%\n", $locmove_count, $genecount, 100.0*$locmove_count/$genecount);

exit(1);

#########################################################################################

sub usage {
  print "eedb_sync_entrezgene.pl [options]\n";
  print "  -help              : print this help\n";
  print "  -url <url>         : URL to database\n";
  print "  -assembly <name>   : name of species/assembly (eg hg18 or mm9)\n";
  print "  -entrezID <id>     : synchronize specific entrez gene\n";
  print "eedb_sync_entrezgene.pl v1.0\n";

  exit(1);
}


##################################################################
#
# new XML webservice based mathods
#
##################################################################

sub update_from_webservice {
  my $url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?".
            "db=gene&retmax=100000";
  $url .= sprintf("&term=%d[taxid]%%20AND%%20gene_all[filter]", $assembly->taxon_id);
  printf("URL: %s\n", $url);
  my $tpp = XML::TreePP->new();
  my $tree = $tpp->parsehttp( GET => $url );
  #print $tree, "\n";

  my $search_count = $tree->{'eSearchResult'}->{'Count'};
  printf("search returned %d genes\n", $search_count);
  my $id_list = $tree->{'eSearchResult'}->{'IdList'}->{'Id'};
  #printf("idList %s\n", $id_list);

  my $geneIDs =[];
  if($id_list =~ /ARRAY/) { $geneIDs = $id_list; } 
  else { $geneIDs = [$id_list]; }

  #should maybe do something here to filter the list 
  #into:: new, deprecated, and update
  my $sql = "select sym_value from symbol join feature_2_symbol using (symbol_id) ".
     "JOIN feature using(feature_id) ".
     "WHERE sym_type='EntrezID' AND feature_source_id=?";
  my $loadedEntrezIDs = MQdb::MappedQuery->fetch_col_array($eeDB, $sql, $entrez_source->id);
  my $eIDhash = {};
  my $newCount=0;
  my $updateCount=0; 
  my $deprecateCount=0;

  foreach my $geneID (@$loadedEntrezIDs) { $eIDhash->{$geneID}='dbonly'; }
  foreach my $geneID (@$id_list) {
    if($eIDhash->{$geneID} and ($eIDhash->{$geneID} eq 'dbonly')) { 
      $updateCount++; 
      $eIDhash->{$geneID}='update'; 
    }
    else { $newCount++; $eIDhash->{$geneID}='new'; }
  }
  for my $geneID (keys(%$eIDhash)) {
    $deprecateCount++ if($eIDhash->{$geneID} eq 'dbonly');
  }
  printf("%d new genes to add\n", $newCount);
  printf("%d genes to update check\n", $updateCount);
  printf("%d genes to deprecate\n", $deprecateCount);
  sleep(5);

  #first add new genes
  for my $geneID (keys(%$eIDhash)) {
    next unless($eIDhash->{$geneID} eq 'new');
    fetch_gene_from_webservice($geneID);
  }
  fetch_gene_from_webservice();  #flushes the buffer

  #then the deprecates
  for my $geneID (keys(%$eIDhash)) {
    next unless($eIDhash->{$geneID} eq 'dbonly');
    deprecate_geneID($geneID);
  }

  #then the updates
  unless($skip_update) {
    for my $geneID (keys(%$eIDhash)) {



( run in 1.519 second using v1.01-cache-2.11-cpan-39bf76dae61 )