From d0e556a2681525cdc31ea791d1243428fae430fd Mon Sep 17 00:00:00 2001
From: Monika Komorowska <mk8@sanger.ac.uk>
Date: Mon, 16 Jan 2012 23:20:16 +0000
Subject: [PATCH] use gene names from core instead if vega to populate
 alt_allele table

---
 misc-scripts/alt_alleles/alt_alleles.pl | 269 ++++++------------------
 1 file changed, 66 insertions(+), 203 deletions(-)

diff --git a/misc-scripts/alt_alleles/alt_alleles.pl b/misc-scripts/alt_alleles/alt_alleles.pl
index 36315d71d3..0d5d510a35 100644
--- a/misc-scripts/alt_alleles/alt_alleles.pl
+++ b/misc-scripts/alt_alleles/alt_alleles.pl
@@ -1,145 +1,92 @@
+#!/usr/bin/env perl
+
 use strict;
 use Bio::EnsEMBL::DBSQL::DBAdaptor;
 use Getopt::Long qw(:config pass_through);
 
 # (make sure api version is correct
 # Usage:
-# perl alt_alleles.pl -cpass XXXX > & human_release_63_alt_alleles
+# perl alt_alleles.pl -pass XXXX > & human_release_63_alt_alleles
 #
 #
 # long way
-# perl alt_alleles.pl -vhost ens-staging1 -vport 3306 -vdbname homo_sapiens_vega_63_37 -cdbname homo_sapiens_core_63_37 -chost ens-staging1 -cpass XXXX > & human_release_63_alt_alleles
+# perl alt_alleles.pl -dbname homo_sapiens_core_63_37 -host ens-staging1 -pass XXXX > & human_release_63_alt_alleles
 #
 
-my ($vhost, $vpass, $vport, $vdbname, $vuser, $chost, $cpass, $cport, $cdbname, $cuser);
+my ($host, $pass, $port, $dbname, $user);
 
 GetOptions(
-    'vuser=s'  => \$vuser,
-    'vpass=s'  => \$vpass,
-    'vhost=s'  => \$vhost,
-    'vport=i'  => \$vport,
-    'vdbname=s'       => \$vdbname,
-    'cuser=s'  => \$cuser,
-    'cpass=s'  => \$cpass,
-    'chost=s'  => \$chost,
-    'cport=i'  => \$cport,
-    'cdbname=s'       => \$cdbname);
-#
-# Connect to the vgea databse to get the alt allele data.
-#
+    'user=s'  => \$user,
+    'pass=s'  => \$pass,
+    'host=s'  => \$host,
+    'port=i'  => \$port,
+    'dbname=s'       => \$dbname);
 
-my $api_version = Bio::EnsEMBL::ApiVersion->software_version();
 
-if(!defined($vdbname)){
-  $vdbname = "homo_sapiens_vega_".$api_version."_37";
-}
 
-if(!defined($cdbname)){
-  $cdbname = "homo_sapiens_core_".$api_version."_37";
-}
+my $api_version = Bio::EnsEMBL::ApiVersion->software_version();
 
+if(!defined($dbname)){
+  $dbname = "homo_sapiens_core_".$api_version."_37";
+}
 
-#
 # Connect to the core database 
-#
-
-my $core_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $chost||'ens-staging1',
-                                          -user => $cuser||'ensadmin',
-					  -pass => $cpass,
-                                          -species => "test",
-                                          -dbname => $cdbname||"homo_sapiens_core_63_37");
-
-#
-# get ensembl gene ids and vega stable ids from the core database
-# 
 
-my %vega_to_ens_id;
-my ($vega_stable_id, $gene_id);
+my $core_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $host||'ens-staging1',
+					  -port => $port || '3306',
+                                          -user => $user||'ensadmin',
+					  -pass => $pass,
+                                          -dbname => $dbname);
 
-my $sth =  $core_dba->dbc->prepare("select ensembl_id, display_label from object_xref join xref using(xref_id) join external_db using(external_db_id) where db_name = 'OTTG' and ensembl_object_type = 'Gene'");
-$sth->execute;
+my $sa = $core_dba->get_adaptor("slice");
+my $ga = $core_dba->get_adaptor("gene");
 
-$sth->bind_columns(\$gene_id, \$vega_stable_id);
+my @top_slices = @{$sa->fetch_all("toplevel",undef,1)}; #returns all slices including non-reference ("toplevel",undef,1)
+print "Looping over chromosomes...\n";
 
-while ($sth->fetch){
-  $vega_to_ens_id{$vega_stable_id} = $gene_id;
-}
-$sth->finish;
+my %alt_alleles;
+my %reference_genes;
 
+my %logic_names = ("havana" => 1,
+		   "ensembl_havana_gene" => 1,
+		   "ensembl_havana_lincrna" => 1,);
 
+my %skip_sources = ("Clone_based_vega_gene" => 1,
+		    "Clone_based_ensembl_gene" => 1);
 
-my $vega_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(-host => $vhost||'ens-staging1',
-                                          -user => $vuser||'ensro',
-                                          -port => $vport||3306,
-                                          -dbname => $vdbname||"homo_sapiens_vega_63_37");
+foreach my $slice (@top_slices) {
+  my $slice_name = $slice->seq_region_name;
+ GENE:
+  foreach my $gene (@{$ga->fetch_all_by_Slice($slice)}) {
+     
+    next GENE if (exists($skip_sources{$gene->external_db}) );
 
-#
-# SQL to get alt_allele data from vega
-#
+    next GENE unless (exists($logic_names{$gene->analysis->logic_name}) );
 
-my $sql =(<<EOS);
-select aa.alt_allele_id, g.stable_id
- from alt_allele aa, gene g 
- where aa.gene_id = g.gene_id
-EOS
+    my $gene_id = $gene->dbID;
 
+    #get name and synonyms
+    my ($syns,$display_id);
+    eval { $display_id = $gene->display_xref->display_id }; 
+    if ($@) {
+      print "Can't get name for gene $gene_id on slice $slice_name\n";
+    } else {
+	push @{$alt_alleles{$display_id}}, $gene_id ;
+	if ($slice->is_reference) {
+	    $reference_genes{$gene_id} = 1;
+	}
 
-#
-# Store data in a hash where the key is the alt_id and the ensembl gene ids 
-# stored in an anonymous array (value of the hash).
-#
-
-my $sth = $vega_dba->dbc->prepare($sql);
-$sth->execute;
-my ($alt_id, $vega_stable_id);
-my %alt_to_gene;
-$sth->bind_columns(\$alt_id, \$vega_stable_id);
-
-my $max_alt_id = 0;
-
-my %no_gene_id;
+    }
 
-while($sth->fetch()){
-  my $gene_id = $vega_to_ens_id{$vega_stable_id};
-  if ( defined($gene_id) ) {
-      push @{$alt_to_gene{$alt_id}}, $gene_id ;
-  } else {
-      push @{$no_gene_id{$alt_id}}, $vega_stable_id;
-      print STDERR "no ensembl gene_id found for vega stable id $vega_stable_id in core\n";
-  }
-  if($alt_id > $max_alt_id){
-    $max_alt_id = $alt_id;
   }
-}
-$sth->finish;
-
-$max_alt_id += 1000;
-
-
-my %non_reference; # $non_reference{$gene_id} = 1;
-
-
-$sql = (<<SEQ);
-SELECT g.gene_id 
-  FROM gene g, seq_region_attrib sra, attrib_type at
-    WHERE g.seq_region_id = sra.seq_region_id AND
-          at.attrib_type_id = sra.attrib_type_id AND
-          at.code = 'non_ref'
-SEQ
-
-$sth = $core_dba->dbc->prepare($sql);
-$sth->execute;
-$sth->bind_columns(\$gene_id);
-while($sth->fetch()){
-  $non_reference{$gene_id} = 1;
-}
 
+}   
 
 #
 # Delete the old data
 #
 
-$sth = $core_dba->dbc->prepare("delete from alt_allele");
+my $sth = $core_dba->dbc->prepare("delete from alt_allele");
 $sth->execute;
 
 
@@ -148,12 +95,10 @@ $sth->execute;
 # NOTE: if an alt allele has only one gene_id delete it from the hash 
 #
 
-foreach my $key (keys %alt_to_gene){
-  my @arr = @{$alt_to_gene{$key}};
+foreach my $key (keys %alt_alleles){
+  my @arr = @{$alt_alleles{$key}};
   if(scalar(@arr) <= 1){
-      delete $alt_to_gene{$key};
-      my @unmapped_vega = $no_gene_id{$key};
-      print STDERR "ignoring alt allele $key: unmapped vega stable id(s): ". join(", ",@unmapped_vega) ."\n";    
+      delete $alt_alleles{$key};
   }
 }
 
@@ -161,116 +106,34 @@ foreach my $key (keys %alt_to_gene){
 # Then store the alt_allele and gene_id in the alt_allele table.
 #
 
-$sql = "insert into alt_allele (alt_allele_id, gene_id, is_ref) values(?, ?, ?)";
+my $sql = "insert into alt_allele (alt_allele_id, gene_id, is_ref) values(?, ?, ?)";
 my $insert_sth = $core_dba->dbc->prepare($sql);
 
 my $count=0;
 my $alt_id_count = 0;
 
 
-#
-# gene_id_to_alt_id needed for LRG stuff
-#
-#my %gene_id_to_alt_id;
 
-foreach my $key (keys %alt_to_gene){
-  my @arr = @{$alt_to_gene{$key}};
+foreach my $key (keys %alt_alleles){
+  my @arr = @{$alt_alleles{$key}};
   if(scalar(@arr) > 1){
-    my $sanity_check =0; # should be 1 refernce gene only if 0 or more than 1 we may have a problem
+    my $sanity_check =0; # should be 1 reference gene only if 0 or more than 1 we may have a problem
+    $alt_id_count++;
     foreach my $gene_id (@arr){
-      my $ref = 1;
-      if(defined( $non_reference{$gene_id})){
-	$ref = 0;
-      }      
-      else{
+      my $ref = 0;
+      if(exists($reference_genes{$gene_id})){
+	$ref = 1;
 	$sanity_check++;
       }
-      $insert_sth->execute($key, $gene_id, $ref);
-#      $gene_id_to_alt_id{$gene_id}= $key;
+      $insert_sth->execute($alt_id_count, $gene_id, $ref);
       $count++;
     }
     if($sanity_check !=1){
-      print STDERR "Problem we have $sanity_check genes on the reference sequence for alt_id $key\n";
+      print STDERR "Problem we have $sanity_check genes on the reference sequence for alt_id $alt_id_count gene display_id $key\n";
     }
-    $alt_id_count++;
+   
   }
-  elsif(scalar(@arr) == 0){ # been removed due to stable id error
-  }
-}
 
+}
 
 print "Added $alt_id_count alt_allele ids for $count genes\n";
-
-#
-# useful sql to look at sanity check problems
-#
-#select g.stable_id,g.seq_region_start, g.seq_region_end, s.name, x.display_label from gene g, seq_region s, alt_allele aa, xref x where g.display_xref_id = x.xref_id and g.gene_id = aa.gene_id and g.seq_region_id = s.seq_region_id and aa.alt_allele_id =37;
-
-
-
-exit;
-
-
-
-
-
-
-
-
-
-## LRG SQL. How to fit this in?
-##select ox.ensembl_id, g.gene_id from xref x, object_xref ox, external_db e, gene g where x.xref_id = ox.xref_id and e.external_db_id = x.external_db_id and e.db_name like "Ens_Hs_gene" and ox.ensembl_object_type = "Gene" and x.display_label = g.stable_id ;
-
-
-
-##
-## Use $max_alt_id for new ones.
-##
-
-#$sql =(<<LRG);
-#SELECT  ox.ensembl_id, g.gene_id 
-#  FROM xref x, object_xref ox, external_db e, gene g
-#    WHERE x.xref_id = ox.xref_id AND
-#          e.external_db_id = x.external_db_id AND
-#          e.db_name like "Ens_Hs_gene" AND
-#          ox.ensembl_object_type = "Gene" AND
-#           x.display_label = g.stable_id
-#LRG
-
-#$sth = $core_dba->dbc->prepare($sql);
-#my ($core_gene_id, $lrg_gene_id);
-#$sth->execute();
-#$sth->bind_columns(\$lrg_gene_id, \$core_gene_id);
-
-#$count =0;
-
-#my $old_count = 0;
-#my $new_count = 0;
-#my $lrg_count = 0;
-##
-## If the core gene is already in an alt_allele set then use that alt_id for the LRG gene only.
-## Else use a new one and add both core and LRG.
-##
-
-
-#while ($sth->fetch()){
-#  if(defined($gene_id_to_alt_id{$core_gene_id})){
-#    $insert_sth->execute($gene_id_to_alt_id{$core_gene_id}, $lrg_gene_id);
-#    $old_count++;
-#  }
-#  elsif(defined($gene_id_to_alt_id{$lrg_gene_id})){
-#    $insert_sth->execute($gene_id_to_alt_id{$lrg_gene_id}, $core_gene_id);
-#    print "LRG perculiarity\t$core_gene_id\t$lrg_gene_id\n";
-#    $lrg_count++;
-#  }
-#  else{ # new one.
-#    $max_alt_id++;
-#    $insert_sth->execute($max_alt_id, $lrg_gene_id);
-#    $insert_sth->execute($max_alt_id, $core_gene_id);
-#    $new_count++;
-#  }
-#  $count++;
-#}
-
-#print "Added $count alt_allels for the lrgs. $old_count added to previous alt_alleles and $new_count new ones\n";
-#print "LRG problem count = $lrg_count\n";
-- 
GitLab