From 7f161c1e23350d5a69e0907c4a4e1faa8c9994a6 Mon Sep 17 00:00:00 2001
From: Monika Komorowska <mk8@sanger.ac.uk>
Date: Mon, 9 Jan 2012 15:19:59 +0000
Subject: [PATCH] Store Clone_based_ensembl_gene descriptions last, to avoid an
 accession name conflict with Clone_based_vega_gene descriptions. Assign an
 HGNC name which hasn't been used if possible.

---
 .../xref_mapping/XrefMapper/OfficialNaming.pm | 140 +++++++++++++-----
 1 file changed, 102 insertions(+), 38 deletions(-)

diff --git a/misc-scripts/xref_mapping/XrefMapper/OfficialNaming.pm b/misc-scripts/xref_mapping/XrefMapper/OfficialNaming.pm
index efd2dcd726..ca26fb1ada 100644
--- a/misc-scripts/xref_mapping/XrefMapper/OfficialNaming.pm
+++ b/misc-scripts/xref_mapping/XrefMapper/OfficialNaming.pm
@@ -191,7 +191,10 @@ SQ0
   my %xref_added; # store those added  $xref_added{$accession:$source_id} = $xref_id;
   my %seen_gene;
 
-  foreach my $gene_id (@sorted_gene_ids){
+  my %ens_clone_genes;
+  my %official_name_used;
+
+  while ( my $gene_id = shift @sorted_gene_ids){
 
     my $tran_source = $dbname;
 
@@ -213,6 +216,8 @@ SQ0
 				     cbvt          => $dbname_to_source_id->{"Clone_based_vega_transcript"}
 				    });
 
+    if (!defined($ens_clone_genes{$gene_id})) { #we're processing this gene for the first time
+
     ################################
     # Get offical name if it has one
     ################################
@@ -220,7 +225,13 @@ SQ0
       $self->get_official_domain_name({gene_id       => $gene_id, 
 				       gene_to_tran  => \%gene_to_transcripts,
 				       tran_to_vega_name => $tran_to_vega_name,
-				       gene_id_to_stable_id => \%gene_id_to_stable_id});
+				       gene_id_to_stable_id => \%gene_id_to_stable_id,
+                                       official_name_used => \%official_name_used
+				      });
+
+    if (defined($gene_symbol_xref_id)) {
+	$official_name_used{$gene_symbol_xref_id} = 1;
+    }
 
     ############################################
     # If not found see if there is an LRG entry
@@ -252,6 +263,8 @@ SQ0
       }
     }
 
+    } #if (!exists($ens_clone_genes{$gene_id}))
+
     ##############################################
     # Finally if all else fails use the clone name
     ##############################################
@@ -271,7 +284,7 @@ SQ0
       next;
     }
 
-    if(defined($gene_symbol)){
+    if(defined($gene_symbol) && !defined($ens_clone_genes{$gene_id})){
       my $desc = $display_label_to_desc{$gene_symbol};
 
       if(!defined($gene_symbol_xref_id)){
@@ -292,11 +305,14 @@ SQ0
                                             xref_added       => \%xref_added, 
                                             seen_gene        => \%seen_gene, 
                                             gene_to_tran     => \%gene_to_transcripts, 
-                                            tran_to_vega_ext => $tran_to_vega_ext });
+                                            tran_to_vega_ext => $tran_to_vega_ext,
+					    ens_clone_genes  => \%ens_clone_genes,
+					   });
     }
-    else{ # use clone name
 
-      $self->set_transcript_and_gene_display_xref_via_clone_name({vega_clone_name => $vega_clone_name,
+    if (!defined($gene_symbol)) { # use clone name 
+
+      my $keep_gene = $self->set_transcript_and_gene_display_xref_via_clone_name({vega_clone_name => $vega_clone_name,
                                                          clone_name => $clone_name,
 							 dbname_to_source => $dbname_to_source_id,
 							 gene_id          =>  $gene_id,
@@ -304,8 +320,12 @@ SQ0
 							 max_object       => \$max_object_xref_id,
 							 xref_added       => \%xref_added,
 							 gene_to_tran     => \%gene_to_transcripts,
-							 tran_to_vega_ext => $tran_to_vega_ext
+							 tran_to_vega_ext => $tran_to_vega_ext, 
+							 ens_clone_genes  => \%ens_clone_genes,
 							});
+      if ($keep_gene) {
+	  push @sorted_gene_ids, $gene_id;
+      }
 
     }
   } # for each gene
@@ -344,6 +364,7 @@ sub get_official_domain_name{
   my $gene_id_to_stable_id = $arg_ref->{gene_id_to_stable_id};
   my $tran_to_vega_name    = $arg_ref->{tran_to_vega_name};
   my $gene_to_transcripts  = $arg_ref->{gene_to_tran};
+  my $official_name_used   = $arg_ref->{official_name_used};
 
 
   my $dbname = $self->get_official_name();
@@ -424,6 +445,9 @@ sub get_official_domain_name{
       $best_list{$xref_id_to_display{$xref_id}} = 1;
     }
 
+    #print "Multiple best ".$dbname."'s using vega gene description to find the best name for ".$gene_id_to_stable_id->{$gene_id}."\n";
+    #add this section when OTTG xrefs have gene name in description
+
     my %name_count;
     foreach my $tran_id (@{$gene_to_transcripts->{$gene_id}}){
       if(defined($tran_to_vega_name->{$tran_id}) and defined($best_list{$tran_to_vega_name->{$tran_id}})){
@@ -441,9 +465,9 @@ sub get_official_domain_name{
 	  $gene_symbol = $name;
 	}
       }
-      foreach my $xref_id (keys %ODN){
-	if($gene_symbol eq $xref_id_to_display{$xref_id}){
-	  $gene_symbol_xref_id = $xref_id;
+      foreach my $x (keys %ODN){
+	if($gene_symbol eq $xref_id_to_display{$x}){
+	  $gene_symbol_xref_id = $x;
 	}
       }
       print "\t$gene_symbol chosen from vega\n";
@@ -461,19 +485,43 @@ sub get_official_domain_name{
 	  } 
       }
       
-      # take the first one ??
-      my $i = 0;
+      # take the name which hasn't been already assigned to another gene, if possible
+      
+      my $xref_not_used;
       foreach my $x (keys %ODN){
-	print "\t".$xref_id_to_display{$x};
-	if(!$i){
-	  print "  (chosen as first)\n";
-	  $gene_symbol =  $xref_id_to_display{$x};
-	  $gene_symbol_xref_id =  $x;
-	}
-	else{
-	  print "  (left as $dbname reference but not gene symbol)\n";
-	}
-	$i++;
+	  if (!defined($official_name_used->{$x}) ) {
+	      $xref_not_used = $x;
+	  }
+      }
+      if ($xref_not_used) {
+	  foreach my $x (keys %ODN){
+	      print "\t".$xref_id_to_display{$x};
+	      if ($x == $xref_not_used) {
+		  print "    chosen\n";
+		  $gene_symbol =  $xref_id_to_display{$x};
+		  $gene_symbol_xref_id =  $x;		  
+	      } else {
+		  print "  (left as $dbname reference but not gene symbol)\n";
+	      }
+	  }
+
+      } else {
+
+	  my $i=0;
+	  foreach my $x (keys %ODN){
+	      print "\t".$xref_id_to_display{$x};
+	      if(!$i){
+		  print "  (chosen as first)\n";
+		  $gene_symbol =  $xref_id_to_display{$x};
+		  $gene_symbol_xref_id =  $x;
+	      }
+	      else{
+		  print "  (left as $dbname reference but not gene symbol)\n";
+	      }
+	      $i++;
+	  }
+
+
       }
     }
   }
@@ -495,6 +543,7 @@ sub set_transcript_and_gene_display_xref_via_clone_name{
   my $tran_to_vega_ext =    $arg_ref->{tran_to_vega_ext};
   my $vega_clone_name  =    $arg_ref->{vega_clone_name};
   my $clone_name =          $arg_ref->{clone_name};
+  my $ens_clone_names =     $arg_ref->{ens_clone_genes};
 
   my $ins_xref_sth =              $self->get_ins_xref_sth();
   my $ins_dep_ix_sth =            $self->get_ins_dep_ix_sth();
@@ -502,35 +551,50 @@ sub set_transcript_and_gene_display_xref_via_clone_name{
   my $ins_object_xref_sth =       $self->get_ins_object_xref_sth();
   my $set_gene_display_xref_sth = $self->get_set_gene_display_xref_sth();
 
+  my $keep_gene;
 
   my $t_source_id;
   my $g_source_id;
   my $desc;
   my $name;
-  if(defined($vega_clone_name)){
+  if(defined($vega_clone_name) && !defined($ens_clone_names->{$gene_id})){
     $name = $vega_clone_name;
     $t_source_id = $dbname_to_source_id->{"Clone_based_vega_transcript"};
     $g_source_id = $dbname_to_source_id->{"Clone_based_vega_gene"};
+    $name = $vega_clone_name;
     $desc = "via havana clone name";
-  }
-  else{
-    if(defined($clone_name)){
-      $name = $clone_name;
-      $t_source_id = $dbname_to_source_id->{"Clone_based_ensembl_transcript"};
-      $g_source_id = $dbname_to_source_id->{"Clone_based_ensembl_gene"};
-      $desc = "via ensembl clone name";
-    }
-    else{
-      croak "No name";
-    }
     my $num = 1;
     my $unique_name = $name.".".$num;
-    while(defined($xref_added->{$unique_name.":".$g_source_id})){
-      $num++;
-      $unique_name = $name.".".$num;
+    while(defined($xref_added->{$unique_name.":".$g_source_id}) ){
+	 $num++;
+	 $unique_name = $name.".".$num;
     }
     $name = $unique_name;
   }
+  if (!defined($vega_clone_name) ) {
+      if (defined($ens_clone_names->{$gene_id})) {
+	  if(defined($clone_name)){
+	      $name = $clone_name;
+	      $t_source_id = $dbname_to_source_id->{"Clone_based_ensembl_transcript"};
+	      $g_source_id = $dbname_to_source_id->{"Clone_based_ensembl_gene"};
+	      $desc = "via ensembl clone name";
+	  }
+	  else{
+	      croak "No name";
+	  }
+	  my $num = 1;
+	  my $unique_name = $name.".".$num;
+	  while(defined($xref_added->{$unique_name.":".$g_source_id}) || defined($xref_added->{$unique_name.":".$dbname_to_source_id->{"Clone_based_vega_gene"}})){
+	      $num++;
+	      $unique_name = $name.".".$num;
+	  }
+	  $name = $unique_name;
+      } else {	  
+	  $ens_clone_names->{$gene_id} = 1;
+	  $keep_gene = 1;
+	  return $keep_gene;
+      }
+  }
   
   # first add the gene xref and set display_xref_id
   # store the data
@@ -579,7 +643,7 @@ sub set_transcript_and_gene_display_xref_via_clone_name{
       $set_tran_display_xref_sth->execute($$max_xref_id, $tran_id);
     }
   }
-  return;
+  return 0;
 }
 
 ###########################################################
-- 
GitLab