From 9bdbb41e3d950e9e979991e6223f5adc93f80ec2 Mon Sep 17 00:00:00 2001
From: Ian Longden <ianl@sanger.ac.uk>
Date: Mon, 5 Sep 2011 15:35:38 +0000
Subject: [PATCH] More docs added

---
 .../xref_mapping/XrefMapper/OfficialNaming.pm | 208 +++++++++++++-----
 1 file changed, 148 insertions(+), 60 deletions(-)

diff --git a/misc-scripts/xref_mapping/XrefMapper/OfficialNaming.pm b/misc-scripts/xref_mapping/XrefMapper/OfficialNaming.pm
index 57008201d8..01a6dfaa39 100644
--- a/misc-scripts/xref_mapping/XrefMapper/OfficialNaming.pm
+++ b/misc-scripts/xref_mapping/XrefMapper/OfficialNaming.pm
@@ -51,6 +51,11 @@ use base qw( XrefMapper::BasicMapper);
 #
 ##############################################################################################
 
+
+####################################
+# Create OfficialNaming object
+# Get some info from the BasicMapper
+####################################
 sub new {
   my($class, $mapper) = @_;
 
@@ -66,6 +71,11 @@ sub new {
   return $self;
 }
 
+
+##################################################
+# This will be the offical database name
+# HGNC, MGI or ZFIN_ID, comes from BasicMapper
+#################################################
 sub get_official_name {
  my ($self, $arg) = @_;
 
@@ -74,6 +84,11 @@ sub get_official_name {
   return $self->{_official_name};
 }
 
+
+
+##################################################
+# This is the main subroutine that does everything
+##################################################
 sub run {
   my $self = shift;
 
@@ -117,10 +132,8 @@ sub run {
   ######################################################
   my ($max_object_xref_id, $max_xref_id) = $self->find_max_ids();
 
-
-  my %display_label_to_id;
   my %display_label_to_desc;
-  $self->get_display_label_data(\%display_label_to_id,\%display_label_to_desc);
+  $self->get_display_label_data(\%display_label_to_desc);
 
   my %synonym;
   $self->get_synonyms(\%synonym);
@@ -192,6 +205,9 @@ SQ0
     my $vega_clone_name = undef;
     my $clone_name = undef;
 
+    ########################################
+    # Get the vega data needed for this gene
+    ########################################
     my ($tran_to_vega_ext, $tran_to_vega_name, $tran_to_vega_xref_id) = 
       $self->get_tran_to_vega_data({ gene_id       => $gene_id, 
 				     gene_to_tran  => \%gene_to_transcripts, 
@@ -201,20 +217,36 @@ SQ0
 				     cbvt          => $dbname_to_source_id->{"Clone_based_vega_transcript"}
 				    });
 
-    ($gene_symbol, $gene_symbol_xref_id) = $self->get_official_domain_name({gene_id       => $gene_id, 
-									    gene_to_tran  => \%gene_to_transcripts,
-									    tran_to_vega_name => $tran_to_vega_name,
-									    gene_id_to_stable_id => \%gene_id_to_stable_id});
-
+    ################################
+    # Get offical name if it has one
+    ################################
+    ($gene_symbol, $gene_symbol_xref_id) = 
+      $self->get_official_domain_name({gene_id       => $gene_id, 
+				       gene_to_tran  => \%gene_to_transcripts,
+				       tran_to_vega_name => $tran_to_vega_name,
+				       gene_id_to_stable_id => \%gene_id_to_stable_id});
+
+    ############################################
+    # If not found see if there is an LRG entry
+    ############################################
     if(!defined($gene_symbol)){ # look for LRG
       ($gene_symbol, $gene_symbol_xref_id) = $self->find_lrg_hgnc($gene_id);
     }
 
-    if(!defined($gene_symbol)){ # try the other database source (should be RFAM and miRBase only)
-      ($gene_symbol, $gene_symbol_xref_id) = $self->find_from_other_sources({gene_id       => $gene_id, 
-									     tran_source   => \$tran_source});
+    ####################################################
+    # If not found look for other valid database sources
+    # At present RFAm and miRBase are the only ones.
+    ####################################################
+    if(!defined($gene_symbol)){ 
+      ($gene_symbol, $gene_symbol_xref_id) = 
+	$self->find_from_other_sources({gene_id       => $gene_id, 
+					label_to_desc => \%display_label_to_desc,
+					tran_source   => \$tran_source});
     }
 
+    ###################################################
+    # If still no joy see if there is a vega clone name
+    ###################################################
     if(!defined($gene_symbol)){
       foreach my $tran_id  (@{$gene_to_transcripts{$gene_id}}){
 	my $source_id = $dbname_to_source_id->{"Clone_based_vega_transcript"};
@@ -224,26 +256,26 @@ SQ0
       }
     }
 
-    if(!defined($gene_symbol) ){   # No HGNC or other so look for vega clone names
-      if(!defined($vega_clone_name)){ #if no vega clone name use the ensembl clone name
-	$clone_name = $self->get_clone_name($gene_id, $ga, $dbname);
-	if(defined($clone_name)){
-          $clone_name =~ s/[.]\d+//;    #remove .number
-	}
+    ##############################################
+    # Finally if all else fails use the clone name
+    ##############################################
+    if((!defined($gene_symbol)) and (!defined($vega_clone_name))){
+      $clone_name = $self->get_clone_name($gene_id, $ga, $dbname);
+      if(defined($clone_name)){
+	$clone_name =~ s/[.]\d+//;    #remove .number
       }
     }
 
-    #
+    ######################################
     # Check we have a suitable name to use.
-    #
+    # Else give error message and goto next
+    ######################################
     if( !(defined($clone_name) or defined($vega_clone_name) or defined($gene_symbol)) ){
       carp "Problem gene ".$gene_id_to_stable_id{$gene_id}." could not get a clone name or ".$dbname." symbol\n";
       next;
     }
 
     if(defined($gene_symbol)){
-      #gene symbol already set as it is HGNC or MGI so do not need add a new xref anything for the gene;
-
       my $desc = $display_label_to_desc{$gene_symbol};
 
       if(!defined($gene_symbol_xref_id)){
@@ -294,6 +326,21 @@ SQ0
   return;
 }
 
+
+
+
+####################################################################
+# Get offical name if it has one
+#
+# Search gene for dbname entries.
+# dbname (HGNC||MGI||ZFIN_ID depenedent on species
+#
+# Find the "best" one
+# Remove the lesser ones (set status to MULTI_DELETE for object_xref)
+#
+# returns the gene_symbol and xref_id of the best one
+######################################################################
+
 sub get_official_domain_name{
   my ($self, $arg_ref) = @_;
 
@@ -353,20 +400,6 @@ sub get_official_domain_name{
   }
   if(scalar(@ODN) > 1){ # try to use vega to find the most common one
 
-    if($gene_id_to_stable_id->{$gene_id} eq "ENSG00000213030"){ #start debug
-      print "----ODN\n";
-      foreach my $xref_id (@ODN){
-	print "---\t".$xref_id."\t".$xref_id_to_display{$xref_id}."\n";
-      }
-      foreach my $tran_id (@{$gene_to_transcripts->{$gene_id}}){
-	print "--".$tran_id;
-	if(defined($tran_to_vega_name->{$tran_id})){
-	  print "\t".$tran_to_vega_name->{$tran_id};
-	}
-	print "\n";
-      }
-    }  ## end debug
- 
     my %best_list;
     foreach my $xref_id (@ODN){
       $best_list{$xref_id_to_display{$xref_id}} = 1;
@@ -517,10 +550,15 @@ sub set_transcript_and_gene_display_xref_via_clone_name{
   return;
 }
 
-
+###########################################################
+# Set the transcript display xrefs
+#
+# Use the gene symbol to create a transcript display xref
+# Add the Vega ext if it exists else start at 201 and
+# increment.
+###########################################################
 sub set_transcript_display_xrefs{
   my ($self, $arg_ref) = @_;
-#  my ($self, $max_xref_id, $max_object_xref_id, $gene_id, $gene_symbol, $desc, $source_id, $xref_added, $seen_gene, $gene_to_transcripts, $tran_to_vega_ext) = @_;
 
   my $max_xref_id =         $arg_ref->{max_xref};
   my $max_object_xref_id =  $arg_ref->{max_object};
@@ -533,21 +571,21 @@ sub set_transcript_display_xrefs{
   my $gene_to_transcripts = $arg_ref->{gene_to_tran};
   my $tran_to_vega_ext =    $arg_ref->{tran_to_vega_ext};
 
-#  croak ("MISSING parameters for set_transcript_display_xrefs")
-#    if any (!defined($_))  $max_xref_id, $max_object_xref_id, $gene_id, $gene_symbol, $desc, $source_id, $xref_added, $seen_gene, $gene_to_transcripts, $tran_to_vega_ext;
 
+  # statement handles needed
   my $ins_xref_sth =              $self->get_ins_xref_sth();
   my $ins_dep_ix_sth =            $self->get_ins_dep_ix_sth();
   my $set_tran_display_xref_sth = $self->get_set_transcript_display_xref_sth();
   my $ins_object_xref_sth =       $self->get_ins_object_xref_sth();
 
+
   my $no_vega_ext = 201;
   if(defined($seen_gene->{$gene_symbol})){
     $no_vega_ext = $seen_gene->{$gene_symbol};
   }
+
   foreach my $tran_id ( @{$gene_to_transcripts->{$gene_id}} ){
     my $ext;
-#    my $source_id = $dbname_to_source_id->($tran_source);
     if(defined($tran_to_vega_ext->{$tran_id})){
       $ext = $tran_to_vega_ext->{$tran_id};
     }
@@ -573,6 +611,12 @@ sub set_transcript_display_xrefs{
   return;
 }
 
+
+#################################################
+# Get statement handle to retrieve what xrefs
+# are attached to a specific ensembl_id and type
+# for a particular source name
+#################################################
 sub get_dbentrie_sth{
   my $self = shift;
 
@@ -593,7 +637,36 @@ SQ1
   return  $self->{'_dbentrie_sth'};
 }
 
+#################################################
+# Get statement handle to retrieve what xrefs
+# are attached to a specific ensembl_id and type
+# for a particular source name with description
+#################################################
+sub get_dbentrie_with_desc_sth{
+  my $self = shift;
+
 
+  if(!defined($self->{'_dbentrie_desc_sth'})){
+    my $sql =(<<"SQD");
+SELECT x.label, x.xref_id, ox.object_xref_id, s.prioriy, x.description 
+  FROM xref x, object_xref ox, source s
+    WHERE x.xref_id = ox.xref_id AND
+          x.source_id = s.source_id AND
+          s.name = ? AND
+          ox.ox_status = 'DUMP_OUT' AND
+          ox.ensembl_id = ? AND
+          ox.ensembl_object_type = ?
+SQD
+    $self->{'_dbentrie_desc_sth'}  = $self->xref->dbc->prepare($sql);
+  }
+  return  $self->{'_dbentrie_desc_sth'};
+}
+
+#################################################
+# Get statement handle to set the display xref
+# for a transcript in the xref database.
+# Stored in the transcript_stable_id table.
+#################################################
 sub get_set_transcript_display_xref_sth {
   my $self = shift;
   if(!defined($self->{'_set_tran_display'})){
@@ -602,6 +675,12 @@ sub get_set_transcript_display_xref_sth {
   return $self->{'_set_tran_display'}
 }
 
+
+#################################################
+# Get statement handle to set the display xref
+# for a gene in the xref database.
+# Stored in the gene_stable_id table.
+#################################################
 sub get_set_gene_display_xref_sth {
   my $self = shift;
   if(!defined($self->{'_set_gene_display'})){
@@ -611,6 +690,9 @@ sub get_set_gene_display_xref_sth {
 }
 
 
+###############################################
+# Get statement handle to insert an xref
+############################################### 
 sub get_ins_xref_sth{
   my $self= shift;
 
@@ -622,6 +704,10 @@ sub get_ins_xref_sth{
   return $self->{'_ins_xref_sth'};
 }
 
+
+#################################################
+# Get statement handle to insert an identity xref
+#################################################
 sub get_ins_dep_ix_sth{
   my $self= shift;
 
@@ -632,6 +718,9 @@ sub get_ins_dep_ix_sth{
   return $self->{'_ins_identity_sth'};
 }
 
+###############################################
+# Get statement handle to insert an object_xref
+############################################### 
 sub get_ins_object_xref_sth{
   my $self= shift;
 
@@ -671,16 +760,15 @@ sub find_max_ids{
   return $max_object_xref_id, $max_xref_id;
 }
 
+
+
 sub get_tran_to_vega_data{
-#  my ($self, $gene_id, $ref_gene_to_transcripts, $ref_display_label_to_desc, $ref_synonym, $ref_best_list, $ref_name_count) = @_;
   my ($self, $arg_ref) = @_;
 
   my $gene_id                   = $arg_ref->{gene_id};
   my $ref_gene_to_transcripts   = $arg_ref->{gene_to_tran};
   my $ref_display_label_to_desc = $arg_ref->{label_to_desc};
   my $ref_synonym               = $arg_ref->{synonym};
-  my $ref_best_list             = $arg_ref->{best_list};
-  my $ref_name_count            = $arg_ref->{name_count};
   my $ref_xref_added            = $arg_ref->{xref_added};
   my $clone_based_vega_transcript_id = $arg_ref->{cbvt};
 
@@ -728,9 +816,10 @@ sub get_tran_to_vega_data{
       if( (!defined($num)) or (!$num) or ($num eq "")){
 	print "Problem finding number for $display\n";
       }
-      
+
+      $tran_to_vega_name{$tran_id} = $symbol_bit;
       $tran_to_vega_ext{$tran_id} = $num;
-      
+
       if(defined($ref_display_label_to_desc->{$symbol_bit})){
       }
       elsif(defined($ref_synonym->{$symbol_bit})){
@@ -745,9 +834,6 @@ sub get_tran_to_vega_data{
 	  next;
 	}
       }
-#      if($ref_best_list->{$symbol_bit}){
-#	$ref_name_count->{$symbol_bit}++;
-#      }
     }
   }
   return \%tran_to_vega_ext,\%tran_to_vega_name, \%tran_to_vega_xref_id;
@@ -779,7 +865,8 @@ SYN
 }
 
 sub get_display_label_data{
-  my ($self, $label_to_id, $label_to_desc) = @_;
+#  my ($self, $label_to_id, $label_to_desc) = @_;
+  my ($self, $label_to_desc) = @_;
 
   my $dbname = $self->get_official_name();
 
@@ -797,7 +884,7 @@ GD1
   my ($display_label, $acc, $syn, $desc);
   $gd1_sth->bind_columns(\$acc,\$display_label, \$desc);
   while($gd1_sth->fetch){
-    $label_to_id->{$display_label} = $acc;
+#    $label_to_id->{$display_label} = $acc;
     $label_to_desc->{$display_label} = $desc;
   }
   $gd1_sth->finish;
@@ -817,7 +904,7 @@ GD2
   $gd2_sth->execute();
   $gd2_sth->bind_columns(\$acc,\$display_label, \$desc);
   while($gd2_sth->fetch){
-    $label_to_id->{$display_label} = $acc;
+#    $label_to_id->{$display_label} = $acc;
     if(!defined($desc)){
       warn "undef desc for $display_label\n";
     }
@@ -839,27 +926,31 @@ sub get_other_name_hash{
   return  $self->{'_other_name'};
 }
 
+
+
+
 sub find_from_other_sources{
   my ($self, $ref_args) = @_;
-  my $tran_source = $ref_args->{tran_source};
-  my $gene_id     = $ref_args->{gene_id};
-
+  my $tran_source           = $ref_args->{tran_source};
+  my $gene_id               = $ref_args->{gene_id};
+  my $display_label_to_desc = $ref_args->{label_to_desc}; 
 
 
   my ($gene_symbol, $gene_symbol_xref_id);
 
-  my $dbentrie_sth = $self->get_dbentrie_sth();
+  my $dbentrie_sth = $self->get_dbentrie_with_desc_sth();
   
   my $other_name_num = $self->get_other_name_hash();
 
-  my ($display, $xref_id, $object_xref_id, $level);
+  my ($display, $xref_id, $object_xref_id, $level, $desc);
   foreach my $ext_db_name (qw(miRBase RFAM)){
     $dbentrie_sth->execute($ext_db_name, $gene_id, "Gene");
-    $dbentrie_sth->bind_columns(\$display, \$xref_id, \$object_xref_id, \$level);
+    $dbentrie_sth->bind_columns(\$display, \$xref_id, \$object_xref_id, \$level, \$desc);
     while($dbentrie_sth->fetch){
       $gene_symbol = $display;
       $gene_symbol_xref_id = $xref_id;
       $$tran_source = $ext_db_name;
+      $display_label_to_desc->{$display} = $desc;
       if(defined($other_name_num->{$gene_symbol})){
 	$other_name_num->{$gene_symbol}++;
       }
@@ -890,9 +981,6 @@ sub get_delete_odn_sth{
 sub set_the_best_odn{
   my ($self, $odn, $ref_list, $ref_list_ox, $ref_xref_id_to_display) = @_;
 
-#  my @list = @{$ref_list};
-#  my @list_ox = @{$ref_list_ox};
-
   my $delete_odn_sth = $self->get_delete_odn_sth();
 
 
-- 
GitLab