From 47f9f5ca4f01f9638c950d0f4c423de312a202c2 Mon Sep 17 00:00:00 2001
From: Glenn Proctor <gp1@sanger.ac.uk>
Date: Wed, 10 Nov 2004 15:49:06 +0000
Subject: [PATCH] Print header for xrefs as well.

---
 .../xref_mapping/XrefMapper/BasicMapper.pm    | 156 ++++++++++++++++--
 1 file changed, 146 insertions(+), 10 deletions(-)

diff --git a/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm b/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm
index 03505ab0c2..0fe29e04f2 100644
--- a/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm
+++ b/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm
@@ -86,7 +86,11 @@ sub run_matching{
     $i++;
   }
 
-  $self->run_mapping(\@list);
+  if (!defined($self->use_existing_mappings)) {
+    $self->run_mapping(\@list);
+  } else {
+    print "Using existing mappings";
+  }
 
 }
 
@@ -149,7 +153,8 @@ sub get_set_lists{
 #	  ["method2",[$self->species,"*"]],
 #	  ["method3",["*","*"]]];
 
-  return [["ExonerateGappedBest1", ["homo_sapiens","UniProtSwissProt"]]];
+  #return [["ExonerateGappedBest1", ["homo_sapiens","UniProtSwissProt"]]];
+return [["ExonerateGappedBest1", ["homo_sapiens","RefSeq"]]];
 #  return [["ExonerateBest1",["*","*"]]];
 
 }
@@ -622,7 +627,7 @@ sub run_mapping {
   submit_depend_job($self->dir, @job_names);
 
 
-} # run_exonerate
+} # run_mapping
 
 
 =head2 submit_depend_job
@@ -734,6 +739,14 @@ sub store {
   # also keep track of types of ensembl objects
   my %ensembl_object_types;
 
+  # and a list of mappings of ensembl objects to xrefs
+  # (primary now, dependent added in dump_xrefs)
+  # this is required for display_xref generation later
+  # format:
+  #   key: ensembl object type:ensembl object id
+  #   value: list of xref_id (with offset)
+  my %object_xref_mappings;
+
   my $dir = $self->dir();
   foreach my $file (glob("$dir/*.map")) {
 
@@ -774,6 +787,11 @@ sub store {
 
       $ensembl_object_types{$target_id} = $type;
 
+      # store mapping for later - note NON-OFFSET xref_id is used
+      my $key = $type . ":" . $target_id;
+      my $xref_id = $query_id;
+      push @{$object_xref_mappings{$key}}, $xref_id;
+
       # note the NON-OFFSET xref_id is stored here as the values are used in
       # a query against the original xref database
       $primary_xref_ids{$query_id}{$target_id} = $target_id;
@@ -799,7 +817,8 @@ sub store {
   print "Read $total_lines lines from $total_files exonerate output files\n";
 
   # write relevant xrefs to file
-  $self->dump_xrefs(\%primary_xref_ids, $object_xref_id+1, $xref_id_offset, \%ensembl_object_types);
+  print "passing object_xref_mappings to dump_xrefs with " . scalar (keys %object_xref_mappings) . "\n";
+  $self->dump_xrefs(\%primary_xref_ids, $object_xref_id+1, $xref_id_offset, \%ensembl_object_types, \%object_xref_mappings);
 
   # write comparison info. Can be removed after development
   dump_comparison();
@@ -869,7 +888,8 @@ sub get_analysis_id {
 
 sub dump_xrefs {
 
-  my ($self, $xref_ids_hashref, $start_object_xref_id, $xref_id_offset, $ensembl_object_types_hashref) = @_;
+  my ($self, $xref_ids_hashref, $start_object_xref_id, $xref_id_offset, $ensembl_object_types_hashref, $object_xref_mappings) = @_;
+
   my @xref_ids = keys %$xref_ids_hashref;
   my %xref_to_objects = %$xref_ids_hashref;
   my %ensembl_object_types = %$ensembl_object_types_hashref;
@@ -910,8 +930,8 @@ sub dump_xrefs {
     my $xref_sth = $xref_dbi->prepare($sql);
     $xref_sth->execute();
 
-    my ($xref_id, $accession, $label, $description, $source_id, $species_id);
-    $xref_sth->bind_columns(\$xref_id, \$accession, \$label, \$description, \$source_id, \$species_id);
+    my ($xref_id, $accession, $version, $label, $description, $source_id, $species_id);
+    $xref_sth->bind_columns(\$xref_id, \$accession, \$version, \$label, \$description, \$source_id, \$species_id);
 
     # note the xref_id we write to the file is NOT the one we've just read
     # from the internal xref database as the ID may already exist in the core database
@@ -936,6 +956,7 @@ sub dump_xrefs {
       $source_ids{$source_id} = $source_id;
 
       # create an object_xref linking this (dependent) xref with any objects it maps to
+      # write to file and add to object_xref_mappings
       if (defined $xref_to_objects{$xref_id+$xref_id_offset}) {
 	my @objects = keys( %{$xref_to_objects{$xref_id+$xref_id_offset}} );
 	print "xref $accession has " . scalar(@objects) . " associated ensembl objects\n";
@@ -943,6 +964,9 @@ sub dump_xrefs {
 	  my $type = $ensembl_object_types{$object_id};
 	  print OBJECT_XREF "$object_xref_id\t$object_id\t$type\t" . ($xref_id+$xref_id_offset) . "DEPENDENT\n";
 	  $object_xref_id++;
+	  # Add this mapping to the list - note NON-OFFSET xref_id is used
+	  my $key = $type . ":" . $object_id;
+	  push @{$object_xref_mappings->{$key}}, $xref_id;
 	}
       }
     }
@@ -992,13 +1016,14 @@ sub dump_xrefs {
     $source_id_str = "= " . $source_id_array[0];
   }
 
-  my $source_sql = "SELECT name, release FROM source WHERE source_id $source_id_str";
+  # get source names; 
+  my $source_sql = "SELECT name, release, source_id FROM source WHERE source_id $source_id_str";
   my $source_sth = $xref_dbi->prepare($source_sql);
   #print STDERR $source_sql."\n";
   $source_sth->execute();
 
-  my ($source_name, $release);
-  $source_sth->bind_columns(\$source_name, \$release);
+  my ($source_name, $release, $source_id);
+  $source_sth->bind_columns(\$source_name, \$release, \$source_id);
 
   while (my @row = $source_sth->fetchrow_array()) {
     print EXTERNAL_DB "$edb_id\t$source_name\t$release\tXREF\n";
@@ -1008,8 +1033,14 @@ sub dump_xrefs {
 
   close(EXTERNAL_DB);
 
+  print "Before calling display_xref, object_xref_mappings size " . scalar (keys %{$object_xref_mappings}) . "\n";
+
+  # calculate display_xref_ids for transcripts and genes
+  $self->build_transcript_display_xrefs($object_xref_mappings, $xref_id_offset);
+
 }
 
+
 # produce output for comparison with existing ensembl mappings
 # format is (with header)
 # xref_accession ensembl_type ensembl_id
@@ -1025,6 +1056,7 @@ sub dump_comparison {
   # first read all the xrefs that were dumped and get an xref_id->accession map
   my %xref_id_to_accesson;
   open (XREF, "xref.txt");
+  print XREF "xref_accession" . "\t" . "ensembl_type" . "\t" . "ensembl_id\n";
   while (<XREF>) {
     my ($xref_id,$accession,$label,$description) = split;
     $xref_id_to_accesson{$xref_id} = $accession;
@@ -1043,4 +1075,108 @@ sub dump_comparison {
 
 }
 
+sub build_transcript_display_xrefs {
+
+  my ($self, $object_xref_mappings, $xref_id_offset) = @_;
+
+  # get a list of xref sources; format:
+  # key: xref_id value: source_name
+  # lots of these; if memory is a problem, just get the source ID (not the name)
+  # and look it up elsewhere
+  print "Building xref->source mapping table\n";
+  my %xref_to_source;
+  my $sql = "SELECT x.xref_id, s.name FROM source s, xref x WHERE x.source_id=s.source_id";
+  my $sth = $self->xref->dbi()->prepare($sql);
+  $sth->execute();
+
+  my ($xref_id, $source_name);
+  $sth->bind_columns(\$xref_id, \$source_name);
+
+  while (my @row = $sth->fetchrow_array()) {
+    $xref_to_source{$xref_id} = $source_name;
+  }
+
+  print "Got " . scalar(keys %xref_to_source) . " xref-source mappings\n";
+
+  print "Building transcript display_xrefs\n";
+  my @priorities = $self->transcript_display_xref_sources();
+
+  open (TRANSCRIPT_DX, ">transcript_display_xref.sql");
+
+  my $n;
+  # go through each transcript/xref mapping
+ foreach my $key (keys %{$object_xref_mappings}) {
+
+    my ($type, $obj) = split /:/, $key;
+    next if ($type !~ /Transcript/i);
+
+    # if a transcript has more than one associated xref,
+    # use the one with the highest priority, i.e. lower list position in @priorities
+    my @xrefs = @{$object_xref_mappings->{$key}};
+    my ($best_xref, $best_xref_idx);
+    $best_xref_idx = 99999;
+    foreach my $xref (@xrefs) {
+      my $source = $xref_to_source{$xref};
+      if ($source) {
+	my $i = find_in_list($source, @priorities);
+	if ($i > -1 && $i < $best_xref_idx) {
+	  $best_xref = $xref;
+	  $best_xref_idx = $i;
+	}
+      } else {
+	warn("Couldn't find a source for xref $xref \n");
+      }
+    }
+
+    if (!$best_xref) {
+      warn("Couldn't find a display xref for transcript id $obj\n");
+    } else {
+      # Write record with xref_id_offset
+      print TRANSCRIPT_DX "UPDATE transcript SET display_xref_id=" . ($best_xref+$xref_id_offset) . " WHERE transcript_id=" . $obj . "\n";
+      $n++;
+    }
+  }
+
+  close(TRANSCRIPT_DX);
+
+  print "Wrote $n transcript display_xref entries to transcript_display_xref.sql\n";
+}
+
+# Display xref sources to be used for transcripts *in order of priority*
+# Source names used must be identical to those in the source table.
+
+sub transcript_display_xref_sources {
+
+  return ('HUGO',
+	  'MarkerSymbol',
+	  'wormbase_transcript',
+	  'flybase_symbol',
+	  'Anopheles_symbol',
+	  'Genoscope_annotated_gene',
+	  'Genoscope_predicted_transcript',
+	  'Genoscope_predicted_gene',
+	  'UniProtSwissProt',
+	  'RefSeq',
+	  'UniProtSPTrEMBL',
+	  'LocusLink');
+
+}
+
+
+# Find the index of an item in a list(ref), or -1 if it's not in the list
+
+sub find_in_list {
+
+  my ($item, @list) = @_;
+
+  for (my $i = 0; $i < scalar(@list); $i++) {
+    if ($list[$i] =~ /$item/) {
+      return $i;
+    }
+  }
+
+  return -1;
+
+}
+
 1;
-- 
GitLab