From 1ac3b437eb68a37c97bdf23d012c1055942b0c1c Mon Sep 17 00:00:00 2001 From: Glenn Proctor <gp1@sanger.ac.uk> Date: Thu, 16 Dec 2004 11:02:30 +0000 Subject: [PATCH] Fixed calculation of transcript display_xrefs; wasn't considering translation-transcript relationship properly. Now use | as separator character in composite hash keys; : was a bad choice since it appears in GO xrefs. Don't write xref/object_xrefs for sources that have no corresponding external_db entry (but warn about this). --- .../xref_mapping/XrefMapper/BasicMapper.pm | 147 ++++++++++++------ 1 file changed, 96 insertions(+), 51 deletions(-) diff --git a/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm b/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm index 49e70d3e51..4731268a01 100644 --- a/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm +++ b/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm @@ -791,7 +791,7 @@ sub parse_mappings { $ensembl_object_types{$target_id} = $type; # store mapping for later - note NON-OFFSET xref_id is used - my $key = $type . ":" . $target_id; + my $key = $type . "|" . $target_id; my $xref_id = $query_id; push @{$object_xref_mappings{$key}}, $xref_id; @@ -965,9 +965,11 @@ sub dump_core_xrefs { if (!$xrefs_written{$xref_id}) { my $external_db_id = $source_to_external_db{$source_id}; - print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\n"; - $xrefs_written{$xref_id} = 1; - $source_ids{$source_id} = $source_id; + if ($external_db_id) { # skip "unknown" sources + print XREF ($xref_id+$xref_id_offset) . "\t" . $external_db_id . "\t" . $accession . "\t" . $label . "\t" . $version . "\t" . $description . "\n"; + $xrefs_written{$xref_id} = 1; + $source_ids{$source_id} = $source_id; + } } } @@ -981,6 +983,8 @@ sub dump_core_xrefs { while ($dep_sth->fetch()) { my $external_db_id = $source_to_external_db{$source_id}; + next if (!$external_db_id); + $label = $accession if (!$label); if (!$xrefs_written{$xref_id}) { @@ -991,17 +995,17 @@ sub dump_core_xrefs { # create an object_xref linking this (dependent) xref with any objects it maps to # write to file and add to object_xref_mappings - if (defined $xref_to_objects{$master_xref_id}) { # XXX check - my @ensembl_object_ids = keys( %{$xref_to_objects{$master_xref_id}} ); # XXX check + if (defined $xref_to_objects{$master_xref_id}) { + my @ensembl_object_ids = keys( %{$xref_to_objects{$master_xref_id}} ); #print "xref $accession has " . scalar(@ensembl_object_ids) . " associated ensembl objects\n"; foreach my $object_id (@ensembl_object_ids) { my $type = $ensembl_object_types{$object_id}; - my $full_key = $type.":".$object_id.":".$xref_id; + my $full_key = $type."|".$object_id."|".$xref_id; if (!$object_xrefs_written{$full_key}) { print OBJECT_XREF "$object_xref_id\t$object_id\t$type\t" . ($xref_id+$xref_id_offset) . "\tDEPENDENT\n"; $object_xref_id++; # Add this mapping to the list - note NON-OFFSET xref_id is used - my $key = $type . ":" . $object_id; + my $key = $type . "|" . $object_id; push @{$object_xref_mappings->{$key}}, $xref_id; $object_xrefs_written{$full_key} = 1; } @@ -1097,9 +1101,10 @@ sub build_transcript_display_xrefs { print "Got " . scalar(keys %xref_to_source) . " xref-source mappings\n"; - # Cache the list of translation->transcript mappings + # Cache the list of translation->transcript mappings & vice versa print "Building translation to transcript mappings\n"; my %translation_to_transcript; + my %transcript_to_translation; my $sth = $self->dbi()->prepare("SELECT translation_id, transcript_id FROM translation"); $sth->execute(); @@ -1108,23 +1113,20 @@ sub build_transcript_display_xrefs { while ($sth->fetch()) { $translation_to_transcript{$translation_id} = $transcript_id; + $transcript_to_translation{$transcript_id} = $translation_id if ($translation_id); } print "Building transcript display_xrefs\n"; my @priorities = $self->transcript_display_xref_sources(); - open (TRANSCRIPT_DX, ">transcript_display_xref.sql"); - my $n = 0; - # go through each object/xref mapping - # store the best ones as we go along - # hash keyed on transcript id, value is xref_id:source prioirity index - # xref is stored with offset added - # Note xrefs to translations are also considered; transcript ID is always stored - my %transcript_display_xrefs; + + # go through each object/xref mapping and store the best ones as we go along + my %obj_to_best_xref; + foreach my $key (keys %{$object_xref_mappings}) { - my ($type, $obj) = split /:/, $key; + my ($type, $object_id) = split /\|/, $key; next if ($type !~ /(Transcript|Translation)/i); @@ -1134,6 +1136,7 @@ sub build_transcript_display_xrefs { my ($best_xref, $best_xref_priority_idx); $best_xref_priority_idx = 99999; foreach my $xref (@xrefs) { + my $source = $xref_to_source{$xref}; if ($source) { my $i = find_in_list($source, @priorities); @@ -1145,29 +1148,67 @@ sub build_transcript_display_xrefs { warn("Couldn't find a source for xref $xref \n"); } } + # store object type, id, and best xref id and source priority + if ($best_xref) { + $obj_to_best_xref{$key} = $best_xref . "|" . $best_xref_priority_idx; + } - if (!$best_xref) { - #warn("Couldn't find a display xref for transcript id $obj\n"); - } else { + } + + # Now go through each of the calculated best xrefs and convert any that are + # calculated against translations to be associated with their transcript, + # if the priority of the translation xref is higher than that of the transcript + # xref. + # Needs to be done this way to avoid clobbering higher-priority transcripts. + + # hash keyed on transcript id, value is xref_id|source prioirity index + my %transcript_display_xrefs; + + # Write a .sql file that can be executed, and a .txt file that can be processed + open (TRANSCRIPT_DX, ">transcript_display_xref.sql"); + open (TRANSCRIPT_DX_TXT, ">transcript_display_xref.txt"); + + foreach my $key (keys %obj_to_best_xref) { + + my ($type, $object_id) = split /\|/, $key; + my ($best_xref, $best_xref_priority_idx) = split /\|/, $obj_to_best_xref{$object_id}; + + # If transcript has a translation, use the best xref out of the transcript & translation + if ($type =~ /Transcript/i) { + my $transcript_id = $object_id; + my $translation_id = $transcript_to_translation{$transcript_id}; + if ($translation_id) { + my ($translation_xref, $translation_priority) = split /\|/, $obj_to_best_xref{"Translation|$translation_id"}; + my ($transcript_xref, $transcript_priority) = split /\|/, $obj_to_best_xref{"Transcript|$transcript_id"}; + + if ($translation_priority < $transcript_priority) { + $best_xref = $translation_xref; + $best_xref_priority_idx = $translation_priority; + } else { + $best_xref = $transcript_xref; + $best_xref_priority_idx = $transcript_priority; + } - # If transcript, store directly - # If translation, lookup transcript id - my $object_id; - if ($type =~ /Transcript/i) { - $object_id = $obj; - } elsif ($type =~ /Translation/i) { - $object_id = $translation_to_transcript{$obj}; } + } + + if ($best_xref) { + # Write record with xref_id_offset print TRANSCRIPT_DX "UPDATE transcript SET display_xref_id=" . ($best_xref+$xref_id_offset) . " WHERE transcript_id=" . $object_id . ";\n"; + print "wrote " . $best_xref . " (plus offset) for 95625\n" if ($object_id eq 95625); + print TRANSCRIPT_DX_TXT ($best_xref+$xref_id_offset) . "\t" . $object_id . "\n"; $n++; - my $value = ($best_xref+$xref_id_offset) . ":" . $best_xref_priority_idx; + + my $value = ($best_xref+$xref_id_offset) . "|" . $best_xref_priority_idx; $transcript_display_xrefs{$object_id} = $value; + } } close(TRANSCRIPT_DX); + close(TRANSCRIPT_DX_TXT); print "Wrote $n transcript display_xref entries to transcript_display_xref.sql\n"; @@ -1213,6 +1254,7 @@ sub build_gene_display_xrefs { print "Assigning display_xrefs to genes\n"; open (GENE_DX, ">gene_display_xref.sql"); + open (GENE_DX_TXT, ">gene_display_xref.txt"); my $hit = 0; my $miss = 0; my $trans_no_xref = 0; @@ -1231,9 +1273,10 @@ sub build_gene_display_xrefs { } else { $trans_xref++; } - my ($xref_id, $priority) = split (/:/, $transcript_display_xrefs->{$transcript_id}); + my ($xref_id, $priority) = split (/\|/, $transcript_display_xrefs->{$transcript_id}); #print "gene $gene_id orig:" . $transcript_display_xrefs->{$transcript_id} . " xref id: " . $xref_id . " pri " . $priority . "\n"; # 2 separate if clauses to avoid having to fetch transcripts unnecessarily + if (($priority lt $best_xref_priority_idx)) { $best_xref_priority_idx = $priority; @@ -1252,21 +1295,21 @@ sub build_gene_display_xrefs { } } - if (!$best_xref) { - #XXXwarn("Couldn't find a display xref for gene id $gene_id\n"); - $miss++; - } else{ - # Write record + if ($best_xref) { + # Write record print GENE_DX "UPDATE gene SET display_xref_id=" . $best_xref . " WHERE gene_id=" . $gene_id . ";\n"; + print GENE_DX_TXT $best_xref . "\t" . $gene_id ."\n"; $hit++; } } close (GENE_DX); + close (GENE_DX_TXT); print "Transcripts with no xrefs: $trans_no_xref with xrefs: $trans_xref\n"; print "Wrote $hit gene display_xref entries to gene_display_xref.sql\n"; - print "Couldn't find display_xrefs for $miss genes\n"; + print "Couldn't find display_xrefs for $miss genes\n" if ($miss > 0); + print "Found display_xrefs for all genes\n" if ($miss eq 0); } @@ -1291,19 +1334,20 @@ sub transcript_display_xref_sources { } -# Find the index of an item in a list(ref), or -1 if it's not in the list +# Find the index of an item in a list(ref), or 999999 if it's not in the list. +# Only look for exact matches (case insensitive) sub find_in_list { my ($item, @list) = @_; for (my $i = 0; $i < scalar(@list); $i++) { - if ($list[$i] =~ /$item/) { + if (lc($list[$i]) eq lc($item)) { return $i; } } - return -1; + return 999999; } @@ -1337,7 +1381,7 @@ sub map_source_to_external_db { } else { - print STDERR "Can't find external_db entry for source name $source_name\n" + print STDERR "Can't find external_db entry for source name $source_name; xrefs for this source will not be written. Consider adding $source_name to external_db\n" } @@ -1379,29 +1423,30 @@ sub do_upload { # gene_display_xref.sql etc foreach my $table ("gene", "transcript") { - my $file = getcwd() . "/" . $table . "_display_xref.sql"; + my $file = getcwd() . "/" . $table . "_display_xref.txt"; my $sth; if ($deleteexisting) { $sth = $self->dbi()->prepare("UPDATE $table SET display_xref_id=NULL"); - print "Setting all existing display_xref_id in $table to NULL\n"; + print "Setting all existing display_xref_id in $table to null\n"; $sth->execute(); } - # is this nicer than using the mysql client? print "Setting $table display_xrefs from $file\n"; - open(DISPLAY_XREF, $file); - while(<DISPLAY_XREF>) { - - $sth = $self->dbi()->prepare($_); - $sth->execute(); - + # TODO this better + #my $str = "mysql -u " .$self->user() ." -p" . $self->password() . " -h " . $self->host() ." -P " . $self->port() . " " .$self->dbname() . " < $file"; + #system $str; + + $sth = $self->dbi()->prepare("UPDATE $table SET display_xref_id=? WHERE ${table}_id=?"); + open(DX_TXT, $file); + while (<DX_TXT>) { + my ($xref_id, $object_id) = split; + $sth->execute($xref_id, $object_id); } - close(DISPLAY_XREF); - + close(DX_TXT); } } -- GitLab