From 26152269f07dc8090d6f67b686b196100bf2f541 Mon Sep 17 00:00:00 2001
From: Magali Ruffier <mr6@ebi.ac.uk>
Date: Thu, 27 Nov 2014 15:17:26 +0000
Subject: [PATCH] ENSCORESW-1116: first attempt at storing some form of scoring
 in the overlap mappings

---
 .../XrefParser/RefSeqCoordinateParser.pm      | 48 ++++++++++++++++++-
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/misc-scripts/xref_mapping/XrefParser/RefSeqCoordinateParser.pm b/misc-scripts/xref_mapping/XrefParser/RefSeqCoordinateParser.pm
index 4cdcd2ebce..acbb03ebc1 100644
--- a/misc-scripts/xref_mapping/XrefParser/RefSeqCoordinateParser.pm
+++ b/misc-scripts/xref_mapping/XrefParser/RefSeqCoordinateParser.pm
@@ -204,6 +204,10 @@ sub run_script {
       foreach my $transcript_of (sort { $a->start() <=> $b->start() } @$transcripts_of) {
         my %transcript_result;
         my %tl_transcript_result;
+        my %transcript_to_dbid;
+        my %translation_to_dbid;
+        my %query_overlap;
+        my %target_overlap;
         my $id = $transcript_of->stable_id();
         if ($id =~ /^XM_/) { next; }
         my $exons_of = $transcript_of->get_all_Exons();
@@ -235,11 +239,16 @@ sub run_script {
           my $exon_match = 0;
           my $tl_exons = $transcript->get_all_translateable_Exons();
           my $tl_exon_match = 0;
+          my $overall_overlap = 0;
+          my $overall_overlap_of = 0;
+          my $tl_overall_overlap = 0;
+          my $tl_overall_overlap_of = 0;
 
           foreach my $exon (@$exons) {
             my $start = $exon->seq_region_start();
             my $end = $exon->seq_region_end();
             my $overlap = $rr1->overlap_size('exon', $start, $end);
+            $overall_overlap += $overlap;
             $exon_match += $overlap/($end - $start + 1);
             $rr2->check_and_register('exon', $start, $end);
           }
@@ -248,6 +257,7 @@ sub run_script {
             my $tl_start = $tl_exon->seq_region_start();
             my $tl_end = $tl_exon->seq_region_end();
             my $tl_overlap = $rr3->overlap_size('exon', $tl_start, $tl_end);
+            $tl_overall_overlap += $tl_overlap;
             $tl_exon_match += $tl_overlap/($tl_end - $tl_start + 1);
             $rr4->check_and_register('exon', $tl_start, $tl_end);
           }
@@ -255,11 +265,12 @@ sub run_script {
           my $exon_match_of = 0;
           my $tl_exon_match_of = 0;
 
-# Look for oeverlap between the two sets of exons
+# Look for overlap between the two sets of exons
           foreach my $exon_of (@$exons_of) {
             my $start_of = $exon_of->seq_region_start();
             my $end_of = $exon_of->seq_region_end();
             my $overlap_of = $rr2->overlap_size('exon', $start_of, $end_of);
+            $overall_overlap_of += $overlap_of;
             $exon_match_of += $overlap_of/($end_of - $start_of + 1);
           }
 
@@ -267,6 +278,7 @@ sub run_script {
             my $tl_start_of = $tl_exon_of->seq_region_start();
             my $tl_end_of = $tl_exon_of->seq_region_end();
             my $tl_overlap_of = $rr4->overlap_size('exon', $tl_start_of, $tl_end_of);
+            $tl_overall_overlap_of += $tl_overlap_of;
             $tl_exon_match_of += $tl_overlap_of/($tl_end_of - $tl_start_of + 1);
           }
 
@@ -275,10 +287,19 @@ sub run_script {
           my $tl_score = 0;
           if (scalar(@$tl_exons_of) > 0) {
             $tl_score = ( ($tl_exon_match_of + $tl_exon_match)) / (scalar(@$tl_exons_of) + scalar(@$tl_exons) );
+            if ($transcript->translation) {
+              my $tl_stable_id = $transcript->translation->stable_id;
+              $translation_to_dbid{$tl_stable_id} = $transcript->translation->stable_id;
+              $query_overlap{$tl_stable_id} = $tl_overall_overlap_of/$transcript_of->translation->length();
+              $target_overlap{$tl_stable_id} = $tl_overall_overlap/$transcript->translation->length();
+            }
           }
           if ($transcript->biotype eq $transcript_of->biotype) {
             $transcript_result{$transcript->stable_id} = $score;
+            $transcript_to_dbid{$transcript->stable_id} = $transcript->dbID;
             $tl_transcript_result{$transcript->stable_id} = $tl_score;
+            $query_overlap{$transcript->stable_id} = $overall_overlap_of/$transcript_of->length() * 100;
+            $target_overlap{$transcript->stable_id} = $overall_overlap/$transcript->length() * 100;
           }
         }
 
@@ -318,6 +339,17 @@ sub run_script {
                                           species_id => $species_id,
                                           info_type => 'DIRECT' });
           $self->add_direct_xref($xref_id, $best_id, "Transcript", "");
+          my $ensembl_id = $transcript_to_dbid{$best_id};
+          my $object_xref_id = $self->add_object_xref({
+             xref_id     => $xref_id,
+             ensembl_id  => $ensembl_id,
+             object_type => 'Transcript'});
+## Add 'identity_xref' to store the overlap values
+          $self->add_identity_xref({
+             object_xref_id  => $object_xref_id,
+             query_identity  => $query_overlap{$best_id},
+             target_identity => $target_overlap{$best_id},
+             score           => $best_score });
 
 # Also store refseq protein as direct xref for ensembl translation, if translation exists
           my $ta_of = $otherf_dba->get_TranscriptAdaptor();
@@ -329,6 +361,7 @@ sub run_script {
           if (defined $tl && defined $tl_of) {
             if ($tl_of->seq eq $tl->seq) {
               ($acc, $version) = split(/\./, $tl_of->stable_id());
+              my $tl_stable_id = $tl->stable_id;
               my $tl_xref_id = $self->add_xref({ acc => $acc,
                                               version => $version,
                                               label => $acc,
@@ -336,7 +369,18 @@ sub run_script {
                                               source_id => $peptide_source_id,
                                               species_id => $species_id,
                                               info_type => 'DIRECT' });
-              $self->add_direct_xref($tl_xref_id, $tl->stable_id(), "Translation", "");
+              $self->add_direct_xref($tl_xref_id, $tl_stable_id, "Translation", "");
+              my $tl_ensembl_id = $translation_to_dbid{$tl_stable_id};
+              my $tl_object_xref_id = $self->add_object_xref({
+                 xref_id     => $tl_xref_id,
+                 ensembl_id  => $tl_ensembl_id,
+                 object_type => 'Translation'});
+## Add 'identity_xref' to store the overlap values
+              $self->add_identity_xref({
+                 object_xref_id  => $tl_object_xref_id,
+                 query_identity  => $query_overlap{$tl_stable_id},
+                 target_identity => $target_overlap{$tl_stable_id},
+                 score           => $best_score });
             }
           }
         }
-- 
GitLab