From d31876bb85ee4e911d67334e9fc59df0ee88e5fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kusalananda=20K=C3=A4h=C3=A4ri?=
 <ak4@sanger.ac.uk>
Date: Thu, 8 Nov 2007 15:47:30 +0000
Subject: [PATCH] Simpler gene matching from transcript matches. Add "Was not
 best match" to unmapped reasons. Sort unmapped reasons.

---
 .../XrefMapper/CoordinateMapper.pm            | 116 ++++++++----------
 1 file changed, 53 insertions(+), 63 deletions(-)

diff --git a/misc-scripts/xref_mapping/XrefMapper/CoordinateMapper.pm b/misc-scripts/xref_mapping/XrefMapper/CoordinateMapper.pm
index a4b7ac9ec0..d29e46a9e8 100644
--- a/misc-scripts/xref_mapping/XrefMapper/CoordinateMapper.pm
+++ b/misc-scripts/xref_mapping/XrefMapper/CoordinateMapper.pm
@@ -22,8 +22,7 @@ our @EXPORT = qw( run_coordinatemapping );
 our $coding_weight = 2;
 our $ens_weight    = 3;
 
-our $transcript_score_threshold = 0.95;
-our $gene_score_threshold       = 0.95;
+our $transcript_score_threshold = 0.90;
 
 sub run_coordinatemapping {
   my ( $mapper, $do_upload ) = @_;
@@ -95,10 +94,8 @@ sub run_coordinatemapping {
   my $analysis_params =
     sprintf( "weights(coding,ensembl)="
                . "%.2f,%.2f;"
-               . "thresholds(transcript,gene)="
-               . "%.2f,%.2f",
-             $coding_weight, $ens_weight, $transcript_score_threshold,
-             $gene_score_threshold );
+               . "transcript_score_threshold=" . "%.2f",
+             $coding_weight, $ens_weight, $transcript_score_threshold );
 
   my $analysis_sql = qq(
     SELECT  analysis_id
@@ -146,7 +143,7 @@ sub run_coordinatemapping {
       'accession' => $xref->{'accession'},
       'reason'    => 'No overlap',
       'reason_full' =>
-        'No coordinate overlap with any Ensembl gene or transcript' };
+        'No coordinate overlap with any Ensembl transcript' };
   }
   $xref_sth->finish();
 
@@ -342,91 +339,82 @@ sub run_coordinatemapping {
         } ## end while ( $sth->fetch() )
         $sth->finish();
 
-        while ( my ( $coord_xref_id, $score ) =
-                each(%transcript_result) )
-        {
-          if ( !defined( $gene_result{$coord_xref_id} )
-               || $gene_result{$coord_xref_id} < $score )
-          {
-            $gene_result{$coord_xref_id} = $score;
-          }
-        }
-
         ################################################################
-        # Apply transcript threshold.                                  #
+        # Apply transcript threshold and pick the best match(es) for   #
+        # this transcript.                                             #
         ################################################################
 
-        foreach my $coord_xref_id ( keys(%transcript_result) ) {
+        my $best_score;
+        foreach my $coord_xref_id (
+             sort( { $transcript_result{$b} <=> $transcript_result{$a} }
+                   keys(%transcript_result) ) )
+        {
           my $score = $transcript_result{$coord_xref_id};
 
           if ( $score > $transcript_score_threshold ) {
-            if ( exists( $unmapped{$coord_xref_id} ) ) {
-              $mapped{$coord_xref_id} = $unmapped{$coord_xref_id};
-              delete( $unmapped{$coord_xref_id} );
-              $mapped{$coord_xref_id}{'reason'}      = undef;
-              $mapped{$coord_xref_id}{'reason_full'} = undef;
-            }
+            $best_score ||= $score;
+
+            if ( $score == $best_score ) {
+              if ( exists( $unmapped{$coord_xref_id} ) ) {
+                $mapped{$coord_xref_id} = $unmapped{$coord_xref_id};
+                delete( $unmapped{$coord_xref_id} );
+                $mapped{$coord_xref_id}{'reason'}      = undef;
+                $mapped{$coord_xref_id}{'reason_full'} = undef;
+              }
+
+              push( @{ $mapped{$coord_xref_id}{'mapped_to'} }, {
+                      'ensembl_id'          => $transcript->dbID(),
+                      'ensembl_object_type' => 'Transcript'
+                    } );
 
-            push( @{ $mapped{$coord_xref_id}{'mapped_to'} }, {
-                    'ensembl_id'          => $transcript->dbID(),
-                    'ensembl_object_type' => 'Transcript'
-                  } );
+              # This is now a candidate Xref for the gene.
+              if ( !defined( $gene_result{$coord_xref_id} )
+                   || $gene_result{$coord_xref_id} < $score )
+              {
+                $gene_result{$coord_xref_id} = $score;
+              }
+
+            } elsif ( exists( $unmapped{$coord_xref_id} ) ) {
+              $unmapped{$coord_xref_id}{'reason'} =
+                'Was not best match';
+              $unmapped{$coord_xref_id}{'reason_full'} =
+                sprintf(
+                       "Did not top best transcript match score (%.2f)",
+                       $best_score );
+            }
 
           } elsif ( exists( $unmapped{$coord_xref_id} ) ) {
             $unmapped{$coord_xref_id}{'reason'} =
               'Did not meet threshold';
             $unmapped{$coord_xref_id}{'reason_full'} =
-              sprintf( "Overlap score for transcript "
+              sprintf( "Match score for transcript "
                          . "lower than threshold (%.2f)",
                        $transcript_score_threshold );
           }
         } ## end foreach my $coord_xref_id (...
 
-        ################################################################
-        # For this transcript, pick out the best matche(s) from the    #
-        # ones passing the transcript threshold.                       #
-        ################################################################
-
-        # TODO
-
       } ## end foreach my $transcript ( sort...
 
       ##################################################################
-      # Apply gene threshold.                                          #
+      # Pick the best match(es) for this gene.                         #
       ##################################################################
 
-      foreach my $coord_xref_id ( keys(%gene_result) ) {
+      my $best_score;
+      foreach my $coord_xref_id (
+                         sort( { $gene_result{$b} <=> $gene_result{$a} }
+                               keys(%gene_result) ) )
+      {
         my $score = $gene_result{$coord_xref_id};
 
-        if ( $score > $gene_score_threshold ) {
-          if ( exists( $unmapped{$coord_xref_id} ) ) {
-            $mapped{$coord_xref_id} = $unmapped{$coord_xref_id};
-            delete( $unmapped{$coord_xref_id} );
-            $mapped{$coord_xref_id}{'reason'}      = undef;
-            $mapped{$coord_xref_id}{'reason_full'} = undef;
-          }
+        $best_score ||= $score;
 
+        if ( $score == $best_score ) {
           push( @{ $mapped{$coord_xref_id}{'mapped_to'} }, {
                   'ensembl_id'          => $gene->dbID(),
                   'ensembl_object_type' => 'Gene'
                 } );
-
-        } elsif ( exists( $unmapped{$coord_xref_id} ) ) {
-          $unmapped{$coord_xref_id}{'reason'} =
-            'Did not meet threshold';
-          $unmapped{$coord_xref_id}{'reason_full'} =
-            sprintf( "Overlap score for gene "
-                       . "lower than threshold (%.2f)",
-                     $gene_score_threshold );
         }
-      } ## end foreach my $coord_xref_id (...
-
-      ##################################################################
-      # For this gene, pick out the best matche(s) from the ones       #
-      # passing the gene threshold.                                    #
-      ##################################################################
-
-      # TODO
+      }
 
     } ## end while ( my $gene = shift(...
   } ## end foreach my $chromosome (@chromosomes)
@@ -540,7 +528,9 @@ sub dump_unmapped_reason {
 
   log_progress( "Dumping for 'unmapped_reason' to '%s'\n", $filename );
 
-  foreach my $reason ( values(%reasons) ) {
+  foreach my $reason (
+            sort( { $a->{'full'} cmp $b->{'full'} } values(%reasons) ) )
+  {
     # Assign 'unmapped_reason_id' to this reason.
     $reason->{'unmapped_reason_id'} = ++$unmapped_reason_id;
 
-- 
GitLab