From 6d276ea9aea70de07e877efab74e828a113e68fd Mon Sep 17 00:00:00 2001
From: Graham McVicker <mcvicker@sanger.ac.uk>
Date: Wed, 4 Feb 2004 18:03:56 +0000
Subject: [PATCH] keep track of types of mapping problems now

---
 misc-scripts/chimp/human2chimp.pl | 291 +++++++++++++++---------------
 1 file changed, 146 insertions(+), 145 deletions(-)

diff --git a/misc-scripts/chimp/human2chimp.pl b/misc-scripts/chimp/human2chimp.pl
index 6706d8c9ae..968403f0cb 100644
--- a/misc-scripts/chimp/human2chimp.pl
+++ b/misc-scripts/chimp/human2chimp.pl
@@ -9,89 +9,139 @@ use Bio::EnsEMBL::Exon;
 
 use Bio::EnsEMBL::Utils::Exception qw(throw);
 
-my $MAX_CODING_INDEL     = 12; #max allowed coding indel in exon
-my $MAX_FRAMESHIFT_INDEL = 5; #max allowed frameshifting indel in exon
+use ErrCode qw(get_err set_err ec2str);
 
+my $MAX_CODING_INDEL     = 18; #max allowed coding indel in exon
+my $MAX_FRAMESHIFT_INDEL = 8; #max allowed frameshifting indel in exon
 
-my ($hhost, $hdbname, $huser, $hpass, $hport, $hassembly,  #human vars
-    $chost, $cdbname, $cuser, $cpass, $cport, $cassembly,  #chimp vars
-    $help,  $verbose);
+my $verbose;
 
-GetOptions('hhost=s'   => \$hhost,
-           'hdbname=s' => \$hdbname,
-           'huser=s'   => \$huser,
-           'hpass=s'   => \$hpass,
-           'hport=i'   => \$hport,
-           'hassembly=s' => \$hassembly,
-           'chost=s'   => \$chost,
-           'cdbname=s' => \$cdbname,
-           'cuser=s'   => \$cuser,
-           'cpass=s'   => \$cpass,
-           'cport=i'   => \$cport,
-           'cassembly=s' => \$cassembly,
-           'help'      => \$help,
-           'verbose'   => \$verbose);
+{  #block to avoid namespace pollution
 
+  my ($hhost, $hdbname, $huser, $hpass, $hport, $hassembly,  #human vars
+      $hchromosome,
+      $chost, $cdbname, $cuser, $cpass, $cport, $cassembly,  #chimp vars
+      $help);
 
-usage() if($help);
-usage("-hdbname option is required") if (!$hdbname);
-usage("-cdbname option is required") if (!$cdbname);
+  GetOptions('hhost=s'   => \$hhost,
+             'hdbname=s' => \$hdbname,
+             'huser=s'   => \$huser,
+             'hpass=s'   => \$hpass,
+             'hport=i'   => \$hport,
+             'hassembly=s' => \$hassembly,
+             'hchromosome=s' => \$hchromosome,
+             'chost=s'   => \$chost,
+             'cdbname=s' => \$cdbname,
+             'cuser=s'   => \$cuser,
+             'cpass=s'   => \$cpass,
+             'cport=i'   => \$cport,
+             'cassembly=s' => \$cassembly,
+             'help'      => \$help,
+             'verbose'   => \$verbose);
 
-$hport ||= 3306;
-$cport ||= 3306;
 
-$hdbname ||= 'localhost';
-$cdbname ||= 'localhost';
+  usage() if($help);
+  usage("-hdbname option is required") if (!$hdbname);
+  usage("-cdbname option is required") if (!$cdbname);
 
-$hassembly ||= 'NCBI34';
-$cassembly ||= 'BROAD1';
+  $hport ||= 3306;
+  $cport ||= 3306;
 
-debug("Connecting to human database");
+  $hdbname ||= 'localhost';
+  $cdbname ||= 'localhost';
 
-my $human_db = Bio::EnsEMBL::DBSQL::DBAdaptor->new
-  (-host    => $hhost,
-   -dbname => $hdbname,
-   -pass   => $hpass,
-   -user   => $huser,
-   -port   => $hport);
+  $hassembly ||= 'NCBI34';
+  $cassembly ||= 'BROAD1';
 
-debug("Connecting to chimp database");
+  debug("Connecting to human database");
 
-my $chimp_db = Bio::EnsEMBL::DBSQL::DBAdaptor->new
-  (-host    => $chost,
-   -dbname  => $cdbname,
-   -pass    => $cpass,
-   -user    => $cuser,
-   -port    => $cport);
+  my $human_db = Bio::EnsEMBL::DBSQL::DBAdaptor->new
+    (-host    => $hhost,
+     -dbname => $hdbname,
+     -pass   => $hpass,
+     -user   => $huser,
+     -port   => $hport);
 
+  debug("Connecting to chimp database");
 
-$human_db->dnadb($chimp_db);
+  my $chimp_db = Bio::EnsEMBL::DBSQL::DBAdaptor->new
+    (-host    => $chost,
+     -dbname  => $cdbname,
+     -pass    => $cpass,
+     -user    => $cuser,
+     -port    => $cport);
 
-my $slice_adaptor   = $human_db->get_SliceAdaptor();
-my $gene_adaptor    = $human_db->get_GeneAdaptor();
 
+  $human_db->dnadb($chimp_db);
 
-debug("Fetching chromosomes");
+  my $slice_adaptor   = $human_db->get_SliceAdaptor();
+  my $gene_adaptor    = $human_db->get_GeneAdaptor();
 
-my $slices = $slice_adaptor->fetch_all('chromosome', $hassembly);
 
-foreach my $slice (@$slices) {
+  debug("Fetching chromosomes");
 
-  debug("Chromosome: " . $slice->seq_region_name());
-  debug("Fetching Genes");
+  my $slices;
 
-  my $genes = $gene_adaptor->fetch_all_by_Slice($slice);
+  if($hchromosome) {
+    my $slice = $slice_adaptor->fetch_by_region('chromosome',
+                                                $hchromosome,
+                                                undef, undef, undef,
+                                                $hassembly);
+    if(!$slice) {
+      throw("unknown chromosome $hchromosome");
+    }
+
+    $slices = [$slice];
+  } else {
+    $slices = $slice_adaptor->fetch_all('chromosome', $hassembly);
+  }
+
+  my %err_hash;
+  my $total_transcripts = 0;
+  my $mapped_transcripts = 0;
+
+  foreach my $slice (@$slices) {
+
+    debug("Chromosome: " . $slice->seq_region_name());
+    debug("Fetching Genes");
+
+    my $genes = $gene_adaptor->fetch_all_by_Slice($slice);
+
+    foreach my $gene (reverse @$genes) {
+      debug("Gene: ".$gene->stable_id);
+      my $transcripts = $gene->get_all_Transcripts();
+
+      foreach my $transcript (@$transcripts) {
+        next if(!$transcript->translation);
 
-  foreach my $gene (reverse @$genes) {
-    debug("Gene: ".$gene->stable_id);
-    my $transcripts = $gene->get_all_Transcripts();
+        $total_transcripts++;
 
-    foreach my $transcript (@$transcripts) {
+        my @transcripts = transfer_transcript($human_db,$chimp_db,$transcript,
+                                             $hassembly, $cassembly);
 
-      my @transcripts = transfer_transcript($human_db,$chimp_db,$transcript);
+        if(@transcripts) {
+          foreach my $transcript (@transcripts) {
+            debug("\nTranslation:" . $transcript->translate()->seq() . "\n\n");
+          }
+          $mapped_transcripts++;
+        } else {
+          $err_hash{get_err()}++ if(!@transcripts);
+        }
 
+        print STDERR "Mapped/Total Transcripts: " .
+          "$mapped_transcripts/$total_transcripts\n";
+
+      }
     }
   }
+
+  print STDERR "==Error Report==\n";
+  print STDERR "#Occurances\tCode:Description\n";
+  print STDERR "-----------\t----------------\n";
+
+  foreach my $ec (sort {$a <=> $b} keys %err_hash) {
+    print STDERR $err_hash{$ec}, "\t\t$ec:", ec2str($ec), "\n";
+  }
 }
 
 
@@ -105,6 +155,8 @@ sub transfer_transcript {
   my $human_db   = shift;
   my $chimp_db   = shift;
   my $transcript = shift;
+  my $hassembly  = shift;
+  my $cassembly  = shift;
 
   debug("Transcript: " . $transcript->stable_id());
 
@@ -159,7 +211,8 @@ sub transfer_transcript {
 
         } else {
           ### TBD: can do more sensible checking and maybe split transcript
-          debug('entire coding exon deletion - discarding transcript');
+          set_err(ErrCode::EXON_DELETE_CODING);
+
           return ();
         }
       } else {
@@ -174,12 +227,6 @@ sub transfer_transcript {
         $chimp_cdna_pos += $c->length();
         $cdna_exon_start += $c->length();
 
-      #debug("exon mapped - incrementing chimp_cdna_pos and cdna_exon_start\n" .
-      #        "  exon_len = " . $c->length() . "\n" .
-      #        "  chimp_cdna_pos  = $chimp_cdna_pos\n" .
-      #        "  cdna_exon_start = $cdna_exon_start"); 
-
-
         push @chimp_exons, [$c->start(), $c->end(), $c->strand(), $c->id()];
       }
     } else {
@@ -190,7 +237,7 @@ sub transfer_transcript {
       if(!$result) {
         #failed to obtain extent of coords due to scaffold spanning
         #strand flipping, or exon inversion
-	
+
         ### TBD - fix this, may be ok to drop exon and continue esp. if exon
         ###       is entirely UTR
         return ();
@@ -241,7 +288,8 @@ sub transfer_transcript {
 
             #sanity check:
             if($insert_len < 0) {
-              throw("Unexpected - negative insert - undetected exon inversion?");
+              throw("Unexpected - negative insert " .
+                    "- undetected exon inversion?");
             }
 
             if($insert_len > 0) {
@@ -249,8 +297,6 @@ sub transfer_transcript {
               #
               # insert in chimp, deletion in human
               #
-              #debug("before insertion processing\n" .
-              #      "  cdna_exon_start = $cdna_exon_start");
 
               my $result = 
                 process_insertion(\$chimp_cdna_pos, $insert_len,
@@ -259,9 +305,6 @@ sub transfer_transcript {
                                   \$cdna_coding_start, \$cdna_coding_end,
                                  $exon_seq_region);
 
-              #debug("after insertion processing\n" .
-              #      "  cdna_exon_start = $cdna_exon_start");
-
               return () if(!defined($result));
 
               push @chimp_exons, @$result;
@@ -272,20 +315,11 @@ sub transfer_transcript {
 
           $chimp_cdna_pos += $c->length();
 
-          #debug("match - incremented chimp_cdna_pos\n" .
-          #      "  match_len = " . $c->length() . "\n" .
-          #      "  chimp_cdna_pos = $chimp_cdna_pos");
-
         }
       }  # foreach coord
 
       $cdna_exon_start += $exon_end - $exon_start + 1;
 
-      #debug("exon complete - incremented cdna_exon_start\n" .
-      #      "  exon_len = " . ($exon_end - $exon_start + 1) .
-      #      "  cdna_exon_start = $cdna_exon_start\n");
-
-
       push @chimp_exons, [$exon_start, $exon_end, $exon_strand,
                           $exon_seq_region];
     }
@@ -293,13 +327,10 @@ sub transfer_transcript {
 
   my $slice_adaptor = $chimp_db->get_SliceAdaptor();
 
-  my @transcripts = create_transcripts(\@chimp_exons,
-                                       $cdna_coding_start, $cdna_coding_end,
-                                       $slice_adaptor);
+  return create_transcripts(\@chimp_exons,
+                            $cdna_coding_start, $cdna_coding_end,
+                            $slice_adaptor);
 
-  foreach my $transcript (@transcripts) {
-    debug("\nTranslation:" . $transcript->translate()->seq() . "\n\n");
-  }
 
 }
 
@@ -339,11 +370,8 @@ sub process_deletion {
   my $exon_len = $$exon_end_ref - $$exon_start_ref + 1;
   my $cdna_exon_end = $$cdna_exon_start_ref + $exon_len - 1;
 
-  #debug("processing deletion");
-  #debug("  del_start = $del_start");
-  #debug("  del_end   = $del_end");
-
-  #sanity check, deletion should be completely in or adjacent to exon boundaries
+  # sanity check, deletion should be completely in
+  # or adjacent to exon boundaries
   if($del_start < $$cdna_exon_start_ref - 1 ||
      $del_start > $cdna_exon_end + 1) {
 
@@ -362,7 +390,7 @@ sub process_deletion {
      $del_end >= $$cdna_coding_end_ref) {
 
     # nothing can be done with this exon since entire CDS is deleted
-    debug("entire cds deleted - discarding transcript");
+    set_err(ErrCode::PART_EXON_CDS_DELETE_ENTIRE);
     return undef;
   }
 
@@ -379,8 +407,7 @@ sub process_deletion {
 
     if($cds_del_len > $MAX_CODING_INDEL) {
       # too much coding sequence has been deleted
-      debug("cds deletion is too long ($cds_del_len) - " .
-            "discarding transcript");
+      set_err(ErrCode::PART_EXON_CDS_DELETE_TOO_LONG);
       return undef;
     }
 
@@ -396,8 +423,7 @@ sub process_deletion {
     if($frameshift) {
       if($cds_del_len > $MAX_FRAMESHIFT_INDEL) {
         # frameshift deletion is too long
-        debug("frameshift deletion is too long ($cds_del_len) " .
-              "- discarding transcript");
+        set_err(ErrCode::PART_EXON_CDS_FRAMESHIFT_DELETE_TOO_LONG);
         return undef;
       }
 
@@ -406,7 +432,7 @@ sub process_deletion {
       $$cdna_coding_start_ref += 3 - $frameshift;
 
       if($$cdna_coding_start_ref >= $$cdna_coding_end_ref) {
-        debug("no cds left after shift - discarding transcript");
+        set_err(ErrCode::NO_CDS_LEFT);
         return undef;
       }
     }
@@ -425,8 +451,7 @@ sub process_deletion {
 
     if($cds_del_len > $MAX_CODING_INDEL) {
       # too much coding sequence has been deleted
-      debug("cds deletion is too long ($cds_del_len) - " .
-            "discarding transcript");
+      set_err(ErrCode::PART_EXON_CDS_DELETE_TOO_LONG);
       return undef;
     }
 
@@ -437,9 +462,8 @@ sub process_deletion {
 
     if($frameshift) {
       if($cds_del_len > $MAX_FRAMESHIFT_INDEL) {
-        debug("frameshift deletion is too long ($cds_del_len) " .
-              "- discarding transcript");
-      return undef;
+        set_err(ErrCode::PART_EXON_CDS_FRAMESHIFT_DELETE_TOO_LONG);
+        return undef;
       }
 
       #move up CDS end to put reading frame back (shrink CDS)
@@ -447,7 +471,7 @@ sub process_deletion {
       $$cdna_coding_end_ref -= 3 - $frameshift;
 
       if($$cdna_coding_start_ref >= $$cdna_coding_end_ref) {
-        debug("no cds left after shift - discarding transcript");
+        set_err(ErrCode::NO_CDS_LEFT);
         return undef;
       }
     }
@@ -461,8 +485,7 @@ sub process_deletion {
 
     if($del_len > $MAX_CODING_INDEL) {
       # too much coding sequence has been deleted
-      debug("cds deletion is too long ($del_len) - " .
-            "discarding transcript");
+      set_err(ErrCode::PART_EXON_CDS_DELETE_TOO_LONG);
       return undef;
     }
 
@@ -473,8 +496,7 @@ sub process_deletion {
 
     if($frameshift) {
       if($del_len > $MAX_FRAMESHIFT_INDEL) {
-        debug("frameshift deletion is too long ($del_len) " .
-              "- discarding transcript");
+        set_err(ErrCode::PART_EXON_CDS_FRAMESHIFT_DELETE_TOO_LONG);
         return undef;
       }
 
@@ -483,7 +505,7 @@ sub process_deletion {
       $$cdna_coding_start_ref += 3 - $frameshift;
 
       if($$cdna_coding_start_ref >= $$cdna_coding_end_ref) {
-        debug("no cds left after shift - discarding transcript");
+        set_err(ErrCode::NO_CDS_LEFT);
         return undef;
       }
     }
@@ -497,8 +519,7 @@ sub process_deletion {
 
     if($del_len > $MAX_CODING_INDEL) {
       # too much coding sequence has been deleted
-      debug("cds deletion is too long ($del_len) - " .
-            "discarding transcript");
+      set_err(ErrCode::PART_EXON_CDS_DELETE_TOO_LONG);
       return undef;
     }
 
@@ -509,8 +530,7 @@ sub process_deletion {
 
     if($frameshift) {
       if($del_len > $MAX_FRAMESHIFT_INDEL) {
-        debug("frameshift deletion is too long ($del_len) " .
-              "- discarding transcript");
+        set_err(ErrCode::PART_EXON_CDS_FRAMESHIFT_DELETE_TOO_LONG);
         return undef;
       }
 
@@ -518,7 +538,7 @@ sub process_deletion {
       $$cdna_coding_end_ref -= 3 - $frameshift;
 
       if($$cdna_coding_start_ref >= $$cdna_coding_end_ref) {
-        debug("no cds left after shift - discarding transcript");
+        set_err(ErrCode::NO_CDS_LEFT);
         return undef;
       }
     }
@@ -532,9 +552,7 @@ sub process_deletion {
     debug("delete ($del_len) in middle of cds");
 
     if($del_len > $MAX_CODING_INDEL) {
-      # too much coding sequence has been deleted
-      debug("cds deletion is too long ($del_len) - " .
-            "discarding transcript");
+      set_err(ErrCode::PART_EXON_CDS_DELETE_TOO_LONG);
       return undef;
     }
 
@@ -545,8 +563,7 @@ sub process_deletion {
 
     if($frameshift) {
       if($del_len > $MAX_FRAMESHIFT_INDEL) {
-        debug("frameshift deletion is too long ($del_len) " .
-              " - discarding transcript");
+        set_err(ErrCode::PART_EXON_CDS_DELETE_TOO_LONG);
         return undef;
       }
 
@@ -673,9 +690,6 @@ sub process_insertion {
   my $exon_len      = $$exon_end_ref - $$exon_start_ref + 1;
   my $cdna_exon_end = $$cdna_exon_start_ref + $exon_len - 1;
 
-  #debug("processing insertion");
-  #debug("  ins_left = $$cdna_ins_pos_ref");
-  #debug("  ins_len  = $insert_len");
 
   # sanity check, insert should be completely in exon boundaries
   if($$cdna_ins_pos_ref < $$cdna_exon_start_ref ||
@@ -688,8 +702,8 @@ sub process_insertion {
     if($$cdna_ins_pos_ref < $$cdna_exon_start_ref  &&
        $$cdna_ins_pos_ref + 3 >= $$cdna_exon_start_ref  ) {
       ### TBD not sure what should be done with this situation
-      debug("insert following the introduction of a frameshift intron" .
-            "and a very small match - confused and bailing on transcript");
+      set_err(ErrCode::PART_EXON_CONFUSED);
+
       return ();
     }
 
@@ -712,8 +726,7 @@ sub process_insertion {
 
     if($insert_len > $MAX_CODING_INDEL) {
       # too much coding sequence has been inserted
-      debug("cds insertion is too long ($insert_len) - " .
-            "discarding transcript");
+      set_err(ErrCode::PART_EXON_CDS_INSERT_TOO_LONG);
       return undef;
     }
 
@@ -724,8 +737,7 @@ sub process_insertion {
 
     if($frameshift) {
       if($insert_len > $MAX_FRAMESHIFT_INDEL) {
-        debug("frameshift insertion is too long ($insert_len) - " .
-              "discarding transcript");
+        set_err(ErrCode::PART_EXON_CDS_FRAMESHIFT_INSERT_TOO_LONG);
         return undef;
       }
 
@@ -818,21 +830,14 @@ sub get_coords_extent {
 
   my($start, $end, $strand, $seq_region);
 
-  #debug("calculating coord span");
-
   foreach my $c (@$coords) {
     next if($c->isa('Bio::EnsEMBL::Mapper::Gap'));
 
-    #debug(" coord_start  = " . $c->start );
-    #debug(" coord_end    = " . $c->end   );
-    #debug(" coord_strand = " . $c->strand );
-    #debug(" coord_id     = " . $c->id);
-
     if(!defined($seq_region)) {
       $seq_region = $c->id();
     }
     elsif($seq_region ne $c->id()) {
-      debug("coords spans multiple scaffolds - unable to get extent");
+      set_err(ErrCode::EXON_SCAFFOLD_SPAN);
       return undef;
     }
 
@@ -840,8 +845,7 @@ sub get_coords_extent {
       $strand = $c->strand();
     }
     elsif($strand != $c->strand()) {
-      debug("coords flip strands - unable to get extent");
-
+      set_err(ErrCode::EXON_STRAND_FLIP);
       return undef;
     }
 
@@ -849,11 +853,11 @@ sub get_coords_extent {
       $start = $c->start if(!defined($start));
     } else {
       if($strand == 1 && $start > $c->start()) {
-        debug("coord inversion - unable to get extent");
+        set_err(ErrCode::EXON_INVERT);
         return undef;
       }
       if($strand == -1 && $start < $c->start()) {
-        debug("coord inversion - unable to get extent");
+        set_err(ErrCode::EXON_INVERT);
         return undef;
       }
 
@@ -866,11 +870,11 @@ sub get_coords_extent {
       $end = $c->end();
     } else {
       if($strand == 1 && $end > $c->end()) {
-        debug("coord inversion - unable to get extent");
+        set_err(ErrCode::EXON_INVERT);
         return undef;
       }
       if($strand == -1 && $end < $c->end()) {
-        debug("coord inversion - unable to get extent");
+        set_err(ErrCode::EXON_INVERT);
         return undef;
       }
       if($c->end > $end) {
@@ -879,9 +883,6 @@ sub get_coords_extent {
     }
   }
 
-  #debug("  calculated span = $start-$end($strand)\n" .
-  #      "  length = " . ($end - $start + 1));
-
   return [$start, $end, $strand, $seq_region];
 }
 
@@ -929,7 +930,7 @@ sub create_transcripts {
       $transcript_seq_region = $seq_region;
     }
     elsif($transcript_seq_region ne $seq_region) {
-      debug("transcript exons on different scaffolds - discarding transcript");
+      set_err(ErrCode::TRANSCRIPT_SCAFFOLD_SPAN);
       ### TBD can probably split transcript rather than discarding
       return ();
     }
@@ -938,7 +939,7 @@ sub create_transcripts {
       $transcript_strand = $exon_strand;
     }
     elsif($transcript_strand != $exon_strand) {
-      debug("transcript exons on different strands - discarding transcript");
+      set_err(ErrCode::TRANSCRIPT_STRAND_FLIP);
       ### TBD can probably split transcript rather than discarding
       return ();
     }
-- 
GitLab