From 0f7f8ef7ac178cd58bf72bf230a2f64b3696a0b5 Mon Sep 17 00:00:00 2001
From: Nathan Johnson <njohnson@ebi.ac.uk>
Date: Thu, 24 Jul 2008 13:31:58 +0000
Subject: [PATCH] added some comments

---
 .../probe_mapping/probe2transcript.pl         | 27 ++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/misc-scripts/probe_mapping/probe2transcript.pl b/misc-scripts/probe_mapping/probe2transcript.pl
index 75b6e21274..d0972a7731 100644
--- a/misc-scripts/probe_mapping/probe2transcript.pl
+++ b/misc-scripts/probe_mapping/probe2transcript.pl
@@ -16,6 +16,7 @@
 #To do
 #Remove median & get_date and implement EFGUtils when migrating to eFG
 #Add unannotated UTR clipping dependant on nearest neighbour
+#Extend UTRs to default length is they are less than defaults, so long as they don't overlap neighbour, then use annotated if present or clip to neighbour start/end if not, also accounting for default UTRs in the neighbour.
 
 use strict;
 
@@ -55,6 +56,7 @@ my $max_mismatches = 1;
 #What we want is to use annotated else use calc or preset default
 #so calc and preset default are mutually exclusive
 #but annotated can be used with both
+#shouldn't median be mode?
 
 
 my $annotated_utrs;
@@ -258,6 +260,21 @@ if($calc_utrs){
 	$three_utr = $transcript->five_prime_utr;
 	$five_utr  = $transcript->three_prime_utr;
 
+	#We actually want to extend the potentially conservative ensembl UTRs
+	#to the calculated default if they are shorter, but only if this does 
+	#not cause overlap with a neighbouring gene.
+	#Do not implement UTR extension until clipping is in place
+	#This will require knowledge of genomic context
+	#What is fastest solution here?
+	#1. Run in a slice context to fetch all genes, then we know the previous transcript
+	#and can easily access the next transcript
+	#2. Simply generate an extended slice from the transcript and pull back genes
+	#We would have to either do this for every trans or just for the longest
+	#
+	#1 is probably most efficient altho' and will actually reduce the memory usage by
+	#chunking by chromosome
+
+
 	if(defined $five_utr){
 	  $five_cnt++;
 	  push @five_lengths, $five_utr->length;
@@ -337,6 +354,8 @@ foreach my $transcript (@transcripts) {
   #we want to be able to test calc and annotated separatly
   my %utr_lengths = %utr_defaults;
 
+  #Need to rework the logic slightly considering UTRs are included in the transcript start/end if they are annotated.
+
   if($annotated_utrs){
 	my ($method, $utr);
 	
@@ -345,7 +364,10 @@ foreach my $transcript (@transcripts) {
 	  $utr = $transcript->$method;
 	  
 	  if(defined $utr){# && $utr->length != 0){
-		$utr_lengths{$flank} = $utr->length;
+		#$utr_lengths{$flank} = $utr->length;
+		#Set extend to 0 if there are already included in the transcript
+		#need to rename this hash
+		$utr_lengths{$flank} = 0;
 	  }
 	  else{
 		$unannotated_utrs{$flank}++;
@@ -355,6 +377,9 @@ foreach my $transcript (@transcripts) {
 
   my $slice = $transcript->feature_Slice();
   #my $extended_slice = $slice->expand(0, $utr_length); # this takes account of strand
+
+  #The UTRs are already included in the transcript!!
+  #We only need to extend if we have no annotated UTR.
   my $extended_slice = $slice->expand($utr_lengths{'five'}, $utr_lengths{'three'}); # this takes account of strand
   
 
-- 
GitLab