From 0f7f8ef7ac178cd58bf72bf230a2f64b3696a0b5 Mon Sep 17 00:00:00 2001 From: Nathan Johnson <njohnson@ebi.ac.uk> Date: Thu, 24 Jul 2008 13:31:58 +0000 Subject: [PATCH] added some comments --- .../probe_mapping/probe2transcript.pl | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/misc-scripts/probe_mapping/probe2transcript.pl b/misc-scripts/probe_mapping/probe2transcript.pl index 75b6e21274..d0972a7731 100644 --- a/misc-scripts/probe_mapping/probe2transcript.pl +++ b/misc-scripts/probe_mapping/probe2transcript.pl @@ -16,6 +16,7 @@ #To do #Remove median & get_date and implement EFGUtils when migrating to eFG #Add unannotated UTR clipping dependant on nearest neighbour +#Extend UTRs to default length is they are less than defaults, so long as they don't overlap neighbour, then use annotated if present or clip to neighbour start/end if not, also accounting for default UTRs in the neighbour. use strict; @@ -55,6 +56,7 @@ my $max_mismatches = 1; #What we want is to use annotated else use calc or preset default #so calc and preset default are mutually exclusive #but annotated can be used with both +#shouldn't median be mode? my $annotated_utrs; @@ -258,6 +260,21 @@ if($calc_utrs){ $three_utr = $transcript->five_prime_utr; $five_utr = $transcript->three_prime_utr; + #We actually want to extend the potentially conservative ensembl UTRs + #to the calculated default if they are shorter, but only if this does + #not cause overlap with a neighbouring gene. + #Do not implement UTR extension until clipping is in place + #This will require knowledge of genomic context + #What is fastest solution here? + #1. Run in a slice context to fetch all genes, then we know the previous transcript + #and can easily access the next transcript + #2. Simply generate an extended slice from the transcript and pull back genes + #We would have to either do this for every trans or just for the longest + # + #1 is probably most efficient altho' and will actually reduce the memory usage by + #chunking by chromosome + + if(defined $five_utr){ $five_cnt++; push @five_lengths, $five_utr->length; @@ -337,6 +354,8 @@ foreach my $transcript (@transcripts) { #we want to be able to test calc and annotated separatly my %utr_lengths = %utr_defaults; + #Need to rework the logic slightly considering UTRs are included in the transcript start/end if they are annotated. + if($annotated_utrs){ my ($method, $utr); @@ -345,7 +364,10 @@ foreach my $transcript (@transcripts) { $utr = $transcript->$method; if(defined $utr){# && $utr->length != 0){ - $utr_lengths{$flank} = $utr->length; + #$utr_lengths{$flank} = $utr->length; + #Set extend to 0 if there are already included in the transcript + #need to rename this hash + $utr_lengths{$flank} = 0; } else{ $unannotated_utrs{$flank}++; @@ -355,6 +377,9 @@ foreach my $transcript (@transcripts) { my $slice = $transcript->feature_Slice(); #my $extended_slice = $slice->expand(0, $utr_length); # this takes account of strand + + #The UTRs are already included in the transcript!! + #We only need to extend if we have no annotated UTR. my $extended_slice = $slice->expand($utr_lengths{'five'}, $utr_lengths{'three'}); # this takes account of strand -- GitLab