diff --git a/modules/Bio/EnsEMBL/Utils/PolyA.pm b/modules/Bio/EnsEMBL/Utils/PolyA.pm index b16a59364773d0fa46d8c7fdcb339f4ec9b8b0f4..59fc71c1afcbaebc4bc15d8943958508d02b893c 100644 --- a/modules/Bio/EnsEMBL/Utils/PolyA.pm +++ b/modules/Bio/EnsEMBL/Utils/PolyA.pm @@ -124,6 +124,20 @@ sub mask{ ############################################################ +# $seq is a string and $length is the length we want to clip from both ends + +sub clip_ends{ + my ($self,$seq,$length) = @_; + + my $seq_length = length( $seq ); + my $subseq = substr( $seq, $length, $seq_length - 2*$length ); + + return $subseq; +} + + + +############################################################ sub _find_polyA{ my ($self, $seq) = @_; my $new_seq; @@ -137,7 +151,6 @@ sub _find_polyA{ my $t_count = $check_polyT =~ tr/Tt//; my $a_count = $check_polyA =~ tr/Aa//; - #### polyA #### if ( $a_count >= 5 && $a_count > $t_count ){ @@ -293,76 +306,22 @@ sub has_polyA_track{ my $length = length($seq); # is it a polyA or polyT? - my $check_polyT = substr( $seq, 0, 6 ); + my $check_polyT = substr( $seq, 0, 10 ); - my $check_polyA = substr( $seq, -6 ); + my $check_polyA = substr( $seq, -10 ); + print STDERR "polyA: $check_polyA\n"; + my $t_count = $check_polyT =~ tr/Tt//; my $a_count = $check_polyA =~ tr/Aa//; - - my $length_to_mask = 0; - #### polyA #### - if ( $a_count >= 5 && $a_count > $t_count ){ - - # we calculate the number of bases we want to chop - my $length_to_mask = 0; - - # we start with 3 bases - my ($piece, $count ) = (3,0); - - # count also the number of Ns, consider the Ns as potential As - my $n_count = 0; - - # take 3 by 3 bases from the end - while( $length_to_mask < $length ){ - my $chunk = substr( $seq, ($length - ($length_to_mask + 3)), $piece); - $count = $chunk =~ tr/Aa//; - $n_count = $chunk =~ tr/Nn//; - if ( ($count + $n_count) >= 2*( $piece )/3 ){ - $length_to_mask += 3; - } - else{ - last; - } - } - } - #### polyT #### - elsif( $t_count >=5 && $t_count > $a_count ){ - - # calculate the number of bases to chop - my $length_to_mask = -3; - - # we start with 3 bases: - my ($piece, $count) = (3,3); - - # count also the number of Ns, consider the Ns as potential As - my $n_count = 0; - - # take 3 by 3 bases from the beginning - while ( $length_to_mask < $length ){ - my $chunk = substr( $seq, $length_to_mask + 3, $piece ); - #print STDERR "length to mask: $length_to_mask\n"; - #print "chunk: $chunk\n"; - $count = $chunk =~ tr/Tt//; - $n_count = $chunk =~ tr/Nn//; - if ( ($count+$n_count) >= 2*( $piece )/3 ){ - $length_to_mask +=3; - } - else{ - last; - - } - } - } - - if ( $length_to_mask >= 5 ){ + ## testing with this short cut + if ( $a_count >=7 || $t_count >=7 ){ return 1; } else{ return 0; } - }