From 7a283a2344b398eb7e4d10d81a57465142e3c944 Mon Sep 17 00:00:00 2001 From: Tim Hubbard <th@sanger.ac.uk> Date: Tue, 15 Jul 2003 08:38:19 +0000 Subject: [PATCH] dna compress fixed for all base letters --- modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm | 29 ++++++++++++++++---- scripts/dna_compress.pl | 9 ++++-- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm b/modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm index c1d7640472..90e533b33d 100644 --- a/modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm +++ b/modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm @@ -391,15 +391,29 @@ sub store_compressed { my $len=length($sequence); #print "DEBUG $sequence ($len)\n"; { - if($sequence=~/^([^N]*)([N]+)(.*)$/){ + if($sequence=~/^([ACGT]*)([^ACGT]+)(.*)$/){ my $l1=length($1); my $l2=length($2); my $start=$l1+1; my $end=$l1+$l2; $sequence=$1.('A' x $l2).$3; - $n_line.=" " if $n_line; - $n_line.="$start-$end"; - #print "DEBUG: Ns from $start-$end\n"; + # might be more than one letter + my $insert=$2; + #print "DEBUG: $insert ($start,$end)\n"; + { + if($insert=~/^(\w)(\1*)(.*)$/){ + my $insert2=$1.$2; + my $end2=$start+length($insert2)-1; + $n_line.=" " if $n_line; + my $modifier=''; + if($1 ne 'N'){$modifier=":$1";} + $n_line.="$start-$end2$modifier"; + #print "DEBUG: write $start-$end2$modifier\n"; + $insert=$3; + $start=$end2+1; + redo if $insert; + } + } redo; } } @@ -538,6 +552,11 @@ sub store_compressed { # mask with N's foreach my $range (split(/ /,$n_line)){ my($st,$ed)=split(/\-/,$range); + my $char='N'; + if($ed=~/(\d+):(\w)/){ + $ed=$1; + $char=$2; + } #print "before: $st-$ed $start-$end\n"; # check in range next if($ed<$start || $st>$end); @@ -546,7 +565,7 @@ sub store_compressed { #print "after: $st-$ed\n"; my $len=$ed-$st+1; $st-=$start; - substr($seq,$st,$len)='N'x$len; + substr($seq,$st,$len)=$char x $len; } return $seq; diff --git a/scripts/dna_compress.pl b/scripts/dna_compress.pl index 84967e5696..a647e0ba65 100755 --- a/scripts/dna_compress.pl +++ b/scripts/dna_compress.pl @@ -17,11 +17,11 @@ use strict; use Getopt::Std; use vars qw($opt_h $opt_T $opt_i $opt_U $opt_D $opt_P $opt_H $opt_p - $opt_s $opt_c $opt_C $opt_l $opt_d); + $opt_s $opt_c $opt_C $opt_l $opt_d $opt_r); use Bio::EnsEMBL::DBSQL::DBAdaptor; -getopts("hTi:U:D:P:H:p:s:cCl:d"); +getopts("hTi:U:D:P:H:p:s:cCl:dr:"); $|=1; @@ -30,6 +30,7 @@ my $def_U='ensadmin'; my $def_D='testdna'; my $def_P='3307'; my $def_H='127.0.0.1'; +my $def_r=5; if($opt_h){ &help; @@ -46,6 +47,7 @@ dna_compress.pl -l char label of clone, contig (for writing) -c convert -C compare + -r num number of randomisation subsequence selections [$def_r] -i id clone_dbid (for comparing, converting, extracting DNA) -d delete clone @@ -68,6 +70,7 @@ $opt_U=$def_U unless $opt_U; $opt_D=$def_D unless $opt_D; $opt_P=$def_P unless $opt_P; $opt_H=$def_H unless $opt_H; +$opt_r=$def_r unless $opt_r; # db connection my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host => $opt_H, @@ -157,7 +160,7 @@ if($opt_C || $opt_c){ # now test a subsequence my $lenr=$len-1; - for(my $i=0;$i<5;$i++){ + for(my $i=0;$i<$opt_r;$i++){ my $r=rand; my $st=int($r*$lenr)+1; $r=rand; -- GitLab