From 7a283a2344b398eb7e4d10d81a57465142e3c944 Mon Sep 17 00:00:00 2001
From: Tim Hubbard <th@sanger.ac.uk>
Date: Tue, 15 Jul 2003 08:38:19 +0000
Subject: [PATCH] dna compress fixed for all base letters

---
 modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm | 29 ++++++++++++++++----
 scripts/dna_compress.pl                      |  9 ++++--
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm b/modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm
index c1d7640472..90e533b33d 100644
--- a/modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm
+++ b/modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm
@@ -391,15 +391,29 @@ sub store_compressed {
   my $len=length($sequence);
   #print "DEBUG $sequence ($len)\n";
   {
-      if($sequence=~/^([^N]*)([N]+)(.*)$/){
+      if($sequence=~/^([ACGT]*)([^ACGT]+)(.*)$/){
 	  my $l1=length($1);
 	  my $l2=length($2);
 	  my $start=$l1+1;
 	  my $end=$l1+$l2;
 	  $sequence=$1.('A' x $l2).$3;
-	  $n_line.=" " if $n_line;
-	  $n_line.="$start-$end";
-	  #print "DEBUG: Ns from $start-$end\n";
+	  # might be more than one letter
+	  my $insert=$2;
+	  #print "DEBUG: $insert ($start,$end)\n";
+	  {
+	    if($insert=~/^(\w)(\1*)(.*)$/){
+	      my $insert2=$1.$2;
+	      my $end2=$start+length($insert2)-1;
+	      $n_line.=" " if $n_line;
+	      my $modifier='';
+	      if($1 ne 'N'){$modifier=":$1";}
+	      $n_line.="$start-$end2$modifier";
+	      #print "DEBUG: write $start-$end2$modifier\n";
+	      $insert=$3;
+	      $start=$end2+1;
+	      redo if $insert;
+	    }
+	  }
 	  redo;
       }
   }
@@ -538,6 +552,11 @@ sub store_compressed {
     # mask with N's
     foreach my $range (split(/ /,$n_line)){
       my($st,$ed)=split(/\-/,$range);
+      my $char='N';
+      if($ed=~/(\d+):(\w)/){
+	$ed=$1;
+	$char=$2;
+      }
       #print "before: $st-$ed $start-$end\n";
       # check in range
       next if($ed<$start || $st>$end);
@@ -546,7 +565,7 @@ sub store_compressed {
       #print "after: $st-$ed\n";
       my $len=$ed-$st+1;
       $st-=$start;
-      substr($seq,$st,$len)='N'x$len;
+      substr($seq,$st,$len)=$char x $len;
     }
 
     return $seq;
diff --git a/scripts/dna_compress.pl b/scripts/dna_compress.pl
index 84967e5696..a647e0ba65 100755
--- a/scripts/dna_compress.pl
+++ b/scripts/dna_compress.pl
@@ -17,11 +17,11 @@ use strict;
 use Getopt::Std;
 use vars qw($opt_h $opt_T $opt_i 
 	    $opt_U $opt_D $opt_P $opt_H $opt_p
-	    $opt_s $opt_c $opt_C $opt_l $opt_d);
+	    $opt_s $opt_c $opt_C $opt_l $opt_d $opt_r);
 
 use Bio::EnsEMBL::DBSQL::DBAdaptor;
 
-getopts("hTi:U:D:P:H:p:s:cCl:d");
+getopts("hTi:U:D:P:H:p:s:cCl:dr:");
 
 $|=1;
 
@@ -30,6 +30,7 @@ my $def_U='ensadmin';
 my $def_D='testdna';
 my $def_P='3307';
 my $def_H='127.0.0.1';
+my $def_r=5;
 
 if($opt_h){
     &help;
@@ -46,6 +47,7 @@ dna_compress.pl
   -l char  label of clone, contig (for writing)
   -c       convert
   -C       compare
+  -r num   number of randomisation subsequence selections [$def_r]
   -i id    clone_dbid (for comparing, converting, extracting DNA)
   -d       delete clone
 
@@ -68,6 +70,7 @@ $opt_U=$def_U unless $opt_U;
 $opt_D=$def_D unless $opt_D;
 $opt_P=$def_P unless $opt_P;
 $opt_H=$def_H unless $opt_H;
+$opt_r=$def_r unless $opt_r;
 
 # db connection
 my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host   => $opt_H,
@@ -157,7 +160,7 @@ if($opt_C || $opt_c){
 
 	# now test a subsequence
 	my $lenr=$len-1;
-	for(my $i=0;$i<5;$i++){
+	for(my $i=0;$i<$opt_r;$i++){
 	  my $r=rand;
 	  my $st=int($r*$lenr)+1;
 	  $r=rand;
-- 
GitLab