Commit 7a283a23 authored by Tim Hubbard's avatar Tim Hubbard
Browse files

dna compress fixed for all base letters

parent 02dc1452
......@@ -391,15 +391,29 @@ sub store_compressed {
my $len=length($sequence);
#print "DEBUG $sequence ($len)\n";
{
if($sequence=~/^([^N]*)([N]+)(.*)$/){
if($sequence=~/^([ACGT]*)([^ACGT]+)(.*)$/){
my $l1=length($1);
my $l2=length($2);
my $start=$l1+1;
my $end=$l1+$l2;
$sequence=$1.('A' x $l2).$3;
$n_line.=" " if $n_line;
$n_line.="$start-$end";
#print "DEBUG: Ns from $start-$end\n";
# might be more than one letter
my $insert=$2;
#print "DEBUG: $insert ($start,$end)\n";
{
if($insert=~/^(\w)(\1*)(.*)$/){
my $insert2=$1.$2;
my $end2=$start+length($insert2)-1;
$n_line.=" " if $n_line;
my $modifier='';
if($1 ne 'N'){$modifier=":$1";}
$n_line.="$start-$end2$modifier";
#print "DEBUG: write $start-$end2$modifier\n";
$insert=$3;
$start=$end2+1;
redo if $insert;
}
}
redo;
}
}
......@@ -538,6 +552,11 @@ sub store_compressed {
# mask with N's
foreach my $range (split(/ /,$n_line)){
my($st,$ed)=split(/\-/,$range);
my $char='N';
if($ed=~/(\d+):(\w)/){
$ed=$1;
$char=$2;
}
#print "before: $st-$ed $start-$end\n";
# check in range
next if($ed<$start || $st>$end);
......@@ -546,7 +565,7 @@ sub store_compressed {
#print "after: $st-$ed\n";
my $len=$ed-$st+1;
$st-=$start;
substr($seq,$st,$len)='N'x$len;
substr($seq,$st,$len)=$char x $len;
}
return $seq;
......
......@@ -17,11 +17,11 @@ use strict;
use Getopt::Std;
use vars qw($opt_h $opt_T $opt_i
$opt_U $opt_D $opt_P $opt_H $opt_p
$opt_s $opt_c $opt_C $opt_l $opt_d);
$opt_s $opt_c $opt_C $opt_l $opt_d $opt_r);
use Bio::EnsEMBL::DBSQL::DBAdaptor;
getopts("hTi:U:D:P:H:p:s:cCl:d");
getopts("hTi:U:D:P:H:p:s:cCl:dr:");
$|=1;
......@@ -30,6 +30,7 @@ my $def_U='ensadmin';
my $def_D='testdna';
my $def_P='3307';
my $def_H='127.0.0.1';
my $def_r=5;
if($opt_h){
&help;
......@@ -46,6 +47,7 @@ dna_compress.pl
-l char label of clone, contig (for writing)
-c convert
-C compare
-r num number of randomisation subsequence selections [$def_r]
-i id clone_dbid (for comparing, converting, extracting DNA)
-d delete clone
......@@ -68,6 +70,7 @@ $opt_U=$def_U unless $opt_U;
$opt_D=$def_D unless $opt_D;
$opt_P=$def_P unless $opt_P;
$opt_H=$def_H unless $opt_H;
$opt_r=$def_r unless $opt_r;
# db connection
my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host => $opt_H,
......@@ -157,7 +160,7 @@ if($opt_C || $opt_c){
# now test a subsequence
my $lenr=$len-1;
for(my $i=0;$i<5;$i++){
for(my $i=0;$i<$opt_r;$i++){
my $r=rand;
my $st=int($r*$lenr)+1;
$r=rand;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment