Skip to content
Snippets Groups Projects
Commit 7a283a23 authored by Tim Hubbard's avatar Tim Hubbard
Browse files

dna compress fixed for all base letters

parent 02dc1452
No related branches found
No related tags found
No related merge requests found
......@@ -391,15 +391,29 @@ sub store_compressed {
my $len=length($sequence);
#print "DEBUG $sequence ($len)\n";
{
if($sequence=~/^([^N]*)([N]+)(.*)$/){
if($sequence=~/^([ACGT]*)([^ACGT]+)(.*)$/){
my $l1=length($1);
my $l2=length($2);
my $start=$l1+1;
my $end=$l1+$l2;
$sequence=$1.('A' x $l2).$3;
$n_line.=" " if $n_line;
$n_line.="$start-$end";
#print "DEBUG: Ns from $start-$end\n";
# might be more than one letter
my $insert=$2;
#print "DEBUG: $insert ($start,$end)\n";
{
if($insert=~/^(\w)(\1*)(.*)$/){
my $insert2=$1.$2;
my $end2=$start+length($insert2)-1;
$n_line.=" " if $n_line;
my $modifier='';
if($1 ne 'N'){$modifier=":$1";}
$n_line.="$start-$end2$modifier";
#print "DEBUG: write $start-$end2$modifier\n";
$insert=$3;
$start=$end2+1;
redo if $insert;
}
}
redo;
}
}
......@@ -538,6 +552,11 @@ sub store_compressed {
# mask with N's
foreach my $range (split(/ /,$n_line)){
my($st,$ed)=split(/\-/,$range);
my $char='N';
if($ed=~/(\d+):(\w)/){
$ed=$1;
$char=$2;
}
#print "before: $st-$ed $start-$end\n";
# check in range
next if($ed<$start || $st>$end);
......@@ -546,7 +565,7 @@ sub store_compressed {
#print "after: $st-$ed\n";
my $len=$ed-$st+1;
$st-=$start;
substr($seq,$st,$len)='N'x$len;
substr($seq,$st,$len)=$char x $len;
}
return $seq;
......
......@@ -17,11 +17,11 @@ use strict;
use Getopt::Std;
use vars qw($opt_h $opt_T $opt_i
$opt_U $opt_D $opt_P $opt_H $opt_p
$opt_s $opt_c $opt_C $opt_l $opt_d);
$opt_s $opt_c $opt_C $opt_l $opt_d $opt_r);
use Bio::EnsEMBL::DBSQL::DBAdaptor;
getopts("hTi:U:D:P:H:p:s:cCl:d");
getopts("hTi:U:D:P:H:p:s:cCl:dr:");
$|=1;
......@@ -30,6 +30,7 @@ my $def_U='ensadmin';
my $def_D='testdna';
my $def_P='3307';
my $def_H='127.0.0.1';
my $def_r=5;
if($opt_h){
&help;
......@@ -46,6 +47,7 @@ dna_compress.pl
-l char label of clone, contig (for writing)
-c convert
-C compare
-r num number of randomisation subsequence selections [$def_r]
-i id clone_dbid (for comparing, converting, extracting DNA)
-d delete clone
......@@ -68,6 +70,7 @@ $opt_U=$def_U unless $opt_U;
$opt_D=$def_D unless $opt_D;
$opt_P=$def_P unless $opt_P;
$opt_H=$def_H unless $opt_H;
$opt_r=$def_r unless $opt_r;
# db connection
my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host => $opt_H,
......@@ -157,7 +160,7 @@ if($opt_C || $opt_c){
# now test a subsequence
my $lenr=$len-1;
for(my $i=0;$i<5;$i++){
for(my $i=0;$i<$opt_r;$i++){
my $r=rand;
my $st=int($r*$lenr)+1;
$r=rand;
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment