support for dnac compressed dna table

35729e7b · Tim Hubbard · 7a2d4026 · 35729e7b · 35729e7b · 35729e7b
Commit 35729e7b authored 21 years ago by Tim Hubbard
--- a/modules/Bio/EnsEMBL/DBSQL/CloneAdaptor.pm
+++ b/modules/Bio/EnsEMBL/DBSQL/CloneAdaptor.pm
@@ -233,9 +233,9 @@ sub fetch_all {
  my $self = shift;

  my $sth = $self->prepare("SELECT clone_id, name, embl_acc, version,
-                                   embl_version, htg_phase, created, modified
+                                   embl_version, htg_phase, 
+                                   UNIX_TIMESTAMP(created), UNIX_TIMESTAMP(modified)
                            FROM clone");
-  

  $sth->execute();


--- a/modules/Bio/EnsEMBL/DBSQL/RawContigAdaptor.pm
+++ b/modules/Bio/EnsEMBL/DBSQL/RawContigAdaptor.pm
@@ -509,8 +509,25 @@ sub remove {
    my $dna_id = $sth->fetchrow_array;
    $sth = $self->prepare("DELETE FROM dna WHERE dna_id = $dna_id");
    $sth->execute;
-    $self->throw("Failed to delete dna for dna_id '$dna_id'")
-      unless $sth->rows;
+    my $flag_found=0;
+    if($sth->rows){
+      $flag_found=1;
+    }
+    # now try dnac - eval since table might not be present..
+    $sth = $self->prepare("DELETE FROM dnac WHERE dna_id = $dna_id");
+    eval{
+      $sth->execute;
+    };
+    if($@){
+    }else{
+      if($sth->rows){
+	$flag_found=1;
+      }
+    }
+    print "got here\n";
+    if($flag_found==0){
+      $self->throw("Failed to delete dna for dna_id '$dna_id'")
+    }
  }

  # Remove the contig

--- a/modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm
+++ b/modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm
 #
-# EnsEMBL module for Bio::EnsEMBL::DBSQL::SequenceAdaptor
+# EnsEMBL module for Bio::EnsEMBL::DSQL::SequenceAdaptor
 #
 # Cared for by Arne Stabenau <stabenau@ebi.ac.uk>
 #
@@ -55,6 +55,7 @@ use Bio::EnsEMBL::DBSQL::BaseAdaptor;
               a -1 means until the end
  Arg [4]    : int $strand
               -1, 1 are possible values
+  Arg [5]    : (optional) force read from compressed DNA table
  Example    : $dna = $seq_adp->fetch_by_RawContig_start_end_strand($contig, 1,
 								    1000, -1);
  Description: retrieves the dna string from the database from the 
@@ -66,7 +67,7 @@ use Bio::EnsEMBL::DBSQL::BaseAdaptor;
 =cut

 sub fetch_by_RawContig_start_end_strand {
-  my ( $self, $contig, $start, $end, $strand ) = @_;
+  my ( $self, $contig, $start, $end, $strand, $compressed ) = @_;
  my $sth;
  
  if( $start < 1 ) {
@@ -77,46 +78,125 @@ sub fetch_by_RawContig_start_end_strand {
    $self->throw("contig arg must be contig not a [$contig]");
  } 

-  my $query;
+  {

-  if( $end == -1 ) { 
+    my $query;

-    $query = "SELECT c.length, SUBSTRING( d.sequence, $start )
-                FROM dna d, contig c 
-               WHERE d.dna_id = c.dna_id 
-                 AND c.contig_id = ?";
+    if($compressed==1 || $self->is_compressed){
+
+      # query for compressed mode, uses hex and a different range
+      # needs to getback dna_id as will have to check for Ns in dnan table
+      my $start4=int(($start-1)/4)+1;
      
-  } else {
-    my $length = $end - $start + 1;
-    if( $length < 1 ) {
-      $self->throw( "Wrong parameters" );
-    }
-    
-    $query = "SELECT c.length, SUBSTRING( d.sequence, $start, $length )
-                FROM dna d, contig c 
-               WHERE d.dna_id = c.dna_id 
-                 AND c.contig_id = ?";    
-  }
+      if( $end == -1 ) { 
+	$query = "SELECT c.length, HEX( SUBSTRING( d.sequence, $start4 ) ), 
+                         d.n_line
+                    FROM dnac d, contig c 
+                   WHERE d.dna_id = c.dna_id 
+                     AND c.contig_id = ?";
+      
+      } else {
+	my $end4=int(($end-1)/4)+1;
+	my $length4 = $end4 - $start4 + 1;
+	if( $length4 < 1 ) {
+	  $self->throw( "Wrong parameters" );
+	}
+      
+	$query = "SELECT c.length, HEX( SUBSTRING( d.sequence, $start4, $length4 ) ), 
+                         d.n_line
+                    FROM dnac d, contig c 
+                   WHERE d.dna_id = c.dna_id 
+                     AND c.contig_id = ?";    
+      }
+
+    }else{
+      
+      if( $end == -1 ) { 

-  #use the dna db if defined
-  if( defined $self->db()->dnadb() ) {
-    $sth = $self->db()->dnadb()->prepare( $query );
-  } else {
-    $sth = $self->prepare( $query );
-  }
+	$query = "SELECT c.length, SUBSTRING( d.sequence, $start )
+                    FROM dna d, contig c 
+                   WHERE d.dna_id = c.dna_id 
+                     AND c.contig_id = ?";
+      
+      } else {
+	my $length = $end - $start + 1;
+	if( $length < 1 ) {
+	  $self->throw( "Wrong parameters" );
+	}
+    
+	$query = "SELECT c.length, SUBSTRING( d.sequence, $start, $length )
+                    FROM dna d, contig c 
+                   WHERE d.dna_id = c.dna_id 
+                     AND c.contig_id = ?";    
+      }
+      
+    }

-  $sth->execute($contig->dbID());
+    #use the dna db if defined
+    if( defined $self->db()->dnadb() ) {
+      $sth = $self->db()->dnadb()->prepare( $query );
+    } else {
+      $sth = $self->prepare( $query );
+    }

-  if( my $aref = $sth->fetchrow_arrayref() ) {
-    my ( $length, $seq ) = @$aref;
-    $seq =~ s/\s//g;
-    if( $strand == -1 ) {
-      return $self->_reverse_comp( $seq );
+    $sth->execute($contig->dbID());
+
+    if( my $aref = $sth->fetchrow_arrayref() ) {
+      my ( $length, $seq, $n_line ) = @$aref;
+      my $lenx=length($seq);
+      if($compressed || $self->is_compressed()){
+	$seq=$self->_dna_uncompress($length,$n_line,$start,$end,\$seq);
+      }
+      $seq =~ s/\s//g;
+      if( $strand == -1 ) {
+	return $self->_reverse_comp( $seq );
+      } else {
+	return $seq;
+      }
    } else {
-      return $seq;
+
+      my $flag;
+      if($self->is_compressed==0){
+	# found nothing...perhaps in this entry is compressed - check
+	# and if present, set is_compressed
+	eval{
+	  $query = "SELECT c.length, d.dna_id
+                      FROM dnac d, contig c 
+                     WHERE d.dna_id = c.dna_id 
+                       AND c.contig_id = ?";
+	  $sth = $self->prepare( $query );
+	  $sth->execute($contig->dbID());
+	  if( my $aref = $sth->fetchrow_arrayref() ) {
+	    $flag=1;
+	  }
+	};
+	if($flag==1){
+	  $self->is_compressed(1);
+	  print "Switched to compressed DNA reading mode\n";
+	  redo;
+	}
+      }else{
+	# found nothing...perhaps in this entry is uncompressed - check
+	# and if present, set is_compressed
+	eval{
+	  $query = "SELECT c.length, d.dna_id
+                      FROM dna d, contig c 
+                     WHERE d.dna_id = c.dna_id 
+                       AND c.contig_id = ?";
+	  $sth = $self->prepare( $query );
+	  $sth->execute($contig->dbID());
+	  if( my $aref = $sth->fetchrow_arrayref() ) {
+	    $flag=1;
+	  }
+	};
+	if($flag==1){
+	  $self->is_compressed(0);
+	  print "Switched to uncompressed DNA reading mode\n";
+	  redo;
+	}
+      }
+      return undef;
    }
-  } else {
-    return undef;
  }
 }

@@ -264,6 +344,11 @@ sub store {
  my ($self, $sequence, $date) = @_;
  
  $sequence =~ tr/atgcn/ATGCN/;
+
+  # save compressed
+  if($self->is_compressed){
+      return $self->store_compressed($sequence,$date);
+  }
  
  my $statement = $self->prepare("
        INSERT INTO dna(sequence,created) 
@@ -284,6 +369,215 @@ sub store {
 }


+=head2 store_compressed
+
+  Arg [1]    : string $sequence the dna sequence to be stored in the database
+  Arg [2]    : string $date create date to be associated with the dna sequence
+               to be stored.
+  Arg [3]    : dbid (optional)
+  Returntype : int
+  Exceptions : none
+  Caller     : Bio::EnsEMBL::DBSQL::RawContigAdaptor::store_compressed, 
+               Bio::EnsEMBL::DBSQL::RawContigAdaptor::store
+
+=cut
+
+sub store_compressed {
+  my ($self, $sequence, $date, $dbid) = @_;
+
+  # look for N's, save list of start and ends and convert to A's
+  my $n_line;
+  my $len=length($sequence);
+  #print "DEBUG $sequence ($len)\n";
+  {
+      if($sequence=~/^([^N]*)([N]+)(.*)$/){
+	  my $l1=length($1);
+	  my $l2=length($2);
+	  my $start=$l1+1;
+	  my $end=$l1+$l2;
+	  $sequence=$1.('A' x $l2).$3;
+	  $n_line.=" " if $n_line;
+	  $n_line.="$start-$end";
+	  #print "DEBUG: Ns from $start-$end\n";
+	  redo;
+      }
+  }
+  #print "DEBUG $sequence\n";
+
+  # convert ACGT -> 00011011
+  my $sequence_comp=$self->_dna_compress($sequence);
+  
+  my $id;
+  if($dbid){
+      my $statement = $self->prepare("
+            INSERT INTO dnac(dna_id,sequence,created,n_line) 
+            VALUES(?, ?, FROM_UNIXTIME(?), ?)
+            "); 
+  
+      my $rv = $statement->execute($dbid,$sequence_comp, $date, $n_line);  
+      $self->throw("Failed to insert dna $sequence") unless $rv;  
+      $statement->finish;
+      $id=$dbid;
+  }else{
+      my $statement = $self->prepare("
+            INSERT INTO dnac(sequence,created,flag_dnan) 
+            VALUES(?, FROM_UNIXTIME(?), ?)
+            "); 
+  
+      my $rv = $statement->execute($sequence_comp, $date, $n_line);  
+      $self->throw("Failed to insert dna $sequence") unless $rv;  
+      $statement->finish;
+
+      $statement = $self->prepare("SELECT last_insert_id()");
+      $statement->execute();
+      ($id) = $statement->fetchrow();
+      $statement->finish;
+  }
+  return $id;
+}
+
+
+=head2 _dna_compress
+
+  Arg  1    : txt $dna_sequence
+  Function  : build hex representation of string (ACGT -> 1B)
+  Returntype: txt
+  Exceptions: none
+  Caller    : private to this module
+
+=cut
+
+{
+
+  my %table=(
+	     'AA'=>'0',
+	     'AC'=>'1',
+	     'AG'=>'2',
+	     'AT'=>'3',
+	     'CA'=>'4',
+	     'CC'=>'5',
+	     'CG'=>'6',
+	     'CT'=>'7',
+	     'GA'=>'8',
+	     'GC'=>'9',
+	     'GG'=>'A',
+	     'GT'=>'B',
+	     'TA'=>'C',
+	     'TC'=>'D',
+	     'TG'=>'E',
+	     'TT'=>'F',
+	     );
+
+  sub _dna_compress {
+    my $self = shift;
+    my $seq = shift;
+    # length must be multiple of 4, so pad if necesary
+    my $len=length($seq);
+    $seq.='A' x ($len-(int($len/4))*4);
+    #print "DEBUG $seq\n";
+    # process in blocks of 2 to do conversion
+    my $seq_hex;
+    my @seq=split(//,$seq);
+    while(my $pair=shift(@seq)){
+      $pair.=shift(@seq);
+      $seq_hex.=$table{$pair};
+    }
+    return pack("H*",$seq_hex);
+  }
+}
+
+
+=head2 _dna_uncompress
+
+  Arg [1]    : int $length
+  Arg [2]    : int $n_line
+  Arg [3]    : int $start
+  Arg [4]    : int $end
+  Arg [5]    : reference to $seq (hex string)
+  Function   : converts hex string from DB into ACGT with appropriate truncation
+  Returntype : string
+  Exceptions : none
+  Caller     : private to this module
+
+=cut
+
+{
+
+  my %table=(
+	     '0'=>'AA',
+	     '1'=>'AC',
+	     '2'=>'AG',
+	     '3'=>'AT',
+	     '4'=>'CA',
+	     '5'=>'CC',
+	     '6'=>'CG',
+	     '7'=>'CT',
+	     '8'=>'GA',
+	     '9'=>'GC',
+	     'A'=>'GG',
+	     'B'=>'GT',
+	     'C'=>'TA',
+	     'D'=>'TC',
+	     'E'=>'TG',
+	     'F'=>'TT',
+	     );
+
+  sub _dna_uncompress {
+    my($self,$length,$n_line,$start,$end,$rseq)=@_;
+
+    # convert sequence back to ACGT (already in Hex)
+    my $seq2=join('',map{$table{$_}}split(//,$$rseq));
+
+    # calculate extent of $seq2 and truncate
+    my $start4=(int(($start-1)/4)+1)*4-3;
+    my $off=$start-$start4;
+    my $len;
+    if($end==-1){
+      $end=$length;
+    }
+    $len=$end-$start+1;
+    #print "truncate: $start,$start4,$end,$len\n";
+    my $seq=substr($seq2,$off,$len);
+    
+    # mask with N's
+    foreach my $range (split(/ /,$n_line)){
+      my($st,$ed)=split(/\-/,$range);
+      #print "before: $st-$ed $start-$end\n";
+      # check in range
+      next if($ed<$start || $st>$end);
+      $ed=$end if $ed>$end;
+      $st=$start if $st<$start;
+      #print "after: $st-$ed\n";
+      my $len=$ed-$st+1;
+      $st-=$start;
+      substr($seq,$st,$len)='N'x$len;
+    }
+
+    return $seq;
+  }
+}
+
+
+=head2 is_compressed
+
+  Arg [1]    : (optional) flag
+               to set or unset
+  Example    : $adaptor->is_compressed(1)
+  Description: Getter/Setter for whether compressed DNA should be read/written
+  Returntype : true/false
+  Exceptions : none
+  Caller     : Adaptors inherited from BaseAdaptor
+
+=cut
+
+sub is_compressed { 
+    my( $obj, $flag ) = @_;
+    if (defined $flag) {
+        $obj->{'_is_truncated'} = $flag ? 1 : 0;
+    }
+    return $obj->{'_is_truncated'};
+}
+

 =head2 _reverse_comp


--- a/scripts/dna_compress.pl
+++ b/scripts/dna_compress.pl
+#!/usr/local/bin/perl --    # -*-Perl-*-
+#
+# Copyright (c) 2003 Tim Hubbard (th@sanger.ac.uk)
+# Sanger Centre, Wellcome Trust Genome Campus, Cambs, UK
+# All rights reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation
+#
+# $Header$
+
+# This is a driver script to populate and test the experimental dnac
+# (compressed dna) table of ensembl.  Use -T for pod based tutorial
+
+use strict;
+use Getopt::Std;
+use vars qw($opt_h $opt_T $opt_i 
+	    $opt_U $opt_D $opt_P $opt_H $opt_p
+	    $opt_s $opt_c $opt_C $opt_l $opt_d);
+
+use Bio::EnsEMBL::DBSQL::DBAdaptor;
+
+getopts("hTi:U:D:P:H:p:s:cCl:d");
+
+$|=1;
+
+# specify defaults
+my $def_U='ensadmin';
+my $def_D='testdna';
+my $def_P='3307';
+my $def_H='127.0.0.1';
+
+if($opt_h){
+    &help;
+}elsif($opt_T){
+    &help2;
+}
+
+sub help {
+    print <<ENDHELP;
+dna_compress.pl
+
+  -h       for help
+  -s char  string of DNA (for writing)
+  -l char  label of clone, contig (for writing)
+  -c       convert
+  -C       compare
+  -i id    clone_dbid (for comparing, converting, extracting DNA)
+  -d       delete clone
+
+  -U user  ensembl db user [$def_U]
+  -D db    ensembl db      [$def_D]
+  -P port  port of mysql   [$def_P]
+  -H host  host of mysql   [$def_H]
+  -p pass  passwd for mysqluser
+ENDHELP
+
+    exit 0;
+}
+
+sub help2 {
+    exec('perldoc', $0);
+}
+
+# defaults or options
+$opt_U=$def_U unless $opt_U;
+$opt_D=$def_D unless $opt_D;
+$opt_P=$def_P unless $opt_P;
+$opt_H=$def_H unless $opt_H;
+
+# db connection
+my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host   => $opt_H,
+					    -user   => $opt_U,
+					    -port   => $opt_P,
+					    -dbname => $opt_D,
+					    -pass   => $opt_p);
+
+my $seq_apt=$db->get_SequenceAdaptor;
+my $contig_apt=$db->get_RawContigAdaptor;
+my $clone_apt=$db->get_CloneAdaptor;
+
+if($opt_C || $opt_c){
+  
+  # convert dna of sequences (loop over clone->contig->dna table,
+  # inserting into dnac table when no entry is already present)
+  # OR
+  # compare raw and compressed sequences
+
+  my $clones=$clone_apt->fetch_all();
+  my $nd=0;
+  my $ns=0;
+  foreach my $clone (@$clones){
+    my $clone_dbid=$clone->dbID;
+    next if($opt_i && $opt_i!=$clone_dbid);
+    my $clone_id=$clone->id;
+    my $created=$clone->created();
+    print "Processing Clone $clone_id ($clone_dbid) $created\n";
+    my $contigs=$clone->get_all_Contigs();
+    foreach my $contig (@$contigs){
+      my $contig_dbid=$contig->dbID;
+      my $contig_id=$clone->id;
+      my $seq=$seq_apt->fetch_by_RawContig_start_end_strand($contig,1,-1,1);
+      my $len=length($seq);
+      
+      my $dna_id;
+      # custom sql is required to get dna_id and check if there is a compressed id
+      my $query="SELECT c.dna_id FROM contig c WHERE c.contig_id = ?";
+      my $sth=$contig_apt->prepare($query);
+      $sth->execute($contig_dbid);
+      if(my $aref=$sth->fetchrow_arrayref()){
+	($dna_id)=@$aref;
+      }else{
+	$contig_apt->thrown("No dna_id for Contig $contig_id");
+      }
+      
+      print "  Processing Contig $contig_id ($contig_dbid) [$len] $dna_id\n";
+      
+      my $flag_compressed_exists=0;
+      $query="SELECT d.dna_id FROM dnac d WHERE d.dna_id = ?";
+      $sth=$seq_apt->prepare($query);
+      $sth->execute($dna_id);
+      if(my $aref=$sth->fetchrow_arrayref()){
+	$flag_compressed_exists=1;
+      }
+      # convert
+      if($opt_c){
+	if($flag_compressed_exists){
+	  print "    Compressed DNA entry already exists for dna_id $dna_id\n";
+	  next;
+	}
+	# write compressed DNA record
+	my $id=$seq_apt->store_compressed($seq,$created,$dna_id);
+	print "    Created compressed DNA entry\n";
+      }elsif($opt_C){
+	if(!$flag_compressed_exists){
+	  print "    Compressed DNA entry missing for dna_id $dna_id\n";
+	  next;
+	}
+	my $seq2=$seq_apt->fetch_by_RawContig_start_end_strand($contig,1,-1,1,1);
+	if($seq ne $seq2){
+	  print "DIFFERENT\n";
+	  $nd++;
+	}else{
+	  print "Same\n";
+	  $ns++;
+	}
+	my $len2=length($seq2);
+	my $lenx=30;
+	if($len<$lenx){
+	  $lenx=$len;
+	  print substr($seq,0,$lenx)."\n".substr($seq2,0,$lenx)."\n\n";
+	}else{
+	  print substr($seq,0,$lenx)." .. ".substr($seq,$len-$lenx,$len)."\n";
+	  print substr($seq2,0,$lenx)." .. ".substr($seq2,$len-$lenx,$len2)."\n\n";
+	}
+
+	# now test a subsequence
+	my $lenr=$len-1;
+	for(my $i=0;$i<5;$i++){
+	  my $r=rand;
+	  my $st=int($r*$lenr)+1;
+	  $r=rand;
+	  my $ed=int($r*($lenr-$st))+$st+1;
+	  print "$lenr: $st-$ed\n";
+	  $seq=$seq_apt->fetch_by_RawContig_start_end_strand($contig,$st,$ed,1);
+	  $len=length($seq);
+	  $seq2=$seq_apt->fetch_by_RawContig_start_end_strand($contig,$st,$ed,1,1);
+	  if($seq ne $seq2){
+	    print "DIFFERENT\n";
+	    $nd++;
+	  }else{
+	    print "Same\n";
+	    $ns++;
+	  }
+	  my $len2=length($seq2);
+	  my $lenx=30;
+	  if($len<$lenx){
+	    $lenx=$len;
+	    print substr($seq,0,$lenx)."\n".substr($seq2,0,$lenx)."\n\n";
+	  }else{
+	    print substr($seq,0,$lenx)." .. ".substr($seq,$len-$lenx,$len)."\n";
+	    print substr($seq2,0,$lenx)." .. ".substr($seq2,$len-$lenx,$len2)."\n\n";
+	  }
+	}
+      }
+      
+    }
+  }
+  print "Same: $ns; Different: $nd\n" if $opt_C;
+  exit 0;
+
+}elsif($opt_s && $opt_l){
+
+  # sequence and label defined, so write into db;
+
+  my $acc=$opt_l;
+  my $seq=$opt_s;
+  my $ver=1;
+  # clone object
+  my $clone=new Bio::EnsEMBL::Clone;
+  $clone->id($acc);
+  $clone->htg_phase(4);
+  $clone->embl_id($acc);
+  $clone->version($ver);
+  $clone->embl_version($ver);
+  my $now = time;
+  $clone->created($now);
+  $clone->modified($now);
+
+  # contig object
+  my $contig = new Bio::EnsEMBL::RawContig;
+  my $length = length($seq);
+  $contig->name("$acc.$ver.1.$length");
+  $contig->embl_offset(1);
+  $contig->length($length);
+  $contig->seq($seq);
+  $clone->add_Contig($contig);
+
+  my $dbclone;
+  eval {
+    $dbclone = $clone_apt->fetch_by_accession_version($acc,$ver);
+  };
+  if($dbclone){
+    #$dbclone->delete_by_dbID;
+    $clone_apt->remove($dbclone);
+    print "remove old version of clone\n";
+  }
+  my $id=$clone_apt->store($clone);
+
+  print "Clone '$acc' stored under dbid $id\n";
+  
+  exit 0;
+
+}elsif($opt_d && $opt_i){
+
+  # remove clone
+
+  my $clone=$clone_apt->fetch_by_dbID($opt_i);
+  if($clone ne ''){
+    $clone_apt->remove($clone);
+    print "Clone $opt_i removed\n";
+  }else{
+    print "Clone not found\n";
+  }
+
+}elsif($opt_i){
+
+  foreach my $id (split(/,/,$opt_i)){
+    my $clone=$clone_apt->fetch_by_dbID($id);
+    my $contigs=$clone->get_all_Contigs;
+    foreach my $contig (@$contigs){
+      my $cid=$contig->dbID;
+      print "$cid: ".$contig->seq."\n";
+    }
+  }
+  exit 0;
+
+}else{
+
+    print "WARN - provide a valid contig_id (reading) or sequence to write\n";
+
+}
+
+__END__
+
+=pod
+
+=head1 NAME - name
+
+dna_compress.pl
+
+=head1 DESCRIPTION
+
+Populates and tests the experimental dnac (compressed dna) table of ensembl.
+
+=head1 SYNOPSIS
+
+    
+
+=head1 EXAMPLES
+
+Create a clone/clontig/dna entry
+
+    dna_compress.pl -s 'ACGTTTGGAANANGCCGTTNNNNACCGNGTGCTAAGCC' -l test
+
+Clone 'test' stored under dbid 42
+
+
+Create a compressed version of this entry
+
+    dna_compress.pl -c -i 42
+
+
+Compare compressed and uncompressed versions of this entry
+
+    dna_compress.pl -C -i 42
+
+(carries out a fetch of the entire contig and then 5 different random parts from it)
+
+
+Extract sequence from DB, autosensing when only compressed is present.
+
+    dna_compress.pl -i 42
+
+28: ACGTTTGGAANANGCCGTTNNNNACCGNGTGCTAAGCC
+
+    mysql> delete from dna where dna_id=42;
+
+    dna_compress.pl -i 42
+
+Switched to compressed DNA reading mode
+28: ACGTTTGGAANANGCCGTTNNNNACCGNGTGCTAAGCC
+
+    
+
+=head1 FLAGS
+
+=over 4
+
+=item -h
+
+Displays short help
+
+=item -T
+
+Displays this help message
+
+=item -s <string>
+
+String of DNA to write into database
+
+=item -l <string>
+
+Label for clonename, contigname to write into database
+
+=item -c
+
+Make compressed DNA entry for all clones in DB, or just those listed by dbid (-i)
+
+=item -C
+
+Compare uncompressed and compressed DNA entries where present, and/or listed by dbid (-i)
+
+=item -i <num[,num]>
+
+List of clone dbid
+
+=back
+
+=head1 VERSION HISTORY
+
+=over 4
+
+=item 14-Jul-2003
+
+B<th> initial release
+
+=back
+
+=head1 BUGS
+
+=head1 AUTHOR
+
+B<Tim Hubbard> Email th@sanger.ac.uk
+
+=cut
--- a/sql/table.sql
+++ b/sql/table.sql
@@ -145,6 +145,24 @@ CREATE TABLE dna (
  PRIMARY KEY (dna_id)
 ) MAX_ROWS = 750000 AVG_ROW_LENGTH = 19000;

+#
+# Table structure for table 'dnac'
+#
+
+# Contains equivalent data to dna table, but 4 letters of DNA code are represented
+# by a single binary character, based on 2 bit encoding
+# don't need to worry about ambiguity of length, since this is stored in contig.length
+# n_line column contains start-end pairs of coordinates in the string that are really Ns
+
+CREATE TABLE dnac (
+  dna_id    int(10) unsigned NOT NULL auto_increment,
+  sequence  mediumblob NOT NULL,
+  created   datetime NOT NULL,
+  n_line    text,  
+
+  PRIMARY KEY (dna_id)
+) MAX_ROWS = 750000 AVG_ROW_LENGTH = 19000;
+
 #
 # Table structure for table 'exon'
 #