Skip to content
Snippets Groups Projects
Commit 35729e7b authored by Tim Hubbard's avatar Tim Hubbard
Browse files

support for dnac compressed dna table

parent 7a2d4026
No related branches found
No related tags found
No related merge requests found
......@@ -233,9 +233,9 @@ sub fetch_all {
my $self = shift;
my $sth = $self->prepare("SELECT clone_id, name, embl_acc, version,
embl_version, htg_phase, created, modified
embl_version, htg_phase,
UNIX_TIMESTAMP(created), UNIX_TIMESTAMP(modified)
FROM clone");
$sth->execute();
......
......@@ -509,8 +509,25 @@ sub remove {
my $dna_id = $sth->fetchrow_array;
$sth = $self->prepare("DELETE FROM dna WHERE dna_id = $dna_id");
$sth->execute;
$self->throw("Failed to delete dna for dna_id '$dna_id'")
unless $sth->rows;
my $flag_found=0;
if($sth->rows){
$flag_found=1;
}
# now try dnac - eval since table might not be present..
$sth = $self->prepare("DELETE FROM dnac WHERE dna_id = $dna_id");
eval{
$sth->execute;
};
if($@){
}else{
if($sth->rows){
$flag_found=1;
}
}
print "got here\n";
if($flag_found==0){
$self->throw("Failed to delete dna for dna_id '$dna_id'")
}
}
# Remove the contig
......
#
# EnsEMBL module for Bio::EnsEMBL::DBSQL::SequenceAdaptor
# EnsEMBL module for Bio::EnsEMBL::DSQL::SequenceAdaptor
#
# Cared for by Arne Stabenau <stabenau@ebi.ac.uk>
#
......@@ -55,6 +55,7 @@ use Bio::EnsEMBL::DBSQL::BaseAdaptor;
a -1 means until the end
Arg [4] : int $strand
-1, 1 are possible values
Arg [5] : (optional) force read from compressed DNA table
Example : $dna = $seq_adp->fetch_by_RawContig_start_end_strand($contig, 1,
1000, -1);
Description: retrieves the dna string from the database from the
......@@ -66,7 +67,7 @@ use Bio::EnsEMBL::DBSQL::BaseAdaptor;
=cut
sub fetch_by_RawContig_start_end_strand {
my ( $self, $contig, $start, $end, $strand ) = @_;
my ( $self, $contig, $start, $end, $strand, $compressed ) = @_;
my $sth;
if( $start < 1 ) {
......@@ -77,46 +78,125 @@ sub fetch_by_RawContig_start_end_strand {
$self->throw("contig arg must be contig not a [$contig]");
}
my $query;
{
if( $end == -1 ) {
my $query;
$query = "SELECT c.length, SUBSTRING( d.sequence, $start )
FROM dna d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?";
if($compressed==1 || $self->is_compressed){
# query for compressed mode, uses hex and a different range
# needs to getback dna_id as will have to check for Ns in dnan table
my $start4=int(($start-1)/4)+1;
} else {
my $length = $end - $start + 1;
if( $length < 1 ) {
$self->throw( "Wrong parameters" );
}
$query = "SELECT c.length, SUBSTRING( d.sequence, $start, $length )
FROM dna d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?";
}
if( $end == -1 ) {
$query = "SELECT c.length, HEX( SUBSTRING( d.sequence, $start4 ) ),
d.n_line
FROM dnac d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?";
} else {
my $end4=int(($end-1)/4)+1;
my $length4 = $end4 - $start4 + 1;
if( $length4 < 1 ) {
$self->throw( "Wrong parameters" );
}
$query = "SELECT c.length, HEX( SUBSTRING( d.sequence, $start4, $length4 ) ),
d.n_line
FROM dnac d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?";
}
}else{
if( $end == -1 ) {
#use the dna db if defined
if( defined $self->db()->dnadb() ) {
$sth = $self->db()->dnadb()->prepare( $query );
} else {
$sth = $self->prepare( $query );
}
$query = "SELECT c.length, SUBSTRING( d.sequence, $start )
FROM dna d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?";
} else {
my $length = $end - $start + 1;
if( $length < 1 ) {
$self->throw( "Wrong parameters" );
}
$query = "SELECT c.length, SUBSTRING( d.sequence, $start, $length )
FROM dna d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?";
}
}
$sth->execute($contig->dbID());
#use the dna db if defined
if( defined $self->db()->dnadb() ) {
$sth = $self->db()->dnadb()->prepare( $query );
} else {
$sth = $self->prepare( $query );
}
if( my $aref = $sth->fetchrow_arrayref() ) {
my ( $length, $seq ) = @$aref;
$seq =~ s/\s//g;
if( $strand == -1 ) {
return $self->_reverse_comp( $seq );
$sth->execute($contig->dbID());
if( my $aref = $sth->fetchrow_arrayref() ) {
my ( $length, $seq, $n_line ) = @$aref;
my $lenx=length($seq);
if($compressed || $self->is_compressed()){
$seq=$self->_dna_uncompress($length,$n_line,$start,$end,\$seq);
}
$seq =~ s/\s//g;
if( $strand == -1 ) {
return $self->_reverse_comp( $seq );
} else {
return $seq;
}
} else {
return $seq;
my $flag;
if($self->is_compressed==0){
# found nothing...perhaps in this entry is compressed - check
# and if present, set is_compressed
eval{
$query = "SELECT c.length, d.dna_id
FROM dnac d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?";
$sth = $self->prepare( $query );
$sth->execute($contig->dbID());
if( my $aref = $sth->fetchrow_arrayref() ) {
$flag=1;
}
};
if($flag==1){
$self->is_compressed(1);
print "Switched to compressed DNA reading mode\n";
redo;
}
}else{
# found nothing...perhaps in this entry is uncompressed - check
# and if present, set is_compressed
eval{
$query = "SELECT c.length, d.dna_id
FROM dna d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?";
$sth = $self->prepare( $query );
$sth->execute($contig->dbID());
if( my $aref = $sth->fetchrow_arrayref() ) {
$flag=1;
}
};
if($flag==1){
$self->is_compressed(0);
print "Switched to uncompressed DNA reading mode\n";
redo;
}
}
return undef;
}
} else {
return undef;
}
}
......@@ -264,6 +344,11 @@ sub store {
my ($self, $sequence, $date) = @_;
$sequence =~ tr/atgcn/ATGCN/;
# save compressed
if($self->is_compressed){
return $self->store_compressed($sequence,$date);
}
my $statement = $self->prepare("
INSERT INTO dna(sequence,created)
......@@ -284,6 +369,215 @@ sub store {
}
=head2 store_compressed
Arg [1] : string $sequence the dna sequence to be stored in the database
Arg [2] : string $date create date to be associated with the dna sequence
to be stored.
Arg [3] : dbid (optional)
Returntype : int
Exceptions : none
Caller : Bio::EnsEMBL::DBSQL::RawContigAdaptor::store_compressed,
Bio::EnsEMBL::DBSQL::RawContigAdaptor::store
=cut
sub store_compressed {
my ($self, $sequence, $date, $dbid) = @_;
# look for N's, save list of start and ends and convert to A's
my $n_line;
my $len=length($sequence);
#print "DEBUG $sequence ($len)\n";
{
if($sequence=~/^([^N]*)([N]+)(.*)$/){
my $l1=length($1);
my $l2=length($2);
my $start=$l1+1;
my $end=$l1+$l2;
$sequence=$1.('A' x $l2).$3;
$n_line.=" " if $n_line;
$n_line.="$start-$end";
#print "DEBUG: Ns from $start-$end\n";
redo;
}
}
#print "DEBUG $sequence\n";
# convert ACGT -> 00011011
my $sequence_comp=$self->_dna_compress($sequence);
my $id;
if($dbid){
my $statement = $self->prepare("
INSERT INTO dnac(dna_id,sequence,created,n_line)
VALUES(?, ?, FROM_UNIXTIME(?), ?)
");
my $rv = $statement->execute($dbid,$sequence_comp, $date, $n_line);
$self->throw("Failed to insert dna $sequence") unless $rv;
$statement->finish;
$id=$dbid;
}else{
my $statement = $self->prepare("
INSERT INTO dnac(sequence,created,flag_dnan)
VALUES(?, FROM_UNIXTIME(?), ?)
");
my $rv = $statement->execute($sequence_comp, $date, $n_line);
$self->throw("Failed to insert dna $sequence") unless $rv;
$statement->finish;
$statement = $self->prepare("SELECT last_insert_id()");
$statement->execute();
($id) = $statement->fetchrow();
$statement->finish;
}
return $id;
}
=head2 _dna_compress
Arg 1 : txt $dna_sequence
Function : build hex representation of string (ACGT -> 1B)
Returntype: txt
Exceptions: none
Caller : private to this module
=cut
{
my %table=(
'AA'=>'0',
'AC'=>'1',
'AG'=>'2',
'AT'=>'3',
'CA'=>'4',
'CC'=>'5',
'CG'=>'6',
'CT'=>'7',
'GA'=>'8',
'GC'=>'9',
'GG'=>'A',
'GT'=>'B',
'TA'=>'C',
'TC'=>'D',
'TG'=>'E',
'TT'=>'F',
);
sub _dna_compress {
my $self = shift;
my $seq = shift;
# length must be multiple of 4, so pad if necesary
my $len=length($seq);
$seq.='A' x ($len-(int($len/4))*4);
#print "DEBUG $seq\n";
# process in blocks of 2 to do conversion
my $seq_hex;
my @seq=split(//,$seq);
while(my $pair=shift(@seq)){
$pair.=shift(@seq);
$seq_hex.=$table{$pair};
}
return pack("H*",$seq_hex);
}
}
=head2 _dna_uncompress
Arg [1] : int $length
Arg [2] : int $n_line
Arg [3] : int $start
Arg [4] : int $end
Arg [5] : reference to $seq (hex string)
Function : converts hex string from DB into ACGT with appropriate truncation
Returntype : string
Exceptions : none
Caller : private to this module
=cut
{
my %table=(
'0'=>'AA',
'1'=>'AC',
'2'=>'AG',
'3'=>'AT',
'4'=>'CA',
'5'=>'CC',
'6'=>'CG',
'7'=>'CT',
'8'=>'GA',
'9'=>'GC',
'A'=>'GG',
'B'=>'GT',
'C'=>'TA',
'D'=>'TC',
'E'=>'TG',
'F'=>'TT',
);
sub _dna_uncompress {
my($self,$length,$n_line,$start,$end,$rseq)=@_;
# convert sequence back to ACGT (already in Hex)
my $seq2=join('',map{$table{$_}}split(//,$$rseq));
# calculate extent of $seq2 and truncate
my $start4=(int(($start-1)/4)+1)*4-3;
my $off=$start-$start4;
my $len;
if($end==-1){
$end=$length;
}
$len=$end-$start+1;
#print "truncate: $start,$start4,$end,$len\n";
my $seq=substr($seq2,$off,$len);
# mask with N's
foreach my $range (split(/ /,$n_line)){
my($st,$ed)=split(/\-/,$range);
#print "before: $st-$ed $start-$end\n";
# check in range
next if($ed<$start || $st>$end);
$ed=$end if $ed>$end;
$st=$start if $st<$start;
#print "after: $st-$ed\n";
my $len=$ed-$st+1;
$st-=$start;
substr($seq,$st,$len)='N'x$len;
}
return $seq;
}
}
=head2 is_compressed
Arg [1] : (optional) flag
to set or unset
Example : $adaptor->is_compressed(1)
Description: Getter/Setter for whether compressed DNA should be read/written
Returntype : true/false
Exceptions : none
Caller : Adaptors inherited from BaseAdaptor
=cut
sub is_compressed {
my( $obj, $flag ) = @_;
if (defined $flag) {
$obj->{'_is_truncated'} = $flag ? 1 : 0;
}
return $obj->{'_is_truncated'};
}
=head2 _reverse_comp
......
#!/usr/local/bin/perl -- # -*-Perl-*-
#
# Copyright (c) 2003 Tim Hubbard (th@sanger.ac.uk)
# Sanger Centre, Wellcome Trust Genome Campus, Cambs, UK
# All rights reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation
#
# $Header$
# This is a driver script to populate and test the experimental dnac
# (compressed dna) table of ensembl. Use -T for pod based tutorial
use strict;
use Getopt::Std;
use vars qw($opt_h $opt_T $opt_i
$opt_U $opt_D $opt_P $opt_H $opt_p
$opt_s $opt_c $opt_C $opt_l $opt_d);
use Bio::EnsEMBL::DBSQL::DBAdaptor;
getopts("hTi:U:D:P:H:p:s:cCl:d");
$|=1;
# specify defaults
my $def_U='ensadmin';
my $def_D='testdna';
my $def_P='3307';
my $def_H='127.0.0.1';
if($opt_h){
&help;
}elsif($opt_T){
&help2;
}
sub help {
print <<ENDHELP;
dna_compress.pl
-h for help
-s char string of DNA (for writing)
-l char label of clone, contig (for writing)
-c convert
-C compare
-i id clone_dbid (for comparing, converting, extracting DNA)
-d delete clone
-U user ensembl db user [$def_U]
-D db ensembl db [$def_D]
-P port port of mysql [$def_P]
-H host host of mysql [$def_H]
-p pass passwd for mysqluser
ENDHELP
exit 0;
}
sub help2 {
exec('perldoc', $0);
}
# defaults or options
$opt_U=$def_U unless $opt_U;
$opt_D=$def_D unless $opt_D;
$opt_P=$def_P unless $opt_P;
$opt_H=$def_H unless $opt_H;
# db connection
my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host => $opt_H,
-user => $opt_U,
-port => $opt_P,
-dbname => $opt_D,
-pass => $opt_p);
my $seq_apt=$db->get_SequenceAdaptor;
my $contig_apt=$db->get_RawContigAdaptor;
my $clone_apt=$db->get_CloneAdaptor;
if($opt_C || $opt_c){
# convert dna of sequences (loop over clone->contig->dna table,
# inserting into dnac table when no entry is already present)
# OR
# compare raw and compressed sequences
my $clones=$clone_apt->fetch_all();
my $nd=0;
my $ns=0;
foreach my $clone (@$clones){
my $clone_dbid=$clone->dbID;
next if($opt_i && $opt_i!=$clone_dbid);
my $clone_id=$clone->id;
my $created=$clone->created();
print "Processing Clone $clone_id ($clone_dbid) $created\n";
my $contigs=$clone->get_all_Contigs();
foreach my $contig (@$contigs){
my $contig_dbid=$contig->dbID;
my $contig_id=$clone->id;
my $seq=$seq_apt->fetch_by_RawContig_start_end_strand($contig,1,-1,1);
my $len=length($seq);
my $dna_id;
# custom sql is required to get dna_id and check if there is a compressed id
my $query="SELECT c.dna_id FROM contig c WHERE c.contig_id = ?";
my $sth=$contig_apt->prepare($query);
$sth->execute($contig_dbid);
if(my $aref=$sth->fetchrow_arrayref()){
($dna_id)=@$aref;
}else{
$contig_apt->thrown("No dna_id for Contig $contig_id");
}
print " Processing Contig $contig_id ($contig_dbid) [$len] $dna_id\n";
my $flag_compressed_exists=0;
$query="SELECT d.dna_id FROM dnac d WHERE d.dna_id = ?";
$sth=$seq_apt->prepare($query);
$sth->execute($dna_id);
if(my $aref=$sth->fetchrow_arrayref()){
$flag_compressed_exists=1;
}
# convert
if($opt_c){
if($flag_compressed_exists){
print " Compressed DNA entry already exists for dna_id $dna_id\n";
next;
}
# write compressed DNA record
my $id=$seq_apt->store_compressed($seq,$created,$dna_id);
print " Created compressed DNA entry\n";
}elsif($opt_C){
if(!$flag_compressed_exists){
print " Compressed DNA entry missing for dna_id $dna_id\n";
next;
}
my $seq2=$seq_apt->fetch_by_RawContig_start_end_strand($contig,1,-1,1,1);
if($seq ne $seq2){
print "DIFFERENT\n";
$nd++;
}else{
print "Same\n";
$ns++;
}
my $len2=length($seq2);
my $lenx=30;
if($len<$lenx){
$lenx=$len;
print substr($seq,0,$lenx)."\n".substr($seq2,0,$lenx)."\n\n";
}else{
print substr($seq,0,$lenx)." .. ".substr($seq,$len-$lenx,$len)."\n";
print substr($seq2,0,$lenx)." .. ".substr($seq2,$len-$lenx,$len2)."\n\n";
}
# now test a subsequence
my $lenr=$len-1;
for(my $i=0;$i<5;$i++){
my $r=rand;
my $st=int($r*$lenr)+1;
$r=rand;
my $ed=int($r*($lenr-$st))+$st+1;
print "$lenr: $st-$ed\n";
$seq=$seq_apt->fetch_by_RawContig_start_end_strand($contig,$st,$ed,1);
$len=length($seq);
$seq2=$seq_apt->fetch_by_RawContig_start_end_strand($contig,$st,$ed,1,1);
if($seq ne $seq2){
print "DIFFERENT\n";
$nd++;
}else{
print "Same\n";
$ns++;
}
my $len2=length($seq2);
my $lenx=30;
if($len<$lenx){
$lenx=$len;
print substr($seq,0,$lenx)."\n".substr($seq2,0,$lenx)."\n\n";
}else{
print substr($seq,0,$lenx)." .. ".substr($seq,$len-$lenx,$len)."\n";
print substr($seq2,0,$lenx)." .. ".substr($seq2,$len-$lenx,$len2)."\n\n";
}
}
}
}
}
print "Same: $ns; Different: $nd\n" if $opt_C;
exit 0;
}elsif($opt_s && $opt_l){
# sequence and label defined, so write into db;
my $acc=$opt_l;
my $seq=$opt_s;
my $ver=1;
# clone object
my $clone=new Bio::EnsEMBL::Clone;
$clone->id($acc);
$clone->htg_phase(4);
$clone->embl_id($acc);
$clone->version($ver);
$clone->embl_version($ver);
my $now = time;
$clone->created($now);
$clone->modified($now);
# contig object
my $contig = new Bio::EnsEMBL::RawContig;
my $length = length($seq);
$contig->name("$acc.$ver.1.$length");
$contig->embl_offset(1);
$contig->length($length);
$contig->seq($seq);
$clone->add_Contig($contig);
my $dbclone;
eval {
$dbclone = $clone_apt->fetch_by_accession_version($acc,$ver);
};
if($dbclone){
#$dbclone->delete_by_dbID;
$clone_apt->remove($dbclone);
print "remove old version of clone\n";
}
my $id=$clone_apt->store($clone);
print "Clone '$acc' stored under dbid $id\n";
exit 0;
}elsif($opt_d && $opt_i){
# remove clone
my $clone=$clone_apt->fetch_by_dbID($opt_i);
if($clone ne ''){
$clone_apt->remove($clone);
print "Clone $opt_i removed\n";
}else{
print "Clone not found\n";
}
}elsif($opt_i){
foreach my $id (split(/,/,$opt_i)){
my $clone=$clone_apt->fetch_by_dbID($id);
my $contigs=$clone->get_all_Contigs;
foreach my $contig (@$contigs){
my $cid=$contig->dbID;
print "$cid: ".$contig->seq."\n";
}
}
exit 0;
}else{
print "WARN - provide a valid contig_id (reading) or sequence to write\n";
}
__END__
=pod
=head1 NAME - name
dna_compress.pl
=head1 DESCRIPTION
Populates and tests the experimental dnac (compressed dna) table of ensembl.
=head1 SYNOPSIS
=head1 EXAMPLES
Create a clone/clontig/dna entry
dna_compress.pl -s 'ACGTTTGGAANANGCCGTTNNNNACCGNGTGCTAAGCC' -l test
Clone 'test' stored under dbid 42
Create a compressed version of this entry
dna_compress.pl -c -i 42
Compare compressed and uncompressed versions of this entry
dna_compress.pl -C -i 42
(carries out a fetch of the entire contig and then 5 different random parts from it)
Extract sequence from DB, autosensing when only compressed is present.
dna_compress.pl -i 42
28: ACGTTTGGAANANGCCGTTNNNNACCGNGTGCTAAGCC
mysql> delete from dna where dna_id=42;
dna_compress.pl -i 42
Switched to compressed DNA reading mode
28: ACGTTTGGAANANGCCGTTNNNNACCGNGTGCTAAGCC
=head1 FLAGS
=over 4
=item -h
Displays short help
=item -T
Displays this help message
=item -s <string>
String of DNA to write into database
=item -l <string>
Label for clonename, contigname to write into database
=item -c
Make compressed DNA entry for all clones in DB, or just those listed by dbid (-i)
=item -C
Compare uncompressed and compressed DNA entries where present, and/or listed by dbid (-i)
=item -i <num[,num]>
List of clone dbid
=back
=head1 VERSION HISTORY
=over 4
=item 14-Jul-2003
B<th> initial release
=back
=head1 BUGS
=head1 AUTHOR
B<Tim Hubbard> Email th@sanger.ac.uk
=cut
......@@ -145,6 +145,24 @@ CREATE TABLE dna (
PRIMARY KEY (dna_id)
) MAX_ROWS = 750000 AVG_ROW_LENGTH = 19000;
#
# Table structure for table 'dnac'
#
# Contains equivalent data to dna table, but 4 letters of DNA code are represented
# by a single binary character, based on 2 bit encoding
# don't need to worry about ambiguity of length, since this is stored in contig.length
# n_line column contains start-end pairs of coordinates in the string that are really Ns
CREATE TABLE dnac (
dna_id int(10) unsigned NOT NULL auto_increment,
sequence mediumblob NOT NULL,
created datetime NOT NULL,
n_line text,
PRIMARY KEY (dna_id)
) MAX_ROWS = 750000 AVG_ROW_LENGTH = 19000;
#
# Table structure for table 'exon'
#
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment