Commit 35729e7b authored by Tim Hubbard's avatar Tim Hubbard
Browse files

support for dnac compressed dna table

parent 7a2d4026
......@@ -233,10 +233,10 @@ sub fetch_all {
my $self = shift;
my $sth = $self->prepare("SELECT clone_id, name, embl_acc, version,
embl_version, htg_phase, created, modified
embl_version, htg_phase,
UNIX_TIMESTAMP(created), UNIX_TIMESTAMP(modified)
FROM clone");
$sth->execute();
my ($clone_id, $name, $embl_acc, $version, $embl_version, $htg_phase,
......
......@@ -509,8 +509,25 @@ sub remove {
my $dna_id = $sth->fetchrow_array;
$sth = $self->prepare("DELETE FROM dna WHERE dna_id = $dna_id");
$sth->execute;
my $flag_found=0;
if($sth->rows){
$flag_found=1;
}
# now try dnac - eval since table might not be present..
$sth = $self->prepare("DELETE FROM dnac WHERE dna_id = $dna_id");
eval{
$sth->execute;
};
if($@){
}else{
if($sth->rows){
$flag_found=1;
}
}
print "got here\n";
if($flag_found==0){
$self->throw("Failed to delete dna for dna_id '$dna_id'")
unless $sth->rows;
}
}
# Remove the contig
......
#
# EnsEMBL module for Bio::EnsEMBL::DBSQL::SequenceAdaptor
# EnsEMBL module for Bio::EnsEMBL::DSQL::SequenceAdaptor
#
# Cared for by Arne Stabenau <stabenau@ebi.ac.uk>
#
......@@ -55,6 +55,7 @@ use Bio::EnsEMBL::DBSQL::BaseAdaptor;
a -1 means until the end
Arg [4] : int $strand
-1, 1 are possible values
Arg [5] : (optional) force read from compressed DNA table
Example : $dna = $seq_adp->fetch_by_RawContig_start_end_strand($contig, 1,
1000, -1);
Description: retrieves the dna string from the database from the
......@@ -66,7 +67,7 @@ use Bio::EnsEMBL::DBSQL::BaseAdaptor;
=cut
sub fetch_by_RawContig_start_end_strand {
my ( $self, $contig, $start, $end, $strand ) = @_;
my ( $self, $contig, $start, $end, $strand, $compressed ) = @_;
my $sth;
if( $start < 1 ) {
......@@ -77,8 +78,39 @@ sub fetch_by_RawContig_start_end_strand {
$self->throw("contig arg must be contig not a [$contig]");
}
{
my $query;
if($compressed==1 || $self->is_compressed){
# query for compressed mode, uses hex and a different range
# needs to getback dna_id as will have to check for Ns in dnan table
my $start4=int(($start-1)/4)+1;
if( $end == -1 ) {
$query = "SELECT c.length, HEX( SUBSTRING( d.sequence, $start4 ) ),
d.n_line
FROM dnac d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?";
} else {
my $end4=int(($end-1)/4)+1;
my $length4 = $end4 - $start4 + 1;
if( $length4 < 1 ) {
$self->throw( "Wrong parameters" );
}
$query = "SELECT c.length, HEX( SUBSTRING( d.sequence, $start4, $length4 ) ),
d.n_line
FROM dnac d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?";
}
}else{
if( $end == -1 ) {
$query = "SELECT c.length, SUBSTRING( d.sequence, $start )
......@@ -98,6 +130,8 @@ sub fetch_by_RawContig_start_end_strand {
AND c.contig_id = ?";
}
}
#use the dna db if defined
if( defined $self->db()->dnadb() ) {
$sth = $self->db()->dnadb()->prepare( $query );
......@@ -108,7 +142,11 @@ sub fetch_by_RawContig_start_end_strand {
$sth->execute($contig->dbID());
if( my $aref = $sth->fetchrow_arrayref() ) {
my ( $length, $seq ) = @$aref;
my ( $length, $seq, $n_line ) = @$aref;
my $lenx=length($seq);
if($compressed || $self->is_compressed()){
$seq=$self->_dna_uncompress($length,$n_line,$start,$end,\$seq);
}
$seq =~ s/\s//g;
if( $strand == -1 ) {
return $self->_reverse_comp( $seq );
......@@ -116,8 +154,50 @@ sub fetch_by_RawContig_start_end_strand {
return $seq;
}
} else {
my $flag;
if($self->is_compressed==0){
# found nothing...perhaps in this entry is compressed - check
# and if present, set is_compressed
eval{
$query = "SELECT c.length, d.dna_id
FROM dnac d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?";
$sth = $self->prepare( $query );
$sth->execute($contig->dbID());
if( my $aref = $sth->fetchrow_arrayref() ) {
$flag=1;
}
};
if($flag==1){
$self->is_compressed(1);
print "Switched to compressed DNA reading mode\n";
redo;
}
}else{
# found nothing...perhaps in this entry is uncompressed - check
# and if present, set is_compressed
eval{
$query = "SELECT c.length, d.dna_id
FROM dna d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?";
$sth = $self->prepare( $query );
$sth->execute($contig->dbID());
if( my $aref = $sth->fetchrow_arrayref() ) {
$flag=1;
}
};
if($flag==1){
$self->is_compressed(0);
print "Switched to uncompressed DNA reading mode\n";
redo;
}
}
return undef;
}
}
}
......@@ -265,6 +345,11 @@ sub store {
$sequence =~ tr/atgcn/ATGCN/;
# save compressed
if($self->is_compressed){
return $self->store_compressed($sequence,$date);
}
my $statement = $self->prepare("
INSERT INTO dna(sequence,created)
VALUES(?, FROM_UNIXTIME(?))
......@@ -284,6 +369,215 @@ sub store {
}
=head2 store_compressed
Arg [1] : string $sequence the dna sequence to be stored in the database
Arg [2] : string $date create date to be associated with the dna sequence
to be stored.
Arg [3] : dbid (optional)
Returntype : int
Exceptions : none
Caller : Bio::EnsEMBL::DBSQL::RawContigAdaptor::store_compressed,
Bio::EnsEMBL::DBSQL::RawContigAdaptor::store
=cut
sub store_compressed {
my ($self, $sequence, $date, $dbid) = @_;
# look for N's, save list of start and ends and convert to A's
my $n_line;
my $len=length($sequence);
#print "DEBUG $sequence ($len)\n";
{
if($sequence=~/^([^N]*)([N]+)(.*)$/){
my $l1=length($1);
my $l2=length($2);
my $start=$l1+1;
my $end=$l1+$l2;
$sequence=$1.('A' x $l2).$3;
$n_line.=" " if $n_line;
$n_line.="$start-$end";
#print "DEBUG: Ns from $start-$end\n";
redo;
}
}
#print "DEBUG $sequence\n";
# convert ACGT -> 00011011
my $sequence_comp=$self->_dna_compress($sequence);
my $id;
if($dbid){
my $statement = $self->prepare("
INSERT INTO dnac(dna_id,sequence,created,n_line)
VALUES(?, ?, FROM_UNIXTIME(?), ?)
");
my $rv = $statement->execute($dbid,$sequence_comp, $date, $n_line);
$self->throw("Failed to insert dna $sequence") unless $rv;
$statement->finish;
$id=$dbid;
}else{
my $statement = $self->prepare("
INSERT INTO dnac(sequence,created,flag_dnan)
VALUES(?, FROM_UNIXTIME(?), ?)
");
my $rv = $statement->execute($sequence_comp, $date, $n_line);
$self->throw("Failed to insert dna $sequence") unless $rv;
$statement->finish;
$statement = $self->prepare("SELECT last_insert_id()");
$statement->execute();
($id) = $statement->fetchrow();
$statement->finish;
}
return $id;
}
=head2 _dna_compress
Arg 1 : txt $dna_sequence
Function : build hex representation of string (ACGT -> 1B)
Returntype: txt
Exceptions: none
Caller : private to this module
=cut
{
my %table=(
'AA'=>'0',
'AC'=>'1',
'AG'=>'2',
'AT'=>'3',
'CA'=>'4',
'CC'=>'5',
'CG'=>'6',
'CT'=>'7',
'GA'=>'8',
'GC'=>'9',
'GG'=>'A',
'GT'=>'B',
'TA'=>'C',
'TC'=>'D',
'TG'=>'E',
'TT'=>'F',
);
sub _dna_compress {
my $self = shift;
my $seq = shift;
# length must be multiple of 4, so pad if necesary
my $len=length($seq);
$seq.='A' x ($len-(int($len/4))*4);
#print "DEBUG $seq\n";
# process in blocks of 2 to do conversion
my $seq_hex;
my @seq=split(//,$seq);
while(my $pair=shift(@seq)){
$pair.=shift(@seq);
$seq_hex.=$table{$pair};
}
return pack("H*",$seq_hex);
}
}
=head2 _dna_uncompress
Arg [1] : int $length
Arg [2] : int $n_line
Arg [3] : int $start
Arg [4] : int $end
Arg [5] : reference to $seq (hex string)
Function : converts hex string from DB into ACGT with appropriate truncation
Returntype : string
Exceptions : none
Caller : private to this module
=cut
{
my %table=(
'0'=>'AA',
'1'=>'AC',
'2'=>'AG',
'3'=>'AT',
'4'=>'CA',
'5'=>'CC',
'6'=>'CG',
'7'=>'CT',
'8'=>'GA',
'9'=>'GC',
'A'=>'GG',
'B'=>'GT',
'C'=>'TA',
'D'=>'TC',
'E'=>'TG',
'F'=>'TT',
);
sub _dna_uncompress {
my($self,$length,$n_line,$start,$end,$rseq)=@_;
# convert sequence back to ACGT (already in Hex)
my $seq2=join('',map{$table{$_}}split(//,$$rseq));
# calculate extent of $seq2 and truncate
my $start4=(int(($start-1)/4)+1)*4-3;
my $off=$start-$start4;
my $len;
if($end==-1){
$end=$length;
}
$len=$end-$start+1;
#print "truncate: $start,$start4,$end,$len\n";
my $seq=substr($seq2,$off,$len);
# mask with N's
foreach my $range (split(/ /,$n_line)){
my($st,$ed)=split(/\-/,$range);
#print "before: $st-$ed $start-$end\n";
# check in range
next if($ed<$start || $st>$end);
$ed=$end if $ed>$end;
$st=$start if $st<$start;
#print "after: $st-$ed\n";
my $len=$ed-$st+1;
$st-=$start;
substr($seq,$st,$len)='N'x$len;
}
return $seq;
}
}
=head2 is_compressed
Arg [1] : (optional) flag
to set or unset
Example : $adaptor->is_compressed(1)
Description: Getter/Setter for whether compressed DNA should be read/written
Returntype : true/false
Exceptions : none
Caller : Adaptors inherited from BaseAdaptor
=cut
sub is_compressed {
my( $obj, $flag ) = @_;
if (defined $flag) {
$obj->{'_is_truncated'} = $flag ? 1 : 0;
}
return $obj->{'_is_truncated'};
}
=head2 _reverse_comp
......
#!/usr/local/bin/perl -- # -*-Perl-*-
#
# Copyright (c) 2003 Tim Hubbard (th@sanger.ac.uk)
# Sanger Centre, Wellcome Trust Genome Campus, Cambs, UK
# All rights reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation
#
# $Header$
# This is a driver script to populate and test the experimental dnac
# (compressed dna) table of ensembl. Use -T for pod based tutorial
use strict;
use Getopt::Std;
use vars qw($opt_h $opt_T $opt_i
$opt_U $opt_D $opt_P $opt_H $opt_p
$opt_s $opt_c $opt_C $opt_l $opt_d);
use Bio::EnsEMBL::DBSQL::DBAdaptor;
getopts("hTi:U:D:P:H:p:s:cCl:d");
$|=1;
# specify defaults
my $def_U='ensadmin';
my $def_D='testdna';
my $def_P='3307';
my $def_H='127.0.0.1';
if($opt_h){
&help;
}elsif($opt_T){
&help2;
}
sub help {
print <<ENDHELP;
dna_compress.pl
-h for help
-s char string of DNA (for writing)
-l char label of clone, contig (for writing)
-c convert
-C compare
-i id clone_dbid (for comparing, converting, extracting DNA)
-d delete clone
-U user ensembl db user [$def_U]
-D db ensembl db [$def_D]
-P port port of mysql [$def_P]
-H host host of mysql [$def_H]
-p pass passwd for mysqluser
ENDHELP
exit 0;
}
sub help2 {
exec('perldoc', $0);
}
# defaults or options
$opt_U=$def_U unless $opt_U;
$opt_D=$def_D unless $opt_D;
$opt_P=$def_P unless $opt_P;
$opt_H=$def_H unless $opt_H;
# db connection
my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-host => $opt_H,
-user => $opt_U,
-port => $opt_P,
-dbname => $opt_D,
-pass => $opt_p);
my $seq_apt=$db->get_SequenceAdaptor;
my $contig_apt=$db->get_RawContigAdaptor;
my $clone_apt=$db->get_CloneAdaptor;
if($opt_C || $opt_c){
# convert dna of sequences (loop over clone->contig->dna table,
# inserting into dnac table when no entry is already present)
# OR
# compare raw and compressed sequences
my $clones=$clone_apt->fetch_all();
my $nd=0;
my $ns=0;
foreach my $clone (@$clones){
my $clone_dbid=$clone->dbID;
next if($opt_i && $opt_i!=$clone_dbid);
my $clone_id=$clone->id;
my $created=$clone->created();
print "Processing Clone $clone_id ($clone_dbid) $created\n";
my $contigs=$clone->get_all_Contigs();
foreach my $contig (@$contigs){
my $contig_dbid=$contig->dbID;
my $contig_id=$clone->id;
my $seq=$seq_apt->fetch_by_RawContig_start_end_strand($contig,1,-1,1);
my $len=length($seq);
my $dna_id;
# custom sql is required to get dna_id and check if there is a compressed id
my $query="SELECT c.dna_id FROM contig c WHERE c.contig_id = ?";
my $sth=$contig_apt->prepare($query);
$sth->execute($contig_dbid);
if(my $aref=$sth->fetchrow_arrayref()){
($dna_id)=@$aref;
}else{
$contig_apt->thrown("No dna_id for Contig $contig_id");
}
print " Processing Contig $contig_id ($contig_dbid) [$len] $dna_id\n";
my $flag_compressed_exists=0;
$query="SELECT d.dna_id FROM dnac d WHERE d.dna_id = ?";
$sth=$seq_apt->prepare($query);
$sth->execute($dna_id);
if(my $aref=$sth->fetchrow_arrayref()){
$flag_compressed_exists=1;
}
# convert
if($opt_c){
if($flag_compressed_exists){
print " Compressed DNA entry already exists for dna_id $dna_id\n";
next;
}
# write compressed DNA record
my $id=$seq_apt->store_compressed($seq,$created,$dna_id);
print " Created compressed DNA entry\n";
}elsif($opt_C){
if(!$flag_compressed_exists){
print " Compressed DNA entry missing for dna_id $dna_id\n";
next;
}
my $seq2=$seq_apt->fetch_by_RawContig_start_end_strand($contig,1,-1,1,1);
if($seq ne $seq2){
print "DIFFERENT\n";
$nd++;
}else{
print "Same\n";
$ns++;
}
my $len2=length($seq2);
my $lenx=30;
if($len<$lenx){
$lenx=$len;
print substr($seq,0,$lenx)."\n".substr($seq2,0,$lenx)."\n\n";
}else{
print substr($seq,0,$lenx)." .. ".substr($seq,$len-$lenx,$len)."\n";
print substr($seq2,0,$lenx)." .. ".substr($seq2,$len-$lenx,$len2)."\n\n";
}
# now test a subsequence
my $lenr=$len-1;
for(my $i=0;$i<5;$i++){
my $r=rand;
my $st=int($r*$lenr)+1;
$r=rand;
my $ed=int($r*($lenr-$st))+$st+1;
print "$lenr: $st-$ed\n";
$seq=$seq_apt->fetch_by_RawContig_start_end_strand($contig,$st,$ed,1);
$len=length($seq);
$seq2=$seq_apt->fetch_by_RawContig_start_end_strand($contig,$st,$ed,1,1);
if($seq ne $seq2){
print "DIFFERENT\n";
$nd++;
}else{
print "Same\n";
$ns++;
}
my $len2=length($seq2);
my $lenx=30;
if($len<$lenx){
$lenx=$len;
print substr($seq,0,$lenx)."\n".substr($seq2,0,$lenx)."\n\n";
}else{
print substr($seq,0,$lenx)." .. ".substr($seq,$len-$lenx,$len)."\n";
print substr($seq2,0,$lenx)." .. ".substr($seq2,$len-$lenx,$len2)."\n\n";
}
}
}
}
}
print "Same: $ns; Different: $nd\n" if $opt_C;
exit 0;
}elsif($opt_s && $opt_l){
# sequence and label defined, so write into db;
my $acc=$opt_l;
my $seq=$opt_s;
my $ver=1;
# clone object
my $clone=new Bio::EnsEMBL::Clone;
$clone->id($acc);
$clone->htg_phase(4);
$clone->embl_id($acc);
$clone->version($ver);
$clone->embl_version($ver);
my $now = time;