Commit b8c6251f authored by Magali Ruffier's avatar Magali Ruffier
Browse files

removed dnac data, tests, adaptors

parent 6ae7371b
=head1 LICENSE
Copyright (c) 1999-2013 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
http://www.ensembl.org/info/about/code_licence.html
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at <dev@ensembl.org>.
Questions may also be sent to the Ensembl help desk at
<helpdesk@ensembl.org>.
=cut
=head1 NAME
Bio::EnsEMBL::DBSQL::CompressedSequenceAdaptor - Facilitates DB storage and retrieval of compressed sequence
=head1 SYNOPSIS
$seq_adptr = $database_adaptor->get_SequenceAdaptor();
$dna =
${ $seq_adptr->fetch_by_Slice_start_end_strand( $slice, 1, 1000,
-1 ) };
=head1 DESCRIPTION
An adaptor for the retrieval of compressed DNA sequence from the EnsEMBL
database
=head1 METHODS
=cut
package Bio::EnsEMBL::DBSQL::CompressedSequenceAdaptor;
use vars qw(@ISA);
use strict;
use Bio::EnsEMBL::DBSQL::SequenceAdaptor;
@ISA = qw(Bio::EnsEMBL::DBSQL::SequenceAdaptor);
sub _fetch_seq {
my $self = shift;
my $seq_region_id = shift;
my $start = shift;
my $len = shift;
#calculate the offset and start in the compressed sequence
my $comp_start = ($start-1 >> 2) + 1;
my $comp_len = ($len >> 2) + 2;
my ($bvector, $nline);
my $sth = $self->prepare(
"SELECT SUBSTRING( d.sequence, ?, ?), n_line
FROM dnac d
WHERE d.seq_region_id = ?");
$sth->bind_param(1,$comp_start,SQL_INTEGER);
$sth->bind_param(2,$comp_len ,SQL_INTEGER);
$sth->bind_param(3,$seq_region_id,SQL_INTEGER);
$sth->execute();
$sth->bind_columns(\$bvector, \$nline);
$sth->fetch();
$sth->finish();
#convert sequence from binary string to 0123 string
my $bitlen = length($bvector) << 2;
my $str = '';
for(my $i=0; $i < $bitlen; $i++) {
$str .= vec($bvector, $i, 2);
}
#convert from 0123 to ACTG
$str =~ tr/0123/ACTG/;
$str = substr($str, ($start-1)%4, $len);
#expand the nlines and place them back in the sequence
my @nlines = split(/:/, $nline);
foreach my $nl (@nlines) {
my ($offset,$char,$nlen) = $nl =~ /(\d+)(\D)(\d+)/;
#skip nlines entirely out of range
next if(($offset+$nlen-1) < $start || $offset > ($start+$len-1));
#obtain relative offset into requested region
$offset = $offset - $start + 1;
#nlines that partially overlap requested region have to be shrunk
if($offset < 1) {
$nlen = $nlen - (1-$offset);
$offset = 1;
}
if($offset + $nlen > $start+$len) {
$nlen = $len - $offset + 1;
}
substr($str,$offset-1,$nlen) = $char x $nlen;
}
return \$str;
}
=head2 store
Arg [1] : string $seq_region_id the id of the sequence region this dna
will be associated with.
Arg [2] : string reference $sequence the dna sequence to be stored in
the database
Example : $dbID = $seq_adaptor->store(12,\'ACTGGGTACCAAACAAACACAACA');
Description: stores a dna sequence in the databases dna table and returns the
database identifier for the new record.
Returntype : int
Exceptions : none
Caller : Bio::EnsEMBL::DBSQL::RawContigAdaptor::store
Status : Stable
=cut
sub store {
my ($self, $seq_region_id, $sequence) = @_;
if(!$seq_region_id) {
throw('seq_region_id is required');
}
$sequence = uc($sequence);
my $bvector = '';
#convert sequence to 0s,1s,2s and 3s
$sequence =~ tr/ACTG/0123/;
#nlines cover sequence which is not ACTG such as N
#nline format is a set of colon delimited int, char, int triplets:
#<offset><code><length>
my($nline_char,$nline_len,$nline_off);
my @nlines;
my $len = length($sequence);
for(my $i=0; $i < $len; $i++) {
my $char = substr($sequence,$i,1);
#quickly check if this character was an A,C,T or G (and was converted to
# a 0,1,2,3)
if($char =~ /[0-3]/) {
vec($bvector, $i,2) = $char;
if($nline_char) {
#end of an nline
push @nlines, "$nline_off$nline_char$nline_len";
$nline_char = undef;
$nline_len = 0;
$nline_off = 0;
}
} else {
#this was not an ACTG
if($nline_char) {
if($nline_char eq $char) {
#continuation of an nline
$nline_len++;
} else {
#end of a previous nline and start of a new one
push @nlines, "$nline_off$nline_char$nline_len";
$nline_char = $char;
$nline_len = 1;
$nline_off = $i+1;
}
} else {
#start of a new nline
$nline_char = $char;
$nline_len = 1;
$nline_off = $i+1;
}
$char = 0; #need to put numeric val into bitvector despite nline
}
vec($bvector, $i,2) = $char;
}
my $nline = join(':', @nlines);
my $statement = $self->prepare(
"INSERT INTO dnac(seq_region_id, sequence, n_line) VALUES(?,?,?)");
$statement->bind_param(1,$seq_region_id,SQL_INTEGER);
$statement->bind_param(2,$bvector,SQL_BLOB);
$statement->bind_param(3,$nline,SQL_LONGVARCHAR);
$statement->execute();
$statement->finish();
return;
}
1;
......@@ -326,7 +326,7 @@ sub get_available_adaptors {
map( { $_ => "Bio::EnsEMBL::DBSQL::${_}Adaptor" } qw(
Analysis ArchiveStableId Attribute
AssemblyExceptionFeature AssemblyMapper CoordSystem
CompressedSequence DBEntry DnaAlignFeature
DBEntry DnaAlignFeature
DensityFeature DensityType Exon
Gene KaryotypeBand MiscSet
MiscFeature PredictionTranscript PredictionExon
......
......@@ -63,7 +63,7 @@ my $aaga = $db->get_adaptor('AltAlleleGroup');
my $group_list = $aaga->fetch_all_Groups;
my $aag = $group_list->[0];
ok ($aag->rep_Gene_id == 18256,"Check for correct selection of allele");
is($aag->rep_Gene_id, 18256,"Check for correct selection of allele");
is_deeply ($aag->get_all_Gene_ids,[18256,18257,18258,18259],"Check group members");
is_deeply ($aag->get_all_Gene_ids('no ref'),[18257,18258,18259],"Test effect of excluding reference gene");
......@@ -125,4 +125,4 @@ $aag = $group_list->[0];
ok(scalar(@$group_list) == 1, "Pretend multi-species fetch returns same groups as normal.");
done_testing();
\ No newline at end of file
done_testing();
use strict;
use warnings;
use Test::More;
use Bio::EnsEMBL::Test::TestUtils;
use Bio::EnsEMBL::Test::MultiTestDB;
use Bio::EnsEMBL::Slice;
our $verbose= 0;
my $multi_db = Bio::EnsEMBL::Test::MultiTestDB->new;
my $db = $multi_db->get_DBAdaptor('core');
#
# Test fetch_by_Slice_start_end_strand
#
my $slice_adaptor = $db->get_SliceAdaptor;
my $seq_adaptor = $db->get_CompressedSequenceAdaptor();
my $seq = 'ACTGAAANTTANNNATYTTTAAATTACCC';
my $len = length($seq);
my $contig_cs = $db->get_CoordSystemAdaptor->fetch_by_name('contig');
$multi_db->save('core', 'dnac', 'seq_region');
#we need to create a fake seq region because the length of the seq region
#needs to match the length of the sequence we are inserting. Otherwise
#we can get weird padding at the end due to the last byte being not
#fully packed but still labelled as non-gap
my $sth =
$db->dbc->prepare('INSERT INTO seq_region (name, length, coord_system_id) ' .
'VALUES (?,?,?)');
$sth->execute('testfrag', $len,$contig_cs->dbID);
my $slice = $slice_adaptor->fetch_by_region('contig', 'testfrag');
my $seq_region_id = $slice_adaptor->get_seq_region_id($slice);
debug("Storing sequence: $seq");
$seq_adaptor->store($seq_region_id,$seq);
ok(1);
my $new_seq = ${$seq_adaptor->fetch_by_Slice_start_end_strand($slice, 1,
$len, 1)};
ok($seq eq $new_seq);
debug("Retrieved sequence: $seq");
my $flanking = 5;
$new_seq = ${$seq_adaptor->fetch_by_Slice_start_end_strand($slice,
1-$flanking,
$len+$flanking, 1)};
ok(('N' x $flanking) . $seq . ('N' x $flanking) eq $new_seq);
debug("Retrieved sequence (with $flanking flanking): $new_seq");
$multi_db->restore('core', 'dnac', 'seq_region');
done_testing();
......@@ -233,7 +233,7 @@ foreach my $seg (@projection) {
my $csa = $db->get_CoordSystemAdaptor();
my $ctg_cs = $csa->fetch_by_name('contig');
$multi->save('core', 'seq_region', 'dna', 'dnac');
$multi->save('core', 'seq_region', 'dna');
my $ctg_len = 50;
my $name = 'testregion';
......@@ -308,7 +308,7 @@ my $chr_map = $ctg_slice->project( $chr_cs->name, $chr_cs->version );
# $chr_map->[0]->[2]->name eq $chr_slice->name );
$multi->restore('core', 'seq_region', 'dna', 'dnac');
$multi->restore('core', 'seq_region', 'dna');
#
......
......@@ -226,13 +226,6 @@ CREATE TABLE `dna_align_feature` (
KEY `pair_idx` (`pair_dna_align_feature_id`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1 MAX_ROWS=100000000 AVG_ROW_LENGTH=80;
CREATE TABLE `dnac` (
`seq_region_id` int(10) unsigned NOT NULL,
`sequence` mediumblob NOT NULL,
`n_line` text,
PRIMARY KEY (`seq_region_id`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1 MAX_ROWS=750000 AVG_ROW_LENGTH=19000;
CREATE TABLE `exon` (
`exon_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`seq_region_id` int(10) unsigned NOT NULL,
......
......@@ -226,13 +226,6 @@ CREATE TABLE `dna_align_feature` (
KEY `pair_idx` (`pair_dna_align_feature_id`)
) ENGINE=MyISAM MAX_ROWS=100000000 AVG_ROW_LENGTH=80;
CREATE TABLE `dnac` (
`seq_region_id` int(10) unsigned NOT NULL,
`sequence` mediumblob NOT NULL,
`n_line` text ,
PRIMARY KEY (`seq_region_id`)
) ENGINE=MyISAM MAX_ROWS=750000 AVG_ROW_LENGTH=19000;
CREATE TABLE `exon` (
`exon_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`seq_region_id` int(10) unsigned NOT NULL,
......
......@@ -226,13 +226,6 @@ CREATE TABLE `dna_align_feature` (
KEY `pair_idx` (`pair_dna_align_feature_id`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1 MAX_ROWS=100000000 AVG_ROW_LENGTH=80;
CREATE TABLE `dnac` (
`seq_region_id` int(10) unsigned NOT NULL DEFAULT '0',
`sequence` mediumblob NOT NULL,
`n_line` text CHARACTER SET latin1 COLLATE latin1_bin,
PRIMARY KEY (`seq_region_id`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1 MAX_ROWS=750000 AVG_ROW_LENGTH=19000;
CREATE TABLE `exon` (
`exon_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`seq_region_id` int(10) unsigned NOT NULL,
......
......@@ -235,13 +235,6 @@ CREATE TABLE `dna_align_feature` (
KEY `pair_idx` (`pair_dna_align_feature_id`)
) ENGINE=MyISAM AUTO_INCREMENT=29797338 DEFAULT CHARSET=latin1 COLLATE=latin1_bin MAX_ROWS=100000000 AVG_ROW_LENGTH=80;
CREATE TABLE `dnac` (
`seq_region_id` int(10) unsigned NOT NULL DEFAULT '0',
`sequence` mediumblob NOT NULL,
`n_line` text COLLATE latin1_bin,
PRIMARY KEY (`seq_region_id`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_bin MAX_ROWS=750000 AVG_ROW_LENGTH=19000;
CREATE TABLE `exon` (
`exon_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`seq_region_id` int(10) unsigned NOT NULL,
......
......@@ -187,27 +187,6 @@ CREATE TABLE dna (
) COLLATE=latin1_swedish_ci ENGINE=MyISAM MAX_ROWS=750000 AVG_ROW_LENGTH=19000;
/**
@table dnac
@desc Contains equivalent data to dna table, but 4 letters of DNA code are represented by a single binary character, based on 2 bit encoding.
@column seq_region_id Primary key, internal identifier. Foreign key references to the @link seq_region table.
@column sequence Compressed DNA sequence.
@column n_line Contains start-end pairs of coordinates in the string that are Ns.
*/
CREATE TABLE dnac (
seq_region_id INT(10) UNSIGNED NOT NULL,
sequence MEDIUMBLOB NOT NULL,
n_line TEXT,
PRIMARY KEY (seq_region_id)
) COLLATE=latin1_swedish_ci ENGINE=MyISAM MAX_ROWS=750000 AVG_ROW_LENGTH=19000;
/**
@table karyotype
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment