Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
ensembl-gh-mirror
ensembl
Commits
35729e7b
Commit
35729e7b
authored
Jul 14, 2003
by
Tim Hubbard
Browse files
support for dnac compressed dna table
parent
7a2d4026
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
734 additions
and
38 deletions
+734
-38
modules/Bio/EnsEMBL/DBSQL/CloneAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/CloneAdaptor.pm
+2
-2
modules/Bio/EnsEMBL/DBSQL/RawContigAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/RawContigAdaptor.pm
+19
-2
modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm
+328
-34
scripts/dna_compress.pl
scripts/dna_compress.pl
+367
-0
sql/table.sql
sql/table.sql
+18
-0
No files found.
modules/Bio/EnsEMBL/DBSQL/CloneAdaptor.pm
View file @
35729e7b
...
...
@@ -233,9 +233,9 @@ sub fetch_all {
my
$self
=
shift
;
my
$sth
=
$self
->
prepare
("
SELECT clone_id, name, embl_acc, version,
embl_version, htg_phase, created, modified
embl_version, htg_phase,
UNIX_TIMESTAMP(created), UNIX_TIMESTAMP(modified)
FROM clone
");
$sth
->
execute
();
...
...
modules/Bio/EnsEMBL/DBSQL/RawContigAdaptor.pm
View file @
35729e7b
...
...
@@ -509,8 +509,25 @@ sub remove {
my
$dna_id
=
$sth
->
fetchrow_array
;
$sth
=
$self
->
prepare
("
DELETE FROM dna WHERE dna_id =
$dna_id
");
$sth
->
execute
;
$self
->
throw
("
Failed to delete dna for dna_id '
$dna_id
'
")
unless
$sth
->
rows
;
my
$flag_found
=
0
;
if
(
$sth
->
rows
){
$flag_found
=
1
;
}
# now try dnac - eval since table might not be present..
$sth
=
$self
->
prepare
("
DELETE FROM dnac WHERE dna_id =
$dna_id
");
eval
{
$sth
->
execute
;
};
if
(
$@
){
}
else
{
if
(
$sth
->
rows
){
$flag_found
=
1
;
}
}
print
"
got here
\n
";
if
(
$flag_found
==
0
){
$self
->
throw
("
Failed to delete dna for dna_id '
$dna_id
'
")
}
}
# Remove the contig
...
...
modules/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm
View file @
35729e7b
#
# EnsEMBL module for Bio::EnsEMBL::D
B
SQL::SequenceAdaptor
# EnsEMBL module for Bio::EnsEMBL::DSQL::SequenceAdaptor
#
# Cared for by Arne Stabenau <stabenau@ebi.ac.uk>
#
...
...
@@ -55,6 +55,7 @@ use Bio::EnsEMBL::DBSQL::BaseAdaptor;
a -1 means until the end
Arg [4] : int $strand
-1, 1 are possible values
Arg [5] : (optional) force read from compressed DNA table
Example : $dna = $seq_adp->fetch_by_RawContig_start_end_strand($contig, 1,
1000, -1);
Description: retrieves the dna string from the database from the
...
...
@@ -66,7 +67,7 @@ use Bio::EnsEMBL::DBSQL::BaseAdaptor;
=cut
sub
fetch_by_RawContig_start_end_strand
{
my
(
$self
,
$contig
,
$start
,
$end
,
$strand
)
=
@_
;
my
(
$self
,
$contig
,
$start
,
$end
,
$strand
,
$compressed
)
=
@_
;
my
$sth
;
if
(
$start
<
1
)
{
...
...
@@ -77,46 +78,125 @@ sub fetch_by_RawContig_start_end_strand {
$self
->
throw
("
contig arg must be contig not a [
$contig
]
");
}
my
$query
;
{
if
(
$end
==
-
1
)
{
my
$query
;
$query
=
"
SELECT c.length, SUBSTRING( d.sequence,
$start
)
FROM dna d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?
";
if
(
$compressed
==
1
||
$self
->
is_compressed
){
# query for compressed mode, uses hex and a different range
# needs to getback dna_id as will have to check for Ns in dnan table
my
$start4
=
int
((
$start
-
1
)
/
4
)
+
1
;
}
else
{
my
$length
=
$end
-
$start
+
1
;
if
(
$length
<
1
)
{
$self
->
throw
(
"
Wrong parameters
"
);
}
$query
=
"
SELECT c.length, SUBSTRING( d.sequence,
$start
,
$length
)
FROM dna d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?
";
}
if
(
$end
==
-
1
)
{
$query
=
"
SELECT c.length, HEX( SUBSTRING( d.sequence,
$start4
) ),
d.n_line
FROM dnac d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?
";
}
else
{
my
$end4
=
int
((
$end
-
1
)
/
4
)
+
1
;
my
$length4
=
$end4
-
$start4
+
1
;
if
(
$length4
<
1
)
{
$self
->
throw
(
"
Wrong parameters
"
);
}
$query
=
"
SELECT c.length, HEX( SUBSTRING( d.sequence,
$start4
,
$length4
) ),
d.n_line
FROM dnac d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?
";
}
}
else
{
if
(
$end
==
-
1
)
{
#use the dna db if defined
if
(
defined
$self
->
db
()
->
dnadb
()
)
{
$sth
=
$self
->
db
()
->
dnadb
()
->
prepare
(
$query
);
}
else
{
$sth
=
$self
->
prepare
(
$query
);
}
$query
=
"
SELECT c.length, SUBSTRING( d.sequence,
$start
)
FROM dna d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?
";
}
else
{
my
$length
=
$end
-
$start
+
1
;
if
(
$length
<
1
)
{
$self
->
throw
(
"
Wrong parameters
"
);
}
$query
=
"
SELECT c.length, SUBSTRING( d.sequence,
$start
,
$length
)
FROM dna d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?
";
}
}
$sth
->
execute
(
$contig
->
dbID
());
#use the dna db if defined
if
(
defined
$self
->
db
()
->
dnadb
()
)
{
$sth
=
$self
->
db
()
->
dnadb
()
->
prepare
(
$query
);
}
else
{
$sth
=
$self
->
prepare
(
$query
);
}
if
(
my
$aref
=
$sth
->
fetchrow_arrayref
()
)
{
my
(
$length
,
$seq
)
=
@$aref
;
$seq
=~
s/\s//g
;
if
(
$strand
==
-
1
)
{
return
$self
->
_reverse_comp
(
$seq
);
$sth
->
execute
(
$contig
->
dbID
());
if
(
my
$aref
=
$sth
->
fetchrow_arrayref
()
)
{
my
(
$length
,
$seq
,
$n_line
)
=
@$aref
;
my
$lenx
=
length
(
$seq
);
if
(
$compressed
||
$self
->
is_compressed
()){
$seq
=
$self
->
_dna_uncompress
(
$length
,
$n_line
,
$start
,
$end
,
\
$seq
);
}
$seq
=~
s/\s//g
;
if
(
$strand
==
-
1
)
{
return
$self
->
_reverse_comp
(
$seq
);
}
else
{
return
$seq
;
}
}
else
{
return
$seq
;
my
$flag
;
if
(
$self
->
is_compressed
==
0
){
# found nothing...perhaps in this entry is compressed - check
# and if present, set is_compressed
eval
{
$query
=
"
SELECT c.length, d.dna_id
FROM dnac d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?
";
$sth
=
$self
->
prepare
(
$query
);
$sth
->
execute
(
$contig
->
dbID
());
if
(
my
$aref
=
$sth
->
fetchrow_arrayref
()
)
{
$flag
=
1
;
}
};
if
(
$flag
==
1
){
$self
->
is_compressed
(
1
);
print
"
Switched to compressed DNA reading mode
\n
";
redo
;
}
}
else
{
# found nothing...perhaps in this entry is uncompressed - check
# and if present, set is_compressed
eval
{
$query
=
"
SELECT c.length, d.dna_id
FROM dna d, contig c
WHERE d.dna_id = c.dna_id
AND c.contig_id = ?
";
$sth
=
$self
->
prepare
(
$query
);
$sth
->
execute
(
$contig
->
dbID
());
if
(
my
$aref
=
$sth
->
fetchrow_arrayref
()
)
{
$flag
=
1
;
}
};
if
(
$flag
==
1
){
$self
->
is_compressed
(
0
);
print
"
Switched to uncompressed DNA reading mode
\n
";
redo
;
}
}
return
undef
;
}
}
else
{
return
undef
;
}
}
...
...
@@ -264,6 +344,11 @@ sub store {
my
(
$self
,
$sequence
,
$date
)
=
@_
;
$sequence
=~
tr/atgcn/ATGCN/
;
# save compressed
if
(
$self
->
is_compressed
){
return
$self
->
store_compressed
(
$sequence
,
$date
);
}
my
$statement
=
$self
->
prepare
("
INSERT INTO dna(sequence,created)
...
...
@@ -284,6 +369,215 @@ sub store {
}
=head2 store_compressed
Arg [1] : string $sequence the dna sequence to be stored in the database
Arg [2] : string $date create date to be associated with the dna sequence
to be stored.
Arg [3] : dbid (optional)
Returntype : int
Exceptions : none
Caller : Bio::EnsEMBL::DBSQL::RawContigAdaptor::store_compressed,
Bio::EnsEMBL::DBSQL::RawContigAdaptor::store
=cut
sub
store_compressed
{
my
(
$self
,
$sequence
,
$date
,
$dbid
)
=
@_
;
# look for N's, save list of start and ends and convert to A's
my
$n_line
;
my
$len
=
length
(
$sequence
);
#print "DEBUG $sequence ($len)\n";
{
if
(
$sequence
=~
/^([^N]*)([N]+)(.*)$/
){
my
$l1
=
length
(
$
1
);
my
$l2
=
length
(
$
2
);
my
$start
=
$l1
+
1
;
my
$end
=
$l1
+
$l2
;
$sequence
=
$
1
.
('
A
'
x
$l2
)
.
$
3
;
$n_line
.=
"
"
if
$n_line
;
$n_line
.=
"
$start
-
$end
";
#print "DEBUG: Ns from $start-$end\n";
redo
;
}
}
#print "DEBUG $sequence\n";
# convert ACGT -> 00011011
my
$sequence_comp
=
$self
->
_dna_compress
(
$sequence
);
my
$id
;
if
(
$dbid
){
my
$statement
=
$self
->
prepare
("
INSERT INTO dnac(dna_id,sequence,created,n_line)
VALUES(?, ?, FROM_UNIXTIME(?), ?)
");
my
$rv
=
$statement
->
execute
(
$dbid
,
$sequence_comp
,
$date
,
$n_line
);
$self
->
throw
("
Failed to insert dna
$sequence
")
unless
$rv
;
$statement
->
finish
;
$id
=
$dbid
;
}
else
{
my
$statement
=
$self
->
prepare
("
INSERT INTO dnac(sequence,created,flag_dnan)
VALUES(?, FROM_UNIXTIME(?), ?)
");
my
$rv
=
$statement
->
execute
(
$sequence_comp
,
$date
,
$n_line
);
$self
->
throw
("
Failed to insert dna
$sequence
")
unless
$rv
;
$statement
->
finish
;
$statement
=
$self
->
prepare
("
SELECT last_insert_id()
");
$statement
->
execute
();
(
$id
)
=
$statement
->
fetchrow
();
$statement
->
finish
;
}
return
$id
;
}
=head2 _dna_compress
Arg 1 : txt $dna_sequence
Function : build hex representation of string (ACGT -> 1B)
Returntype: txt
Exceptions: none
Caller : private to this module
=cut
{
my
%table
=
(
'
AA
'
=>
'
0
',
'
AC
'
=>
'
1
',
'
AG
'
=>
'
2
',
'
AT
'
=>
'
3
',
'
CA
'
=>
'
4
',
'
CC
'
=>
'
5
',
'
CG
'
=>
'
6
',
'
CT
'
=>
'
7
',
'
GA
'
=>
'
8
',
'
GC
'
=>
'
9
',
'
GG
'
=>
'
A
',
'
GT
'
=>
'
B
',
'
TA
'
=>
'
C
',
'
TC
'
=>
'
D
',
'
TG
'
=>
'
E
',
'
TT
'
=>
'
F
',
);
sub
_dna_compress
{
my
$self
=
shift
;
my
$seq
=
shift
;
# length must be multiple of 4, so pad if necesary
my
$len
=
length
(
$seq
);
$seq
.=
'
A
'
x
(
$len
-
(
int
(
$len
/
4
))
*
4
);
#print "DEBUG $seq\n";
# process in blocks of 2 to do conversion
my
$seq_hex
;
my
@seq
=
split
(
//
,
$seq
);
while
(
my
$pair
=
shift
(
@seq
)){
$pair
.=
shift
(
@seq
);
$seq_hex
.=
$table
{
$pair
};
}
return
pack
("
H*
",
$seq_hex
);
}
}
=head2 _dna_uncompress
Arg [1] : int $length
Arg [2] : int $n_line
Arg [3] : int $start
Arg [4] : int $end
Arg [5] : reference to $seq (hex string)
Function : converts hex string from DB into ACGT with appropriate truncation
Returntype : string
Exceptions : none
Caller : private to this module
=cut
{
my
%table
=
(
'
0
'
=>
'
AA
',
'
1
'
=>
'
AC
',
'
2
'
=>
'
AG
',
'
3
'
=>
'
AT
',
'
4
'
=>
'
CA
',
'
5
'
=>
'
CC
',
'
6
'
=>
'
CG
',
'
7
'
=>
'
CT
',
'
8
'
=>
'
GA
',
'
9
'
=>
'
GC
',
'
A
'
=>
'
GG
',
'
B
'
=>
'
GT
',
'
C
'
=>
'
TA
',
'
D
'
=>
'
TC
',
'
E
'
=>
'
TG
',
'
F
'
=>
'
TT
',
);
sub
_dna_uncompress
{
my
(
$self
,
$length
,
$n_line
,
$start
,
$end
,
$rseq
)
=
@_
;
# convert sequence back to ACGT (already in Hex)
my
$seq2
=
join
('',
map
{
$table
{
$_
}}
split
(
//
,
$$rseq
));
# calculate extent of $seq2 and truncate
my
$start4
=
(
int
((
$start
-
1
)
/
4
)
+
1
)
*
4
-
3
;
my
$off
=
$start
-
$start4
;
my
$len
;
if
(
$end
==-
1
){
$end
=
$length
;
}
$len
=
$end
-
$start
+
1
;
#print "truncate: $start,$start4,$end,$len\n";
my
$seq
=
substr
(
$seq2
,
$off
,
$len
);
# mask with N's
foreach
my
$range
(
split
(
/ /
,
$n_line
)){
my
(
$st
,
$ed
)
=
split
(
/\-/
,
$range
);
#print "before: $st-$ed $start-$end\n";
# check in range
next
if
(
$ed
<
$start
||
$st
>
$end
);
$ed
=
$end
if
$ed
>
$end
;
$st
=
$start
if
$st
<
$start
;
#print "after: $st-$ed\n";
my
$len
=
$ed
-
$st
+
1
;
$st
-=
$start
;
substr
(
$seq
,
$st
,
$len
)
=
'
N
'
x$len
;
}
return
$seq
;
}
}
=head2 is_compressed
Arg [1] : (optional) flag
to set or unset
Example : $adaptor->is_compressed(1)
Description: Getter/Setter for whether compressed DNA should be read/written
Returntype : true/false
Exceptions : none
Caller : Adaptors inherited from BaseAdaptor
=cut
sub
is_compressed
{
my
(
$obj
,
$flag
)
=
@_
;
if
(
defined
$flag
)
{
$obj
->
{'
_is_truncated
'}
=
$flag
?
1
:
0
;
}
return
$obj
->
{'
_is_truncated
'};
}
=head2 _reverse_comp
...
...
scripts/dna_compress.pl
0 → 100755
View file @
35729e7b
#!/usr/local/bin/perl -- # -*-Perl-*-
#
# Copyright (c) 2003 Tim Hubbard (th@sanger.ac.uk)
# Sanger Centre, Wellcome Trust Genome Campus, Cambs, UK
# All rights reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation
#
# $Header$
# This is a driver script to populate and test the experimental dnac
# (compressed dna) table of ensembl. Use -T for pod based tutorial
use
strict
;
use
Getopt::
Std
;
use
vars
qw($opt_h $opt_T $opt_i
$opt_U $opt_D $opt_P $opt_H $opt_p
$opt_s $opt_c $opt_C $opt_l $opt_d)
;
use
Bio::EnsEMBL::DBSQL::
DBAdaptor
;
getopts
("
hTi:U:D:P:H:p:s:cCl:d
");
$|
=
1
;
# specify defaults
my
$def_U
=
'
ensadmin
';
my
$def_D
=
'
testdna
';
my
$def_P
=
'
3307
';
my
$def_H
=
'
127.0.0.1
';
if
(
$opt_h
){
&help
;
}
elsif
(
$opt_T
){
&help2
;
}
sub
help
{
print
<<ENDHELP;
dna_compress.pl
-h for help
-s char string of DNA (for writing)
-l char label of clone, contig (for writing)
-c convert
-C compare
-i id clone_dbid (for comparing, converting, extracting DNA)
-d delete clone
-U user ensembl db user [$def_U]
-D db ensembl db [$def_D]
-P port port of mysql [$def_P]
-H host host of mysql [$def_H]
-p pass passwd for mysqluser
ENDHELP
exit
0
;
}
sub
help2
{
exec
('
perldoc
',
$
0
);
}
# defaults or options
$opt_U
=
$def_U
unless
$opt_U
;
$opt_D
=
$def_D
unless
$opt_D
;
$opt_P
=
$def_P
unless
$opt_P
;
$opt_H
=
$def_H
unless
$opt_H
;
# db connection
my
$db
=
new
Bio::EnsEMBL::DBSQL::
DBAdaptor
(
-
host
=>
$opt_H
,
-
user
=>
$opt_U
,
-
port
=>
$opt_P
,
-
dbname
=>
$opt_D
,
-
pass
=>
$opt_p
);
my
$seq_apt
=
$db
->
get_SequenceAdaptor
;
my
$contig_apt
=
$db
->
get_RawContigAdaptor
;
my
$clone_apt
=
$db
->
get_CloneAdaptor
;
if
(
$opt_C
||
$opt_c
){
# convert dna of sequences (loop over clone->contig->dna table,
# inserting into dnac table when no entry is already present)
# OR
# compare raw and compressed sequences
my
$clones
=
$clone_apt
->
fetch_all
();
my
$nd
=
0
;
my
$ns
=
0
;
foreach
my
$clone
(
@$clones
){
my
$clone_dbid
=
$clone
->
dbID
;
next
if
(
$opt_i
&&
$opt_i
!=
$clone_dbid
);
my
$clone_id
=
$clone
->
id
;
my
$created
=
$clone
->
created
();
print
"
Processing Clone
$clone_id
(
$clone_dbid
)
$created
\n
";
my
$contigs
=
$clone
->
get_all_Contigs
();
foreach
my
$contig
(
@$contigs
){
my
$contig_dbid
=
$contig
->
dbID
;
my
$contig_id
=
$clone
->
id
;
my
$seq
=
$seq_apt
->
fetch_by_RawContig_start_end_strand
(
$contig
,
1
,
-
1
,
1
);
my
$len
=
length
(
$seq
);
my
$dna_id
;
# custom sql is required to get dna_id and check if there is a compressed id
my
$query
=
"
SELECT c.dna_id FROM contig c WHERE c.contig_id = ?
";
my
$sth
=
$contig_apt
->
prepare
(
$query
);
$sth
->
execute
(
$contig_dbid
);
if
(
my
$aref
=
$sth
->
fetchrow_arrayref
()){
(
$dna_id
)
=
@$aref
;
}
else
{
$contig_apt
->
thrown
("
No dna_id for Contig
$contig_id
");
}
print
"
Processing Contig
$contig_id
(
$contig_dbid
) [
$len
]
$dna_id
\n
";
my
$flag_compressed_exists
=
0
;
$query
=
"
SELECT d.dna_id FROM dnac d WHERE d.dna_id = ?
";
$sth
=
$seq_apt
->
prepare
(
$query
);
$sth
->
execute
(
$dna_id
);
if
(
my
$aref
=
$sth
->
fetchrow_arrayref
()){
$flag_compressed_exists
=
1
;
}
# convert
if
(
$opt_c
){
if
(
$flag_compressed_exists
){
print
"
Compressed DNA entry already exists for dna_id
$dna_id
\n
";
next
;
}
# write compressed DNA record
my
$id
=
$seq_apt
->
store_compressed
(
$seq
,
$created
,
$dna_id
);
print
"
Created compressed DNA entry
\n
";
}
elsif
(
$opt_C
){
if
(
!
$flag_compressed_exists
){
print
"
Compressed DNA entry missing for dna_id
$dna_id
\n
";
next
;
}
my
$seq2
=
$seq_apt
->
fetch_by_RawContig_start_end_strand
(
$contig
,
1
,
-
1
,
1
,
1
);
if
(
$seq
ne
$seq2
){
print
"
DIFFERENT
\n
";
$nd
++
;
}
else
{
print
"
Same
\n
";
$ns
++
;
}
my
$len2
=
length
(
$seq2
);
my
$lenx
=
30
;
if
(
$len
<
$lenx
){
$lenx
=
$len
;
print
substr
(
$seq
,
0
,
$lenx
)
.
"
\n
"
.
substr
(
$seq2
,
0
,
$lenx
)
.
"
\n\n
";
}
else
{
print
substr
(
$seq
,
0
,
$lenx
)
.
"
..
"
.
substr
(
$seq
,
$len
-
$lenx
,
$len
)
.
"
\n
";
print
substr
(
$seq2
,
0
,
$lenx
)
.
"
..
"
.
substr
(
$seq2
,
$len
-
$lenx
,
$len2
)
.
"
\n\n
";
}
# now test a subsequence
my
$lenr
=
$len
-
1
;