Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Open sidebar
ensembl-gh-mirror
ensembl
Commits
bed40a14
Commit
bed40a14
authored
Jan 19, 2018
by
premanand17
Browse files
All updates to ProteinFeature to supoort GIFTS IMPORT
parent
16a9f646
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
1245 additions
and
22 deletions
+1245
-22
misc-scripts/gifts_import/populate_protein_feature.pl
misc-scripts/gifts_import/populate_protein_feature.pl
+350
-0
modules/Bio/EnsEMBL/BaseAlignFeature.pm
modules/Bio/EnsEMBL/BaseAlignFeature.pm
+302
-3
modules/Bio/EnsEMBL/DBSQL/ProteinFeatureAdaptor.pm
modules/Bio/EnsEMBL/DBSQL/ProteinFeatureAdaptor.pm
+152
-14
modules/Bio/EnsEMBL/ProteinFeature.pm
modules/Bio/EnsEMBL/ProteinFeature.pm
+73
-3
modules/t/baseAlignFeature.t
modules/t/baseAlignFeature.t
+135
-0
modules/t/proteinFeature.t
modules/t/proteinFeature.t
+138
-1
modules/t/proteinFeatureAdaptor.t
modules/t/proteinFeatureAdaptor.t
+93
-1
modules/t/test-genome-DBs/homo_sapiens/core/analysis.txt
modules/t/test-genome-DBs/homo_sapiens/core/analysis.txt
+1
-0
modules/t/test-genome-DBs/homo_sapiens/core/protein_feature.txt
...s/t/test-genome-DBs/homo_sapiens/core/protein_feature.txt
+1
-0
No files found.
misc-scripts/gifts_import/populate_protein_feature.pl
0 → 100644
View file @
bed40a14
#!/usr/bin/env perl
# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
# Copyright [2016-2018] EMBL-European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#The script populates the CORE db's protein_feature table with all the uniprot mappings found in GIFTS db's ensp_u_cigar table.
use
strict
;
use
warnings
;
use
DBI
qw( :sql_types )
;
use
Getopt::
Long
;
use
Bio::EnsEMBL::
Registry
;
use
Data::
Dumper
;
my
(
$gfhost
,
$gfport
,
$gfuser
,
$gfpass
,
$gfdb
);
my
(
$pfhost
,
$pfport
,
$pfuser
,
$pfpass
,
$pfdb
);
my
$create_index
=
0
;
my
$dryrun
=
0
;
#my $limit = 10;
my
$limit
=
0
;
GetOptions
(
"
gfhost=s
"
=>
\
$gfhost
,
"
gfport=i
"
=>
\
$gfport
,
"
gfuser=s
"
=>
\
$gfuser
,
"
gfpass=s
"
=>
\
$gfpass
,
"
gfdb=s
"
=>
\
$gfdb
,
"
pfhost=s
"
=>
\
$pfhost
,
"
pfport=i
"
=>
\
$pfport
,
"
pfuser=s
"
=>
\
$pfuser
,
"
pfpass=s
"
=>
\
$pfpass
,
"
pfdb=s
"
=>
\
$pfdb
,
"
dryrun!
"
=>
\
$dryrun
,
"
help
",
\
&usage
,
);
usage
()
if
(
!
defined
$gfhost
||
!
defined
$gfport
||
!
defined
$gfuser
||
!
defined
$gfdb
);
usage
()
if
(
!
defined
$pfhost
||
!
defined
$pfport
||
!
defined
$pfuser
||
!
defined
$pfpass
);
my
(
$giftsDB
,
$protfeatureDB
)
=
init_db
();
die
unless
defined
(
$giftsDB
)
and
defined
(
$protfeatureDB
);
print
"
giftsDB credentials:
\n
";
print
Dumper
(
$giftsDB
);
print
"
protfeatureDB credentials:
\n
";
print
Dumper
(
$protfeatureDB
);
my
$registry
=
get_registry
();
my
$translation_adaptor
=
$registry
->
get_adaptor
(
'
human
',
'
core
',
'
translation
'
);
sub
init_db
{
# This is where the gifts info will be read from
my
$giftsDB
=
{
user
=>
$gfuser
,
pass
=>
$gfpass
,
host
=>
$gfhost
,
port
=>
$gfport
,
dbname
=>
$gfdb
};
# This is where the gifts info will be written to
my
$protfeatureDB
=
{
user
=>
$pfuser
,
pass
=>
$pfpass
,
host
=>
$pfhost
,
port
=>
$pfport
,
dbname
=>
$pfdb
};
return
(
$giftsDB
,
$protfeatureDB
);
}
populate_protein_feature_db
();
sub
get_analysis_id
{
my
(
$logic_name
)
=
@_
;
my
$analysis_id
;
my
$dba
=
get_dba
(
$protfeatureDB
);
my
$analysis_dba
=
$dba
->
get_AnalysisAdaptor
();
my
$analysis
=
$analysis_dba
->
fetch_by_logic_name
(
$logic_name
);
$analysis_id
=
$analysis
->
dbID
()
if
defined
$analysis
;
return
$analysis_id
;
}
sub
populate_protein_feature_db
{
my
$gifts_dbh
=
db_connect
(
$giftsDB
);
my
$protfeature_dbh
=
db_connect
(
$protfeatureDB
);
if
(
$gifts_dbh
){
print
"
Got connection to GiftsDB
\n
";
}
if
(
$protfeature_dbh
){
print
"
Got connection to ProtfeatureDB
\n
";
}
my
$insert_container
=
build_insert_sql
(
$gifts_dbh
,
$protfeature_dbh
);
my
$rows_inserted
=
0
;
my
@failed_stmts
=
[]
;
unless
(
$dryrun
){
foreach
my
$insert_stmt
(
@$insert_container
)
{
eval
{
my
$affected
=
$protfeature_dbh
->
do
(
$insert_stmt
);
$rows_inserted
+=
$affected
;
};
if
(
$@
){
print
"
$@
\n
";
push
(
@failed_stmts
,
$@
);
}
}
print
Dumper
(
\
@failed_stmts
);
print
"
INSERT END
\n
";
}
print
"
Number of rows inserted into protein_feature
$protfeatureDB
->{'dbname'}.protein_feature
",
$rows_inserted
,
"
\n
";
return
$rows_inserted
;
$gifts_dbh
->
disconnect
();
$protfeature_dbh
->
disconnect
();
}
sub
annotate_ref
{
my
$ref
=
shift
;
my
$ensp_id
=
$ref
->
[
0
];
my
$ensp
=
$translation_adaptor
->
fetch_by_stable_id
(
$ensp_id
)
;
if
(
$ensp
){
$ref
->
[
0
]
=
$ensp
->
dbID
;
$ref
->
[
1
]
=
1
;
$ref
->
[
2
]
=
$ensp
->
length
()
-
1
;
}
return
$ref
;
}
sub
build_insert_sql
{
my
(
$gifts_dbh
,
$protfeature_dbh
)
=
@_
;
#get analysis id
my
$analysis_id
=
get_analysis_id
("
gifts_import
");
print
("
Analysis id
$analysis_id
\n
");
my
$select_sql
=
qq{SELECT DISTINCT ensp_id, 0, 0, uniprot_acc, $analysis_id as analysis_id, mdz, 'mdtag' FROM ensp_u_cigar}
;
$select_sql
=
$limit
?
$select_sql
.
"
limit 10
"
:
$select_sql
;
my
$insert_sql
=
qq{INSERT INTO $protfeatureDB->{'dbname'}.protein_feature(translation_id, seq_start, seq_end, hit_name, analysis_id, cigar_line, align_type) VALUES }
;
my
$import_sth
=
$gifts_dbh
->
prepare
(
$select_sql
);
$import_sth
->
execute
;
my
$rows
=
$import_sth
->
rows
;
my
$start
=
0
;
my
$insert_values
;
my
$max_rows_in_insert
=
9999
;
#Batch size of 10,000
# my $max_rows_in_insert = 10; #Batch size of 10
my
$rows_in_insert
=
0
;
my
@insert_container
=
();
my
$miss
=
1
;
while
(
my
$ref
=
$import_sth
->
fetchrow_arrayref
)
{
my
$not_ensp
=
1
;
if
(
$ref
->
[
0
]
!~
/^ENSP.*/
){
warn
"
\t\t
$not_ensp
ensp id not in right format for
",
$ref
->
[
0
],
"
Uniprot id
",
$ref
->
[
3
],
"
\n
";
$not_ensp
++
;
next
;
}
$ref
=
annotate_ref
(
$ref
);
if
(
$ref
->
[
0
]
=~
/^ENSP.*/
){
warn
"
$miss
No translation record found in core translation for
",
$ref
->
[
0
],
"
Uniprot id
",
$ref
->
[
3
],
"
\n
";
$miss
++
;
next
;
}
if
(
$rows_in_insert
<
$max_rows_in_insert
)
{
$insert_values
.=
'
,
'
if
$start
++
;
$insert_values
.=
'
(
'
.
(
join
(
"
,
",
map
{
$gifts_dbh
->
quote
(
$_
)
}
@
{
$ref
}
)
)
.
'
)
';
$rows_in_insert
++
;
}
else
{
$insert_values
.=
'
,
'
if
$start
++
;
$insert_values
.=
'
(
'
.
(
join
(
"
,
",
map
{
$gifts_dbh
->
quote
(
$_
)
}
@
{
$ref
}
)
)
.
'
)
';
my
$insert_sql_with_values
=
$insert_sql
.
$insert_values
;
push
(
@insert_container
,
$insert_sql_with_values
);
$insert_values
=
undef
;
$rows_in_insert
=
0
;
$start
=
0
;
}
}
if
(
$insert_values
)
{
my
$insert_sql_with_values
=
$insert_sql
.
$insert_values
;
push
(
@insert_container
,
$insert_sql_with_values
);
}
print
"
Number of rows fetched from GIFTS
$rows
\n
";
return
\
@insert_container
;
}
#===========DB Connection routines=============
sub
db_connect
{
my
(
$connectDB
)
=
@_
;
my
$dbname
=
$connectDB
->
{'
dbname
'};
my
$host
=
$connectDB
->
{'
host
'};
my
$user
=
$connectDB
->
{'
user
'};
my
$pass
=
$connectDB
->
{'
pass
'};
my
$port
=
$connectDB
->
{'
port
'};
my
$dsn
=
"
DBI:mysql:host=
$host
;
";
if
(
$port
)
{
$dsn
.=
"
port=
$port
;
";
}
$dsn
.=
"
database=
$dbname
";
my
$dbh
=
DBI
->
connect
(
$dsn
,
$user
,
$pass
,
{
'
PrintError
'
=>
1
,
'
RaiseError
'
=>
1
}
);
if
(
!
$dbh
)
{
die
"
ERROR:
$DBI
::errstr
";
}
return
$dbh
;
}
sub
get_dba
{
my
(
$connectDB
)
=
@_
;
my
$dbname
=
$connectDB
->
{'
dbname
'};
my
$host
=
$connectDB
->
{'
host
'};
my
$user
=
$connectDB
->
{'
user
'};
my
$pass
=
$connectDB
->
{'
pass
'};
my
$port
=
$connectDB
->
{'
port
'};
my
$dba
=
new
Bio::EnsEMBL::DBSQL::
DBAdaptor
(
-
host
=>
$host
,
-
user
=>
$user
,
-
dbname
=>
$dbname
,
-
pass
=>
$pass
,
-
port
=>
$port
);
return
$dba
;
}
sub
get_registry
{
my
$registry
=
"
Bio::EnsEMBL::Registry
";
$registry
->
load_registry_from_db
(
-
host
=>
$protfeatureDB
->
{'
host
'},
-
user
=>
$protfeatureDB
->
{'
user
'},
-
pass
=>
$protfeatureDB
->
{'
pass
'},
-
port
=>
$protfeatureDB
->
{'
port
'}
);
return
$registry
;
}
sub
usage
{
my
$indent
=
'
'
x
length
(
$
0
);
print
<<
EOF
;
exit
(
0
);
The
script
populates
the
CORE
db
'
s protein_feature table with all the uniprot mappings found in GIFTS db
'
s ensp_u_cigar table.
Options -gfhost -gfport -gfuser
-
gfpass
-
gfdb
are
mandatory
and
specify
the
credentials
for
the
server
on
which
a
GIFTS
database
exists
Options
-
pfhost
-
pfport
-
pfuser
-
pfpass
-
pfdb
are
credentials
of
the
core
db
where
the
GIFTS
data
will
be
copied
into
To
run
the
script
cd
into
the
directory
where
the
script
lives
and
run
the
script:
cd
ensembl
/misc-scripts/gi
fts_import
/
eg:
(
with
-
dryrun
flag
won
'
t insert)
perl populate_protein_feature.pl -gfhost localhost -gfport 3306 -gfuser prem -gfpass prem -gfdb carlos_ensembl_gifts -pfhost localhost -pfport 3306 -pfuser prem -pfpass prem -pfdb homo_sapiens_core_91_38 -dryrun
(without -dryrun flag creates a full version)
perl populate_protein_feature.pl -gfhost localhost -gfport 3306 -gfuser prem -gfpass prem -gfdb carlos_ensembl_gifts -pfhost localhost -pfport 3306 -pfuser prem -pfpass prem -pfdb homo_sapiens_core_91_38
Usage:
$0 -gfhost giftsdb_host -gfport giftsdb_port -gfuser giftsdb_user -gfpass giftsdb_pass -gfdb giftsdb_dbname -pfhost protfeatdb_host -pfport protfeatdb_port -pfuser protfeatdb_user -pfpass protfeatdb_pass -pfdb protfeatdb_dbname
$indent [-help]
-gfhost GIFTS database host
-gfport GIFTS database port
-gfuser GIFTS database user
-gfpass GIFTS database password
-gfdb GIFTS database name
-pfhost core database host
-pfport core database port
-pfuser core database user
-pfpass core database pass
-pfdb core database name
-dryrun dryrun, no insert
-help This message
EOF
}
modules/Bio/EnsEMBL/BaseAlignFeature.pm
View file @
bed40a14
...
...
@@ -270,6 +270,8 @@ sub alignment_length {
if
(
$self
->
{'
align_type
'}
eq
'
ensembl
')
{
return
$self
->
_ensembl_cigar_alignment_length
();
}
elsif
(
$self
->
{'
align_type
'}
eq
'
mdtag
')
{
return
$self
->
_mdtag_alignment_length
();
}
else
{
throw
("
No alignment_length method available for
"
.
$self
->
{'
align_type
'});
}
...
...
@@ -513,7 +515,6 @@ sub _parse_ensembl_cigar {
throw
("
No cigar string defined in object
")
if
(
!
defined
(
$string
));
my
@pieces
=
(
$string
=~
/(\d*[MDI])/g
);
#print "cigar: ",join ( ",", @pieces ),"\n";
my
@features
;
my
$strand1
=
$self
->
{'
strand
'}
||
1
;
...
...
@@ -603,7 +604,7 @@ sub _parse_ensembl_cigar {
-
PERCENT_ID
=>
$self
->
{'
percent_id
'},
-
ANALYSIS
=>
$self
->
{'
analysis
'},
-
P_VALUE
=>
$self
->
{'
p_value
'},
-
EXTERNAL_DB_ID
=>
$self
->
{'
external_db_id
'},
-
EXTERNAL_DB_ID
=>
$self
->
{'
external_db_id
'},
-
HCOVERAGE
=>
$self
->
{'
hcoverage
'},
-
GROUP_ID
=>
$self
->
{'
group_id
'},
-
LEVEL_ID
=>
$self
->
{'
level_id
'});
...
...
@@ -639,9 +640,11 @@ sub _parse_ensembl_cigar {
sub
_parse_cigar
{
my
$self
=
shift
;
if
(
$self
->
{'
align_type
'}
eq
'
ensembl
')
{
return
$self
->
_parse_ensembl_cigar
();
}
else
{
}
else
{
throw
("
No parsing method implemented for
"
.
$self
->
{'
align_type
'});
}
}
...
...
@@ -1009,7 +1012,303 @@ sub _query_unit {
throw
(
"
Abstract method call!
"
);
}
=head2 _mdtag_alignment_length
Arg [1] : None
Description: return the alignment length (including indels) based on the mdtag (mdz) string
Returntype : int
Exceptions : none
Caller : internal
Status : Stable
=cut
sub
_mdtag_alignment_length
{
my
$self
=
shift
;
if
(
!
defined
$self
->
{'
_alignment_length
'}
&&
defined
$self
->
cigar_string
)
{
my
$mdz_string
=
$self
->
cigar_string
;
my
$chunks
=
$self
->
_get_mdz_chunks
(
$mdz_string
);
my
$alignment_length
=
0
;
$alignment_length
=
$self
->
_get_mdz_alignment_length
(
$chunks
);
$self
->
{'
_alignment_length
'}
=
$alignment_length
;
}
return
$self
->
{'
_alignment_length
'};
}
=head2 _get_mdz_chunks
Arg [1] : mdz string
Description: parses the mdz string and group it according the type
Returntype : array of strings
Exceptions : none
Caller : internal
Status : Stable
=cut
sub
_get_mdz_chunks
{
my
$self
=
shift
;
my
$mdz_string
=
shift
;
my
$alignment_length
=
0
;
my
@chunks
;
if
(
$mdz_string
=~
/^MD:Z:(.*)/
){
my
$mdtag
=
$
1
;
#if start to end is a number then all match return as it is
if
(
$mdtag
=~
/^\d+$/
){
return
[
$mdtag
];
}
else
{
my
@char_arr
=
split
('',
$mdtag
);
for
(
my
$i
=
0
;
$i
<
scalar
(
@char_arr
);
$i
++
){
my
$char
=
$char_arr
[
$i
];
#determine the type and next type
my
$type
=
$self
->
_get_mdz_chunk_type
(
$char
);
my
$next_char
=
$char_arr
[
++
$i
];
my
$next_type
=
$self
->
_get_mdz_chunk_type
(
$next_char
);
my
$chunk
=
$char
;
#group chars in mdz based on the type, break if it differenc and push to chunks
while
(
$type
eq
$next_type
and
$i
<=
$#char_arr
){
$chunk
=
$chunk
.
$next_char
;
$next_char
=
$char_arr
[
++
$i
];
$next_type
=
$self
->
_get_mdz_chunk_type
(
$next_char
);
}
if
(
$i
==
$#char_arr
){
if
(
$type
eq
$next_type
){
$chunk
=
$chunk
.
$next_char
;
push
(
@chunks
,
$chunk
);
}
else
{
push
(
@chunks
,
$chunk
);
}
}
else
{
push
(
@chunks
,
$chunk
);
}
--
$i
;
}
}
}
return
\
@chunks
;
}
=head2 _get_mdz_alignment_length
Arg [1] : array of strings
Description: calculate the alignment length from the given chunks
Returntype : array of strings
Exceptions : none
Caller : internal
Status : Stable
=cut
sub
_get_mdz_alignment_length
{
my
$self
=
shift
;
my
$chunks
=
shift
;
my
$length
=
0
;
for
(
my
$i
=
0
;
$i
<
scalar
(
@$chunks
);
$i
++
){
my
$chunk
=
$$chunks
[
$i
];
my
$type
=
$self
->
_get_mdz_chunk_type
(
$chunk
);
if
(
$type
eq
'
num
'){
$length
=
$length
+
$chunk
;
}
elsif
(
$type
eq
'
alpha
'){
$length
=
$length
+
length
(
$chunk
);
}
elsif
(
$type
eq
'
del
'){
#skip next chunk
$i
++
;
}
}
return
$length
;
}
=head2 _get_mdz_chunk_type
Arg [1] : char
Description: get the chunk type
Returntype : string
Exceptions : none
Caller : internal
Status : Stable
=cut
sub
_get_mdz_chunk_type
{
my
$self
=
shift
;
my
$char
=
shift
;
my
$type
;
if
(
$char
eq
'
^
'){
$type
=
"
del
";
}
elsif
(
$char
=~
/\d+/
){
$type
=
'
num
';
}
elsif
(
$char
=~
/\w+/
){
$type
=
'
alpha
';
}
return
$type
;
}
=head2 alignment_strings
Arg [1] : list of string $flags
Example : $pf->alignment_strings or
Description: Allows to rebuild the alignment string of both the seq and hseq sequence
using the mdz_string information and the translation object
Returntype : array reference containing 2 strings
the first corresponds to seq
the second corresponds to hseq
Exceptions : none
Caller : general
Status : Stable
=cut
sub
alignment_strings
{
my
$self
=
shift
;
#Translations
my
$transl_adaptor
=
$self
->
adaptor
->
db
->
get_TranslationAdaptor
();
my
$transl_object
=
$transl_adaptor
->
fetch_by_dbID
(
$self
->
translation_id
);
my
$seq
;
if
(
defined
$transl_object
&&
$transl_object
->
isa
('
Bio::EnsEMBL::Translation
'))
{
$seq
=
$transl_object
->
transcript
()
->
translate
()
->
seq
();
}
if
(
$self
->
align_type
eq
'
mdtag
')
{
if
(
defined
$seq
&&
defined
$self
->
cigar_string
){
return
$self
->
_mdz_alignment_string
(
$seq
,
$self
->
cigar_string
);
}
else
{
warn
"
sequence or cigar_line not found for
"
.
$self
->
translation_id
;
}
}
else
{
throw
("
alignment_strings method not implemented for
"
.
$self
->
align_type
);
}
return
undef
;
}
=head2 _mdz_alignment_string
Arg [1] : input sequence
Arg [2] : mdz cigar string
Example : $pf->alignment_strings or
Description: Allows to rebuild the alignment string of both the seq and hseq sequence
Returntype : array reference containing 2 strings
the first corresponds to seq
the second corresponds to hseq
Exceptions : none
Caller : general
Status : Stable
=cut
# Try to get the sequence from translation object
# mdz_string from current object
sub
_mdz_alignment_string
{
my
(
$self
,
$input_seq
,
$mdz_string
)
=
@_
;
my
$chunks
=
$self
->
_get_mdz_chunks
(
$mdz_string
);
my
$target_seq
=
"";
my
$query_seq
=
"";