Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
ensembl
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Iterations
Wiki
Requirements
Jira
Code
Merge requests
1
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ensembl-gh-mirror
ensembl
Commits
3c4fb39f
Commit
3c4fb39f
authored
21 years ago
by
Abel Ureta-Vidal
Browse files
Options
Downloads
Patches
Plain Diff
removed from here, moved to ensembl/misc-scripts
parent
18e38f73
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
scripts/gene-descriptions.pl
+0
-394
0 additions, 394 deletions
scripts/gene-descriptions.pl
with
0 additions
and
394 deletions
scripts/gene-descriptions.pl
deleted
100755 → 0
+
0
−
394
View file @
18e38f73
#!/usr/local/bin/perl -w
# $Header$
#
# Script to create a tab-separated file (to be loaded into the 'gene_description' table)
# from protein database file(s) either SWISSPROT/SPTREMBL or RefSeq and
# the protein mapping information stored in the currently used Ensembl core database.
#
# When file is created and checked for "junk" data, the same script is used to load the
# corresponding data to the Ensembl core database in 'gene_description' table
# EXIT STATUS
# 0 all is fine
# 1 Problem in parsing protein database file
# 2 Problem when loading data
# 3 Gene_description_file does not exist
# 4 Problem with an Acc without description
use
strict
;
use
Bio::EnsEMBL::DBSQL::
DBAdaptor
;
use
Getopt::
Long
;
# db_name used in the Ensembl core database (to be checked in table externalDB) and
# used in the protein mapping.
my
%protein_db_naming
=
('
swissprot
'
=>
"
SWISSPROT
",
'
sptrembl
'
=>
"
SPTREMBL
",
'
refseq
'
=>
"
RefSeq
");
my
@databases_used_for_guessing_descriptions
;
## list of regexps tested, in order of increasing preference, when
## deciding which description to use (used by sub compare_desc):
my
@word_order
=
qw(unknown hypothetical putative novel probable [0-9]{3} kDa fragment cdna protein)
;
## Getting parameters/options and usage
my
$usage
=
"
Usage: $0 -h host
\\
-u dbuser
\\
-d dbname protein_database_file(s) > gene-descriptions.tab
where protein_database_file could SWISSPROT/SPTREMBL entries in SWISSPROT-like format
and/or
RefSeq entries in gnp-like format.
OR to load the data from gene-descriptions.tab file to
\
'gene-description
\
' table
$0 -h host
\\
-u dbuser
\\
-d dbname
\\
-p password
\\
-load gene-descriptions.tab
\n
";
my
$help
=
0
;
my
(
$host
,
$dbname
,
$dbuser
,
$dbpass
,
$gene_description_file
);
unless
(
GetOptions
('
help
'
=>
\
$help
,
'
h=s
'
=>
\
$host
,
'
d=s
'
=>
\
$dbname
,
'
u=s
'
=>
\
$dbuser
,
'
p=s
'
=>
\
$dbpass
,
'
load=s
'
=>
\
$gene_description_file
))
{
die
$usage
;
}
if
(
$help
)
{
print
$usage
;
exit
0
;
}
## Some checks before starting
if
(
!
defined
$host
)
{
warn
"
\n
Must specify a host with -h
\n
";
die
$usage
;
}
elsif
(
!
defined
$dbname
)
{
warn
"
\n
Must specify a database with -d
\n
";
die
$usage
;
}
elsif
(
!
defined
$dbuser
)
{
warn
"
\n
Must specify a database username with -u
\n
";
die
$usage
;
}
if
(
defined
$gene_description_file
||
defined
$dbpass
)
{
if
(
!
defined
$dbpass
)
{
warn
"
\n
Must specify a database password with -p to load your data
\n
";
die
$usage
;
}
elsif
(
!
defined
$gene_description_file
)
{
warn
"
Assuming you want to load your data in the database...
If so you must specify a path for your gene-descriptions.tab file with -load.
If not -p is not needed
\n
";
die
$usage
;
}
}
elsif
(
scalar
@ARGV
==
0
)
{
warn
"
\n
Must specify an protein database input file
\n
";
die
$usage
;
}
## Loading data to database if requested and exit
if
(
defined
$gene_description_file
)
{
if
(
load_data
(
$gene_description_file
,
$host
,
$dbname
,
$dbuser
,
$dbpass
)
==
1
)
{
exit
0
;
}
else
{
warn
"
Problem when loading data
exit 2
";
exit
2
;
}
}
## Parsing of protein database flat files(Swissprot/SPTrEMBL/RefSeq) used in protein mapping
my
%sp_desc
;
while
(
defined
(
my
$protein_database_file
=
shift
@ARGV
))
{
unless
(
parse_protein_database
(
$protein_database_file
,
\
%protein_db_naming
,
\
%sp_desc
)
==
1
)
{
warn
"
Problem in parsing protein database file :
$protein_database_file
exit 1
";
exit
1
;
}
}
## Connecting to the Ensembl database where protein mapping is stored
my
$db
=
new
Bio::EnsEMBL::DBSQL::
DBAdaptor
(
-
host
=>
$host
,
-
dbname
=>
$dbname
,
-
user
=>
$dbuser
);
foreach
my
$db
(
@databases_used_for_guessing_descriptions
)
{
$db
=
"
'
"
.
$db
.
"
'
";
}
my
$db_query
=
join
"
,
",
@databases_used_for_guessing_descriptions
;
my
$sth
=
$db
->
prepare
("
SELECT
tsc.translation_id,tsc.gene_id,xdb.db_name,x.dbprimary_id,ix.query_identity,ix.target_identity
FROM
transcript tsc,objectXref ox,Xref x,externalDB xdb,identityXref ix
WHERE
tsc.translation_id = ox.ensembl_id AND
ox.xrefId = x.xrefId AND
x.externalDBId = xdb.externalDBId AND
xdb.db_name in (
$db_query
) AND
ox.objectxrefId = ix.objectxrefId
ORDER BY
tsc.gene_id asc,
xdb.db_name desc,
x.dbprimary_id asc
");
unless
(
$sth
->
execute
())
{
$db
->
throw
("
Failed execution of a select query
");
}
## Assigning the descriptions to Ensembl genes.
my
%gene_desc
;
while
(
my
(
$ensp
,
$ensg
,
$db
,
$acc
,
$qy_percid
,
$tg_percid
)
=
$sth
->
fetchrow_array
())
{
my
$desc
;
if
(
defined
$gene_desc
{
$ensg
})
{
my
(
$prevdb
,
$prev_desc
,
$prev_acc
,
$prev_qy_percid
,
$prev_tg_percid
)
=
@
{
$gene_desc
{
$ensg
}};
if
(
$prevdb
eq
$db
&&
(
$db
eq
$protein_db_naming
{'
swissprot
'}
||
$db
eq
$protein_db_naming
{'
refseq
'}))
{
if
(
$prev_qy_percid
<
$qy_percid
)
{
$desc
=
$sp_desc
{"
$db
:
$acc
"};
$gene_desc
{
$ensg
}
=
[
$db
,
$desc
,
$acc
,
$qy_percid
,
$tg_percid
];
# taking better SWISSPROT/RefSeq desc.
next
;
}
elsif
(
$prev_qy_percid
==
$qy_percid
&&
$prev_tg_percid
<
$tg_percid
)
{
$desc
=
$sp_desc
{"
$db
:
$acc
"};
$gene_desc
{
$ensg
}
=
[
$db
,
$desc
,
$acc
,
$qy_percid
,
$tg_percid
];
# taking better SWISSPROT/RefSeq desc.
next
;
}
else
{
next
;
# nothing to change
}
}
if
(
$db
eq
$protein_db_naming
{'
swissprot
'})
{
$desc
=
$sp_desc
{"
$db
:
$acc
"};
$gene_desc
{
$ensg
}
=
[
$db
,
$desc
,
$acc
,
$qy_percid
,
$tg_percid
];
# kick out the SPTREMBL desc.
next
;
}
if
(
$db
eq
$protein_db_naming
{'
refseq
'}
&&
$prevdb
ne
$protein_db_naming
{'
swissprot
'})
{
$desc
=
$sp_desc
{"
$db
:
$acc
"};
$gene_desc
{
$ensg
}
=
[
$db
,
$desc
,
$acc
,
$qy_percid
,
$tg_percid
];
# kick out the SPTREMBL desc.
next
;
}
if
(
$prevdb
eq
$db
&&
$db
eq
$protein_db_naming
{'
sptrembl
'})
{
$desc
=
$sp_desc
{"
$db
:
$acc
"};
if
(
compare_desc
(
$prev_desc
,
$desc
)
<
0
)
{
# new desc is better
# warn "new better: $desc (old was: $prev_desc)\n";
$gene_desc
{
$ensg
}
=
[
$db
,
$desc
,
$acc
,
$qy_percid
,
$tg_percid
];
next
;
}
else
{
# warn "old better: $prev_desc (new is: $desc)\n";
next
;
}
# die "should not reach this point: only know SWISSPROT and SPTREMBL";
}
}
else
{
$desc
=
$sp_desc
{"
$db
:
$acc
"};
$gene_desc
{
$ensg
}
=
[
$db
,
$desc
,
$acc
,
$qy_percid
,
$tg_percid
];
}
}
## Dump Ensembl gene descriptions assignations to stdout
foreach
my
$ensg
(
keys
%gene_desc
)
{
my
(
$db
,
$desc
,
$acc
,
$qy_percid
,
$tg_percid
)
=
@
{
$gene_desc
{
$ensg
}};
### final cleanup
### get rid of the Rik mess:
if
(
defined
$desc
)
{
$_
=
$desc
;
if
(
s/[0-9A-Z]{10}RIK PROTEIN[ \.]//ig
)
{
warn
"
throwing away:
$desc
\n
";
}
if
(
s/RIKEN CDNA [0-9A-Z]{10}[ \.]//ig
)
{
warn
"
throwing away:
$desc
\n
";
}
s/^\s*\(FRAGMENT\)\.?\s*$//ig
;
s/^\s*\(GENE\)\.?\s*$//ig
;
s/^\s*\(\s*\)\s*$//g
;
### add more as appropriate
if
(
$_
)
{
# =~ /[a-z]/;???
print
STDOUT
"
$ensg
\t
"
.
substr
(
$_
,
0
,
255
-
length
("
[Source:
$db
;Acc:
$acc
]
"))
.
"
[Source:
$db
;Acc:
$acc
]
\n
";
}
else
{
warn
"
throwing away:
$desc
\n
";
}
}
else
{
warn
"
Description for Acc
$acc
in
$db
file not defined.
Check in Ensembl core database if all Acc included in protein mapping are in the protein database files
exit 4
";
exit
4
;
}
}
## following taken from ensembl-external/scripts/family-input.pl
sub
compare_desc
($$)
{
my
(
$a
,
$b
)
=
@_
;
my
(
$am
,
$bm
);
foreach
my
$w
(
@word_order
)
{
$am
=
(
$a
=~
/$w/i
)?
1
:
0
;
$bm
=
(
$b
=~
/$w/i
)?
1
:
0
;
if
(
$am
!=
$bm
)
{
# ie, one matches, other doesn't
if
(
$am
==
1
)
{
# first one worse than second
return
-
1
;
}
else
{
return
1
;
}
}
}
# still look same; base result on length: longer is better
return
length
(
$a
)
<=>
length
(
$b
);
}
# compare_desc
sub
load_data
($$$$)
{
my
(
$gene_description_file
,
$host
,
$dbname
,
$dbuser
,
$dbpass
)
=
@_
;
if
(
-
e
$gene_description_file
)
{
symlink
$gene_description_file
,"
$ENV
{PWD}/gene_description_file.symlink
";
}
else
{
warn
"
File
$gene_description_file
does not exist
exit 3
";
exit
3
;
}
my
$db
=
new
Bio::EnsEMBL::DBSQL::
DBAdaptor
(
-
host
=>
$host
,
-
dbname
=>
$dbname
,
-
user
=>
$dbuser
,
-
pass
=>
$dbpass
);
my
$sth
=
$db
->
prepare
("
load data infile '
$ENV
{PWD}/gene_description_file.symlink' into table gene_description
");
unless
(
$sth
->
execute
())
{
unlink
"
$ENV
{PWD}/gene_description_file.symlink
";
$db
->
throw
("
Failed execution of loading data
");
}
else
{
unlink
"
$ENV
{PWD}/gene_description_file.symlink
";
print
STDERR
"
Data in
$gene_description_file
file successfully loaded in
\
'gene_description
\
' table in
",
$dbname
,"
\@
",
$host
,"
\n
";
return
1
;
}
}
# load_data
sub
parse_protein_database
($$$)
{
my
(
$protein_database_file
,
$protein_db_naming_href
,
$sp_desc_href
)
=
@_
;
my
$db
;
my
$acc
;
my
$desc
=
"";
open
PROTDBF
,
$protein_database_file
||
die
"
$protein_database_file
:$!
";
while
(
defined
(
my
$line
=
<
PROTDBF
>
))
{
next
if
(
$line
=~
/^$/
);
if
(
$line
=~
/^ID\s{3}.*$/o
)
{
if
(
$line
=~
/^ID\s{3}.*\s+PRELIMINARY;\s+.*$/o
)
{
$db
=
$protein_db_naming_href
->
{'
sptrembl
'};
push
@databases_used_for_guessing_descriptions
,
$db
unless
(
grep
/^$db$/
,
@databases_used_for_guessing_descriptions
);
}
elsif
(
$line
=~
/^ID\s{3}.*\s+STANDARD;\s+.*$/o
)
{
$db
=
$protein_db_naming_href
->
{'
swissprot
'};
push
@databases_used_for_guessing_descriptions
,
$db
unless
(
grep
/^$db$/
,
@databases_used_for_guessing_descriptions
);
}
else
{
chomp
$line
;
warn
"
\n
Could not recognize line :
\"
$line
\"\n
Check the input file format in
$protein_database_file
.
\n\n
";
return
0
;
}
}
if
(
$line
=~
/^LOCUS\s+\S+\s+.*$/o
)
{
$db
=
$protein_db_naming_href
->
{'
refseq
'};
push
@databases_used_for_guessing_descriptions
,
$db
unless
(
grep
/^$db$/
,
@databases_used_for_guessing_descriptions
);
}
if
(
$db
eq
$protein_db_naming_href
->
{'
swissprot
'}
||
$db
eq
$protein_db_naming_href
->
{'
sptrembl
'})
{
if
(
$line
=~
/^AC\s{3}(\S+);.*$/o
&&
!
defined
$acc
)
{
$acc
=
$
1
;
}
if
(
$line
=~
/^DE\s{3}(\S.*\S)$/o
)
{
$desc
.=
"
"
unless
(
$desc
eq
"");
$desc
.=
uc
$
1
;
}
if
(
$line
=~
/^\/\/$/o
)
{
$desc
=~
s/\{.*\}//g
;
$sp_desc
{"
$db
:
$acc
"}
=
$desc
;
$db
=
undef
;
$acc
=
undef
;
$desc
=
"";
}
}
elsif
(
$db
eq
$protein_db_naming_href
->
{'
refseq
'})
{
if
(
$line
=~
/^DEFINITION\s+(\S.*\S)$/o
)
{
$desc
.=
uc
$
1
;
while
(
defined
(
$line
=
<
PROTDBF
>
))
{
last
if
(
$line
=~
/^ACCESSION\s+(\S+)\s*.*$/o
);
if
(
$line
=~
/^\s+(\S.*\S)$/o
)
{
$desc
.=
"
"
.
uc
$
1
;
}
}
}
if
(
$line
=~
/^DBSOURCE\s+(\S+)\s*.*$/o
)
{
# Accession Number got in this line here during protein
# not in ACCESSION line.
if
(
$line
=~
/^DBSOURCE\s+REFSEQ:\s+accession\s+(\S+)\.\d+$/o
)
{
$acc
=
$
1
;
}
else
{
chomp
$line
;
warn
"
Problem in parsing DBSOURCE line:
$line
Expecting to match this regexp /^DBSOURCE
\\
s+REFSEQ:
\\
s+accession
\\
s+(
\\
S+)
\\
.
\\
d+
\$
/o
\n
";
return
0
;
}
}
if
(
$line
=~
/^\/\/$/o
)
{
$desc
=~
s/\s*\[.*\]//g
;
$sp_desc
{"
$db
:
$acc
"}
=
$desc
;
$db
=
undef
;
$acc
=
undef
;
$desc
=
"";
}
}
}
close
PROTDBF
;
return
1
;
}
# parse_protein_database
exit
0
;
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment