Skip to content
Snippets Groups Projects
Commit c04494aa authored by Abel Ureta-Vidal's avatar Abel Ureta-Vidal
Browse files

Integration of sp-description.pl.

Possibility to add of RefSeq entries to get gene description with precedence on SPTREMBL,
and SWISSPROT having precedence on RefSeq.
Then ranking descriptions using the similarity measure used in establishing the mapping.
Manage now to load gene descriptions file to the corresponding Ensembl core database.
parent 56520097
No related branches found
No related tags found
No related merge requests found
#!/usr/local/bin/perl -w
# $Header$
#
# Script to create a tab-separated file (to be loaded into the
# gene_description table) from a file sp-descriptions.dat
# (swiss-prot/sptrembl -> description mapping, from running
# sp-descriptions.pl on a swissprot/trembl flatfile) and from the
# mapping.dat (ensg -> sp/trembl accno mapping, obtained from script
# sp-mapping-dump.sh).
# Script to create a tab-separated file (to be loaded into the 'gene_description' table)
# from protein database file(s) either SWISSPROT/SPTREMBL or RefSeq and
# the protein mapping information stored in the currently used Ensembl core database.
#
# When file is created and checked for "junk" data, the same script is used to load the
# corresponding data to the Ensembl core database in 'gene_description' table
## list of regexps tested, in order of increasing preference, when
## deciding which description to use (used by sub compare_desc):
# EXIT STATUS
# 0 all is fine
# 1 Problem in parsing protein database file
# 2 Problem when loading data
# 3 Gene_description_file does not exist
# 4 Problem with an Acc without description
use strict;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Getopt::Long;
my $host;
my $dbname;
my $dbuser;
# db_name used in the Ensembl core database (to be checked in table externalDB) and
# used in the protein mapping.
my %protein_db_naming = ('swissprot' => "SWISSPROT",
'sptrembl' => "SPTREMBL",
'refseq' => "RefSeq");
&GetOptions('h=s' => \$host,
'd=s' => \$dbname,
'u=s' => \$dbuser);
my @databases_used_for_guessing_descriptions;
## list of regexps tested, in order of increasing preference, when
## deciding which description to use (used by sub compare_desc):
my @word_order = qw(unknown hypothetical putative novel probable [0-9]{3} kDa fragment cdna protein);
my $usage = "\n Usage: $0 -h host -u dbuser -d dbname sp-descriptions.dat > gene-descriptions.dat\n\n";
## Getting parameters/options and usage
my $usage = "
Usage: $0 -h host \\
-u dbuser \\
-d dbname protein_database_file(s) > gene-descriptions.tab
where protein_database_file could SWISSPROT/SPTREMBL entries in SWISSPROT-like format
and/or
RefSeq entries in gnp-like format.
OR to load the data from gene-descriptions.tab file to \'gene-description\' table
$0 -h host \\
-u dbuser \\
-d dbname \\
-p password \\
-load gene-descriptions.tab
\n";
my $help = 0;
my ($host, $dbname, $dbuser, $dbpass, $gene_description_file);
unless (GetOptions('help' => \$help,
'h=s' => \$host,
'd=s' => \$dbname,
'u=s' => \$dbuser,
'p=s' => \$dbpass,
'load=s' => \$gene_description_file)) {
die $usage;
}
if ($help) {
print $usage;
exit 0;
}
## Some checks before starting
if (! defined $host) {
warn "\n Must specifie a host with -h\n";
warn "\n Must specify a host with -h\n";
die $usage;
} elsif (! defined $dbname) {
warn "\n Must specifie a database with -d\n";
warn "\n Must specify a database with -d\n";
die $usage;
} elsif (! defined $dbuser) {
warn "\n Must specifie a database username with -u\n";
warn "\n Must specify a database username with -u\n";
die $usage;
}
if (defined $gene_description_file || defined $dbpass) {
if (! defined $dbpass) {
warn "\n Must specify a database password with -p to load your data\n";
die $usage;
} elsif (! defined $gene_description_file) {
warn "
Assuming you want to load your data in the database...
If so you must specify a path for your gene-descriptions.tab file with -load.
If not -p is not needed\n";
die $usage;
}
} elsif (scalar @ARGV ==0) {
warn "\n Must give an input file, expects to have the sp-description.pl output as input\n";
warn "\n Must specify an protein database input file\n";
die $usage;
}
my $spdesc = shift;
## Loading data to database if requested and exit
if (defined $gene_description_file) {
if (load_data($gene_description_file,$host,$dbname,$dbuser,$dbpass) == 1) {
exit 0;
} else {
warn "Problem when loading data
exit 2";
exit 2;
}
}
open SPDESC,$spdesc || die "$spdesc:$!";
## Parsing of protein database flat files(Swissprot/SPTrEMBL/RefSeq) used in protein mapping
my %sp_desc;
while (<SPDESC>) {
chomp;
my ($db, $acc, $desc) = split(/\t/);
$sp_desc{"$db:$acc"}=$desc;
while (defined (my $protein_database_file = shift @ARGV)) {
unless (parse_protein_database($protein_database_file,\%protein_db_naming,\%sp_desc) == 1) {
warn "Problem in parsing protein database file : $protein_database_file
exit 1";
exit 1;
}
}
close SPDESC || die "$!";
## Connecting to the Ensembl database where protein mapping is stored
my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor (-host => $host,
-user => $dbuser,
-dbname => $dbname );
-dbname => $dbname,
-user => $dbuser);
foreach my $db (@databases_used_for_guessing_descriptions) {
$db = "'".$db."'";
}
my $db_query = join ",",@databases_used_for_guessing_descriptions;
my $sth = $db->prepare("SELECT tsc.translation_id, tsc.gene_id, xdb.db_name, x.dbprimary_id, ix.query_identity, ix.target_identity FROM transcript tsc, objectXref ox, Xref x, externalDB xdb, identityXref ix WHERE tsc.translation_id = ox.ensembl_id AND ox.xrefId = x.xrefId AND x.externalDBId = xdb.externalDBId AND xdb.db_name in ('SWISSPROT', 'SPTREMBL') AND ox.objectxrefId = ix.objectxrefId order by tsc.gene_id asc, xdb.db_name desc, x.dbprimary_id asc");
my $sth = $db->prepare("
SELECT
tsc.translation_id,tsc.gene_id,xdb.db_name,x.dbprimary_id,ix.query_identity,ix.target_identity
FROM
transcript tsc,objectXref ox,Xref x,externalDB xdb,identityXref ix
WHERE
tsc.translation_id = ox.ensembl_id AND
ox.xrefId = x.xrefId AND
x.externalDBId = xdb.externalDBId AND
xdb.db_name in ($db_query) AND
ox.objectxrefId = ix.objectxrefId
ORDER BY
tsc.gene_id asc,
xdb.db_name desc,
x.dbprimary_id asc");
unless ($sth->execute()) {
$db->throw("Failed execution of a select query");
}
my %gene_desc;
## Assigning the descriptions to Ensembl genes.
LINE:
my %gene_desc;
while (my ($ensp, $ensg, $db, $acc, $qy_percid, $tg_percid) = $sth->fetchrow_array()) {
my $desc;
if ( defined($gene_desc{$ensg}) ) {
if (defined $gene_desc{$ensg}) {
my ($prevdb, $prev_desc, $prev_acc, $prev_qy_percid, $prev_tg_percid) = @{$gene_desc{$ensg}};
# if ($prev_qy_percid < $qy_percid) {
# $desc = $sp_desc{"$db:$acc"};
# $gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ]; # kick out the SPTREMBL desc.
# next LINE;
# }
# next LINE;
if ($prevdb eq 'SWISSPROT') {
next LINE; # nothing to change
if ($prevdb eq $db &&
($db eq $protein_db_naming{'swissprot'} ||
$db eq $protein_db_naming{'refseq'})) {
if ($prev_qy_percid < $qy_percid) {
$desc = $sp_desc{"$db:$acc"};
$gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ]; # taking better SWISSPROT/RefSeq desc.
next;
} elsif ($prev_qy_percid == $qy_percid &&
$prev_tg_percid < $tg_percid) {
$desc = $sp_desc{"$db:$acc"};
$gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ]; # taking better SWISSPROT/RefSeq desc.
next;
} else {
next; # nothing to change
}
}
if ($db eq 'SWISSPROT') {
if ($db eq $protein_db_naming{'swissprot'}) {
$desc = $sp_desc{"$db:$acc"};
$gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ]; # kick out the SPTREMBL desc.
next;
}
if ($db eq $protein_db_naming{'refseq'} &&
$prevdb ne $protein_db_naming{'swissprot'}) {
$desc = $sp_desc{"$db:$acc"};
$gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ]; # kick out the SPTREMBL desc.
next LINE;
next;
}
if ($db eq 'SPTREMBL' && $prevdb eq $db ) {
if ($prevdb eq $db &&
$db eq $protein_db_naming{'sptrembl'}) {
$desc = $sp_desc{"$db:$acc"};
if ( &compare_desc($prev_desc, $desc) < 0 ) {
if (compare_desc($prev_desc,$desc) < 0) {
# new desc is better
# warn "new better: $desc (old was: $prev_desc)\n";
$gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ];
next LINE;
next;
} else {
# warn "old better: $prev_desc (new is: $desc)\n";
next LINE;
next;
}
die "should not reach this point: only know SWISS-PROT and SPTREMBL";
# die "should not reach this point: only know SWISSPROT and SPTREMBL";
}
} else {
$desc = $sp_desc{"$db:$acc"};
......@@ -110,7 +215,7 @@ while (my ($ensp, $ensg, $db, $acc, $qy_percid, $tg_percid) = $sth->fetchrow_arr
}
}
# now dump the stuff to stdout.
## Dump Ensembl gene descriptions assignations to stdout
foreach my $ensg ( keys %gene_desc ) {
......@@ -118,36 +223,179 @@ foreach my $ensg ( keys %gene_desc ) {
### final cleanup
### get rid of the Rik mess:
$_ = $desc;
if (s/[0-9A-Z]{10}Rik protein[ \.]//g) {
warn "throwing away: $desc\n";
if (defined $desc) {
$_ = $desc;
if (s/[0-9A-Z]{10}Rik protein[ \.]//g) {
warn "throwing away: $desc\n";
}
s/^\s*\(Fragment\)\.?\s*$//g;
s/^\s*\(\s*\)\s*$//g;
### add more as appropriate
if ($_) { # =~ /[a-z]/;???
# print STDOUT "$ensg\t $_ [Source:$db;Acc:$acc,%qy:$qy_percid,\%tg:$tg_percid]\n";
print STDOUT "$ensg\t".substr($_,0,255-length(" [Source:$db;Acc:$acc]"))." [Source:$db;Acc:$acc]\n";
}
} else {
warn "Description for Acc $acc in $db file not defined.
Check in Ensembl core database if all Acc included in protein mapping are in the protein database files
exit 4";
exit 4;
}
s/^\s*\(Fragment\)\.?\s*$//g;
s/^\s*\(\s*\)\s*$//g;
### add more as appropriate
# print STDOUT "$ensg\t $_ [Source:$db;Acc:$acc,%qy:$qy_percid,\%tg:$tg_percid]\n" if $_; # =~ /[a-z]/;???
print STDOUT "$ensg\t $_ [Source:$db;Acc:$acc]\n" if $_; # =~ /[a-z]/;???
}
#### following taken from ensembl-external/scripts/family-input.pl
## following taken from ensembl-external/scripts/family-input.pl
sub compare_desc ($$) {
my ($a, $b) = @_;
my ($am, $bm);
foreach my $w (@word_order) {
$am = ($a =~ /$w/i)?1:0;
$bm = ($b =~ /$w/i)?1:0;
if ($am != $bm ) { # ie, one matches, other doesn't
if ( $am == 1 ) { # first one worse than second
return -1;
} else {
return 1;
}
}
}
# still look same; base result on length: longer is better
return length($a) <=> length($b);
} # compare_desc
sub load_data ($$$$) {
my ($gene_description_file,$host,$dbname,$dbuser,$dbpass) = @_;
if (-e $gene_description_file) {
symlink $gene_description_file,"$ENV{PWD}/gene_description_file.symlink";
} else {
warn "File $gene_description_file does not exist
exit 3";
exit 3;
}
my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor (-host => $host,
-dbname => $dbname,
-user => $dbuser,
-pass => $dbpass);
my $sth = $db->prepare("load data infile '$ENV{PWD}/gene_description_file.symlink' into table gene_description");
unless ($sth->execute()) {
unlink "$ENV{PWD}/gene_description_file.symlink";
$db->throw("Failed execution of loading data");
} else {
unlink "$ENV{PWD}/gene_description_file.symlink";
print STDERR "Data in $gene_description_file file successfully loaded in \'gene_description\' table in ",$dbname,"\@",$host,"\n";
return 1;
}
} # load_data
sub parse_protein_database ($$$) {
my ($protein_database_file,$protein_db_naming_href,$sp_desc_href) = @_;
sub compare_desc {
my ($a, $b) = @_;
my $db;
my $acc;
my $desc = "";
my ($am, $bm);
foreach my $w (@word_order) {
$am = ($a =~ /$w/i)?1:0;
$bm = ($b =~ /$w/i)?1:0;
open PROTDBF,$protein_database_file || die "$protein_database_file:$!";
while (defined (my $line = <PROTDBF>)) {
if ($am != $bm ) { # ie, one matches, other doesn't
if ( $am == 1 ) { # first one worse than second
return -1;
} else {
return 1;
}
}
if ($line =~ /^ID\s{3}.*$/o) {
if ($line =~ /^ID\s{3}.*\s+PRELIMINARY;\s+.*$/o) {
$db = $protein_db_naming_href->{'sptrembl'};
# print "OK1:$db\n";
push @databases_used_for_guessing_descriptions, $db unless (grep /^$db$/, @databases_used_for_guessing_descriptions);
# print join " ",@databases_used_for_guessing_descriptions,"\n";
} elsif ($line =~ /^ID\s{3}.*\s+STANDARD;\s+.*$/o) {
$db = $protein_db_naming_href->{'swissprot'};
# print "OK2:$db\n";
push @databases_used_for_guessing_descriptions, $db unless (grep /^$db$/, @databases_used_for_guessing_descriptions);
# print join " ",@databases_used_for_guessing_descriptions,"\n";
} else {
chomp $line;
warn "\nCould not recognize line : \"$line\"\nCheck the input file format in $protein_database_file.\n\n";
return 0;
}
}
if ($line =~ /^LOCUS\s+\S+\s+.*$/o) {
$db = $protein_db_naming_href->{'refseq'};
# print "OK3:$db\n";
push @databases_used_for_guessing_descriptions, $db unless (grep /^$db$/, @databases_used_for_guessing_descriptions);
# print join " ",@databases_used_for_guessing_descriptions,"\n";
}
# still look same; base result on length: longer is better
return length($a) <=> length($b);
} # compare_desc
if ($db eq $protein_db_naming_href->{'swissprot'} ||
$db eq $protein_db_naming_href->{'sptrembl'}) {
if ($line =~ /^AC\s{3}(\S+);.*$/o &&
! defined $acc) {
$acc = $1;
}
if ($line =~ /^DE\s{3}(\S.*\S)$/o) {
$desc .= " " unless ($desc eq "");
$desc .= uc $1;
}
if ($line =~ /^\/\/$/o) {
$desc =~ s/\{.*\}//g;
$sp_desc{"$db:$acc"} = $desc;
$db = undef;
$acc = undef;
$desc = "";
}
} elsif ($db eq $protein_db_naming_href->{'refseq'}) {
if ($line =~ /^DEFINITION\s+(\S.*\S)$/o) {
$desc .= uc $1;
while (defined ($line = <PROTDBF>)) {
last if ($line =~ /^ACCESSION\s+(\S+)\s*.*$/o);
if ($line =~ /^\s+(\S.*\S)$/o) {
$desc .= " ".uc $1;
}
}
}
if ($line =~ /^DBSOURCE\s+(\S+)\s*.*$/o) {
# Accession Number got in this line here during protein
# not in ACCESSION line.
if ($line =~ /^DBSOURCE\s+REFSEQ:\s+accession\s+(\S+)\.\d+$/o) {
$acc = $1;
} else {
chomp $line;
warn "Problem in parsing DBSOURCE line: $line
Expecting to match this regexp /^DBSOURCE\\s+REFSEQ:\\s+accession\\s+(\\S+)\\.\\d+\$/o\n";
return 0;
}
}
if ($line =~ /^\/\/$/o) {
$desc =~ s/\s*\[.*\]//g;
$sp_desc{"$db:$acc"} = $desc;
$db = undef;
$acc = undef;
$desc = "";
}
}
}
close PROTDBF;
# foreach my $db_acc (keys %sp_desc) {
# my $desc = $sp_desc{$db_acc};
# print "$db_acc\t$desc\n";
# }
return 1;
} # parse_protein_database
exit 0;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment