diff --git a/scripts/gene-descriptions.pl b/scripts/gene-descriptions.pl index 126559b551c7c0dfc41fd933bfc7e78357bc4e35..6a4275c05e3cb30afddaf5094d0e0995f1f62cbd 100755 --- a/scripts/gene-descriptions.pl +++ b/scripts/gene-descriptions.pl @@ -1,108 +1,213 @@ #!/usr/local/bin/perl -w # $Header$ # -# Script to create a tab-separated file (to be loaded into the -# gene_description table) from a file sp-descriptions.dat -# (swiss-prot/sptrembl -> description mapping, from running -# sp-descriptions.pl on a swissprot/trembl flatfile) and from the -# mapping.dat (ensg -> sp/trembl accno mapping, obtained from script -# sp-mapping-dump.sh). +# Script to create a tab-separated file (to be loaded into the 'gene_description' table) +# from protein database file(s) either SWISSPROT/SPTREMBL or RefSeq and +# the protein mapping information stored in the currently used Ensembl core database. +# +# When file is created and checked for "junk" data, the same script is used to load the +# corresponding data to the Ensembl core database in 'gene_description' table - -## list of regexps tested, in order of increasing preference, when -## deciding which description to use (used by sub compare_desc): +# EXIT STATUS +# 0 all is fine +# 1 Problem in parsing protein database file +# 2 Problem when loading data +# 3 Gene_description_file does not exist +# 4 Problem with an Acc without description use strict; use Bio::EnsEMBL::DBSQL::DBAdaptor; use Getopt::Long; -my $host; -my $dbname; -my $dbuser; +# db_name used in the Ensembl core database (to be checked in table externalDB) and +# used in the protein mapping. +my %protein_db_naming = ('swissprot' => "SWISSPROT", + 'sptrembl' => "SPTREMBL", + 'refseq' => "RefSeq"); -&GetOptions('h=s' => \$host, - 'd=s' => \$dbname, - 'u=s' => \$dbuser); +my @databases_used_for_guessing_descriptions; + +## list of regexps tested, in order of increasing preference, when +## deciding which description to use (used by sub compare_desc): my @word_order = qw(unknown hypothetical putative novel probable [0-9]{3} kDa fragment cdna protein); -my $usage = "\n Usage: $0 -h host -u dbuser -d dbname sp-descriptions.dat > gene-descriptions.dat\n\n"; +## Getting parameters/options and usage + +my $usage = " + Usage: $0 -h host \\ + -u dbuser \\ + -d dbname protein_database_file(s) > gene-descriptions.tab + + where protein_database_file could SWISSPROT/SPTREMBL entries in SWISSPROT-like format + and/or + RefSeq entries in gnp-like format. + + OR to load the data from gene-descriptions.tab file to \'gene-description\' table + + $0 -h host \\ + -u dbuser \\ + -d dbname \\ + -p password \\ + -load gene-descriptions.tab +\n"; + +my $help = 0; +my ($host, $dbname, $dbuser, $dbpass, $gene_description_file); + +unless (GetOptions('help' => \$help, + 'h=s' => \$host, + 'd=s' => \$dbname, + 'u=s' => \$dbuser, + 'p=s' => \$dbpass, + 'load=s' => \$gene_description_file)) { + die $usage; +} + +if ($help) { + print $usage; + exit 0; +} + +## Some checks before starting if (! defined $host) { - warn "\n Must specifie a host with -h\n"; + warn "\n Must specify a host with -h\n"; die $usage; } elsif (! defined $dbname) { - warn "\n Must specifie a database with -d\n"; + warn "\n Must specify a database with -d\n"; die $usage; } elsif (! defined $dbuser) { - warn "\n Must specifie a database username with -u\n"; + warn "\n Must specify a database username with -u\n"; die $usage; +} + +if (defined $gene_description_file || defined $dbpass) { + + if (! defined $dbpass) { + warn "\n Must specify a database password with -p to load your data\n"; + die $usage; + } elsif (! defined $gene_description_file) { + warn " + Assuming you want to load your data in the database... + If so you must specify a path for your gene-descriptions.tab file with -load. + If not -p is not needed\n"; + die $usage; + } + } elsif (scalar @ARGV ==0) { - warn "\n Must give an input file, expects to have the sp-description.pl output as input\n"; + warn "\n Must specify an protein database input file\n"; die $usage; } -my $spdesc = shift; +## Loading data to database if requested and exit + +if (defined $gene_description_file) { + if (load_data($gene_description_file,$host,$dbname,$dbuser,$dbpass) == 1) { + exit 0; + } else { + warn "Problem when loading data +exit 2"; + exit 2; + } +} -open SPDESC,$spdesc || die "$spdesc:$!"; +## Parsing of protein database flat files(Swissprot/SPTrEMBL/RefSeq) used in protein mapping my %sp_desc; -while (<SPDESC>) { - chomp; - my ($db, $acc, $desc) = split(/\t/); - $sp_desc{"$db:$acc"}=$desc; +while (defined (my $protein_database_file = shift @ARGV)) { + unless (parse_protein_database($protein_database_file,\%protein_db_naming,\%sp_desc) == 1) { + warn "Problem in parsing protein database file : $protein_database_file +exit 1"; + exit 1; + } } -close SPDESC || die "$!"; - +## Connecting to the Ensembl database where protein mapping is stored my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor (-host => $host, - -user => $dbuser, - -dbname => $dbname ); + -dbname => $dbname, + -user => $dbuser); + +foreach my $db (@databases_used_for_guessing_descriptions) { + $db = "'".$db."'"; +} +my $db_query = join ",",@databases_used_for_guessing_descriptions; -my $sth = $db->prepare("SELECT tsc.translation_id, tsc.gene_id, xdb.db_name, x.dbprimary_id, ix.query_identity, ix.target_identity FROM transcript tsc, objectXref ox, Xref x, externalDB xdb, identityXref ix WHERE tsc.translation_id = ox.ensembl_id AND ox.xrefId = x.xrefId AND x.externalDBId = xdb.externalDBId AND xdb.db_name in ('SWISSPROT', 'SPTREMBL') AND ox.objectxrefId = ix.objectxrefId order by tsc.gene_id asc, xdb.db_name desc, x.dbprimary_id asc"); +my $sth = $db->prepare(" +SELECT + tsc.translation_id,tsc.gene_id,xdb.db_name,x.dbprimary_id,ix.query_identity,ix.target_identity +FROM + transcript tsc,objectXref ox,Xref x,externalDB xdb,identityXref ix +WHERE + tsc.translation_id = ox.ensembl_id AND + ox.xrefId = x.xrefId AND + x.externalDBId = xdb.externalDBId AND + xdb.db_name in ($db_query) AND + ox.objectxrefId = ix.objectxrefId +ORDER BY + tsc.gene_id asc, + xdb.db_name desc, + x.dbprimary_id asc"); unless ($sth->execute()) { $db->throw("Failed execution of a select query"); } -my %gene_desc; +## Assigning the descriptions to Ensembl genes. -LINE: +my %gene_desc; while (my ($ensp, $ensg, $db, $acc, $qy_percid, $tg_percid) = $sth->fetchrow_array()) { my $desc; - if ( defined($gene_desc{$ensg}) ) { + if (defined $gene_desc{$ensg}) { my ($prevdb, $prev_desc, $prev_acc, $prev_qy_percid, $prev_tg_percid) = @{$gene_desc{$ensg}}; -# if ($prev_qy_percid < $qy_percid) { -# $desc = $sp_desc{"$db:$acc"}; -# $gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ]; # kick out the SPTREMBL desc. -# next LINE; -# } -# next LINE; - if ($prevdb eq 'SWISSPROT') { - next LINE; # nothing to change + + if ($prevdb eq $db && + ($db eq $protein_db_naming{'swissprot'} || + $db eq $protein_db_naming{'refseq'})) { + if ($prev_qy_percid < $qy_percid) { + $desc = $sp_desc{"$db:$acc"}; + $gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ]; # taking better SWISSPROT/RefSeq desc. + next; + } elsif ($prev_qy_percid == $qy_percid && + $prev_tg_percid < $tg_percid) { + $desc = $sp_desc{"$db:$acc"}; + $gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ]; # taking better SWISSPROT/RefSeq desc. + next; + } else { + next; # nothing to change + } } - if ($db eq 'SWISSPROT') { + + if ($db eq $protein_db_naming{'swissprot'}) { + $desc = $sp_desc{"$db:$acc"}; + $gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ]; # kick out the SPTREMBL desc. + next; + } + + if ($db eq $protein_db_naming{'refseq'} && + $prevdb ne $protein_db_naming{'swissprot'}) { $desc = $sp_desc{"$db:$acc"}; $gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ]; # kick out the SPTREMBL desc. - next LINE; + next; } - if ($db eq 'SPTREMBL' && $prevdb eq $db ) { + if ($prevdb eq $db && + $db eq $protein_db_naming{'sptrembl'}) { $desc = $sp_desc{"$db:$acc"}; - if ( &compare_desc($prev_desc, $desc) < 0 ) { + if (compare_desc($prev_desc,$desc) < 0) { # new desc is better # warn "new better: $desc (old was: $prev_desc)\n"; $gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ]; - next LINE; + next; } else { # warn "old better: $prev_desc (new is: $desc)\n"; - next LINE; + next; } - die "should not reach this point: only know SWISS-PROT and SPTREMBL"; +# die "should not reach this point: only know SWISSPROT and SPTREMBL"; } } else { $desc = $sp_desc{"$db:$acc"}; @@ -110,7 +215,7 @@ while (my ($ensp, $ensg, $db, $acc, $qy_percid, $tg_percid) = $sth->fetchrow_arr } } -# now dump the stuff to stdout. +## Dump Ensembl gene descriptions assignations to stdout foreach my $ensg ( keys %gene_desc ) { @@ -118,36 +223,179 @@ foreach my $ensg ( keys %gene_desc ) { ### final cleanup ### get rid of the Rik mess: - $_ = $desc; - if (s/[0-9A-Z]{10}Rik protein[ \.]//g) { - warn "throwing away: $desc\n"; + if (defined $desc) { + $_ = $desc; + if (s/[0-9A-Z]{10}Rik protein[ \.]//g) { + warn "throwing away: $desc\n"; + } + s/^\s*\(Fragment\)\.?\s*$//g; + s/^\s*\(\s*\)\s*$//g; + ### add more as appropriate + + if ($_) { # =~ /[a-z]/;??? + # print STDOUT "$ensg\t $_ [Source:$db;Acc:$acc,%qy:$qy_percid,\%tg:$tg_percid]\n"; + print STDOUT "$ensg\t".substr($_,0,255-length(" [Source:$db;Acc:$acc]"))." [Source:$db;Acc:$acc]\n"; + } + } else { + warn "Description for Acc $acc in $db file not defined. +Check in Ensembl core database if all Acc included in protein mapping are in the protein database files +exit 4"; + exit 4; } - s/^\s*\(Fragment\)\.?\s*$//g; - s/^\s*\(\s*\)\s*$//g; - ### add more as appropriate - -# print STDOUT "$ensg\t $_ [Source:$db;Acc:$acc,%qy:$qy_percid,\%tg:$tg_percid]\n" if $_; # =~ /[a-z]/;??? - print STDOUT "$ensg\t $_ [Source:$db;Acc:$acc]\n" if $_; # =~ /[a-z]/;??? } -#### following taken from ensembl-external/scripts/family-input.pl +## following taken from ensembl-external/scripts/family-input.pl + +sub compare_desc ($$) { + my ($a, $b) = @_; + + my ($am, $bm); + foreach my $w (@word_order) { + $am = ($a =~ /$w/i)?1:0; + $bm = ($b =~ /$w/i)?1:0; + + if ($am != $bm ) { # ie, one matches, other doesn't + if ( $am == 1 ) { # first one worse than second + return -1; + } else { + return 1; + } + } + } + # still look same; base result on length: longer is better + return length($a) <=> length($b); +} # compare_desc + +sub load_data ($$$$) { + my ($gene_description_file,$host,$dbname,$dbuser,$dbpass) = @_; + + if (-e $gene_description_file) { + symlink $gene_description_file,"$ENV{PWD}/gene_description_file.symlink"; + } else { + warn "File $gene_description_file does not exist +exit 3"; + exit 3; + } + + my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor (-host => $host, + -dbname => $dbname, + -user => $dbuser, + -pass => $dbpass); + + my $sth = $db->prepare("load data infile '$ENV{PWD}/gene_description_file.symlink' into table gene_description"); + + unless ($sth->execute()) { + unlink "$ENV{PWD}/gene_description_file.symlink"; + $db->throw("Failed execution of loading data"); + } else { + unlink "$ENV{PWD}/gene_description_file.symlink"; + print STDERR "Data in $gene_description_file file successfully loaded in \'gene_description\' table in ",$dbname,"\@",$host,"\n"; + return 1; + } +} # load_data + +sub parse_protein_database ($$$) { + my ($protein_database_file,$protein_db_naming_href,$sp_desc_href) = @_; -sub compare_desc { - my ($a, $b) = @_; + my $db; + my $acc; + my $desc = ""; - my ($am, $bm); - foreach my $w (@word_order) { - $am = ($a =~ /$w/i)?1:0; - $bm = ($b =~ /$w/i)?1:0; + open PROTDBF,$protein_database_file || die "$protein_database_file:$!"; + + while (defined (my $line = <PROTDBF>)) { - if ($am != $bm ) { # ie, one matches, other doesn't - if ( $am == 1 ) { # first one worse than second - return -1; - } else { - return 1; - } - } + if ($line =~ /^ID\s{3}.*$/o) { + if ($line =~ /^ID\s{3}.*\s+PRELIMINARY;\s+.*$/o) { + $db = $protein_db_naming_href->{'sptrembl'}; +# print "OK1:$db\n"; + push @databases_used_for_guessing_descriptions, $db unless (grep /^$db$/, @databases_used_for_guessing_descriptions); +# print join " ",@databases_used_for_guessing_descriptions,"\n"; + } elsif ($line =~ /^ID\s{3}.*\s+STANDARD;\s+.*$/o) { + $db = $protein_db_naming_href->{'swissprot'}; +# print "OK2:$db\n"; + push @databases_used_for_guessing_descriptions, $db unless (grep /^$db$/, @databases_used_for_guessing_descriptions); +# print join " ",@databases_used_for_guessing_descriptions,"\n"; + } else { + chomp $line; + warn "\nCould not recognize line : \"$line\"\nCheck the input file format in $protein_database_file.\n\n"; + return 0; + } + } + + if ($line =~ /^LOCUS\s+\S+\s+.*$/o) { + $db = $protein_db_naming_href->{'refseq'}; +# print "OK3:$db\n"; + push @databases_used_for_guessing_descriptions, $db unless (grep /^$db$/, @databases_used_for_guessing_descriptions); +# print join " ",@databases_used_for_guessing_descriptions,"\n"; } - # still look same; base result on length: longer is better - return length($a) <=> length($b); -} # compare_desc + + if ($db eq $protein_db_naming_href->{'swissprot'} || + $db eq $protein_db_naming_href->{'sptrembl'}) { + + if ($line =~ /^AC\s{3}(\S+);.*$/o && + ! defined $acc) { + $acc = $1; + } + + if ($line =~ /^DE\s{3}(\S.*\S)$/o) { + $desc .= " " unless ($desc eq ""); + $desc .= uc $1; + } + + if ($line =~ /^\/\/$/o) { + $desc =~ s/\{.*\}//g; + $sp_desc{"$db:$acc"} = $desc; + $db = undef; + $acc = undef; + $desc = ""; + } + + } elsif ($db eq $protein_db_naming_href->{'refseq'}) { + + if ($line =~ /^DEFINITION\s+(\S.*\S)$/o) { + $desc .= uc $1; + while (defined ($line = <PROTDBF>)) { + last if ($line =~ /^ACCESSION\s+(\S+)\s*.*$/o); + if ($line =~ /^\s+(\S.*\S)$/o) { + $desc .= " ".uc $1; + } + } + } + + if ($line =~ /^DBSOURCE\s+(\S+)\s*.*$/o) { +# Accession Number got in this line here during protein +# not in ACCESSION line. + if ($line =~ /^DBSOURCE\s+REFSEQ:\s+accession\s+(\S+)\.\d+$/o) { + $acc = $1; + } else { + chomp $line; + warn "Problem in parsing DBSOURCE line: $line +Expecting to match this regexp /^DBSOURCE\\s+REFSEQ:\\s+accession\\s+(\\S+)\\.\\d+\$/o\n"; + return 0; + } + } + + if ($line =~ /^\/\/$/o) { + $desc =~ s/\s*\[.*\]//g; + $sp_desc{"$db:$acc"} = $desc; + $db = undef; + $acc = undef; + $desc = ""; + } + } + + } + + close PROTDBF; + +# foreach my $db_acc (keys %sp_desc) { +# my $desc = $sp_desc{$db_acc}; +# print "$db_acc\t$desc\n"; +# } + + return 1; + +} # parse_protein_database + +exit 0;