Skip to content
Snippets Groups Projects
Commit 9be38159 authored by Abel Ureta-Vidal's avatar Abel Ureta-Vidal
Browse files

Introducing good programming practices :) such as perl -w or use strict.

Then identified bug in Accession Number parsing, when in more than one line for an entry, script was getting the first
AC of the last parsed line. But we want the first AC of the first parsed line. Fixed now. Added possibility of parsing
parent 02474ab8
No related branches found
No related tags found
No related merge requests found
#!/usr/local/bin/perl
#!/usr/local/bin/perl -w
# $Header$
#
# Script to create a tab-separated file (to be loaded into the
......@@ -11,82 +11,106 @@
## list of regexps tested, in order of increasing preference, when
## deciding which description to use (used by sub compare_desc):
my @word_order =
qw(unknown hypothetical putative novel probable [0-9]{3} kDa fragment cdna protein);
$Usage = "Usage: $0 sp-descriptions.dat mapping.dat > gene-descriptions.dat\n";
use strict;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
my @word_order = qw(unknown hypothetical putative novel probable [0-9]{3} kDa fragment cdna protein);
my $Usage = "Usage: $0 sp-descriptions.dat mapping.dat > gene-descriptions.dat\n";
die $Usage if @ARGV == 0;
$spdesc = shift;
$map = shift;
my $spdesc = shift;
#my $map = shift;
open(SPDESC, $spdesc) || die "$spdesc:$!";
open SPDESC,$spdesc || die "$spdesc:$!";
undef %sp_desc;
while(<SPDESC>) {
chomp;
($db, $acc, $desc)=split(/\t/);
my %sp_desc;
while (<SPDESC>) {
chomp;
my ($db, $acc, $desc) = split(/\t/);
$sp_desc{"$db:$acc"}=$desc;
}
close(SPDESC) || die "$!";
close SPDESC || die "$!";
open(MAP, $map) || die "$map:$!";
my $host = 'ecs1d.sanger.ac.uk';
my $dbname = 'homo_sapiens_core_4_28';
my $dbuser = 'ensro';
undef %gene_desc;
my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor (-host => $host,
-user => $dbuser,
-dbname => $dbname );
LINE:
while (<MAP>) {
chomp;
my ($ensp, $ensg, $db, $acc)=split(/\t/);
my $sth = $db->prepare("SELECT tsc.translation_id, tsc.gene_id, xdb.db_name, x.dbprimary_id, ix.query_identity, ix.target_identity FROM transcript tsc, objectXref ox, Xref x, externalDB xdb, identityXref ix WHERE tsc.translation_id = ox.ensembl_id AND ox.xrefId = x.xrefId AND x.externalDBId = xdb.externalDBId AND xdb.db_name in ('SWISSPROT', 'SPTREMBL') AND ox.objectxrefId = ix.objectxrefId order by tsc.gene_id asc, xdb.db_name desc, x.dbprimary_id asc");
if ( defined($gene_desc{$ensg}) ) {
($prevdb, $prev_desc) = @{$gene_desc{$ensg}};
if ($prevdb eq 'SWISS-PROT') {
next LINE; # nothing to change
}
if ($db eq 'SWISS-PROT') {
$desc = $sp_desc{"$db:$acc"};
$gene_desc{$ensg} = [ $db, $desc, $acc ]; # kick out the SPTREMBL desc.
next LINE;
}
unless ($sth->execute()) {
$db->throw("Failed execution of a select query");
}
if ($db eq 'SPTREMBL' && $prevdb eq $db ) {
$desc = $sp_desc{"$db:$acc"};
if ( &compare_desc($prev_desc, $desc) < 0 ) {
# new desc is better
# warn "new better: $desc (old was: $prev_desc)\n";
$gene_desc{$ensg} = [ $db, $desc, $acc ];
next LINE;
} else {
# warn "old better: $prev_desc (new is: $desc)\n";
next LINE;
}
die "should not reach this point: only know SWISS-PROT and SPTREMBL";
}
} else {
$desc = $sp_desc{"$db:$acc"};
$gene_desc{$ensg} = [ $db, $desc, $acc ];
my %gene_desc;
LINE:
while (my ($ensp, $ensg, $db, $acc, $qy_percid, $tg_percid) = $sth->fetchrow_array()) {
my $desc;
if ( defined($gene_desc{$ensg}) ) {
my ($prevdb, $prev_desc, $prev_acc, $prev_qy_percid, $prev_tg_percid) = @{$gene_desc{$ensg}};
# if ($prev_qy_percid < $qy_percid) {
# $desc = $sp_desc{"$db:$acc"};
# $gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ]; # kick out the SPTREMBL desc.
# next LINE;
# }
# next LINE;
if ($prevdb eq 'SWISSPROT') {
next LINE; # nothing to change
}
if ($db eq 'SWISSPROT') {
$desc = $sp_desc{"$db:$acc"};
$gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ]; # kick out the SPTREMBL desc.
next LINE;
}
if ($db eq 'SPTREMBL' && $prevdb eq $db ) {
$desc = $sp_desc{"$db:$acc"};
if ( &compare_desc($prev_desc, $desc) < 0 ) {
# new desc is better
# warn "new better: $desc (old was: $prev_desc)\n";
$gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ];
next LINE;
} else {
# warn "old better: $prev_desc (new is: $desc)\n";
next LINE;
}
die "should not reach this point: only know SWISS-PROT and SPTREMBL";
}
} # while <MAP>
} else {
$desc = $sp_desc{"$db:$acc"};
$gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ];
}
}
# now dump the stuff to stdout.
foreach $ensg ( keys %gene_desc ) {
my ($db,$desc,$acc) = @{$gene_desc{$ensg}};
### final cleanup
### get rid of the Rik mess:
$_ = $desc;
if (s/[0-9A-Z]{10}Rik protein[ \.]//g) {
warn "throwing away: $desc\n";
}
s/^\s*\(Fragment\)\.?\s*$//g;
s/^\s*\(\s*\)\s*$//g;
### add more as appropriate
print STDOUT "$ensg\t $_ [Source:$db;Acc:$acc]\n" if $_; # =~ /[a-z]/;???
foreach my $ensg ( keys %gene_desc ) {
my ($db,$desc,$acc,$qy_percid,$tg_percid) = @{$gene_desc{$ensg}};
### final cleanup
### get rid of the Rik mess:
$_ = $desc;
if (s/[0-9A-Z]{10}Rik protein[ \.]//g) {
warn "throwing away: $desc\n";
}
s/^\s*\(Fragment\)\.?\s*$//g;
s/^\s*\(\s*\)\s*$//g;
### add more as appropriate
# print STDOUT "$ensg\t $_ [Source:$db;Acc:$acc,%qy:$qy_percid,\%tg:$tg_percid]\n" if $_; # =~ /[a-z]/;???
print STDOUT "$ensg\t $_ [Source:$db;Acc:$acc]\n" if $_; # =~ /[a-z]/;???
}
#### following taken from ensembl-external/scripts/family-input.pl
......
#!/usr/local/bin/perl
#!/usr/local/bin/perl -w
# $Header$
# Create a file dbname\taccno\tdescription from swissprot flat file.
# Create a file dbname\taccno\tdescription from SWISSPROT/SPTrEMBL/RefSeq flat file.
# Used by gene-descriptions.pl
# Usage: sp-descriptions.pl < swiss-prot-flatfile > sp-descriptions.dat
# Usage: sp-descriptions.pl < swissprot-flatfile [RefSeq-flatfile] > sp-descriptions.dat
use strict;
my $swissprot_db_name = "SWISSPROT";
my $sptrembl_db_name = "SPTREMBL";
my $refseq_db_name = "RefSeq";
my $db;
my $acc;
my $desc="";
while( <> ) {
if ( /^ID/ ) {
if (/PRELIMINARY;/) {
$db='SPTREMBL';
} elsif (/STANDARD;/) {
$db='SWISS-PROT';
} else {
chomp;
die "can't recognize: $_";
}
my $desc = "";
my %sp_desc;
while (defined (my $line = <>)) {
if ($line =~ /^ID\s{3}.*$/o) {
if ($line =~ /^ID\s{3}.*\s+PRELIMINARY;\s+.*$/o) {
$db = $sptrembl_db_name;
} elsif ($line =~ /^ID\s{3}.*\s+STANDARD;\s+.*$/o) {
$db = $swissprot_db_name;
} else {
chomp $line;
warn "\nCould not recognize line : \"$line\"\nCheck the input file format.\n\n";
exit 1;
}
}
if ($line =~ /^LOCUS\s+\S+\s+.*$/o) {
$db = $refseq_db_name;
}
if ($db eq $swissprot_db_name ||
$db eq $sptrembl_db_name) {
if ($line =~ /^AC\s{3}(\S+);.*$/o &&
! defined $acc) {
$acc = $1;
}
if ($line =~ /^DE\s{3}(\S.*\S)$/o) {
$desc .= " " unless ($desc eq "");
$desc .= uc $1;
}
if ( /^AC\s+(\S+)/ ) {
$acc = $1;
$acc =~ s/;$//g;
if ($line =~ /^\/\/$/o) {
$desc =~ s/\{.*\}//g;
$sp_desc{"$db\t$acc"} = $desc;
$db = undef;
$acc = undef;
$desc = "";
}
} elsif ($db eq $refseq_db_name) {
if ( /^DE\s+(\S.*\S)/) {
$desc .= $1;
if ($line =~ /^DEFINITION\s+(\S.*\S)$/o) {
$desc .= uc $1;
while (defined ($line = <>)) {
last if ($line =~ /^ACCESSION\s+(\S+)\s*.*$/o);
if ($line =~ /^\s+(\S.*\S)$/o) {
$desc .= " ".uc $1;
}
}
}
if ( m://: ) {
$desc =~ s/\{.*\}//g;
$sp_desc{"$db\t$acc"} = $desc;
$acc = undef;
$desc = "";
if ($line =~ /^ACCESSION\s+(\S+)\s*.*$/o) {
$acc = $1;
}
if ($line =~ /^\/\/$/o) {
$desc =~ s/\s*\[.*\]//g;
$sp_desc{"$db\t$acc"} = $desc;
$db = undef;
$acc = undef;
$desc = "";
}
}
}
foreach $acc ( keys %sp_desc ) {
$desc = $sp_desc{$acc};
$desc =~ s/\n/\\n/g;
print "$acc\t$desc\n";
foreach my $db_acc (keys %sp_desc) {
my $desc = $sp_desc{$db_acc};
print "$db_acc\t$desc\n";
}
exit;
exit 0;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment