From 9be38159e9e93e18f5ba3c4f447ed81ccd609fe0 Mon Sep 17 00:00:00 2001
From: Abel Ureta-Vidal <abel@sanger.ac.uk>
Date: Wed, 27 Feb 2002 09:37:03 +0000
Subject: [PATCH] Introducing good programming practices :) such as perl -w or
 use strict. Then identified bug in Accession Number parsing, when in more
 than one line for an entry, script was getting the first AC of the last
 parsed line. But we want the first AC of the first parsed line. Fixed now.
 Added possibility of parsing

---
 scripts/gene-descriptions.pl | 140 ++++++++++++++++++++---------------
 scripts/sp-descriptions.pl   | 105 ++++++++++++++++++--------
 2 files changed, 158 insertions(+), 87 deletions(-)

diff --git a/scripts/gene-descriptions.pl b/scripts/gene-descriptions.pl
index 3e1fed1bd8..a7a14d9535 100755
--- a/scripts/gene-descriptions.pl
+++ b/scripts/gene-descriptions.pl
@@ -1,4 +1,4 @@
-#!/usr/local/bin/perl
+#!/usr/local/bin/perl -w
 # $Header$
 # 
 # Script to create a tab-separated file (to be loaded into the
@@ -11,82 +11,106 @@
 
 ## list of regexps tested, in order of increasing preference, when
 ## deciding which description to use (used by sub compare_desc):
-my @word_order = 
-   qw(unknown hypothetical putative novel probable [0-9]{3} kDa fragment cdna protein);
 
-$Usage = "Usage: $0 sp-descriptions.dat mapping.dat > gene-descriptions.dat\n";
+use strict;
+use Bio::EnsEMBL::DBSQL::DBAdaptor;
+
+my @word_order = qw(unknown hypothetical putative novel probable [0-9]{3} kDa fragment cdna protein);
+
+my $Usage = "Usage: $0 sp-descriptions.dat mapping.dat > gene-descriptions.dat\n";
 
 die $Usage if @ARGV == 0;
 
-$spdesc = shift;
-$map = shift;
+my $spdesc = shift;
+#my $map = shift;
 
-open(SPDESC, $spdesc) || die "$spdesc:$!";
+open SPDESC,$spdesc || die "$spdesc:$!";
 
-undef %sp_desc;
-while(<SPDESC>) {
-    chomp;
-    ($db, $acc, $desc)=split(/\t/);
+my %sp_desc;
 
+while (<SPDESC>) {
+    chomp;
+    my ($db, $acc, $desc) = split(/\t/);
     $sp_desc{"$db:$acc"}=$desc;
 }
 
-close(SPDESC) || die "$!";
+close SPDESC || die "$!";
 
-open(MAP, $map) || die "$map:$!";
+my $host = 'ecs1d.sanger.ac.uk';
+my $dbname = 'homo_sapiens_core_4_28';
+my $dbuser = 'ensro';
 
-undef %gene_desc;
+my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor (-host => $host,
+					     -user => $dbuser,
+					     -dbname => $dbname );
 
-LINE:
-while (<MAP>) {                       
-    chomp;
-    my ($ensp, $ensg, $db, $acc)=split(/\t/);
+my $sth = $db->prepare("SELECT tsc.translation_id, tsc.gene_id, xdb.db_name, x.dbprimary_id, ix.query_identity, ix.target_identity FROM transcript tsc, objectXref ox, Xref x, externalDB xdb, identityXref ix WHERE tsc.translation_id = ox.ensembl_id AND ox.xrefId = x.xrefId AND x.externalDBId = xdb.externalDBId AND xdb.db_name in ('SWISSPROT', 'SPTREMBL') AND ox.objectxrefId = ix.objectxrefId order by tsc.gene_id asc, xdb.db_name desc, x.dbprimary_id asc");
 
-    if ( defined($gene_desc{$ensg}) ) {
-        ($prevdb, $prev_desc)  = @{$gene_desc{$ensg}};
-        if ($prevdb  eq 'SWISS-PROT') {   
-            next LINE;                  # nothing to change
-        }
-        if ($db  eq 'SWISS-PROT') {   
-            $desc = $sp_desc{"$db:$acc"};
-            $gene_desc{$ensg} = [ $db, $desc, $acc ]; # kick out the SPTREMBL desc.
-            next LINE;
-        }
+unless ($sth->execute()) {
+  $db->throw("Failed execution of a select query");
+}
 
-        if ($db  eq 'SPTREMBL' &&  $prevdb eq $db ) {   
-            $desc = $sp_desc{"$db:$acc"}; 
-            if ( &compare_desc($prev_desc, $desc) < 0 ) {
-                # new desc is better
-                # warn "new better: $desc (old was: $prev_desc)\n";
-                $gene_desc{$ensg} = [ $db, $desc, $acc ];
-                next LINE;
-            } else {
-                # warn "old better: $prev_desc (new is: $desc)\n";
-                next LINE;
-            }
-            die "should not reach this point: only know SWISS-PROT and SPTREMBL";
-        }
-    } else {
-        $desc = $sp_desc{"$db:$acc"};
-        $gene_desc{$ensg} = [ $db, $desc, $acc ];
+my %gene_desc;
+
+LINE:
+
+while (my ($ensp, $ensg, $db, $acc, $qy_percid, $tg_percid) = $sth->fetchrow_array()) {
+
+  my $desc;
+  if ( defined($gene_desc{$ensg}) ) {
+    my ($prevdb, $prev_desc, $prev_acc, $prev_qy_percid, $prev_tg_percid)  = @{$gene_desc{$ensg}}; 
+#    if ($prev_qy_percid < $qy_percid) { 
+#      $desc = $sp_desc{"$db:$acc"};
+#      $gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ]; # kick out the SPTREMBL desc.
+#      next LINE;
+#    }
+#    next LINE;
+    if ($prevdb  eq 'SWISSPROT') {   
+      next LINE;                  # nothing to change
+    }
+    if ($db  eq 'SWISSPROT') { 
+      $desc = $sp_desc{"$db:$acc"};
+      $gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ]; # kick out the SPTREMBL desc.
+      next LINE;
+    }
+    
+    if ($db  eq 'SPTREMBL' &&  $prevdb eq $db ) {   
+      $desc = $sp_desc{"$db:$acc"}; 
+      if ( &compare_desc($prev_desc, $desc) < 0 ) {
+	# new desc is better
+	# warn "new better: $desc (old was: $prev_desc)\n";
+	$gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ];
+	next LINE;
+      } else {
+	# warn "old better: $prev_desc (new is: $desc)\n";
+	next LINE;
+      }
+      die "should not reach this point: only know SWISS-PROT and SPTREMBL";
     }
-}                                       # while <MAP>
+  } else {
+    $desc = $sp_desc{"$db:$acc"};
+    $gene_desc{$ensg} = [ $db, $desc, $acc, $qy_percid, $tg_percid ];
+  }
+}
 
 #  now dump the stuff to stdout.
-foreach $ensg ( keys %gene_desc )  { 
-    my ($db,$desc,$acc)   = @{$gene_desc{$ensg}};
-
-    ### final cleanup 
-    ### get rid of the Rik mess:
-    $_ = $desc;
-    if (s/[0-9A-Z]{10}Rik protein[ \.]//g) {
-        warn "throwing away: $desc\n";
-    }
-    s/^\s*\(Fragment\)\.?\s*$//g;
-    s/^\s*\(\s*\)\s*$//g;
-    ### add more as appropriate
 
-    print STDOUT "$ensg\t $_ [Source:$db;Acc:$acc]\n" if $_; #  =~ /[a-z]/;???
+foreach my $ensg ( keys %gene_desc )  { 
+
+  my ($db,$desc,$acc,$qy_percid,$tg_percid) = @{$gene_desc{$ensg}};
+
+  ### final cleanup 
+  ### get rid of the Rik mess:
+  $_ = $desc;
+  if (s/[0-9A-Z]{10}Rik protein[ \.]//g) {
+    warn "throwing away: $desc\n";
+  }
+  s/^\s*\(Fragment\)\.?\s*$//g;
+  s/^\s*\(\s*\)\s*$//g;
+  ### add more as appropriate
+
+#  print STDOUT "$ensg\t $_ [Source:$db;Acc:$acc,%qy:$qy_percid,\%tg:$tg_percid]\n" if $_; #  =~ /[a-z]/;???
+  print STDOUT "$ensg\t $_ [Source:$db;Acc:$acc]\n" if $_; #  =~ /[a-z]/;???
 }
 
 #### following taken from ensembl-external/scripts/family-input.pl
diff --git a/scripts/sp-descriptions.pl b/scripts/sp-descriptions.pl
index 36339bb713..43a9511d70 100755
--- a/scripts/sp-descriptions.pl
+++ b/scripts/sp-descriptions.pl
@@ -1,45 +1,92 @@
-#!/usr/local/bin/perl
+#!/usr/local/bin/perl -w
 # $Header$
 
-# Create a file dbname\taccno\tdescription from swissprot flat file.
+# Create a file dbname\taccno\tdescription from SWISSPROT/SPTrEMBL/RefSeq flat file.
 # Used by gene-descriptions.pl
 
-# Usage: sp-descriptions.pl < swiss-prot-flatfile  > sp-descriptions.dat
+# Usage: sp-descriptions.pl < swissprot-flatfile [RefSeq-flatfile] > sp-descriptions.dat
 
+use strict;
+
+my $swissprot_db_name = "SWISSPROT";
+my $sptrembl_db_name = "SPTREMBL";
+my $refseq_db_name = "RefSeq";
+
+my $db;
 my $acc; 
-my $desc="";
-while( <> ) {
-    if ( /^ID/ ) {
-        if (/PRELIMINARY;/) {
-            $db='SPTREMBL';
-        } elsif (/STANDARD;/) { 
-            $db='SWISS-PROT';
-        } else {
-            chomp;
-            die "can't recognize: $_";
-        }
+my $desc = "";
+my %sp_desc;
+
+while (defined (my $line = <>)) {
+
+  if ($line =~ /^ID\s{3}.*$/o) {
+    if ($line =~ /^ID\s{3}.*\s+PRELIMINARY;\s+.*$/o) {
+      $db = $sptrembl_db_name;
+    } elsif ($line =~ /^ID\s{3}.*\s+STANDARD;\s+.*$/o) { 
+      $db = $swissprot_db_name;
+    } else {
+      chomp $line;
+      warn "\nCould not recognize line : \"$line\"\nCheck the input file format.\n\n";
+      exit 1;
+    }
+  }
+
+  if ($line =~ /^LOCUS\s+\S+\s+.*$/o) {
+    $db = $refseq_db_name;
+  }
+  
+  if ($db eq $swissprot_db_name ||
+      $db eq $sptrembl_db_name) {
+    
+    if ($line =~ /^AC\s{3}(\S+);.*$/o &&
+       ! defined $acc)  {
+      $acc = $1;
+    }
+    
+    if ($line =~ /^DE\s{3}(\S.*\S)$/o) {
+      $desc .= " " unless ($desc eq "");
+      $desc .= uc $1;
     }
-    if ( /^AC\s+(\S+)/ )  {
-        $acc = $1;
-        $acc =~ s/;$//g;
+    
+    if ($line =~ /^\/\/$/o) {
+      $desc =~ s/\{.*\}//g; 
+      $sp_desc{"$db\t$acc"} = $desc; 
+      $db = undef;
+      $acc = undef; 
+      $desc = ""; 
     }
+    
+  } elsif ($db eq $refseq_db_name) {
 
-    if ( /^DE\s+(\S.*\S)/) {
-        $desc .= $1;
+    if ($line =~ /^DEFINITION\s+(\S.*\S)$/o) {
+      $desc .= uc $1;
+      while (defined ($line = <>)) {
+	last if ($line =~ /^ACCESSION\s+(\S+)\s*.*$/o);
+	if ($line =~ /^\s+(\S.*\S)$/o) {
+	  $desc .= " ".uc $1;
+	}
+      }
     }
 
-    if ( m://: ) {
-        $desc =~ s/\{.*\}//g; 
-        $sp_desc{"$db\t$acc"} = $desc; 
-        $acc = undef; 
-        $desc = ""; 
+    if ($line =~ /^ACCESSION\s+(\S+)\s*.*$/o)  {
+      $acc = $1;
     }
+
+    if ($line =~ /^\/\/$/o) {
+      $desc =~ s/\s*\[.*\]//g; 
+      $sp_desc{"$db\t$acc"} = $desc; 
+      $db = undef;
+      $acc = undef; 
+      $desc = ""; 
+    }
+  }
+
+  
 }
 
-foreach $acc ( keys %sp_desc )  {
-    $desc = $sp_desc{$acc};
-    $desc =~ s/\n/\\n/g;
-    print "$acc\t$desc\n";
+foreach my $db_acc (keys %sp_desc)  {
+  my $desc = $sp_desc{$db_acc};
+  print "$db_acc\t$desc\n";
 }
 
-exit;
+exit 0;
-- 
GitLab