use strict;
=head1 get_xrefs
=head2 Description
This script take the post processed pmatch output (see and a file which contains the links of each known gene to other databases (eg: SP to hugo or EMBL) and put them back together in a format suitable for the DBlink tables.
=head2 Options
-mapping: Name of the file corresponding to postprocessed pmatch
-xrefs: Name of the file linking known genes to other DB
-dbmap: File giving for each known gene its DB
-refseq: If refseq ac is used, file which store for each NP its corresponding NM
-output: Name of the output file
=head2 Contact
use Getopt::Long;
my ($mapping,$xrefs,$dbmap,$refseq,$out);
my %map;
my %hash;
my %ref_map;
open (DBMAP,"$dbmap") || die "Can't open file $dbmap\n";
open (XREF,"$xrefs") || die "Can't open file $xrefs\n";
open (MAP,"$mapping") || die "Can't open file $mapping\n";
if ($refseq) {
open (REFSEQ,"$refseq") || die "Can't open file $refseq\n";
open (OUT,">$out") || die "Can't open file $out\n";
while (<DBMAP>) {
#Get put in a hash the corresponding database for an external accession number. Get the infos from a file already processed following this format:
#P31946 SP
my ($mapac,$mapdb) = split(/\t/,$_);
$map{$mapac} = $mapdb;
#Read the file by genbank entries (separated by //)
$/ = "\/\/\n";
while (<REFSEQ>) {
#This subroutine store for each NP (refseq protein accession number) its corresponding NM (DNA accession number)
my ($prot_ac) = $_ =~ /ACCESSION\s+(\S+)/;
my ($dna_ac) = $_ =~ /DBSOURCE REFSEQ: accession\s+(\w+)/;
$ref_map{$prot_ac} = $dna_ac;
#Put back the default (new line) for reading file
$/ = "\n";
while (<XREF>) {
#SP P31946 EMBL X57346
my ($xrdb,$xrac,$db,$id) = split (/\t/,$_);
my $both = "$db:$id";
if( !defined $hash{$xrac} ) {
$hash{$xrac} = [];
while (<MAP>) {
#P01111 COBP00000000001 100 PRIMARY
my ($xr,$ens,$perc,$tag) = split (/\t/,$_);
if ($tag eq "PRIMARY") {
#Its a hack an another solution will have to be found, if the external known gene is a refseq protein accession number get back the equivalent refseq DNA accession number
if ($xr =~ /^NP_\d+/) {
$xr = $ref_map{$xr};
#Print the know gene AC and its database
print OUT "$ens\t$map{$xr}\t$xr\n";
#Print all of the external database it links to (eg: HUGO)
foreach my $both (@{$hash{$xr}}){
($a,$b) = split(/:/,$both);
print OUT "$ens\t$a\t$b\n";
......@@ -13,6 +13,10 @@ NB: All of the intermediary files are written.
-sp : SP, SPTREMBL fasta file
-refseq: Refseq peptide fasta file
=head2 Contacts
use Getopt::Long;
......@@ -68,17 +72,18 @@ sub postprocesspmatch {
my ($len,$id,$start,$end,$perc,$query,$qst,$qend,$qperc) = split;
if ($db eq $refseq) {
#Get the refseq ac (NP_\d+)
#Get only the refseq ac (NP_\d+)
($query) = $query =~ /\w+\|\d+\|\w+\|(\w+)/;
my $uniq = "$id:$query";
#Add the percentage of similarity for the Ensembl peptide for a single match
#There is a bug at this step, some similarities can be over 100% !!! This problem may be solved by changing pmatch source code
$hash1{$uniq} += $perc;
#Write out the processed data
foreach my $key ( keys %hash1 ) {
($a,$b) = split(/:/,$key);
print OUT "$a\t$b\t$hash1{$key}\n";
......@@ -88,6 +93,7 @@ sub postprocesspmatch {
sub finalprocess {
#This final subroutine will use the postprocessed pmatch file and get back the best Ensembl match (labelled as PRIMARY) for a given external known protein.
my ($db) = @_;
if ($db eq $sp) {
......@@ -108,10 +114,13 @@ sub finalprocess {
#if ($perc > 100) {
# print "$ens\t$known\t$perc\n";
if( !defined $hash2{$known} ) {
$hash2{$known} = [];
#Each single external protein correspond to an array of objects dealing with the name and the percentage of similarity of the Ensembl peptide matching with the the known external protein.
my $p= NamePerc->new;
......@@ -119,11 +128,11 @@ sub finalprocess {
foreach my $know ( keys %hash2 ) {
my @array = @{$hash2{$know}};
@array = sort { $b->perc <=> $a->perc } @array;
#The Ensembl match to the known protein is labelled as PRIMARY and will be used later for the mapping
my $top = shift @array;
print OUT "$know\t",$top->name,"\t",$top->perc,"\tPRIMARY\n";
......@@ -132,13 +141,15 @@ sub finalprocess {
die "Not good....";
#If there is more than 20 Ensembl peptides matching a single known protein, these Ensembl peptides are labelled as REPEAT
if (scalar(@array) >= 20) {
foreach my $repeat (@array) {
print OUT "$know\t",$repeat->name,"\t",$repeat->perc,"\tREPEAT\n";
#If less than 20, either duplicate if percentage of identity close to the PRIMARY labelled as DUPLICATE or labelled as PSEUDO. DUPLICATEs can also be used for the mapping
if (scalar(@array) < 20) {
foreach my $duplicate (@array) {
if( $duplicate->perc+1 >= $top->perc ) {
......@@ -154,6 +165,7 @@ sub finalprocess {
close (OUT);
#Set of objects to deal with the script
package NamePerc;
