get_xrefs.pl 2.72 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
use strict;

=head1 get_xrefs

=head2 Description
This script take the post processed pmatch output (see process_pmatch.pl) and a file which contains the links of each known gene to other databases (eg: SP to hugo or EMBL) and put them back together in a format suitable for the DBlink tables.

=head2 Options
-mapping:  Name of the file corresponding to postprocessed pmatch
-xrefs: Name of the file linking known genes to other DB
-dbmap: File giving for each known gene its DB
-refseq: If refseq ac is used, file which store for each NP its corresponding NM
-output: Name of the output file 

=head2 Contact
mongin@ebi.ac.uk
birney@ebi.ac.uk

=cut


use Getopt::Long;

my ($mapping,$xrefs,$dbmap,$refseq,$out);

my %map;
my %hash;
my %ref_map;

&GetOptions(
            
            'mapping:s'=>\$mapping,
            'xrefs:s'=>\$xrefs,
	    'dbmap:s'=>\$dbmap,
	    'refseq:s'=>\$refseq,
	    'output:s'=>\$out
            );

open (DBMAP,"$dbmap") || die "Can't open file $dbmap\n"; 
open (XREF,"$xrefs") || die "Can't open file $xrefs\n";
open (MAP,"$mapping") || die "Can't open file $mapping\n";
if ($refseq) {
    open (REFSEQ,"$refseq") || die "Can't open file $refseq\n";
}
open (OUT,">$out") || die "Can't open file $out\n";

while (<DBMAP>) {
    chomp;
#Get put in a hash the corresponding database for an external accession number. Get the infos from a file already processed following this format:
#P31946  SP     
     my ($mapac,$mapdb) = split(/\t/,$_);
     $map{$mapac} = $mapdb;
}

#Read the file by genbank entries (separated by //) 
$/ = "\/\/\n";
while (<REFSEQ>) {
#This subroutine store for each NP (refseq protein accession number) its corresponding NM (DNA accession number)
    my ($prot_ac) = $_ =~ /ACCESSION\s+(\S+)/;
    my ($dna_ac) = $_ =~ /DBSOURCE    REFSEQ: accession\s+(\w+)/;

    $ref_map{$prot_ac} = $dna_ac;
}

#Put back the default (new line) for reading file
$/ = "\n";   
while (<XREF>) {
    chomp;

#SP      P31946  EMBL    X57346 
    my ($xrdb,$xrac,$db,$id) = split (/\t/,$_);
    my $both = "$db:$id";
  
    if( !defined $hash{$xrac} ) {
	$hash{$xrac} = [];
    }
    
    push(@{$hash{$xrac}},$both);
}

while (<MAP>) {
    chomp;

#P01111  COBP00000000001 100     PRIMARY  
    my ($xr,$ens,$perc,$tag) = split (/\t/,$_);
    if ($tag eq "PRIMARY") {

#Its a hack an another solution will have to be found, if the external known gene is a refseq protein accession number get back the equivalent refseq DNA accession number 
	if ($xr =~ /^NP_\d+/) {
	    $xr = $ref_map{$xr};
	}

#Print the know gene AC and its database
	print OUT "$ens\t$map{$xr}\t$xr\n";

#Print all of the external database it links to (eg: HUGO)
	foreach my $both (@{$hash{$xr}}){
	    ($a,$b) = split(/:/,$both);
	    print OUT "$ens\t$a\t$b\n";
	}
    }
}