get_xrefs.pl 4.43 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
use strict;

=head1 get_xrefs

=head2 Description
This script take the post processed pmatch output (see process_pmatch.pl) and a file which contains the links of each known gene to other databases (eg: SP to hugo or EMBL) and put them back together in a format suitable for the DBlink tables.

=head2 Options
-mapping:  Name of the file corresponding to postprocessed pmatch
-xrefs: Name of the file linking known genes to other DB
-dbmap: File giving for each known gene its DB
-refseq: If refseq ac is used, file which store for each NP its corresponding NM
-output: Name of the output file 

=head2 Contact
mongin@ebi.ac.uk
birney@ebi.ac.uk

=cut


use Getopt::Long;

my ($mapping,$xrefs,$dbmap,$refseq,$out);

my %map;
my %hash;
my %ref_map;
29 30
my %ens2embl;
my %sp2embl;
Emmanuel Mongin's avatar
Emmanuel Mongin committed
31
my %embl_clone;
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49

&GetOptions(
            
            'mapping:s'=>\$mapping,
            'xrefs:s'=>\$xrefs,
	    'dbmap:s'=>\$dbmap,
	    'refseq:s'=>\$refseq,
	    'output:s'=>\$out
            );

open (DBMAP,"$dbmap") || die "Can't open file $dbmap\n"; 
open (XREF,"$xrefs") || die "Can't open file $xrefs\n";
open (MAP,"$mapping") || die "Can't open file $mapping\n";
if ($refseq) {
    open (REFSEQ,"$refseq") || die "Can't open file $refseq\n";
}
open (OUT,">$out") || die "Can't open file $out\n";

50
#open (CLONE,"clones.txt") || die "Can't open file\n";
Emmanuel Mongin's avatar
Emmanuel Mongin committed
51 52 53 54 55 56 57 58 59 60

#Put in a hash all of the embl clones used by Ensembl
while (<CLONE>) {
    chomp;
    my ($embl_ac,$id) = split(/\t/,$_);
    print "$embl_ac\n";
    $embl_clone{$embl_ac}=1;
}


61 62
while (<DBMAP>) {
    chomp;
63
#Get put in a hash the corresponding database for an external accession number.Get the infos from a file already processed following this format:
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
#P31946  SP     
     my ($mapac,$mapdb) = split(/\t/,$_);
     $map{$mapac} = $mapdb;
}

#Read the file by genbank entries (separated by //) 
$/ = "\/\/\n";
while (<REFSEQ>) {
#This subroutine store for each NP (refseq protein accession number) its corresponding NM (DNA accession number)
    my ($prot_ac) = $_ =~ /ACCESSION\s+(\S+)/;
    my ($dna_ac) = $_ =~ /DBSOURCE    REFSEQ: accession\s+(\w+)/;

    $ref_map{$prot_ac} = $dna_ac;
}

#Put back the default (new line) for reading file
$/ = "\n";   
while (<XREF>) {
    chomp;

#SP      P31946  EMBL    X57346 
    my ($xrdb,$xrac,$db,$id) = split (/\t/,$_);
86 87
    
    if ($xrdb ne "ENSEMBL") {
88
	my $both = "$db&$id";
89
  
90 91 92
	if( !defined $hash{$xrac} ) {
	    $hash{$xrac} = [];
	}
93
    
94 95
	push(@{$hash{$xrac}},$both);
    }
Emmanuel Mongin's avatar
Emmanuel Mongin committed
96 97

#Get the embl clone corresponding for each Ensembl peptides
98
    #if (($xrdb eq "ENSEMBL")) {
Emmanuel Mongin's avatar
Emmanuel Mongin committed
99
	
100 101
	#push(@{$ens2embl{$xrac}},$id);
    #}
102

Emmanuel Mongin's avatar
Emmanuel Mongin committed
103
#Get the embl ACs for each SP and SPTREMBL proteins
104
    #if ((($xrdb eq "SP") || ($xrdb eq "SPTREMBL")) && ($db eq "EMBL")) {
Emmanuel Mongin's avatar
Emmanuel Mongin committed
105
	 #print "$id\n";
106
	#if ($embl_clone{$id}) {
Emmanuel Mongin's avatar
Emmanuel Mongin committed
107
	    
108 109 110
	 #   push(@{$sp2embl{$xrac}},$id);
	#}
    #}
111 112 113 114
}

while (<MAP>) {
    chomp;
115
    
116 117
#P01111  COBP00000000001 100     PRIMARY  
    my ($xr,$ens,$perc,$tag) = split (/\t/,$_);
118
    
Emmanuel Mongin's avatar
Emmanuel Mongin committed
119
#Hack to be taken away
120 121 122
    #my ($en1,$en2) = $ens =~ /(\w{3})P(\d+)/;
    #my $enst = $en1."T".$en2;
    
Emmanuel Mongin's avatar
Emmanuel Mongin committed
123 124 125
#For now take primary or duplicates and only matches which correspond to more than 25% of the external peptide. These criteria will have to be lowered up.
    if ((($tag eq "PRIMARY") || ($tag eq "DUPLICATE")) && ($perc >= 25)) {
	
126 127 128 129
#Its a hack an another solution will have to be found, if the external known gene is a refseq protein accession number get back the equivalent refseq DNA accession number 
	if ($xr =~ /^NP_\d+/) {
	    $xr = $ref_map{$xr};
	}
130
	
Emmanuel Mongin's avatar
Emmanuel Mongin committed
131
#If the external peptide correspond to an embl clone, we will take the match only if the Ensembl peptide correspond to the same clone (at least one exon)
132 133 134 135 136 137 138 139 140 141 142
	#if ($sp2embl{$xr}) {
	#    print "$xr\t".@{$sp2embl{$xr}}."\n";
	#    my $tot_sp_embl;
	#    my $tot_ens_embl;
	#    my @sp_embl = @{$sp2embl{$xr}};
	
	#    foreach my $sing1 (@sp_embl) {
	#	#print "$sing1\n";
	#	$tot_sp_embl .= $sing1;
	
	#    }
Emmanuel Mongin's avatar
Emmanuel Mongin committed
143
	  	  	    
144 145
	#    if ($ens2embl{$enst}) {
	#	my @ens_embl = @{$ens2embl{$enst}};
Emmanuel Mongin's avatar
Emmanuel Mongin committed
146
	    
147 148 149 150 151 152 153 154 155 156 157 158
	#	foreach my $sing2 (@sp_embl) {
	#	    $tot_ens_embl .= $sing2;
	#	}
	#	if ($tot_ens_embl =~ $tot_sp_embl) {
	#	    print  OUT "$ens\t$map{$xr}\t$xr\n";
	#	}
	#	else {
	#	    #print "no\n";
	#	}
	#    }
	#}
	#else {
159
#Print the know gene AC and its database
160 161
	#print OUT "$ens\t$map{$xr}\t$xr\n";
	#}
162
	print OUT "$ens\t$map{$xr}\t$xr\n";
163 164 165 166 167 168
	
	#Print all of the external database it links to (eg: HUGO)
	    foreach my $both (@{$hash{$xr}}){
		($a,$b) = split(/&/,$both);
		print OUT "$ens\t$a\t$b\n";
	    }
169 170 171 172
    }
}