get_hugo_mapping.pl 4.09 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
use strict;

=head1 get_hugo_mapping

=head2 Description

This script reads a set of files produced by HUGO and built a file which will be used to get the DBlink table.The format of the file is the following

SP      P27348  HUGO    YWHAQ 

(known database\t known ac\t hugo or alias\t hugo ac)

=head2 Options

The different options only deal with file names

-nomeid: Hugo file (http://www.gene.ucl.ac.uk/public-files/nomen/nomeids.txt)

-ens1: Hugo file (http://www.gene.ucl.ac.uk/public-files/nomen/ens1.txt)

-ens2: Hugo file (http://www.gene.ucl.ac.uk/public-files/nomen/ens2.txt)

-output: Filename were the output should be written

-dbmap: File which give the corresponding database for each known protein

=cut

use Getopt::Long;

31 32 33
#perl ../../../src/ensembl-live/misc-scripts/protein_match/get_hugo_mapping.pl -ens1 ../secondary/ens1.txt -ens2 ../secondary/ens2.txt -ens4 ../secondary/ens4.txt -ens5 ../secondary/ens5.txt -out hugo.map -dbmap mapdb.map

my ($ens1,$ens2,$ens4,$ens5,$out,$dbmap);
34 35

my %map;
36 37
my %hugo_sp;
my %hugo_refseq;
38 39 40 41 42 43
my %en2;
my %hugohash;

&GetOptions(
	    'ens1:s'=>\$ens1,
            'ens2:s'=>\$ens2,
44 45 46 47
	    'ens4:s'=>\$ens4,
	    'ens5:s'=>\$ens5,
	    'dbmap:s'=>\$dbmap,
	    'output:s'=>\$out
48 49 50 51 52 53
            );



open (ENS1,"$ens1") || die "Can't open file $ens1\n";
open (ENS2,"$ens2") || die "Can't open file $ens2\n";
54 55
open (ENS4,"$ens4") || die "Can't open file $ens4\n";
open (ENS5,"$ens5") || die "Can't open file $ens5\n";
56 57
open (DBMAP,"$dbmap") || die "Can't open file $dbmap\n";
open (OUT,">$out") || die "Can't open output file $out\n";
58
open (ERROR,">hugo.err") || die "Can't open output file hugo.err\n";
59

60 61 62 63 64
while (<DBMAP>) {
    chomp;
     my ($mapac,$mapdb) = split(/\t/,$_);
     $map{$mapac} = $mapdb;
}
65 66 67

while (<ENS1>) {
    chomp;
68
    #Get hugo id
69 70 71
    #Get rid of the annoying carriage return!
    $_ =~ s/\r//g;
    my ($hgnc,$sp,$refseq) = split(/\t/,$_);
Emmanuel Mongin's avatar
Emmanuel Mongin committed
72

73 74 75 76 77 78 79
    if ($sp) {
	print OUT "$map{$sp}\t$sp\tHUGOID\t$hgnc\n";
    }

    if ($refseq) {
	print OUT "$map{refseq}\t$refseq\tHUGOID\t$hgnc\n";
    }
Emmanuel Mongin's avatar
Emmanuel Mongin committed
80

81
    if ($sp) {
82
	$hugo_sp{$hgnc} = $sp;
83 84
    }
    if ($refseq) {
85
	$hugo_refseq{$hgnc} = $refseq;
86 87 88 89 90
    }
}

while (<ENS2>) {
    chomp;
91
#Get hugo symbol
92
    $_ =~ s/\r//g;
93
    my ($hgnc1,$hugo) = split(/\t/,$_);
94
    
95 96
    if ($hugo_sp{$hgnc1}) {
	print OUT "$map{$hugo_sp{$hgnc1}}\t$hugo_sp{$hgnc1}\tHUGOSYMBOL\t$hugo\n";
97 98
    }

99 100 101
    if ($hugo_refseq{$hgnc1}) { 
	print OUT "$map{$hugo_refseq{$hgnc1}}\t$hugo_refseq{$hgnc1}\tHUGOSYMBOL\t$hugo\n";
    }
102

103 104 105 106 107 108
    if (!defined $en2{$hgnc1}) {
	$en2{$hgnc1} = [];
    }
    
    $en2{$hgnc1} = $hugo;
}
109

110
while (<ENS4>) {
111 112 113
#Get hugo aliases given a hugo primary accession number. For each primary accession number, the aliases are put in a hash of array
    
    chomp;
114 115 116 117 118 119 120 121
    my ($hgnc2, $symbol, $alias, $withdrawn) = split (/\t/,$_);

    if ((defined $hugo_sp{$hgnc2}) && (defined $alias)) {
	my @aliases1 = split (/, /,$alias);
	foreach my $aliase1 (@aliases1) {
	    print OUT "$map{$hugo_sp{$hgnc2}}\t$hugo_sp{$hgnc2}\tHUGOALIAS\t$aliase1\n";
	}
    }
122
    
123 124 125 126 127
    if ((defined $hugo_sp{$hgnc2}) && ($withdrawn =~ /\S+/)) {
	my @withdrawns1 = split (/, /,$withdrawn);
	foreach my $withdrawn1 (@withdrawns1) {
	    print OUT "$map{$hugo_sp{$hgnc2}}\t$hugo_sp{$hgnc2}\tHUGOWITHDRAWN\t$withdrawn1\n";
	}
128 129
    }
    
130 131 132 133 134 135 136 137 138 139 140
    if ((defined $hugo_refseq{$hgnc2}) && (defined $alias)) {
	my @aliases2 = split (/, /,$alias);
	foreach my $aliase2 (@aliases2) {
	    print OUT "$map{$hugo_sp{$hgnc2}}\t$hugo_sp{$hgnc2}\tHUGOALIAS\t$aliase2\n";
	}
    }
    
    if ((defined $hugo_refseq{$hgnc2}) && ($withdrawn =~ /\S+/)) {
	my @withdrawns2 = split (/, /,$withdrawn);
	foreach my $withdrawn2 (@withdrawns2) {
	    print OUT "$map{$hugo_sp{$hgnc2}}\t$hugo_sp{$hgnc2}\tHUGOWITHDRAWN\t$withdrawn2\n";
141 142 143 144
	}
    }
}

145 146 147 148 149 150 151 152
while (<ENS5>) {
#Use Hugo mapping to get EC numbers    
    chomp;
    my ($hgnc3, $symbol1, $name, $ec, $sp) = split (/\t/,$_);
    
    if ((defined $hugo_sp{$hgnc3}) && (defined $ec)) {
	 print OUT "$map{$hugo_sp{$hgnc3}}\t$hugo_sp{$hgnc3}\tEC\t$ec\n";
     }
153

154 155 156 157
    if ((defined $hugo_refseq{$hgnc3}) && (defined $ec)) {
	 print OUT "$map{$hugo_sp{$hgnc3}}\t$hugo_sp{$hgnc3}\tEC\t$ec\n";
     }
}
158 159 160