Commit 58fc4a4b authored by Emmanuel Mongin's avatar Emmanuel Mongin
Browse files

Mapping going on, many changes done...

parent 7671f083
...@@ -28,103 +28,141 @@ The different options only deal with file names ...@@ -28,103 +28,141 @@ The different options only deal with file names
use Getopt::Long; use Getopt::Long;
my ($nomeid,$ens1,$ens2,$out,$dbmap); #perl ../../../src/ensembl-live/misc-scripts/protein_match/get_hugo_mapping.pl -ens1 ../secondary/ens1.txt -ens2 ../secondary/ens2.txt -ens4 ../secondary/ens4.txt -ens5 ../secondary/ens5.txt -out hugo.map -dbmap mapdb.map
my ($ens1,$ens2,$ens4,$ens5,$out,$dbmap);
my %map; my %map;
my %en1; my %hugo_sp;
my %hugo_refseq;
my %en2; my %en2;
my %hugohash; my %hugohash;
&GetOptions( &GetOptions(
'nomeid:s'=>\$nomeid,
'ens1:s'=>\$ens1, 'ens1:s'=>\$ens1,
'ens2:s'=>\$ens2, 'ens2:s'=>\$ens2,
'output:s'=>\$out, 'ens4:s'=>\$ens4,
'dbmap:s'=>\$dbmap 'ens5:s'=>\$ens5,
'dbmap:s'=>\$dbmap,
'output:s'=>\$out
); );
open (ENS1,"$ens1") || die "Can't open file $ens1\n"; open (ENS1,"$ens1") || die "Can't open file $ens1\n";
open (ENS2,"$ens2") || die "Can't open file $ens2\n"; open (ENS2,"$ens2") || die "Can't open file $ens2\n";
open (NOME,"$nomeid") || die "Can't open file $nomeid\n"; open (ENS4,"$ens4") || die "Can't open file $ens4\n";
open (ENS5,"$ens5") || die "Can't open file $ens5\n";
open (DBMAP,"$dbmap") || die "Can't open file $dbmap\n"; open (DBMAP,"$dbmap") || die "Can't open file $dbmap\n";
open (OUT,">$out") || die "Can't open output file $out\n"; open (OUT,">$out") || die "Can't open output file $out\n";
open (ERROR,">hugo.err") || die "Can't open output file hugo.err\n";
while (<DBMAP>) {
chomp;
my ($mapac,$mapdb) = split(/\t/,$_);
$map{$mapac} = $mapdb;
}
while (<ENS1>) { while (<ENS1>) {
chomp; chomp;
#Get hugo id
#Get rid of the annoying carriage return! #Get rid of the annoying carriage return!
$_ =~ s/\r//g; $_ =~ s/\r//g;
my ($hgnc,$sp,$refseq) = split(/\t/,$_); my ($hgnc,$sp,$refseq) = split(/\t/,$_);
if ($sp) {
print OUT "$map{$sp}\t$sp\tHUGOID\t$hgnc\n";
}
if ($refseq) {
print OUT "$map{refseq}\t$refseq\tHUGOID\t$hgnc\n";
}
if ($sp) { if ($sp) {
$en1{$sp} = $hgnc; $hugo_sp{$hgnc} = $sp;
} }
if ($refseq) { if ($refseq) {
$en1{$refseq} = $hgnc; $hugo_refseq{$hgnc} = $refseq;
} }
} }
while (<ENS2>) { while (<ENS2>) {
chomp; chomp;
#Get hugo symbol
$_ =~ s/\r//g; $_ =~ s/\r//g;
my ($hgnc,$hugo) = split(/\t/,$_); my ($hgnc1,$hugo) = split(/\t/,$_);
if (!defined $en2{$hgnc}) { # if (!defined $hugo_sp{$hgnc1}) {
$en2{$hgnc} = []; # print ERROR "Can't map back $hugo_sp{$hgnc} (ENS2)\n";
# }
# if (!defined $hugo_refseq{$hgnc1}) {
# print ERROR "Can't map back $hugo_refseq{$hgnc} (ENS2)\n";
# }
if ($hugo_sp{$hgnc1}) {
print OUT "$map{$hugo_sp{$hgnc1}}\t$hugo_sp{$hgnc1}\tHUGOSYMBOL\t$hugo\n";
} }
$en2{$hgnc} = $hugo;
}
if ($hugo_refseq{$hgnc1}) {
print OUT "$map{$hugo_refseq{$hgnc1}}\t$hugo_refseq{$hgnc1}\tHUGOSYMBOL\t$hugo\n";
}
if (!defined $en2{$hgnc1}) {
$en2{$hgnc1} = [];
}
$en2{$hgnc1} = $hugo;
}
while (<NOME>) { while (<ENS4>) {
#Get hugo aliases given a hugo primary accession number. For each primary accession number, the aliases are put in a hash of array #Get hugo aliases given a hugo primary accession number. For each primary accession number, the aliases are put in a hash of array
chomp; chomp;
my @chunk = split (/\t/,$_); my ($hgnc2, $symbol, $alias, $withdrawn) = split (/\t/,$_);
if ((defined $hugo_sp{$hgnc2}) && (defined $alias)) {
my @aliases1 = split (/, /,$alias);
foreach my $aliase1 (@aliases1) {
print OUT "$map{$hugo_sp{$hgnc2}}\t$hugo_sp{$hgnc2}\tHUGOALIAS\t$aliase1\n";
}
}
if ($chunk[8]) { if ((defined $hugo_sp{$hgnc2}) && ($withdrawn =~ /\S+/)) {
$hugohash{$chunk[1]} = $chunk[8]; my @withdrawns1 = split (/, /,$withdrawn);
foreach my $withdrawn1 (@withdrawns1) {
print OUT "$map{$hugo_sp{$hgnc2}}\t$hugo_sp{$hgnc2}\tHUGOWITHDRAWN\t$withdrawn1\n";
}
} }
} if ((defined $hugo_refseq{$hgnc2}) && (defined $alias)) {
my @aliases2 = split (/, /,$alias);
while (<DBMAP>) { foreach my $aliase2 (@aliases2) {
chomp; print OUT "$map{$hugo_sp{$hgnc2}}\t$hugo_sp{$hgnc2}\tHUGOALIAS\t$aliase2\n";
my ($mapac,$mapdb) = split(/\t/,$_); }
my $hugo_ac = $en2{$en1{$mapac}}; }
if ($hugo_ac) {
if ((defined $hugo_refseq{$hgnc2}) && ($withdrawn =~ /\S+/)) {
#Print the HUGOs primary accession numbers my @withdrawns2 = split (/, /,$withdrawn);
print OUT "$mapdb\t$mapac\tHUGO\t$hugo_ac\n"; foreach my $withdrawn2 (@withdrawns2) {
print OUT "$map{$hugo_sp{$hgnc2}}\t$hugo_sp{$hgnc2}\tHUGOWITHDRAWN\t$withdrawn2\n";
if ($hugohash{$hugo_ac}) {
my @syn = split (/, /,$hugohash{$hugo_ac});
foreach my $sol (@syn) {
#print the HUGOs aliases
print OUT "$mapdb\t$mapac\tHUGO\t$sol\n";
}
} }
#if (!defined $en2{$en1{$mapac}}) {
# print STDERR "$mapac\n";
#}
} }
} }
while (<ENS5>) {
#Use Hugo mapping to get EC numbers
chomp;
my ($hgnc3, $symbol1, $name, $ec, $sp) = split (/\t/,$_);
if ((defined $hugo_sp{$hgnc3}) && (defined $ec)) {
print OUT "$map{$hugo_sp{$hgnc3}}\t$hugo_sp{$hgnc3}\tEC\t$ec\n";
}
if ((defined $hugo_refseq{$hgnc3}) && (defined $ec)) {
print OUT "$map{$hugo_sp{$hgnc3}}\t$hugo_sp{$hgnc3}\tEC\t$ec\n";
}
}
...@@ -44,5 +44,5 @@ while (<SCOP>) { ...@@ -44,5 +44,5 @@ while (<SCOP>) {
} }
print OUT "$map{$scopac}\t$scopac\tSCOP1\t$pdb\|\|$chain\n"; print OUT "$map{$scopac}\t$scopac\tSCOP1\t$pdb\|\|$chain\n";
print OUT "$map{$scopac}\t$scopac\tSCOP1\t$scopnb\n"; print OUT "$map{$scopac}\t$scopac\tSCOP2\t$scopnb\n";
} }
...@@ -47,7 +47,7 @@ if ($refseq) { ...@@ -47,7 +47,7 @@ if ($refseq) {
} }
open (OUT,">$out") || die "Can't open file $out\n"; open (OUT,">$out") || die "Can't open file $out\n";
open (CLONE,"clones.txt") || die "Can't open file\n"; #open (CLONE,"clones.txt") || die "Can't open file\n";
#Put in a hash all of the embl clones used by Ensembl #Put in a hash all of the embl clones used by Ensembl
while (<CLONE>) { while (<CLONE>) {
...@@ -60,7 +60,7 @@ while (<CLONE>) { ...@@ -60,7 +60,7 @@ while (<CLONE>) {
while (<DBMAP>) { while (<DBMAP>) {
chomp; chomp;
#Get put in a hash the corresponding database for an external accession number. Get the infos from a file already processed following this format: #Get put in a hash the corresponding database for an external accession number.Get the infos from a file already processed following this format:
#P31946 SP #P31946 SP
my ($mapac,$mapdb) = split(/\t/,$_); my ($mapac,$mapdb) = split(/\t/,$_);
$map{$mapac} = $mapdb; $map{$mapac} = $mapdb;
...@@ -85,7 +85,7 @@ while (<XREF>) { ...@@ -85,7 +85,7 @@ while (<XREF>) {
my ($xrdb,$xrac,$db,$id) = split (/\t/,$_); my ($xrdb,$xrac,$db,$id) = split (/\t/,$_);
if ($xrdb ne "ENSEMBL") { if ($xrdb ne "ENSEMBL") {
my $both = "$db:$id"; my $both = "$db&$id";
if( !defined $hash{$xrac} ) { if( !defined $hash{$xrac} ) {
$hash{$xrac} = []; $hash{$xrac} = [];
...@@ -95,31 +95,31 @@ while (<XREF>) { ...@@ -95,31 +95,31 @@ while (<XREF>) {
} }
#Get the embl clone corresponding for each Ensembl peptides #Get the embl clone corresponding for each Ensembl peptides
if (($xrdb eq "ENSEMBL")) { #if (($xrdb eq "ENSEMBL")) {
push(@{$ens2embl{$xrac}},$id); #push(@{$ens2embl{$xrac}},$id);
} #}
#Get the embl ACs for each SP and SPTREMBL proteins #Get the embl ACs for each SP and SPTREMBL proteins
if ((($xrdb eq "SP") || ($xrdb eq "SPTREMBL")) && ($db eq "EMBL")) { #if ((($xrdb eq "SP") || ($xrdb eq "SPTREMBL")) && ($db eq "EMBL")) {
#print "$id\n"; #print "$id\n";
if ($embl_clone{$id}) { #if ($embl_clone{$id}) {
push(@{$sp2embl{$xrac}},$id); # push(@{$sp2embl{$xrac}},$id);
} #}
} #}
} }
while (<MAP>) { while (<MAP>) {
chomp; chomp;
#P01111 COBP00000000001 100 PRIMARY #P01111 COBP00000000001 100 PRIMARY
my ($xr,$ens,$perc,$tag) = split (/\t/,$_); my ($xr,$ens,$perc,$tag) = split (/\t/,$_);
#Hack to be taken away #Hack to be taken away
my ($en1,$en2) = $ens =~ /(\w{3})P(\d+)/; #my ($en1,$en2) = $ens =~ /(\w{3})P(\d+)/;
my $enst = $en1."T".$en2; #my $enst = $en1."T".$en2;
#For now take primary or duplicates and only matches which correspond to more than 25% of the external peptide. These criteria will have to be lowered up. #For now take primary or duplicates and only matches which correspond to more than 25% of the external peptide. These criteria will have to be lowered up.
if ((($tag eq "PRIMARY") || ($tag eq "DUPLICATE")) && ($perc >= 25)) { if ((($tag eq "PRIMARY") || ($tag eq "DUPLICATE")) && ($perc >= 25)) {
...@@ -127,44 +127,45 @@ while (<MAP>) { ...@@ -127,44 +127,45 @@ while (<MAP>) {
if ($xr =~ /^NP_\d+/) { if ($xr =~ /^NP_\d+/) {
$xr = $ref_map{$xr}; $xr = $ref_map{$xr};
} }
#If the external peptide correspond to an embl clone, we will take the match only if the Ensembl peptide correspond to the same clone (at least one exon) #If the external peptide correspond to an embl clone, we will take the match only if the Ensembl peptide correspond to the same clone (at least one exon)
if ($sp2embl{$xr}) { #if ($sp2embl{$xr}) {
print "$xr\t".@{$sp2embl{$xr}}."\n"; # print "$xr\t".@{$sp2embl{$xr}}."\n";
my $tot_sp_embl; # my $tot_sp_embl;
my $tot_ens_embl; # my $tot_ens_embl;
my @sp_embl = @{$sp2embl{$xr}}; # my @sp_embl = @{$sp2embl{$xr}};
foreach my $sing1 (@sp_embl) { # foreach my $sing1 (@sp_embl) {
#print "$sing1\n"; # #print "$sing1\n";
$tot_sp_embl .= $sing1; # $tot_sp_embl .= $sing1;
} # }
if ($ens2embl{$enst}) { # if ($ens2embl{$enst}) {
my @ens_embl = @{$ens2embl{$enst}}; # my @ens_embl = @{$ens2embl{$enst}};
foreach my $sing2 (@sp_embl) { # foreach my $sing2 (@sp_embl) {
$tot_ens_embl .= $sing2; # $tot_ens_embl .= $sing2;
} # }
if ($tot_ens_embl =~ $tot_sp_embl) { # if ($tot_ens_embl =~ $tot_sp_embl) {
print OUT "$ens\t$map{$xr}\t$xr\n"; # print OUT "$ens\t$map{$xr}\t$xr\n";
} # }
else { # else {
#print "no\n"; # #print "no\n";
} # }
} # }
} #}
else { #else {
#Print the know gene AC and its database #Print the know gene AC and its database
#print OUT "$ens\t$map{$xr}\t$xr\n";
#}
print OUT "$ens\t$map{$xr}\t$xr\n"; print OUT "$ens\t$map{$xr}\t$xr\n";
}
#Print all of the external database it links to (eg: HUGO)
#Print all of the external database it links to (eg: HUGO) foreach my $both (@{$hash{$xr}}){
foreach my $both (@{$hash{$xr}}){ ($a,$b) = split(/&/,$both);
($a,$b) = split(/:/,$both); print OUT "$ens\t$a\t$b\n";
print OUT "$ens\t$a\t$b\n"; }
}
} }
} }
......
...@@ -50,7 +50,7 @@ sub runpmatch { ...@@ -50,7 +50,7 @@ sub runpmatch {
system($pmatch1); # == 0 or die "$0\Error running '$pmatch1' : $!"; system($pmatch1); # == 0 or die "$0\Error running '$pmatch1' : $!";
system($pmatch2); #== 0 or die "$0\Error running '$pmatch2' : $!"; system($pmatch2); #== 0 or die "$0\Error running '$pmatch2' : $!";
system($pmatch3); #== 0 or die "$0\Error running '$pmatch2' : $!"; system($pmatch3); #== 0 or die "$0\Error running '$pmatch2' : $!";
} }
...@@ -77,7 +77,7 @@ sub postprocesspmatch { ...@@ -77,7 +77,7 @@ sub postprocesspmatch {
elsif ($db eq $pdb) { elsif ($db eq $pdb) {
print STDERR "Postprocessing pmatch for PDB mapping\n"; print STDERR "Postprocessing pmatch for PDB mapping\n";
open (OUT, ">ens_pdb.processed") || die "Can't open File\n";; open (OUT, ">ens_pdb.processed") || die "Can't open File\n";;
open (PROC, "ens_pdb_rawpmatch") || die "Can't open file ens_refseq_rawpmatch\n"; open (PROC, "ens_pdb_rawpmatch") || die "Can't open file ens_pdb_rawpmatch\n";
} }
while (<PROC>) { while (<PROC>) {
...@@ -100,7 +100,8 @@ sub postprocesspmatch { ...@@ -100,7 +100,8 @@ sub postprocesspmatch {
#Write out the processed data #Write out the processed data
foreach my $key ( keys %hash1 ) { foreach my $key ( keys %hash1 ) {
if (($hashlength{$key} >= 20)) { #if (($hashlength{$key} >= 20)) {
if (($hash1{$key} >= 25)) {
($a,$b) = split(/:/,$key); ($a,$b) = split(/:/,$key);
print OUT "$a\t$b\t$hash1{$key}\n"; print OUT "$a\t$b\t$hash1{$key}\n";
} }
...@@ -130,7 +131,7 @@ sub finalprocess { ...@@ -130,7 +131,7 @@ sub finalprocess {
elsif ($db eq $pdb) { elsif ($db eq $pdb) {
print STDERR "Getting final mapping for PDB mapping\n"; print STDERR "Getting final mapping for PDB mapping\n";
open (PROC, "ens_refseq.processed") || die "Can' open file ens_refseq.processed\n"; open (PROC, "ens_pdb.processed") || die "Can' open file ens_refseq.processed\n";
open (OUT, ">ens_pdb.final"); open (OUT, ">ens_pdb.final");
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment