Commit 58fc4a4b authored by Emmanuel Mongin's avatar Emmanuel Mongin
Browse files

Mapping going on, many changes done...

parent 7671f083
......@@ -28,103 +28,141 @@ The different options only deal with file names
use Getopt::Long;
my ($nomeid,$ens1,$ens2,$out,$dbmap);
#perl ../../../src/ensembl-live/misc-scripts/protein_match/get_hugo_mapping.pl -ens1 ../secondary/ens1.txt -ens2 ../secondary/ens2.txt -ens4 ../secondary/ens4.txt -ens5 ../secondary/ens5.txt -out hugo.map -dbmap mapdb.map
my ($ens1,$ens2,$ens4,$ens5,$out,$dbmap);
my %map;
my %en1;
my %hugo_sp;
my %hugo_refseq;
my %en2;
my %hugohash;
&GetOptions(
'nomeid:s'=>\$nomeid,
'ens1:s'=>\$ens1,
'ens2:s'=>\$ens2,
'output:s'=>\$out,
'dbmap:s'=>\$dbmap
'ens4:s'=>\$ens4,
'ens5:s'=>\$ens5,
'dbmap:s'=>\$dbmap,
'output:s'=>\$out
);
open (ENS1,"$ens1") || die "Can't open file $ens1\n";
open (ENS2,"$ens2") || die "Can't open file $ens2\n";
open (NOME,"$nomeid") || die "Can't open file $nomeid\n";
open (ENS4,"$ens4") || die "Can't open file $ens4\n";
open (ENS5,"$ens5") || die "Can't open file $ens5\n";
open (DBMAP,"$dbmap") || die "Can't open file $dbmap\n";
open (OUT,">$out") || die "Can't open output file $out\n";
open (ERROR,">hugo.err") || die "Can't open output file hugo.err\n";
while (<DBMAP>) {
chomp;
my ($mapac,$mapdb) = split(/\t/,$_);
$map{$mapac} = $mapdb;
}
while (<ENS1>) {
chomp;
#Get hugo id
#Get rid of the annoying carriage return!
$_ =~ s/\r//g;
my ($hgnc,$sp,$refseq) = split(/\t/,$_);
if ($sp) {
print OUT "$map{$sp}\t$sp\tHUGOID\t$hgnc\n";
}
if ($refseq) {
print OUT "$map{refseq}\t$refseq\tHUGOID\t$hgnc\n";
}
if ($sp) {
$en1{$sp} = $hgnc;
$hugo_sp{$hgnc} = $sp;
}
if ($refseq) {
$en1{$refseq} = $hgnc;
$hugo_refseq{$hgnc} = $refseq;
}
}
while (<ENS2>) {
chomp;
#Get hugo symbol
$_ =~ s/\r//g;
my ($hgnc,$hugo) = split(/\t/,$_);
my ($hgnc1,$hugo) = split(/\t/,$_);
if (!defined $en2{$hgnc}) {
$en2{$hgnc} = [];
# if (!defined $hugo_sp{$hgnc1}) {
# print ERROR "Can't map back $hugo_sp{$hgnc} (ENS2)\n";
# }
# if (!defined $hugo_refseq{$hgnc1}) {
# print ERROR "Can't map back $hugo_refseq{$hgnc} (ENS2)\n";
# }
if ($hugo_sp{$hgnc1}) {
print OUT "$map{$hugo_sp{$hgnc1}}\t$hugo_sp{$hgnc1}\tHUGOSYMBOL\t$hugo\n";
}
$en2{$hgnc} = $hugo;
}
if ($hugo_refseq{$hgnc1}) {
print OUT "$map{$hugo_refseq{$hgnc1}}\t$hugo_refseq{$hgnc1}\tHUGOSYMBOL\t$hugo\n";
}
if (!defined $en2{$hgnc1}) {
$en2{$hgnc1} = [];
}
$en2{$hgnc1} = $hugo;
}
while (<NOME>) {
while (<ENS4>) {
#Get hugo aliases given a hugo primary accession number. For each primary accession number, the aliases are put in a hash of array
chomp;
my @chunk = split (/\t/,$_);
my ($hgnc2, $symbol, $alias, $withdrawn) = split (/\t/,$_);
if ((defined $hugo_sp{$hgnc2}) && (defined $alias)) {
my @aliases1 = split (/, /,$alias);
foreach my $aliase1 (@aliases1) {
print OUT "$map{$hugo_sp{$hgnc2}}\t$hugo_sp{$hgnc2}\tHUGOALIAS\t$aliase1\n";
}
}
if ($chunk[8]) {
$hugohash{$chunk[1]} = $chunk[8];
if ((defined $hugo_sp{$hgnc2}) && ($withdrawn =~ /\S+/)) {
my @withdrawns1 = split (/, /,$withdrawn);
foreach my $withdrawn1 (@withdrawns1) {
print OUT "$map{$hugo_sp{$hgnc2}}\t$hugo_sp{$hgnc2}\tHUGOWITHDRAWN\t$withdrawn1\n";
}
}
}
while (<DBMAP>) {
chomp;
my ($mapac,$mapdb) = split(/\t/,$_);
my $hugo_ac = $en2{$en1{$mapac}};
if ($hugo_ac) {
#Print the HUGOs primary accession numbers
print OUT "$mapdb\t$mapac\tHUGO\t$hugo_ac\n";
if ($hugohash{$hugo_ac}) {
my @syn = split (/, /,$hugohash{$hugo_ac});
foreach my $sol (@syn) {
#print the HUGOs aliases
print OUT "$mapdb\t$mapac\tHUGO\t$sol\n";
}
if ((defined $hugo_refseq{$hgnc2}) && (defined $alias)) {
my @aliases2 = split (/, /,$alias);
foreach my $aliase2 (@aliases2) {
print OUT "$map{$hugo_sp{$hgnc2}}\t$hugo_sp{$hgnc2}\tHUGOALIAS\t$aliase2\n";
}
}
if ((defined $hugo_refseq{$hgnc2}) && ($withdrawn =~ /\S+/)) {
my @withdrawns2 = split (/, /,$withdrawn);
foreach my $withdrawn2 (@withdrawns2) {
print OUT "$map{$hugo_sp{$hgnc2}}\t$hugo_sp{$hgnc2}\tHUGOWITHDRAWN\t$withdrawn2\n";
}
#if (!defined $en2{$en1{$mapac}}) {
# print STDERR "$mapac\n";
#}
}
}
while (<ENS5>) {
#Use Hugo mapping to get EC numbers
chomp;
my ($hgnc3, $symbol1, $name, $ec, $sp) = split (/\t/,$_);
if ((defined $hugo_sp{$hgnc3}) && (defined $ec)) {
print OUT "$map{$hugo_sp{$hgnc3}}\t$hugo_sp{$hgnc3}\tEC\t$ec\n";
}
if ((defined $hugo_refseq{$hgnc3}) && (defined $ec)) {
print OUT "$map{$hugo_sp{$hgnc3}}\t$hugo_sp{$hgnc3}\tEC\t$ec\n";
}
}
......@@ -44,5 +44,5 @@ while (<SCOP>) {
}
print OUT "$map{$scopac}\t$scopac\tSCOP1\t$pdb\|\|$chain\n";
print OUT "$map{$scopac}\t$scopac\tSCOP1\t$scopnb\n";
print OUT "$map{$scopac}\t$scopac\tSCOP2\t$scopnb\n";
}
......@@ -47,7 +47,7 @@ if ($refseq) {
}
open (OUT,">$out") || die "Can't open file $out\n";
open (CLONE,"clones.txt") || die "Can't open file\n";
#open (CLONE,"clones.txt") || die "Can't open file\n";
#Put in a hash all of the embl clones used by Ensembl
while (<CLONE>) {
......@@ -60,7 +60,7 @@ while (<CLONE>) {
while (<DBMAP>) {
chomp;
#Get put in a hash the corresponding database for an external accession number. Get the infos from a file already processed following this format:
#Get put in a hash the corresponding database for an external accession number.Get the infos from a file already processed following this format:
#P31946 SP
my ($mapac,$mapdb) = split(/\t/,$_);
$map{$mapac} = $mapdb;
......@@ -85,7 +85,7 @@ while (<XREF>) {
my ($xrdb,$xrac,$db,$id) = split (/\t/,$_);
if ($xrdb ne "ENSEMBL") {
my $both = "$db:$id";
my $both = "$db&$id";
if( !defined $hash{$xrac} ) {
$hash{$xrac} = [];
......@@ -95,31 +95,31 @@ while (<XREF>) {
}
#Get the embl clone corresponding for each Ensembl peptides
if (($xrdb eq "ENSEMBL")) {
#if (($xrdb eq "ENSEMBL")) {
push(@{$ens2embl{$xrac}},$id);
}
#push(@{$ens2embl{$xrac}},$id);
#}
#Get the embl ACs for each SP and SPTREMBL proteins
if ((($xrdb eq "SP") || ($xrdb eq "SPTREMBL")) && ($db eq "EMBL")) {
#if ((($xrdb eq "SP") || ($xrdb eq "SPTREMBL")) && ($db eq "EMBL")) {
#print "$id\n";
if ($embl_clone{$id}) {
#if ($embl_clone{$id}) {
push(@{$sp2embl{$xrac}},$id);
}
}
# push(@{$sp2embl{$xrac}},$id);
#}
#}
}
while (<MAP>) {
chomp;
#P01111 COBP00000000001 100 PRIMARY
my ($xr,$ens,$perc,$tag) = split (/\t/,$_);
#Hack to be taken away
my ($en1,$en2) = $ens =~ /(\w{3})P(\d+)/;
my $enst = $en1."T".$en2;
#my ($en1,$en2) = $ens =~ /(\w{3})P(\d+)/;
#my $enst = $en1."T".$en2;
#For now take primary or duplicates and only matches which correspond to more than 25% of the external peptide. These criteria will have to be lowered up.
if ((($tag eq "PRIMARY") || ($tag eq "DUPLICATE")) && ($perc >= 25)) {
......@@ -127,44 +127,45 @@ while (<MAP>) {
if ($xr =~ /^NP_\d+/) {
$xr = $ref_map{$xr};
}
#If the external peptide correspond to an embl clone, we will take the match only if the Ensembl peptide correspond to the same clone (at least one exon)
if ($sp2embl{$xr}) {
print "$xr\t".@{$sp2embl{$xr}}."\n";
my $tot_sp_embl;
my $tot_ens_embl;
my @sp_embl = @{$sp2embl{$xr}};
foreach my $sing1 (@sp_embl) {
#print "$sing1\n";
$tot_sp_embl .= $sing1;
}
#if ($sp2embl{$xr}) {
# print "$xr\t".@{$sp2embl{$xr}}."\n";
# my $tot_sp_embl;
# my $tot_ens_embl;
# my @sp_embl = @{$sp2embl{$xr}};
# foreach my $sing1 (@sp_embl) {
# #print "$sing1\n";
# $tot_sp_embl .= $sing1;
# }
if ($ens2embl{$enst}) {
my @ens_embl = @{$ens2embl{$enst}};
# if ($ens2embl{$enst}) {
# my @ens_embl = @{$ens2embl{$enst}};
foreach my $sing2 (@sp_embl) {
$tot_ens_embl .= $sing2;
}
if ($tot_ens_embl =~ $tot_sp_embl) {
print OUT "$ens\t$map{$xr}\t$xr\n";
}
else {
#print "no\n";
}
}
}
else {
# foreach my $sing2 (@sp_embl) {
# $tot_ens_embl .= $sing2;
# }
# if ($tot_ens_embl =~ $tot_sp_embl) {
# print OUT "$ens\t$map{$xr}\t$xr\n";
# }
# else {
# #print "no\n";
# }
# }
#}
#else {
#Print the know gene AC and its database
#print OUT "$ens\t$map{$xr}\t$xr\n";
#}
print OUT "$ens\t$map{$xr}\t$xr\n";
}
#Print all of the external database it links to (eg: HUGO)
foreach my $both (@{$hash{$xr}}){
($a,$b) = split(/:/,$both);
print OUT "$ens\t$a\t$b\n";
}
#Print all of the external database it links to (eg: HUGO)
foreach my $both (@{$hash{$xr}}){
($a,$b) = split(/&/,$both);
print OUT "$ens\t$a\t$b\n";
}
}
}
......
......@@ -50,7 +50,7 @@ sub runpmatch {
system($pmatch1); # == 0 or die "$0\Error running '$pmatch1' : $!";
system($pmatch2); #== 0 or die "$0\Error running '$pmatch2' : $!";
system($pmatch3); #== 0 or die "$0\Error running '$pmatch2' : $!";
system($pmatch3); #== 0 or die "$0\Error running '$pmatch2' : $!";
}
......@@ -77,7 +77,7 @@ sub postprocesspmatch {
elsif ($db eq $pdb) {
print STDERR "Postprocessing pmatch for PDB mapping\n";
open (OUT, ">ens_pdb.processed") || die "Can't open File\n";;
open (PROC, "ens_pdb_rawpmatch") || die "Can't open file ens_refseq_rawpmatch\n";
open (PROC, "ens_pdb_rawpmatch") || die "Can't open file ens_pdb_rawpmatch\n";
}
while (<PROC>) {
......@@ -100,7 +100,8 @@ sub postprocesspmatch {
#Write out the processed data
foreach my $key ( keys %hash1 ) {
if (($hashlength{$key} >= 20)) {
#if (($hashlength{$key} >= 20)) {
if (($hash1{$key} >= 25)) {
($a,$b) = split(/:/,$key);
print OUT "$a\t$b\t$hash1{$key}\n";
}
......@@ -130,7 +131,7 @@ sub finalprocess {
elsif ($db eq $pdb) {
print STDERR "Getting final mapping for PDB mapping\n";
open (PROC, "ens_refseq.processed") || die "Can' open file ens_refseq.processed\n";
open (PROC, "ens_pdb.processed") || die "Can' open file ens_refseq.processed\n";
open (OUT, ">ens_pdb.final");
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment