Commit 123a2a50 authored by Emmanuel Mongin's avatar Emmanuel Mongin
Browse files

Added a small script needed in the mapping process

parent 3656484b
use strict;
=head1 get_all_external
=head2 Description
This script will set up a mapping of each primary database (SP, SPTREMBL, Refseq and PDB) to their respective name. This mapping will be then used to give a database name to each ACs.
=head2 Options
-sp
-sptrembl
-refseq
-pdb
=head2 Contact
mongin@ebi.ac.uk
birney@ebi.ac.uk
=cut
#perl ../../../src/ensembl-live/misc-scripts/protein_match/get_dbmap.pl -sp ../primary/01_04_hum_sprot.txt -sptrembl ../primary/07_04_hum_sptrembl.txt -refseq ../primary/hs.gnp -scop ../primary/scop.fas
use Getopt::Long;
use Bio::SeqIO;
my ($sp, $sptrembl, $refseq, $scop, $out);
&GetOptions(
'sp:s'=>\$sp,
'sptrembl:s'=>\$sptrembl,
'refseq:s'=>\$refseq,
'scop:s'=>\$scop,
'out:s'=>\$out
);
open (OUT,">mapdb.map");
print STDERR "Getting SP mapping\n";
my $in = Bio::SeqIO->new(-file => $sp, '-format' =>'swiss');
while ( my $seq = $in->next_seq() ) {
print OUT $seq->accession."\tSP\n";
my @secs = $seq->get_secondary_accessions;
if (@secs) {
foreach my $sec (@secs) {
print OUT "$sec\tSP\n";
}
}
}
print STDERR "Getting SPTREMBL mapping\n";
my $in1 = Bio::SeqIO->new(-file => $sptrembl, '-format' =>'swiss');
while ( my $seq1 = $in1->next_seq() ) {
print OUT $seq1->accession."\tSPTREMBL\n";
my @secs = $seq1->get_secondary_accessions;
if (@secs) {
foreach my $sec (@secs) {
print OUT "$sec\tSPTREMBL\n";
}
}
}
print "Getting refseq mapping\n";
open (REFSEQ,"$refseq") || die "Can't open file\n";
$/ = "\/\/\n";
while (<REFSEQ>) {
my ($prot_ac) = $_ =~ /ACCESSION\s+(\S+)/;
my ($dna_ac) = $_ =~ /DBSOURCE REFSEQ: accession\s+(\w+)/;
if ($dna_ac) {
print OUT "$dna_ac\tREFSEQ\n";
}
}
close (REFSEQ);
$/ = "\n";
print "Getting Scop mapping\n";
open (SCOP,"$scop") || die "Can't open file\n";
while (<SCOP>) {
my ($scop_ac) = $_ =~ /^>(\S+)/;
if ($scop_ac) {
print OUT "$scop_ac\tSCOP\n";
}
}
close (SCOP);
close (OUT);
...@@ -37,7 +37,7 @@ while ( my $seq1 = $in1->next_seq() ) { ...@@ -37,7 +37,7 @@ while ( my $seq1 = $in1->next_seq() ) {
foreach my $link(@dblink) { foreach my $link(@dblink) {
if (($link->database eq "EMBL") || ($link->database eq "MIM") || ($link->database eq "PDB")) { if (($link->database eq "EMBL") || ($link->database eq "MIM")) {
if (!defined $map{$ac}) { if (!defined $map{$ac}) {
die "Can't map $ac\n"; die "Can't map $ac\n";
......
...@@ -21,20 +21,23 @@ birney@ebi.ac.uk ...@@ -21,20 +21,23 @@ birney@ebi.ac.uk
use Getopt::Long; use Getopt::Long;
my ($ens,$sp,$refseq); my ($ens,$sp,$refseq,$pdb);
&GetOptions( &GetOptions(
'ens:s'=>\$ens, 'ens:s'=>\$ens,
'sp:s'=>\$sp, 'sp:s'=>\$sp,
'refseq:s'=>\$refseq 'refseq:s'=>\$refseq,
'pdb:s'=>\$pdb
); );
&runpmatch(); &runpmatch();
&postprocesspmatch($sp); &postprocesspmatch($sp);
&postprocesspmatch($refseq); &postprocesspmatch($refseq);
&postprocesspmatch($pdb);
&finalprocess($sp); &finalprocess($sp);
&finalprocess($refseq); &finalprocess($refseq);
&finalprocess($pdb);
sub runpmatch { sub runpmatch {
print STDERR "Running pmatch\n"; print STDERR "Running pmatch\n";
...@@ -42,9 +45,13 @@ sub runpmatch { ...@@ -42,9 +45,13 @@ sub runpmatch {
#Run pmatch and store the data in files which will be kept for debugging #Run pmatch and store the data in files which will be kept for debugging
my $pmatch1 = "/nfs/griffin2/rd/bin.ALPHA/pmatch -T 14 $sp $ens > ens_sp_rawpmatch"; my $pmatch1 = "/nfs/griffin2/rd/bin.ALPHA/pmatch -T 14 $sp $ens > ens_sp_rawpmatch";
my $pmatch2 = "/nfs/griffin2/rd/bin.ALPHA/pmatch -T 14 $refseq $ens > ens_refseq_rawpmatch"; my $pmatch2 = "/nfs/griffin2/rd/bin.ALPHA/pmatch -T 14 $refseq $ens > ens_refseq_rawpmatch";
my $pmatch3 = "/nfs/griffin2/rd/bin.ALPHA/pmatch -T 14 $pdb $ens > ens_pdb_rawpmatch";
system($pmatch1); # == 0 or die "$0\Error running '$pmatch1' : $!"; system($pmatch1); # == 0 or die "$0\Error running '$pmatch1' : $!";
system($pmatch2); #== 0 or die "$0\Error running '$pmatch2' : $!"; system($pmatch2); #== 0 or die "$0\Error running '$pmatch2' : $!";
system($pmatch3); #== 0 or die "$0\Error running '$pmatch2' : $!";
} }
...@@ -61,12 +68,18 @@ sub postprocesspmatch { ...@@ -61,12 +68,18 @@ sub postprocesspmatch {
open (PROC, "ens_sp_rawpmatch") || die "Can't open File\n"; open (PROC, "ens_sp_rawpmatch") || die "Can't open File\n";
} }
else { elsif ($db eq $refseq) {
print STDERR "Postprocessing pmatch for REFSEQ mapping\n"; print STDERR "Postprocessing pmatch for REFSEQ mapping\n";
open (OUT, ">ens_refseq.processed") || die "Can't open File\n";; open (OUT, ">ens_refseq.processed") || die "Can't open File\n";;
open (PROC, "ens_refseq_rawpmatch") || die "Can't open file ens_refseq_rawpmatch\n"; open (PROC, "ens_refseq_rawpmatch") || die "Can't open file ens_refseq_rawpmatch\n";
} }
elsif ($db eq $pdb) {
print STDERR "Postprocessing pmatch for PDB mapping\n";
open (OUT, ">ens_pdb.processed") || die "Can't open File\n";;
open (PROC, "ens_pdb_rawpmatch") || die "Can't open file ens_refseq_rawpmatch\n";
}
while (<PROC>) { while (<PROC>) {
#538 COBP00000033978 1 538 35.3 Q14146 1 538 35.3 #538 COBP00000033978 1 538 35.3 Q14146 1 538 35.3
my ($len,$id,$start,$end,$tperc,$query,$qst,$qend,$perc) = split; my ($len,$id,$start,$end,$tperc,$query,$qst,$qend,$perc) = split;
...@@ -109,12 +122,19 @@ sub finalprocess { ...@@ -109,12 +122,19 @@ sub finalprocess {
open (OUT, ">ens_sp.final"); open (OUT, ">ens_sp.final");
} }
else { elsif ($db eq $refseq) {
print STDERR "Getting final mapping for REFSEQ mapping\n"; print STDERR "Getting final mapping for REFSEQ mapping\n";
open (PROC, "ens_refseq.processed") || die "Can' open file ens_refseq.processed\n"; open (PROC, "ens_refseq.processed") || die "Can' open file ens_refseq.processed\n";
open (OUT, ">refseq.final"); open (OUT, ">ens_refseq.final");
} }
elsif ($db eq $pdb) {
print STDERR "Getting final mapping for PDB mapping\n";
open (PROC, "ens_refseq.processed") || die "Can' open file ens_refseq.processed\n";
open (OUT, ">ens_pdb.final");
}
my %hash2; my %hash2;
while (<PROC>) { while (<PROC>) {
my ($ens,$known,$perc) = split; my ($ens,$known,$perc) = split;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment