Commit 738b6b16 authored by Emmanuel Mongin's avatar Emmanuel Mongin
Browse files

Removed useless files which were used for the mapping. Should be more straithforward to run now.

parent 25075c62
use strict;
=head1 get_all_external
=head2 Description
This script will set up a mapping of each primary database (SP, SPTREMBL, Refseq and PDB) to their respective name. This mapping will be then used to give a database name to each ACs.
=head2 Options
-sp
-sptrembl
-refseq
-pdb
=head2 Contact
mongin@ebi.ac.uk
birney@ebi.ac.uk
=cut
#perl ../../../src/ensembl-live/misc-scripts/protein_match/get_dbmap.pl -sp ../primary/01_04_hum_sprot.txt -sptrembl ../primary/07_04_hum_sptrembl.txt -refseq ../primary/hs.gnp -scop ../primary/scop.fas
use Getopt::Long;
use Bio::SeqIO;
my ($sp, $sptrembl, $refseq, $scop, $out);
&GetOptions(
'sp:s'=>\$sp,
'sptrembl:s'=>\$sptrembl,
'refseq:s'=>\$refseq,
'scop:s'=>\$scop,
'out:s'=>\$out
);
open (OUT,">mapdb.map");
print STDERR "Getting SP mapping\n";
my $in = Bio::SeqIO->new(-file => $sp, '-format' =>'swiss');
while ( my $seq = $in->next_seq() ) {
print OUT $seq->accession."\tSP\n";
my @secs = $seq->get_secondary_accessions;
if (@secs) {
foreach my $sec (@secs) {
print OUT "$sec\tSP\n";
}
}
}
print STDERR "Getting SPTREMBL mapping\n";
my $in1 = Bio::SeqIO->new(-file => $sptrembl, '-format' =>'swiss');
while ( my $seq1 = $in1->next_seq() ) {
print OUT $seq1->accession."\tSPTREMBL\n";
my @secs = $seq1->get_secondary_accessions;
if (@secs) {
foreach my $sec (@secs) {
print OUT "$sec\tSPTREMBL\n";
}
}
}
print "Getting refseq mapping\n";
open (REFSEQ,"$refseq") || die "Can't open file\n";
$/ = "\/\/\n";
while (<REFSEQ>) {
my ($prot_ac) = $_ =~ /ACCESSION\s+(\S+)/;
my ($dna_ac) = $_ =~ /DBSOURCE REFSEQ: accession\s+(\w+)/;
if ($dna_ac) {
print OUT "$dna_ac\tREFSEQ\n";
}
}
close (REFSEQ);
$/ = "\n";
print "Getting Scop mapping\n";
open (SCOP,"$scop") || die "Can't open file\n";
while (<SCOP>) {
my ($scop_ac) = $_ =~ /^>(\S+)/;
if ($scop_ac) {
print OUT "$scop_ac\tSCOP\n";
}
}
close (SCOP);
close (OUT);
use strict;
#Some doc will come ...
#perl ../../../src/ensembl-live/misc-scripts/protein_match/get_embl_mim_mapping.pl -sp ../primary/hum_sp_sptrembl.pep -dbmap mapdb.map -output sp_embl_mim.map
use Getopt::Long;
use Bio::SeqIO;
my ($sp,$dbmap,$out);
my %map;
&GetOptions(
'sp:s'=>\$sp,
'output:s'=>\$out,
'dbmap:s'=>\$dbmap
);
open (DBMAP,"$dbmap") || die "Can't open file $dbmap\n";
open (OUT,">$out") || die "Can't open file\n";
print STDERR "Reading DBmap\n";
while (<DBMAP>) {
chomp;
my ($mapac,$mapdb) = split(/\t/,$_);
$map{$mapac} = $mapdb;
}
print STDERR "Reading SP\n";
my $in1 = Bio::SeqIO->new(-file => $sp, '-format' =>'swiss');
while ( my $seq1 = $in1->next_seq() ) {
my $ac = $seq1->accession;
my @dblink = $seq1->annotation->each_DBLink;
foreach my $link(@dblink) {
if ($link->database eq "EMBL") {
if (!defined $map{$ac}) {
die "Can't map $ac\n";
}
print OUT "$map{$ac}\t$ac\tEMBL_AC\t".$link->primary_id,"\n";
my ($protac) = $link->optional_id =~ /^(\w+).\S+/;
print OUT "$map{$ac}\t$ac\tEMBL_PROT_AC\t$protac\n";
}
if ($link->database eq "MIM") {
if (!defined $map{$ac}) {
die "Can't map $ac\n";
}
print OUT "$map{$ac}\t$ac\tOMIM\t".$link->primary_id,"\n";
}
}
}
use strict;
=head1 Get_ens2ensembl.pl
=head1 Description
=head2 Aims
The aim of thi script is to get from the database the corresponding clones for each Ensembl peptides. This will be then used to postprocess pmatch and get a more sensible mapping.
=cut
use Bio::SeqIO;
use Bio::EnsEMBL::DBSQL::Obj;
use Bio::EnsEMBL::DBLoader;
use Getopt::Long;
use Bio::SeqIO;
my $dbpass = undef;
my $dbuser = 'ensro';
my $ensdbname = 'ensembl080';
my $host = 'ecs1c.sanger.ac.uk';
my $output;
&GetOptions(
'db:s' => \$ensdbname,
'host:s'=> \$host,
'dbuser:s'=> \$dbuser,
'output:s' => \$output
);
my $enslocator = "Bio::EnsEMBL::DBSQL::Obj/host=$host;dbname=$ensdbname;user=$dbuser;pass=$dbpass;perlonlyfeatures=1";
my $ensdb = Bio::EnsEMBL::DBLoader->new($enslocator);
my $sth = $ensdb->prepare ("select t.id,cl.embl_id from transcript as t, exon_transcript as et, clone as cl, contig as c, exon as e where t.id=et.transcript and et.exon = e.id and e.contig = c.internal_id and c.clone = cl.internal_id");
$sth->execute;
my %hash;
my %seen;
print STDERR "Getting data\n";
while (my @row = $sth->fetchrow) {
if (! defined $seen{$row[1]}) {
push(@{$hash{$row[0]}},$row[1]);
$seen{$row[1]} = 1;
}
}
print STDERR "Writing out\n";
open (OUT,">$output");
foreach my $keys (keys %hash) {
my @array = @{$hash{$keys}};
foreach my $arr (@array) {
print OUT "ENSEMBL\t$keys\tEMBL\t$arr\n";
}
}
use strict;
=head1 get_hugo_mapping
=head2 Description
This script reads a set of files produced by HUGO and built a file which will be used to get the DBlink table.The format of the file is the following
SP P27348 HUGO YWHAQ
(known database\t known ac\t hugo or alias\t hugo ac)
=head2 Options
The different options only deal with file names
-nomeid: Hugo file (http://www.gene.ucl.ac.uk/public-files/nomen/nomeids.txt)
-ens1: Hugo file (http://www.gene.ucl.ac.uk/public-files/nomen/ens1.txt)
-ens2: Hugo file (http://www.gene.ucl.ac.uk/public-files/nomen/ens2.txt)
-output: Filename were the output should be written
-dbmap: File which give the corresponding database for each known protein
=cut
use Getopt::Long;
#perl ../../../src/ensembl-live/misc-scripts/protein_match/get_hugo_mapping.pl -ens1 ../secondary/ens1.txt -ens2 ../secondary/ens2.txt -ens4 ../secondary/ens4.txt -ens5 ../secondary/ens5.txt -out hugo.map -dbmap mapdb.map
my ($ens1,$ens2,$ens4,$ens5,$out,$dbmap);
my %map;
my %hugo_sp;
my %hugo_refseq;
my %en2;
my %hugohash;
&GetOptions(
'ens1:s'=>\$ens1,
'ens2:s'=>\$ens2,
'ens4:s'=>\$ens4,
'ens5:s'=>\$ens5,
'dbmap:s'=>\$dbmap,
'output:s'=>\$out
);
open (ENS1,"$ens1") || die "Can't open file $ens1\n";
open (ENS2,"$ens2") || die "Can't open file $ens2\n";
open (ENS4,"$ens4") || die "Can't open file $ens4\n";
open (ENS5,"$ens5") || die "Can't open file $ens5\n";
open (DBMAP,"$dbmap") || die "Can't open file $dbmap\n";
open (OUT,">$out") || die "Can't open output file $out\n";
open (ERROR,">hugo.err") || die "Can't open output file hugo.err\n";
while (<DBMAP>) {
chomp;
my ($mapac,$mapdb) = split(/\t/,$_);
$map{$mapac} = $mapdb;
}
while (<ENS1>) {
chomp;
#Get hugo id
#Get rid of the annoying carriage return!
$_ =~ s/\r//g;
my ($hgnc,$sp,$refseq) = split(/\t/,$_);
if ($sp) {
print OUT "$map{$sp}\t$sp\tHUGOID\t$hgnc\n";
}
if ($refseq) {
print OUT "$map{refseq}\t$refseq\tHUGOID\t$hgnc\n";
}
if ($sp) {
$hugo_sp{$hgnc} = $sp;
}
if ($refseq) {
$hugo_refseq{$hgnc} = $refseq;
}
}
while (<ENS2>) {
chomp;
#Get hugo symbol
$_ =~ s/\r//g;
my ($hgnc1,$hugo) = split(/\t/,$_);
if ($hugo_sp{$hgnc1}) {
print OUT "$map{$hugo_sp{$hgnc1}}\t$hugo_sp{$hgnc1}\tHUGOSYMBOL\t$hugo\n";
}
if ($hugo_refseq{$hgnc1}) {
print OUT "$map{$hugo_refseq{$hgnc1}}\t$hugo_refseq{$hgnc1}\tHUGOSYMBOL\t$hugo\n";
}
if (!defined $en2{$hgnc1}) {
$en2{$hgnc1} = [];
}
$en2{$hgnc1} = $hugo;
}
while (<ENS4>) {
#Get hugo aliases given a hugo primary accession number. For each primary accession number, the aliases are put in a hash of array
chomp;
my ($hgnc2, $symbol, $alias, $withdrawn) = split (/\t/,$_);
if ((defined $hugo_sp{$hgnc2}) && (defined $alias)) {
my @aliases1 = split (/, /,$alias);
foreach my $aliase1 (@aliases1) {
print OUT "$map{$hugo_sp{$hgnc2}}\t$hugo_sp{$hgnc2}\tHUGOALIAS\t$aliase1\n";
}
}
if ((defined $hugo_sp{$hgnc2}) && ($withdrawn =~ /\S+/)) {
my @withdrawns1 = split (/, /,$withdrawn);
foreach my $withdrawn1 (@withdrawns1) {
print OUT "$map{$hugo_sp{$hgnc2}}\t$hugo_sp{$hgnc2}\tHUGOWITHDRAWN\t$withdrawn1\n";
}
}
if ((defined $hugo_refseq{$hgnc2}) && (defined $alias)) {
my @aliases2 = split (/, /,$alias);
foreach my $aliase2 (@aliases2) {
print OUT "$map{$hugo_sp{$hgnc2}}\t$hugo_sp{$hgnc2}\tHUGOALIAS\t$aliase2\n";
}
}
if ((defined $hugo_refseq{$hgnc2}) && ($withdrawn =~ /\S+/)) {
my @withdrawns2 = split (/, /,$withdrawn);
foreach my $withdrawn2 (@withdrawns2) {
print OUT "$map{$hugo_sp{$hgnc2}}\t$hugo_sp{$hgnc2}\tHUGOWITHDRAWN\t$withdrawn2\n";
}
}
}
while (<ENS5>) {
#Use Hugo mapping to get EC numbers
chomp;
my ($hgnc3, $symbol1, $name, $ec, $sp) = split (/\t/,$_);
if ((defined $hugo_sp{$hgnc3}) && (defined $ec)) {
print OUT "$map{$hugo_sp{$hgnc3}}\t$hugo_sp{$hgnc3}\tEC\t$ec\n";
}
if ((defined $hugo_refseq{$hgnc3}) && (defined $ec)) {
print OUT "$map{$hugo_sp{$hgnc3}}\t$hugo_sp{$hgnc3}\tEC\t$ec\n";
}
}
use strict;
#Some doc will come
use Getopt::Long;
my ($refseq,$dbmap,$out);
my %map;
&GetOptions(
'refseq:s'=>\$refseq,
'out:s'=>\$out,
'dbmap:s'=>\$dbmap
);
open (DBMAP,"$dbmap") || die "Can't open file $dbmap\n";
open (REFSEQ,"$refseq") || die "Can't open file $refseq\n";
open (OUT,">$out") || die "Can't open file $out";
print STDERR "Reading dbmap\n";
while (<DBMAP>) {
chomp;
my ($mapac,$mapdb) = split(/\t/,$_);
$map{$mapac} = $mapdb;
}
#Separate by entry (each entry goes into $_)
$/ = "\/\/\n";
print STDERR "Reading Refseq file\n";
while (<REFSEQ>) {
my ($prot_ac) = $_ =~ /ACCESSION\s+(\S+)/;
my ($dna_ac) = $_ =~ /DBSOURCE REFSEQ: accession\s+(\w+)/;
my ($mim) = $_ =~ /\/db_xref=\"MIM:(\d+)/;
my ($locus) = $_ =~ /\/db_xref=\"LocusID:(\d*)/;
if ($mim) {
if (!defined $map{$dna_ac}) {
die "can't map $dna_ac\n";
}
print OUT "$map{$dna_ac}\t$dna_ac\tOMIM\t$mim\n";
}
if ($locus) {
if (!defined $map{$dna_ac}) {
die "can't map $dna_ac\n";
}
print OUT "$map{$dna_ac}\t$dna_ac\tLOCUS\t$locus\n";
}
}
use strict;
#Some doc will come
#The scop mapping which should be used here can be found at:
#http://scop.mrc-lmb.cam.ac.uk/scop/parse/dir.dom.scop.txt_1.53 (for the 1.53 release)
#perl ../../../src/ensembl-live/misc-scripts/protein_match/get_scops.pl -scop ../secondary/dir.dom.scop.txt_1.53 -dbmap mapdb.map -out scops.map
use Getopt::Long;
my ($scop,$dbmap,$out);
my %map;
&GetOptions(
'scop:s'=>\$scop,
'out:s'=>\$out,
'dbmap:s'=>\$dbmap
);
open (DBMAP,"$dbmap") || die "Can't open file $dbmap\n";
open (SCOP,"$scop") || die "Can't open file $scop\n";
open (OUT,">$out") || die "Can't open file $out\n";
open (ERROR, ">scop.err") || die "Can't open file scop.err\n";
print STDERR "Reading dbmap\n";
while (<DBMAP>) {
chomp;
my ($mapac,$mapdb) = split(/\t/,$_);
$map{$mapac} = $mapdb;
}
print STDERR "Reading Scop file\n";
while (<SCOP>) {
chomp;
my ($scopac, $pdb, $chain, $scopnb) = split(/\t/,$_);
if (!defined $map{$scopac}) {
print ERROR "can't map $scopac\n";
}
print OUT "$map{$scopac}\t$scopac\tSCOP1\t$pdb\|\|$chain\n";
print OUT "$map{$scopac}\t$scopac\tSCOP2\t$scopnb\n";
}
use strict;
=head1 get_xrefs
=head2 Description
This script take the post processed pmatch output (see process_pmatch.pl) and a file which contains the links of each known gene to other databases (eg: SP to hugo or EMBL) and put them back together in a format suitable for the DBlink tables.
=head2 Options
-mapping: Name of the file corresponding to postprocessed pmatch
-xrefs: Name of the file linking known genes to other DB
-dbmap: File giving for each known gene its DB
-refseq: If refseq ac is used, file which store for each NP its corresponding NM
-output: Name of the output file
=head2 Contact
mongin@ebi.ac.uk
birney@ebi.ac.uk
=cut
use Getopt::Long;
my ($mapping,$xrefs,$dbmap,$refseq,$out);
my %map;
my %hash;
my %ref_map;
my %ens2embl;
my %sp2embl;
my %embl_clone;
&GetOptions(
'mapping:s'=>\$mapping,
'xrefs:s'=>\$xrefs,
'dbmap:s'=>\$dbmap,
'refseq:s'=>\$refseq,
'output:s'=>\$out
);
#perl ../../../src/ensembl-live/misc-scripts/protein_match/get_xrefs.pl -mapping ../map_outputs/map.total -xrefs ../sec_outputs/xref.total -dbmap ../sec_outputs/mapdb.map -refseq ../primary/hs.gnp -output final.map
open (DBMAP,"$dbmap") || die "Can't open file $dbmap\n";
open (XREF,"$xrefs") || die "Can't open file $xrefs\n";
open (MAP,"$mapping") || die "Can't open file $mapping\n";
if ($refseq) {
open (REFSEQ,"$refseq") || die "Can't open file $refseq\n";
}
open (OUT,">$out") || die "Can't open file $out\n";
#open (CLONE,"clones.txt") || die "Can't open file\n";
#Put in a hash all of the embl clones used by Ensembl
#while (<CLONE>) {
# chomp;
# my ($embl_ac,$id) = split(/\t/,$_);
# print "$embl_ac\n";
# $embl_clone{$embl_ac}=1;
#}
while (<DBMAP>) {
chomp;
#Get put in a hash the corresponding database for an external accession number.Get the infos from a file already processed following this format:
#P31946 SP
my ($mapac,$mapdb) = split(/\t/,$_);
$map{$mapac} = $mapdb;
}
#Read the file by genbank entries (separated by //)
$/ = "\/\/\n";
while (<REFSEQ>) {
#This subroutine store for each NP (refseq protein accession number) its corresponding NM (DNA accession number)