Commit 76cc54ff authored by Emmanuel Mongin's avatar Emmanuel Mongin
Browse files

Mapping devellopemments

parent 9f4dccdb
use strict; use strict;
=head1 Get_ens2ensembl.pl
=head1 Description
=head2 Aims
The aim of thi script is to get from the database the corresponding clones for each Ensembl peptides. This will be then used to postprocess pmatch and get a more sensible mapping.
=cut
use Bio::SeqIO; use Bio::SeqIO;
use Bio::EnsEMBL::DBSQL::Obj; use Bio::EnsEMBL::DBSQL::Obj;
use Bio::EnsEMBL::DBLoader; use Bio::EnsEMBL::DBLoader;
......
...@@ -28,7 +28,7 @@ my %hash; ...@@ -28,7 +28,7 @@ my %hash;
my %ref_map; my %ref_map;
my %ens2embl; my %ens2embl;
my %sp2embl; my %sp2embl;
my %embl_clone;
&GetOptions( &GetOptions(
...@@ -47,6 +47,17 @@ if ($refseq) { ...@@ -47,6 +47,17 @@ if ($refseq) {
} }
open (OUT,">$out") || die "Can't open file $out\n"; open (OUT,">$out") || die "Can't open file $out\n";
open (CLONE,"clones.txt") || die "Can't open file\n";
#Put in a hash all of the embl clones used by Ensembl
while (<CLONE>) {
chomp;
my ($embl_ac,$id) = split(/\t/,$_);
print "$embl_ac\n";
$embl_clone{$embl_ac}=1;
}
while (<DBMAP>) { while (<DBMAP>) {
chomp; chomp;
#Get put in a hash the corresponding database for an external accession number. Get the infos from a file already processed following this format: #Get put in a hash the corresponding database for an external accession number. Get the infos from a file already processed following this format:
...@@ -82,13 +93,21 @@ while (<XREF>) { ...@@ -82,13 +93,21 @@ while (<XREF>) {
push(@{$hash{$xrac}},$both); push(@{$hash{$xrac}},$both);
} }
if ($xrdb eq "ENSEMBL") {
#Get the embl clone corresponding for each Ensembl peptides
if (($xrdb eq "ENSEMBL")) {
push(@{$ens2embl{$xrac}},$id); push(@{$ens2embl{$xrac}},$id);
} }
if (($xrdb eq "SP") && ($db eq "EMBL")) {
push(@{$sp2embl{$xrac}},$id);
}
#Get the embl ACs for each SP and SPTREMBL proteins
if ((($xrdb eq "SP") || ($xrdb eq "SPTREMBL")) && ($db eq "EMBL")) {
#print "$id\n";
if ($embl_clone{$id}) {
push(@{$sp2embl{$xrac}},$id);
}
}
} }
while (<MAP>) { while (<MAP>) {
...@@ -96,35 +115,50 @@ while (<MAP>) { ...@@ -96,35 +115,50 @@ while (<MAP>) {
#P01111 COBP00000000001 100 PRIMARY #P01111 COBP00000000001 100 PRIMARY
my ($xr,$ens,$perc,$tag) = split (/\t/,$_); my ($xr,$ens,$perc,$tag) = split (/\t/,$_);
if (($tag eq "PRIMARY") || ($tag eq "DUPLICATE")) {
#Hack to be taken away
my ($en1,$en2) = $ens =~ /(\w{3})P(\d+)/;
my $enst = $en1."T".$en2;
#For now take primary or duplicates and only matches which correspond to more than 25% of the external peptide. These criteria will have to be lowered up.
if ((($tag eq "PRIMARY") || ($tag eq "DUPLICATE")) && ($perc >= 25)) {
#Its a hack an another solution will have to be found, if the external known gene is a refseq protein accession number get back the equivalent refseq DNA accession number #Its a hack an another solution will have to be found, if the external known gene is a refseq protein accession number get back the equivalent refseq DNA accession number
if ($xr =~ /^NP_\d+/) { if ($xr =~ /^NP_\d+/) {
$xr = $ref_map{$xr}; $xr = $ref_map{$xr};
} }
#If the external peptide correspond to an embl clone, we will take the match only if the Ensembl peptide correspond to the same clone (at least one exon)
if ($sp2embl{$xr}) { if ($sp2embl{$xr}) {
print "$xr\t".@{$sp2embl{$xr}}."\n";
my $tot_sp_embl; my $tot_sp_embl;
my $tot_ens_embl; my $tot_ens_embl;
my @sp_embl = @{$sp2embl{$xr}}; my @sp_embl = @{$sp2embl{$xr}};
foreach my $sing1 (@sp_embl) { foreach my $sing1 (@sp_embl) {
#print "$sing1\n";
$tot_sp_embl .= $sing1; $tot_sp_embl .= $sing1;
}
my @ens_embl = @{$ens2embl{$xr}};
foreach my $sing2 (@sp_embl) {
$tot_ens_embl .= $sing2;
} }
if ($tot_ens_embl =~ $tot_sp_embl) {
print OUT "$ens\t$map{$xr}\t$xr\n"; if ($ens2embl{$enst}) {
my @ens_embl = @{$ens2embl{$enst}};
foreach my $sing2 (@sp_embl) {
$tot_ens_embl .= $sing2;
}
if ($tot_ens_embl =~ $tot_sp_embl) {
print OUT "$ens\t$map{$xr}\t$xr\n";
}
else {
#print "no\n";
}
} }
} }
else {
#Print the know gene AC and its database #Print the know gene AC and its database
print OUT "$ens\t$map{$xr}\t$xr\n"; print OUT "$ens\t$map{$xr}\t$xr\n";
}
#Print all of the external database it links to (eg: HUGO) #Print all of the external database it links to (eg: HUGO)
foreach my $both (@{$hash{$xr}}){ foreach my $both (@{$hash{$xr}}){
......
use strict;
use Bio::SeqIO;
use Bio::EnsEMBL::DBSQL::Obj;
use Bio::EnsEMBL::DBLoader;
use Getopt::Long;
use Bio::SeqIO;
my $dbpass = undef;
my $dbuser = 'ensro';
my $ensdbname = 'ensembl080';
my $host = 'ecs1c.sanger.ac.uk';
my $output;
&GetOptions(
'db:s' => \$ensdbname,
'host:s'=> \$host,
'dbuser:s'=> \$dbuser,
'output:s' => \$output
);
my $enslocator = "Bio::EnsEMBL::DBSQL::Obj/host=$host;dbname=$ensdbname;user=$dbuser;pass=$dbpass;perlonlyfeatures=1";
my $ensdb = Bio::EnsEMBL::DBLoader->new($enslocator);
my $sth = $ensdb->prepare ("select t.id,cl.embl_id from transcript as t, exon_transcript as et, clone as cl, contig as c, exon as e where t.id=et.transcript and et.exon = e.id and e.contig = c.internal_id and c.clone = cl.internal_id");
$sth->execute;
my %hash;
my %seen;
print STDERR "Getting data\n";
while (my @row = $sth->fetchrow) {
if (! defined $seen{$row[1]}) {
push(@{$hash{$row[0]}},$row[1]);
$seen{$row[1]} = 1;
}
}
print STDERR "Writing out\n";
open (OUT,">$output");
foreach my $keys (keys %hash) {
my @array = @{$hash{$keys}};
foreach my $arr (@array) {
print OUT "ENSEMBL\t$keys\tEMBL\t$arr\n";
}
}
...@@ -32,9 +32,9 @@ my ($ens,$sp,$refseq); ...@@ -32,9 +32,9 @@ my ($ens,$sp,$refseq);
&runpmatch(); &runpmatch();
&postprocesspmatch($sp); &postprocesspmatch($sp);
#&postprocesspmatch($refseq); &postprocesspmatch($refseq);
&finalprocess($sp); &finalprocess($sp);
#&finalprocess($refseq); &finalprocess($refseq);
sub runpmatch { sub runpmatch {
print STDERR "Running pmatch\n"; print STDERR "Running pmatch\n";
...@@ -87,7 +87,7 @@ sub postprocesspmatch { ...@@ -87,7 +87,7 @@ sub postprocesspmatch {
#Write out the processed data #Write out the processed data
foreach my $key ( keys %hash1 ) { foreach my $key ( keys %hash1 ) {
if ($hashlength{$key} >= 20) { if (($hashlength{$key} >= 20)) {
($a,$b) = split(/:/,$key); ($a,$b) = split(/:/,$key);
print OUT "$a\t$b\t$hash1{$key}\n"; print OUT "$a\t$b\t$hash1{$key}\n";
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment