Commit 76cc54ff authored by Emmanuel Mongin's avatar Emmanuel Mongin
Browse files

Mapping devellopemments

parent 9f4dccdb
use strict;
=head1 Get_ens2ensembl.pl
=head1 Description
=head2 Aims
The aim of thi script is to get from the database the corresponding clones for each Ensembl peptides. This will be then used to postprocess pmatch and get a more sensible mapping.
=cut
use Bio::SeqIO;
use Bio::EnsEMBL::DBSQL::Obj;
use Bio::EnsEMBL::DBLoader;
......
......@@ -28,7 +28,7 @@ my %hash;
my %ref_map;
my %ens2embl;
my %sp2embl;
my %embl_clone;
&GetOptions(
......@@ -47,6 +47,17 @@ if ($refseq) {
}
open (OUT,">$out") || die "Can't open file $out\n";
open (CLONE,"clones.txt") || die "Can't open file\n";
#Put in a hash all of the embl clones used by Ensembl
while (<CLONE>) {
chomp;
my ($embl_ac,$id) = split(/\t/,$_);
print "$embl_ac\n";
$embl_clone{$embl_ac}=1;
}
while (<DBMAP>) {
chomp;
#Get put in a hash the corresponding database for an external accession number. Get the infos from a file already processed following this format:
......@@ -82,13 +93,21 @@ while (<XREF>) {
push(@{$hash{$xrac}},$both);
}
if ($xrdb eq "ENSEMBL") {
#Get the embl clone corresponding for each Ensembl peptides
if (($xrdb eq "ENSEMBL")) {
push(@{$ens2embl{$xrac}},$id);
}
if (($xrdb eq "SP") && ($db eq "EMBL")) {
push(@{$sp2embl{$xrac}},$id);
}
#Get the embl ACs for each SP and SPTREMBL proteins
if ((($xrdb eq "SP") || ($xrdb eq "SPTREMBL")) && ($db eq "EMBL")) {
#print "$id\n";
if ($embl_clone{$id}) {
push(@{$sp2embl{$xrac}},$id);
}
}
}
while (<MAP>) {
......@@ -96,35 +115,50 @@ while (<MAP>) {
#P01111 COBP00000000001 100 PRIMARY
my ($xr,$ens,$perc,$tag) = split (/\t/,$_);
if (($tag eq "PRIMARY") || ($tag eq "DUPLICATE")) {
#Hack to be taken away
my ($en1,$en2) = $ens =~ /(\w{3})P(\d+)/;
my $enst = $en1."T".$en2;
#For now take primary or duplicates and only matches which correspond to more than 25% of the external peptide. These criteria will have to be lowered up.
if ((($tag eq "PRIMARY") || ($tag eq "DUPLICATE")) && ($perc >= 25)) {
#Its a hack an another solution will have to be found, if the external known gene is a refseq protein accession number get back the equivalent refseq DNA accession number
if ($xr =~ /^NP_\d+/) {
$xr = $ref_map{$xr};
}
#If the external peptide correspond to an embl clone, we will take the match only if the Ensembl peptide correspond to the same clone (at least one exon)
if ($sp2embl{$xr}) {
print "$xr\t".@{$sp2embl{$xr}}."\n";
my $tot_sp_embl;
my $tot_ens_embl;
my @sp_embl = @{$sp2embl{$xr}};
foreach my $sing1 (@sp_embl) {
#print "$sing1\n";
$tot_sp_embl .= $sing1;
}
my @ens_embl = @{$ens2embl{$xr}};
foreach my $sing2 (@sp_embl) {
$tot_ens_embl .= $sing2;
}
if ($tot_ens_embl =~ $tot_sp_embl) {
print OUT "$ens\t$map{$xr}\t$xr\n";
if ($ens2embl{$enst}) {
my @ens_embl = @{$ens2embl{$enst}};
foreach my $sing2 (@sp_embl) {
$tot_ens_embl .= $sing2;
}
if ($tot_ens_embl =~ $tot_sp_embl) {
print OUT "$ens\t$map{$xr}\t$xr\n";
}
else {
#print "no\n";
}
}
}
else {
#Print the know gene AC and its database
print OUT "$ens\t$map{$xr}\t$xr\n";
}
#Print all of the external database it links to (eg: HUGO)
foreach my $both (@{$hash{$xr}}){
......
use strict;
use Bio::SeqIO;
use Bio::EnsEMBL::DBSQL::Obj;
use Bio::EnsEMBL::DBLoader;
use Getopt::Long;
use Bio::SeqIO;
my $dbpass = undef;
my $dbuser = 'ensro';
my $ensdbname = 'ensembl080';
my $host = 'ecs1c.sanger.ac.uk';
my $output;
&GetOptions(
'db:s' => \$ensdbname,
'host:s'=> \$host,
'dbuser:s'=> \$dbuser,
'output:s' => \$output
);
my $enslocator = "Bio::EnsEMBL::DBSQL::Obj/host=$host;dbname=$ensdbname;user=$dbuser;pass=$dbpass;perlonlyfeatures=1";
my $ensdb = Bio::EnsEMBL::DBLoader->new($enslocator);
my $sth = $ensdb->prepare ("select t.id,cl.embl_id from transcript as t, exon_transcript as et, clone as cl, contig as c, exon as e where t.id=et.transcript and et.exon = e.id and e.contig = c.internal_id and c.clone = cl.internal_id");
$sth->execute;
my %hash;
my %seen;
print STDERR "Getting data\n";
while (my @row = $sth->fetchrow) {
if (! defined $seen{$row[1]}) {
push(@{$hash{$row[0]}},$row[1]);
$seen{$row[1]} = 1;
}
}
print STDERR "Writing out\n";
open (OUT,">$output");
foreach my $keys (keys %hash) {
my @array = @{$hash{$keys}};
foreach my $arr (@array) {
print OUT "ENSEMBL\t$keys\tEMBL\t$arr\n";
}
}
......@@ -32,9 +32,9 @@ my ($ens,$sp,$refseq);
&runpmatch();
&postprocesspmatch($sp);
#&postprocesspmatch($refseq);
&postprocesspmatch($refseq);
&finalprocess($sp);
#&finalprocess($refseq);
&finalprocess($refseq);
sub runpmatch {
print STDERR "Running pmatch\n";
......@@ -87,7 +87,7 @@ sub postprocesspmatch {
#Write out the processed data
foreach my $key ( keys %hash1 ) {
if ($hashlength{$key} >= 20) {
if (($hashlength{$key} >= 20)) {
($a,$b) = split(/:/,$key);
print OUT "$a\t$b\t$hash1{$key}\n";
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment