Commit 0f1c78d5 authored by Emmanuel Mongin's avatar Emmanuel Mongin
Browse files

Some devellopments for the mapping

parent 94375d0e
...@@ -49,3 +49,15 @@ while ( my $seq1 = $in1->next_seq() ) { ...@@ -49,3 +49,15 @@ while ( my $seq1 = $in1->next_seq() ) {
} }
use strict;
use Bio::SeqIO;
use Bio::EnsEMBL::DBSQL::Obj;
use Bio::EnsEMBL::DBLoader;
use Getopt::Long;
use Bio::SeqIO;
my $dbpass = undef;
my $dbuser = 'ensro';
my $ensdbname = 'ensembl080';
my $host = 'ecs1c.sanger.ac.uk';
my $output;
&GetOptions(
'db:s' => \$ensdbname,
'host:s'=> \$host,
'dbuser:s'=> \$dbuser,
'output:s' => \$output
);
my $enslocator = "Bio::EnsEMBL::DBSQL::Obj/host=$host;dbname=$ensdbname;user=$dbuser;pass=$dbpass;perlonlyfeatures=1";
my $ensdb = Bio::EnsEMBL::DBLoader->new($enslocator);
my $sth = $ensdb->prepare ("select t.id,cl.embl_id from transcript as t, exon_transcript as et, clone as cl, contig as c, exon as e where t.id=et.transcript and et.exon = e.id and e.contig = c.internal_id and c.clone = cl.internal_id");
$sth->execute;
my %hash;
my %seen;
print STDERR "Getting data\n";
while (my @row = $sth->fetchrow) {
if (! defined $seen{$row[1]}) {
push(@{$hash{$row[0]}},$row[1]);
$seen{$row[1]} = 1;
}
}
print STDERR "Writing out\n";
open (OUT,">$output");
foreach my $keys (keys %hash) {
my @array = @{$hash{$keys}};
foreach my $arr (@array) {
print OUT "ENSEMBL\t$keys\tEMBL\t$arr\n";
}
}
...@@ -26,6 +26,9 @@ my ($mapping,$xrefs,$dbmap,$refseq,$out); ...@@ -26,6 +26,9 @@ my ($mapping,$xrefs,$dbmap,$refseq,$out);
my %map; my %map;
my %hash; my %hash;
my %ref_map; my %ref_map;
my %ens2embl;
my %sp2embl;
&GetOptions( &GetOptions(
...@@ -69,13 +72,23 @@ while (<XREF>) { ...@@ -69,13 +72,23 @@ while (<XREF>) {
#SP P31946 EMBL X57346 #SP P31946 EMBL X57346
my ($xrdb,$xrac,$db,$id) = split (/\t/,$_); my ($xrdb,$xrac,$db,$id) = split (/\t/,$_);
my $both = "$db:$id";
if ($xrdb ne "ENSEMBL") {
my $both = "$db:$id";
if( !defined $hash{$xrac} ) { if( !defined $hash{$xrac} ) {
$hash{$xrac} = []; $hash{$xrac} = [];
} }
push(@{$hash{$xrac}},$both); push(@{$hash{$xrac}},$both);
}
if ($xrdb eq "ENSEMBL") {
push(@{$ens2embl{$xrac}},$id);
}
if (($xrdb eq "SP") && ($db eq "EMBL")) {
push(@{$sp2embl{$xrac}},$id);
}
} }
while (<MAP>) { while (<MAP>) {
...@@ -83,13 +96,33 @@ while (<MAP>) { ...@@ -83,13 +96,33 @@ while (<MAP>) {
#P01111 COBP00000000001 100 PRIMARY #P01111 COBP00000000001 100 PRIMARY
my ($xr,$ens,$perc,$tag) = split (/\t/,$_); my ($xr,$ens,$perc,$tag) = split (/\t/,$_);
if ($tag eq "PRIMARY") { if (($tag eq "PRIMARY") || ($tag eq "DUPLICATE")) {
#Its a hack an another solution will have to be found, if the external known gene is a refseq protein accession number get back the equivalent refseq DNA accession number #Its a hack an another solution will have to be found, if the external known gene is a refseq protein accession number get back the equivalent refseq DNA accession number
if ($xr =~ /^NP_\d+/) { if ($xr =~ /^NP_\d+/) {
$xr = $ref_map{$xr}; $xr = $ref_map{$xr};
} }
if ($sp2embl{$xr}) {
my $tot_sp_embl;
my $tot_ens_embl;
my @sp_embl = @{$sp2embl{$xr}};
foreach my $sing1 (@sp_embl) {
$tot_sp_embl .= $sing1;
}
my @ens_embl = @{$ens2embl{$xr}};
foreach my $sing2 (@sp_embl) {
$tot_ens_embl .= $sing2;
}
if ($tot_ens_embl =~ $tot_sp_embl) {
print OUT "$ens\t$map{$xr}\t$xr\n";
}
}
#Print the know gene AC and its database #Print the know gene AC and its database
print OUT "$ens\t$map{$xr}\t$xr\n"; print OUT "$ens\t$map{$xr}\t$xr\n";
......
...@@ -32,16 +32,16 @@ my ($ens,$sp,$refseq); ...@@ -32,16 +32,16 @@ my ($ens,$sp,$refseq);
&runpmatch(); &runpmatch();
&postprocesspmatch($sp); &postprocesspmatch($sp);
&postprocesspmatch($refseq); #&postprocesspmatch($refseq);
&finalprocess($sp); &finalprocess($sp);
&finalprocess($refseq); #&finalprocess($refseq);
sub runpmatch { sub runpmatch {
print STDERR "Running pmatch\n"; print STDERR "Running pmatch\n";
#Run pmatch and store the data in files which will be kept for debugging #Run pmatch and store the data in files which will be kept for debugging
my $pmatch1 = "/nfs/griffin2/rd/bin.ALPHA/pmatch $sp $ens > ens_sp_rawpmatch"; my $pmatch1 = "/nfs/griffin2/rd/bin.ALPHA/pmatch -T 14 $sp $ens > ens_sp_rawpmatch";
my $pmatch2 = "/nfs/griffin2/rd/bin.ALPHA/pmatch $refseq $ens > ens_refseq_rawpmatch"; my $pmatch2 = "/nfs/griffin2/rd/bin.ALPHA/pmatch -T 14 $refseq $ens > ens_refseq_rawpmatch";
system($pmatch1); # == 0 or die "$0\Error running '$pmatch1' : $!"; system($pmatch1); # == 0 or die "$0\Error running '$pmatch1' : $!";
system($pmatch2); #== 0 or die "$0\Error running '$pmatch2' : $!"; system($pmatch2); #== 0 or die "$0\Error running '$pmatch2' : $!";
...@@ -52,7 +52,7 @@ sub runpmatch { ...@@ -52,7 +52,7 @@ sub runpmatch {
sub postprocesspmatch { sub postprocesspmatch {
my ($db) = @_; my ($db) = @_;
my %hash1; my %hash1;
my %hashlength;
#Post process the raw data from pmatch #Post process the raw data from pmatch
if ($db eq $sp) { if ($db eq $sp) {
...@@ -69,7 +69,7 @@ sub postprocesspmatch { ...@@ -69,7 +69,7 @@ sub postprocesspmatch {
while (<PROC>) { while (<PROC>) {
#538 COBP00000033978 1 538 35.3 Q14146 1 538 35.3 #538 COBP00000033978 1 538 35.3 Q14146 1 538 35.3
my ($len,$id,$start,$end,$perc,$query,$qst,$qend,$qperc) = split; my ($len,$id,$start,$end,$tperc,$query,$qst,$qend,$perc) = split;
if ($db eq $refseq) { if ($db eq $refseq) {
#Get only the refseq ac (NP_\d+) #Get only the refseq ac (NP_\d+)
...@@ -81,12 +81,19 @@ sub postprocesspmatch { ...@@ -81,12 +81,19 @@ sub postprocesspmatch {
#Add the percentage of similarity for the Ensembl peptide for a single match #Add the percentage of similarity for the Ensembl peptide for a single match
#There is a bug at this step, some similarities can be over 100% !!! This problem may be solved by changing pmatch source code #There is a bug at this step, some similarities can be over 100% !!! This problem may be solved by changing pmatch source code
$hash1{$uniq} += $perc; $hash1{$uniq} += $perc;
$hashlength{$uniq} += $len;
} }
#Write out the processed data #Write out the processed data
foreach my $key ( keys %hash1 ) { foreach my $key ( keys %hash1 ) {
($a,$b) = split(/:/,$key); if ($hashlength{$key} >= 20) {
print OUT "$a\t$b\t$hash1{$key}\n"; ($a,$b) = split(/:/,$key);
print OUT "$a\t$b\t$hash1{$key}\n";
}
#else {
# print "$a\t$b\t$hash1{$key}\t$hashlength{$key}\n";
#}
} }
close (PROC); close (PROC);
close (OUT); close (OUT);
...@@ -134,33 +141,44 @@ sub finalprocess { ...@@ -134,33 +141,44 @@ sub finalprocess {
#The Ensembl match to the known protein is labelled as PRIMARY and will be used later for the mapping #The Ensembl match to the known protein is labelled as PRIMARY and will be used later for the mapping
my $top = shift @array; my $top = shift @array;
print OUT "$know\t",$top->name,"\t",$top->perc,"\tPRIMARY\n";
#if ($top->perc >= 20) {
foreach $ens ( @array ) {
if( $ens->perc > $top->perc ) { print OUT "$know\t",$top->name,"\t",$top->perc,"\tPRIMARY\n";
die "Not good....";
foreach $ens ( @array ) {
if( $ens->perc > $top->perc ) {
die "Not good....";
}
} }
}
#If there is more than 20 Ensembl peptides matching a single known protein, these Ensembl peptides are labelled as REPEAT #If there is more than 20 Ensembl peptides matching a single known protein, these Ensembl peptides are labelled as REPEAT
if (scalar(@array) >= 20) { if (scalar(@array) >= 20) {
foreach my $repeat (@array) { foreach my $repeat (@array) {
print OUT "$know\t",$repeat->name,"\t",$repeat->perc,"\tREPEAT\n"; if( $repeat->perc+1 >= $top->perc ) {
print OUT "$know\t",$repeat->name,"\t",$repeat->perc,"\tDUPLICATE\n";
}
else {
print OUT "$know\t",$repeat->name,"\t",$repeat->perc,"\tREPEAT\n";
}
}
} }
}
#If less than 20, either duplicate if percentage of identity close to the PRIMARY labelled as DUPLICATE or labelled as PSEUDO. DUPLICATEs can also be used for the mapping #If less than 20, either duplicate if percentage of identity close to the PRIMARY labelled as DUPLICATE or labelled as PSEUDO. DUPLICATEs can also be used for the mapping
if (scalar(@array) < 20) { if (scalar(@array) < 20) {
foreach my $duplicate (@array) { foreach my $duplicate (@array) {
if( $duplicate->perc+1 >= $top->perc ) { if( $duplicate->perc+1 >= $top->perc ) {
print OUT "$know\t",$duplicate->name,"\t",$duplicate->perc,"\tDUPLICATE\n"; print OUT "$know\t",$duplicate->name,"\t",$duplicate->perc,"\tDUPLICATE\n";
} }
else { else {
print OUT "$know\t",$duplicate->name,"\t",$duplicate->perc,"\tPSEUDO\n"; print OUT "$know\t",$duplicate->name,"\t",$duplicate->perc,"\tPSEUDO\n";
} }
} }
} }
} }
#}
close (PROC); close (PROC);
close (OUT); close (OUT);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment