The aim of thi script is to get from the database the corresponding clones for each Ensembl peptides. This will be then used to postprocess pmatch and get a more sensible mapping.
#For now take primary or duplicates and only matches which correspond to more than 25% of the external peptide. These criteria will have to be lowered up.
#Its a hack an another solution will have to be found, if the external known gene is a refseq protein accession number get back the equivalent refseq DNA accession number
if($xr=~/^NP_\d+/){
$xr=$ref_map{$xr};
}
#If the external peptide correspond to an embl clone, we will take the match only if the Ensembl peptide correspond to the same clone (at least one exon)
if($sp2embl{$xr}){
print"$xr\t".@{$sp2embl{$xr}}."\n";
my$tot_sp_embl;
my$tot_ens_embl;
my@sp_embl=@{$sp2embl{$xr}};
foreachmy$sing1(@sp_embl){
#print "$sing1\n";
$tot_sp_embl.=$sing1;
}
my@ens_embl=@{$ens2embl{$xr}};
foreachmy$sing2(@sp_embl){
$tot_ens_embl.=$sing2;
}
if($tot_ens_embl=~$tot_sp_embl){
printOUT"$ens\t$map{$xr}\t$xr\n";
if($ens2embl{$enst}){
my@ens_embl=@{$ens2embl{$enst}};
foreachmy$sing2(@sp_embl){
$tot_ens_embl.=$sing2;
}
if($tot_ens_embl=~$tot_sp_embl){
printOUT"$ens\t$map{$xr}\t$xr\n";
}
else{
#print "no\n";
}
}
}
else{
#Print the know gene AC and its database
printOUT"$ens\t$map{$xr}\t$xr\n";
}
#Print all of the external database it links to (eg: HUGO)
my$sth=$ensdb->prepare("select t.id,cl.embl_id from transcript as t, exon_transcript as et, clone as cl, contig as c, exon as e where t.id=et.transcript and et.exon = e.id and e.contig = c.internal_id and c.clone = cl.internal_id");