This script aims to run pmatch and postprocess pmatch to map Ensembl peptides to external databases (currently Swissprot and Refseq but may be extented). The first part of the script runs pmatch, the second part gets the percentage of a match of a unique Ensembl peptide which match to an unique external protein. The third part classify each ensembl match as PRIMARY match (the longest one and the one which will be used for the mapping, PSEUDO, DUPLICATE and REPEAT (pretty arbitrary criterias but may be useful for quality control).
#This final subroutine will use the postprocessed pmatch file and get back the best Ensembl match (labelled as PRIMARY) for a given external known protein.
my($db)=@_;
if($dbeq$sp){
printSTDERR"Getting final mapping for SP mapping\n";
open(PROC,"ens_sp.processed");
open(OUT,">ens_sp.final");
}
elsif($dbeq$refseq){
printSTDERR"Getting final mapping for REFSEQ mapping\n";
open(PROC,"ens_refseq.processed")||die"Can' open file ens_refseq.processed\n";
open(OUT,">ens_refseq.final");
}
elsif($dbeq$pdb){
printSTDERR"Getting final mapping for PDB mapping\n";
open(PROC,"ens_pdb.processed")||die"Can' open file ens_refseq.processed\n";
open(OUT,">ens_pdb.final");
}
my%hash2;
while(<PROC>){
my($ens,$known,$perc)=split;
#if ($perc > 100) {
# print "$ens\t$known\t$perc\n";
#}
if(!defined$hash2{$known}){
$hash2{$known}=[];
}
#Each single external protein correspond to an array of objects dealing with the name and the percentage of similarity of the Ensembl peptide matching with the the known external protein.
my$p=NamePerc->new;
$p->name($ens);
$p->perc($perc);
push(@{$hash2{$known}},$p);
}
foreachmy$know(keys%hash2){
my@array=@{$hash2{$know}};
@array=sort{$b->perc<=>$a->perc}@array;
#The Ensembl match to the known protein is labelled as PRIMARY and will be used later for the mapping
#If less than 20, either duplicate if percentage of identity close to the PRIMARY labelled as DUPLICATE or labelled as PSEUDO. DUPLICATEs can also be used for the mapping