This script take the post processed pmatch output (see process_pmatch.pl) and a file which contains the links of each known gene to other databases (eg: SP to hugo or EMBL) and put them back together in a format suitable for the DBlink tables.
=head2 Options
-mapping: Name of the file corresponding to postprocessed pmatch
-xrefs: Name of the file linking known genes to other DB
-dbmap: File giving for each known gene its DB
-refseq: If refseq ac is used, file which store for each NP its corresponding NM
-output: Name of the output file
=head2 Contact
mongin@ebi.ac.uk
birney@ebi.ac.uk
=cut
useGetopt::Long;
my($mapping,$xrefs,$dbmap,$refseq,$out);
my%map;
my%hash;
my%ref_map;
&GetOptions(
'mapping:s'=>\$mapping,
'xrefs:s'=>\$xrefs,
'dbmap:s'=>\$dbmap,
'refseq:s'=>\$refseq,
'output:s'=>\$out
);
open(DBMAP,"$dbmap")||die"Can't open file $dbmap\n";
open(XREF,"$xrefs")||die"Can't open file $xrefs\n";
open(MAP,"$mapping")||die"Can't open file $mapping\n";
if($refseq){
open(REFSEQ,"$refseq")||die"Can't open file $refseq\n";
}
open(OUT,">$out")||die"Can't open file $out\n";
while(<DBMAP>){
chomp;
#Get put in a hash the corresponding database for an external accession number. Get the infos from a file already processed following this format:
#P31946 SP
my($mapac,$mapdb)=split(/\t/,$_);
$map{$mapac}=$mapdb;
}
#Read the file by genbank entries (separated by //)
$/="\/\/\n";
while(<REFSEQ>){
#This subroutine store for each NP (refseq protein accession number) its corresponding NM (DNA accession number)
#Its a hack an another solution will have to be found, if the external known gene is a refseq protein accession number get back the equivalent refseq DNA accession number
if($xr=~/^NP_\d+/){
$xr=$ref_map{$xr};
}
#Print the know gene AC and its database
printOUT"$ens\t$map{$xr}\t$xr\n";
#Print all of the external database it links to (eg: HUGO)
#Add the percentage of similarity for the Ensembl peptide for a single match
#There is a bug at this step, some similarities can be over 100% !!! This problem may be solved by changing pmatch source code
$hash1{$uniq}+=$perc;
}
#Write out the processed data
foreachmy$key(keys%hash1){
($a,$b)=split(/:/,$key);
printOUT"$a\t$b\t$hash1{$key}\n";
...
...
@@ -88,6 +93,7 @@ sub postprocesspmatch {
}
sub finalprocess{
#This final subroutine will use the postprocessed pmatch file and get back the best Ensembl match (labelled as PRIMARY) for a given external known protein.
my($db)=@_;
if($dbeq$sp){
...
...
@@ -108,10 +114,13 @@ sub finalprocess {
#if ($perc > 100) {
# print "$ens\t$known\t$perc\n";
#}
if(!defined$hash2{$known}){
$hash2{$known}=[];
}
#Each single external protein correspond to an array of objects dealing with the name and the percentage of similarity of the Ensembl peptide matching with the the known external protein.
my$p=NamePerc->new;
$p->name($ens);
$p->perc($perc);
...
...
@@ -119,11 +128,11 @@ sub finalprocess {
push(@{$hash2{$known}},$p);
}
foreachmy$know(keys%hash2){
my@array=@{$hash2{$know}};
@array=sort{$b->perc<=>$a->perc}@array;
#The Ensembl match to the known protein is labelled as PRIMARY and will be used later for the mapping
#If less than 20, either duplicate if percentage of identity close to the PRIMARY labelled as DUPLICATE or labelled as PSEUDO. DUPLICATEs can also be used for the mapping