Commit 7530dd53 authored by Emmanuel Mongin's avatar Emmanuel Mongin
Browse files

Some small important changes...

parent 68f0ec46
#use strict; use strict;
=head1 Process pmatch
=head1 Description
=head2 Aims
This script aims to run pmatch and postprocess pmatch to map Ensembl peptides to external databases (currently Swissprot and Refseq but may be extented). The first part of the script runs pmatch, the second part gets the percentage of a match of a unique Ensembl peptide which match to an unique external protein. The third part classify each ensembl match as PRIMARY match (the longest one and the one which will be used for the mapping, PSEUDO, DUPLICATE and REPEAT (pretty arbitrary criterias but may be useful for quality control).
NB: All of the intermediary files are written.
=head2 Options
-ens : Ensembl peptide fasta file
-sp : SP, SPTREMBL fasta file
-refseq: Refseq peptide fasta file
=cut
use Getopt::Long; use Getopt::Long;
my ($ens,$sp,$refseq); my ($ens,$sp,$refseq);
...@@ -23,8 +39,8 @@ sub runpmatch { ...@@ -23,8 +39,8 @@ sub runpmatch {
my $pmatch1 = "/nfs/griffin2/rd/bin.ALPHA/pmatch $sp $ens > ens_sp_rawpmatch"; my $pmatch1 = "/nfs/griffin2/rd/bin.ALPHA/pmatch $sp $ens > ens_sp_rawpmatch";
my $pmatch2 = "/nfs/griffin2/rd/bin.ALPHA/pmatch $refseq $ens > ens_refseq_rawpmatch"; my $pmatch2 = "/nfs/griffin2/rd/bin.ALPHA/pmatch $refseq $ens > ens_refseq_rawpmatch";
system($pmatch1);# == 0 or die "$0\Error running '$pmatch1' : $!"; system($pmatch1); # == 0 or die "$0\Error running '$pmatch1' : $!";
system($pmatch2);# == 0 or die "$0\Error running '$pmatch2' : $!"; system($pmatch2); #== 0 or die "$0\Error running '$pmatch2' : $!";
} }
...@@ -33,7 +49,7 @@ sub postprocesspmatch { ...@@ -33,7 +49,7 @@ sub postprocesspmatch {
my ($db) = @_; my ($db) = @_;
my %hash1; my %hash1;
#open (OUT, ">../data/$db.processed");
#Post process the raw data from pmatch #Post process the raw data from pmatch
if ($db eq $sp) { if ($db eq $sp) {
print STDERR "Postprocessing pmatch for SP mapping\n"; print STDERR "Postprocessing pmatch for SP mapping\n";
...@@ -48,12 +64,17 @@ sub postprocesspmatch { ...@@ -48,12 +64,17 @@ sub postprocesspmatch {
} }
while (<PROC>) { while (<PROC>) {
## 20 Q9UN99 36 55 8.1 ENSP00000051351 63 82 7.8 #538 COBP00000033978 1 538 35.3 Q14146 1 538 35.3
my ($len,$id,$start,$end,$perc,$query,$qst,$qend,$qperc) = split; my ($len,$id,$start,$end,$perc,$query,$qst,$qend,$qperc) = split;
#print STDERR "$id:$query";
if ($db eq $refseq) {
#Get the refseq ac (NP_\d+)
($query) = $query =~ /\w+\|\d+\|\w+\|(\w+)/;
}
my $uniq = "$id:$query"; my $uniq = "$id:$query";
$hash1{$uniq} += $qperc; $hash1{$uniq} += $perc;
} }
...@@ -62,7 +83,8 @@ sub postprocesspmatch { ...@@ -62,7 +83,8 @@ sub postprocesspmatch {
($a,$b) = split(/:/,$key); ($a,$b) = split(/:/,$key);
print OUT "$a\t$b\t$hash1{$key}\n"; print OUT "$a\t$b\t$hash1{$key}\n";
} }
close (OUT); close (PROC);
close (OUT);
} }
sub finalprocess { sub finalprocess {
...@@ -90,7 +112,7 @@ sub finalprocess { ...@@ -90,7 +112,7 @@ sub finalprocess {
$hash2{$known} = []; $hash2{$known} = [];
} }
$p= NamePerc->new; my $p= NamePerc->new;
$p->name($ens); $p->name($ens);
$p->perc($perc); $p->perc($perc);
...@@ -99,10 +121,10 @@ sub finalprocess { ...@@ -99,10 +121,10 @@ sub finalprocess {
foreach my $know ( keys %hash2 ) { foreach my $know ( keys %hash2 ) {
@array = @{$hash2{$know}}; my @array = @{$hash2{$know}};
@array = sort { $b->perc <=> $a->perc } @array; @array = sort { $b->perc <=> $a->perc } @array;
$top = shift @array; my $top = shift @array;
print OUT "$know\t",$top->name,"\t",$top->perc,"\tPRIMARY\n"; print OUT "$know\t",$top->name,"\t",$top->perc,"\tPRIMARY\n";
foreach $ens ( @array ) { foreach $ens ( @array ) {
...@@ -128,6 +150,7 @@ sub finalprocess { ...@@ -128,6 +150,7 @@ sub finalprocess {
} }
} }
} }
close (PROC);
close (OUT); close (OUT);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment