Commit db5ef4e1 authored by Emmanuel Mongin's avatar Emmanuel Mongin
Browse files

Removed the last files and added the ones which are now going to be used for the mapping

parent f702f9ca
use strict;
use Getopt::Long;
use Bio::SeqIO;
BEGIN {
unshift (@INC,"/work1/mongin/src/ensembl-live/misc-scripts/protein_match");
require "mapping_conf.pl";
}
my %conf = %::mapping_conf; # configuration options
# global vars
my $sptr_swiss = $conf{'sptr_swiss'};
my $refseq_gnp = $conf{'refseq_gnp'};
my $ens1 = $conf{'ens1'};
my $ens4 = $conf{'ens4'};
my $out = $conf{'x_map'};
my %refseq_map;
my %sp_db;
my %hugo_id;
my %hugo_syn;
open (OUT,">$out") || die "Can't open $out\n";;
#First read the SPTR file in swiss format
print STDERR "Reading SPTR file\n";
my $in = Bio::SeqIO->new(-file => $sptr_swiss, '-format' =>'swiss');
while ( my $seq = $in->next_seq() ) {
#For each SP entry, store the information concerning the entry
my $db;
my $ac = $seq->accession;
my $id = $seq->display_id;
my ($displ_id,$tag) = split(/:/,$id);
if ($tag eq "STANDARD") {
$db = "SP";
}
elsif ($tag eq "PRELIMINARY") {
$db = "SPTREMBL";
}
else {
die "Try to load unknown SPTR database\n";
}
my $un_ac = "$ac:$db";
my @secs = $seq->get_secondary_accessions;
my $syns = join(';',@secs);
print OUT "$ac\tSPTR\t$ac\t$db\t$displ_id\t$syns\n";
$sp_db{$ac} = $db;
#Then get info about the Xref mapping
my @dblink = $seq->annotation->each_DBLink;
foreach my $link(@dblink) {
if ($link->database eq "EMBL") {
print OUT "$ac\tSPTR\t".$link->primary_id."\t".$link->database."\t\t\n";
my ($protac) = $link->optional_id =~ /^(\w+).\S+/;
print OUT "$ac\tSPTR\t".$protac."\tEMBL_PROT_AC\t\t\n";
}
if ($link->database eq "MIM") {
print OUT "$ac\tSPTR\t".$link->primary_id."\t".$link->database."\t\t\n";
}
}
}
#Read the refseq file in gnp format
print STDERR "Reading REFSEQ File\n";
open (REFSEQ,"$refseq_gnp") || die "Can't open $refseq_gnp\n";
$/ = "\/\/\n";
while (<REFSEQ>) {
my ($prot_ac) = $_ =~ /ACCESSION\s+(\S+)/;
my ($dna_ac) = $_ =~ /DBSOURCE REFSEQ: accession\s+(\w+)/;
$refseq_map{$dna_ac} = $prot_ac;
print OUT "$dna_ac\tREFSEQ\t$prot_ac\tREFSEQ\t$prot_ac\t\n";
my ($mim) = $_ =~ /\/db_xref=\"MIM:(\d+)/;
my ($locus) = $_ =~ /\/db_xref=\"LocusID:(\d*)/;
if ($mim) {
print OUT "$dna_ac\tREFSEQ\t$mim\tOMIM\t$mim\t\n";
}
if ($locus) {
print OUT "$dna_ac\tREFSEQ\t$locus\tLOCUS\t$locus\t\n";
}
}
close (REFSEQ);
$/ = "\n";
#Read the Hugo files
print STDERR "Reading Hugo files\n";
open (ENS4,"$ens4") || die "Can't open $ens4\n";;
while (<ENS4>) {
chomp;
my @array = split(/\t/,$_);
my $hgnc = $array[0];
my $id = $array[1];
#my $syn1 = $array[2];
#my $syn2 = $array[3];
my $syn1 = join (';',split (/,\s/,$array[2]));
my $syn2 = join (';',split (/,\s/,$array[3]));
my $syn = "$syn1;$syn2";
$hugo_id{$hgnc} = $id;
$hugo_syn{$hgnc} = $syn;
#print $hugo_syn{$hgnc};
}
close (ENS4);
open (ENS1,"$ens1") || die "Can't open $ens1\n";
while (<ENS1>) {
chomp;
my @array = split(/\t/,$_);
my $hgnc = $array[0];
if ($array[1]) {
#my $db = $sp_db{$array[1]};
print OUT "$array[1]\tSPTR\t$hgnc\tHUGO\t$hugo_id{$hgnc}\t$hugo_syn{$hgnc}\n";
}
if ($array[2]) {
#my $db = $sp_db{$array[1]};
print OUT "$array[2]\tREFSEQ\t$hgnc\tHUGO\t$hugo_id{$hgnc}\t$hugo_syn{$hgnc}\n";
}
}
close (ENS1);
=head1 mappping_conf.pl
=head2 Description
This script gives the basic configuration needed by the mapping. This configuration script will be used by each script of the mapping. Each field is described bellow.
+ organism: The name of the organism which has to be mapped. eg: human, mouse, worm
+ sptr: Location of the SPTR file
+ refseq: Location of the Refseq file (only valide for the human organism)
+ (ens1, ens2, ens5) : Various files produced by Hugo (http://www.gene.ucl.ac.uk/public-files/nomen/) which would be used to map SP to hugo and refseq to Hugo (only valide for the human organism)
=cut
BEGIN {
package main;
%mapping_conf = (
# Files location (Input/Output)
'query' => '/work4/mongin/data/ensembl.pep',
#'query' => '',
'sptr_fa' => '/work1/mongin/mapping/primary/refseq.fa',
#'sptr_fa' => '',
'sptr_swiss' => '/work1/mongin/mapping/primary/sptr.swiss',
#'sptr_swiss' => '',
'refseq_fa' => '/work1/mongin/mapping/primary/refseq.fa',
#'refseq' => '',
'refseq_gnp' => '/work1/mongin/mapping/primary/refseq.gnp',
#'refseq_gnp' => '',
'ens1' => '/work1/mongin/mapping/primary/ens1.txt',
#'ens1' => '',
'ens4' => '/work1/mongin/mapping/primary/ens4.txt',
# 'ens4' => '',
'refseq_map' => '/work1/mongin/mapping/tests/refseq_map.txt',
#'refseq_map' => '',
'sp_map' => '/work1/mongin/mapping/tests/refseq_map2.txt',
#'sp_map' => '',
'x_map' => '/work1/mongin/mapping/tests/xmap_out1.txt',
#'x_map_out' => '',
#Database handling
'db' => 'prot_pipeline_test',
#'db' => '',
'host' => 'ecs1b',
#'host' => '',
);
}
1;
use strict;
use DBI;
use Getopt::Long;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::DBLoader;
use Bio::EnsEMBL::DBSQL::DBEntryAdaptor;
use Bio::EnsEMBL::DBEntry;
use Bio::SeqIO;
BEGIN {
unshift (@INC,"/work1/mongin/src/ensembl-live/misc-scripts/protein_match");
require "mapping_conf.pl";
}
my %conf = %::mapping_conf; # configuration options
# global vars
my $refseq_gnp = $conf{'refseq_gnp'};
my $xmap = $conf{'x_map'};
my $map = $conf{'sp_map'};
my $dbname = $conf{'db'};
my $host = $conf{'host'};
my %map;
my %ref_map;
print STDERR "Connecting to the database...\n";
my $enslocator = "Bio::EnsEMBL::DBSQL::DBAdaptor/host=$host;dbname=$dbname;user=ensadmin;perlonlyfeatures=1";
print STDERR "Bio::EnsEMBL::DBSQL::DBAdaptor/host=$host;dbname=$dbname;user=ensadmin;perlonlyfeatures=1\n";
my $db = Bio::EnsEMBL::DBLoader->new($enslocator);
my $adaptor = Bio::EnsEMBL::DBSQL::DBEntryAdaptor->new($db);
open (REFSEQ,"$refseq_gnp") || die "Can't open $refseq_gnp\n";
#Read the file by genbank entries (separated by //)
$/ = "\/\/\n";
while (<REFSEQ>) {
#This subroutine store for each NP (refseq protein accession number) its corresponding NM (DNA accession number)
my ($prot_ac) = $_ =~ /ACCESSION\s+(\S+)/;
my ($dna_ac) = $_ =~ /DBSOURCE REFSEQ: accession\s+(\w+)/;
#print STDERR "$prot_ac\n";
$ref_map{$prot_ac} = $dna_ac;
}
#Put back the default (new line) for reading file
$/ = "\n";
open (XMAP,"$xmap") || die "Can't open $xmap\n";
while (<XMAP>) {
chomp;
my ($targetid,$targetdb,$xac,$xdb,$xid,$xsyn) = split (/\t/,$_);
#print STDERR "$targetid,$targetdb,$xac,$xdb,$xid,$xsyn\n";
my $p= Desc->new;
$p->targetDB($targetdb);
$p->xAC($xac);
$p->xDB($xdb);
$p->xID($xid);
$p->xSYN($xsyn);
push(@{$map{$targetid}},$p);
}
close (XMAP);
open (MAP,"$map") || die "Can't open $map\n";
while (<MAP>) {
#print STDERR "Loading data in the database\n";
chomp;
my ($queryid,$tid,$tag,$queryperc,$targetperc) = split (/\t/,$_);
#print STDERR "$queryid,$tid,$tag,$queryperc,$targetperc\n";
if ($tid ne "orphan") {
if ($tid =~ /^NP_\d+/) {
#($tid) = $tid =~ /^(NP_\d+)/;
#print STDERR "$tid\n";
$tid = $ref_map{$tid};
}
#print STDERR "$tid\n";
my @array = @{$map{$tid}};
foreach my $a(@array) {
#print STDERR $a->xDB."\n";
my $dbentry = Bio::EnsEMBL::DBEntry->new
( -adaptor => $adaptor,
-primary_id => $a->xAC,
-display_id => $a->xID,
-version => 1,
-release => 1,
-dbname => $a->xDB );
my @synonyms = split (/;/,$a->xSYN);
foreach my $syn (@synonyms) {
if ($syn =~ /\S+/) {
$dbentry->add_synonym($syn);
}
}
$adaptor->store($dbentry,$queryid,"Translation");
}
}
}
package Desc;
=head2 new
Title : new
Usage : $obj->new($newval)
Function:
Returns : value of new
Args : newvalue (optional)
=cut
sub new{
my $class= shift;
my $self = {};
bless $self,$class;
return $self;
}
=head2 targetDB
Title : targetDB
Usage : $obj->targetDB($newval)
Function:
Returns : value of targetDB
Args : newvalue (optional)
=cut
sub targetDB{
my $obj = shift;
if( @_ ) {
my $value = shift;
$obj->{'targetDB'} = $value;
}
return $obj->{'targetDB'};
}
=head2 xAC
Title : xAC
Usage : $obj->xAC($newval)
Function:
Returns : value of xAC
Args : newvalue (optional)
=cut
sub xAC{
my $obj = shift;
if( @_ ) {
my $value = shift;
$obj->{'xAC'} = $value;
}
return $obj->{'xAC'};
}
=head2 xDB
Title : xDB
Usage : $obj->xDB($newval)
Function:
Returns : value of xDB
Args : newvalue (optional)
=cut
sub xDB{
my $obj = shift;
if( @_ ) {
my $value = shift;
$obj->{'xDB'} = $value;
}
return $obj->{'xDB'};
}
=head2 xID
Title : xID
Usage : $obj->xID($newval)
Function:
Returns : value of xID
Args : newvalue (optional)
=cut
sub xID{
my $obj = shift;
if( @_ ) {
my $value = shift;
$obj->{'xID'} = $value;
}
return $obj->{'xID'};
}
=head2 xSYN
Title : xSYN
Usage : $obj->xSYN($newval)
Function:
Returns : value of xSYN
Args : newvalue (optional)
=cut
sub xSYN{
my $obj = shift;
if( @_ ) {
my $value = shift;
$obj->{'xSYN'} = $value;
}
return $obj->{'xSYN'};
}
#!/usr/local/bin/perl
# Author: Marc Sohrmann (ms2@sanger.ac.uk)
# Copyright (c) Marc Sohrmann, 2001
# You may distribute this code under the same terms as perl itself
# wrapper around Richard Durbin's pmatch code (fast protein matcher).
# 06/01 Adapted for Ensembl (mongin@ebi.ac.uk)
use strict;
use Getopt::Std;
use vars qw($opt_q $opt_t $opt_l $opt_o $opt_p $opt_w $opt_s $opt_c $opt_d);
BEGIN {
unshift (@INC,"/work1/mongin/src/ensembl-live/misc-scripts/protein_match");
require "mapping_conf.pl";
}
my %conf = %::mapping_conf;
my $sptr_fa = $conf{'sptr_fa'};
my $refseq_fa = $conf{'refseq_fa'};
$opt_p = 66;
$opt_q = $conf{'query'};
$opt_t = $conf{'sptr_fa'};
$opt_o = $conf{'sp_map'};
getopts ("q:t:l:o:p:wsc:d");
my $usage = "pmatch.pl\n";
$usage .= "-q [query fasta db]\n";
$usage .= "-t [target fasta db] OR -l [file listing target fasta db's] OR -w to create tmp file of all worm pep's from SWALL\n";
$usage .= "-o [out file]\n";
$usage .= "-s to calculate the best path of non-overlapping pmatch matches for each query-target pair OPTIONAL\n";
$usage .= "-c to classify the matches (relative to the query sequence) OPTIONAL\n";
$usage .= "-d to keep the intermediate tmp files OPTIONAL\n";
$usage .= "-p minimum percentage off identity accepted OPTIONAL (default value 66%)\n";
#unless ($opt_q && ($opt_t || $opt_l || $opt_w) && $opt_o) {
# die "$usage";
#}
#################################
my $query = $opt_q;
my $target = $opt_t;
#################################
# make worm-specific protein set from SWALL if ($opt_w)
if ($opt_w) {
print STDERR "extract worm sequences from SWALL...\n";
my $getz = "getz -f seq -sf fasta \'[swall-org:Caenorhabditis elegans]\' > $$.swall";
system "$getz";
$target = "$$.swall";
}
#################################
# run pmatch (Richard Durbin's fast protein matcher, rd@sanger.ac.uk)
print STDERR "run pmatch...\n";
if ($opt_l) {
my @files;
open (DB , "$opt_l") || die "cannot read $opt_l\n";
while (<DB>) {
chomp;
my @a = split /\s+/;
push (@files , @a);
}
close DB;
foreach my $file (@files) {
my $pmatch = "/nfs/disk65/ms2/bin/pmatch -T 14 $file $query >> $$.pmatch";
system "$pmatch";
}
}
else {
my $pmatch = "/nfs/disk65/ms2/bin/pmatch -T 14 $target $query > $$.pmatch";
system "$pmatch";
}
if ($opt_w && !$opt_d) {
unlink "$$.swall";
}
#################################
# sort the pmatch output file, based on query and target name
# (necessary since not all the matches relating to a query-target pair cluster)
print STDERR "sort the pmatch output file...\n";
my (@a , @q , @t);
open (TMP , ">$$.sort") || die "cannot create $$.sort\n";
open (PMATCH , "$$.pmatch") || die "cannot read $$.pmatch\n";
while (<PMATCH>) {
my @f = split /\t/;
push @a, $_;
push @q, $f[1];
push @t, $f[6];
}
foreach my $i (sort { $q[$a] cmp $q[$b] or $t[$a] cmp $t[$b] } 0..$#a) { print TMP $a[$i] }
close PMATCH;
close TMP;
rename ("$$.sort" , "$$.pmatch");
#################################
# determine the best path of non-overlapping matching substrings (if $opt_s)
if ($opt_s) {
print STDERR "stitch pmatch substrings...\n";
open (STITCH , ">$$.stitch") || die "cannot create $$.stitch\n";
my @match_list = ();
my $old_query;
my $old_target;
my $old_qlen;
my $old_tlen;
my $previous_line;
open (PMATCH , "$$.pmatch") || die "cannot read $$.pmatch\n";
while (<PMATCH>) {
chomp;
my @a = split /\t/;
my $query = $a[1];
my $qstart = $a[2];
my $qend = $a[3];
my $qlen = $a[5];
my $target = $a[6];
my $tstart = $a[7];
my $tend = $a[8];
my $tlen = $a[10];
# new set of query/target
if (($query ne $old_query || $target ne $old_target) && @match_list) {
if (@match_list == 1) {
print STITCH "$previous_line\n";
}
else {
my ($max , $trace) = stitch_matches (@match_list);
my $qperc = sprintf ("%.1f" , ($max/$old_qlen)*100);
my $tperc = sprintf ("%.1f" , ($max/$old_tlen)*100);