Commit 76cc54ff authored by Emmanuel Mongin's avatar Emmanuel Mongin
Browse files

Mapping devellopemments

parent 9f4dccdb
use strict;
=head1 Description
=head2 Aims
The aim of thi script is to get from the database the corresponding clones for each Ensembl peptides. This will be then used to postprocess pmatch and get a more sensible mapping.
use Bio::SeqIO;
use Bio::EnsEMBL::DBSQL::Obj;
use Bio::EnsEMBL::DBLoader;
......@@ -28,7 +28,7 @@ my %hash;
my %ref_map;
my %ens2embl;
my %sp2embl;
my %embl_clone;
......@@ -47,6 +47,17 @@ if ($refseq) {
open (OUT,">$out") || die "Can't open file $out\n";
open (CLONE,"clones.txt") || die "Can't open file\n";
#Put in a hash all of the embl clones used by Ensembl
while (<CLONE>) {
my ($embl_ac,$id) = split(/\t/,$_);
print "$embl_ac\n";
while (<DBMAP>) {
#Get put in a hash the corresponding database for an external accession number. Get the infos from a file already processed following this format:
......@@ -82,13 +93,21 @@ while (<XREF>) {
if ($xrdb eq "ENSEMBL") {
#Get the embl clone corresponding for each Ensembl peptides
if (($xrdb eq "ENSEMBL")) {
if (($xrdb eq "SP") && ($db eq "EMBL")) {
#Get the embl ACs for each SP and SPTREMBL proteins
if ((($xrdb eq "SP") || ($xrdb eq "SPTREMBL")) && ($db eq "EMBL")) {
#print "$id\n";
if ($embl_clone{$id}) {
while (<MAP>) {
......@@ -96,24 +115,34 @@ while (<MAP>) {
#P01111 COBP00000000001 100 PRIMARY
my ($xr,$ens,$perc,$tag) = split (/\t/,$_);
if (($tag eq "PRIMARY") || ($tag eq "DUPLICATE")) {
#Hack to be taken away
my ($en1,$en2) = $ens =~ /(\w{3})P(\d+)/;
my $enst = $en1."T".$en2;
#For now take primary or duplicates and only matches which correspond to more than 25% of the external peptide. These criteria will have to be lowered up.
if ((($tag eq "PRIMARY") || ($tag eq "DUPLICATE")) && ($perc >= 25)) {
#Its a hack an another solution will have to be found, if the external known gene is a refseq protein accession number get back the equivalent refseq DNA accession number
if ($xr =~ /^NP_\d+/) {
$xr = $ref_map{$xr};
#If the external peptide correspond to an embl clone, we will take the match only if the Ensembl peptide correspond to the same clone (at least one exon)
if ($sp2embl{$xr}) {
print "$xr\t".@{$sp2embl{$xr}}."\n";
my $tot_sp_embl;
my $tot_ens_embl;
my @sp_embl = @{$sp2embl{$xr}};
foreach my $sing1 (@sp_embl) {
#print "$sing1\n";
$tot_sp_embl .= $sing1;
my @ens_embl = @{$ens2embl{$xr}};
if ($ens2embl{$enst}) {
my @ens_embl = @{$ens2embl{$enst}};
foreach my $sing2 (@sp_embl) {
$tot_ens_embl .= $sing2;
......@@ -121,10 +150,15 @@ while (<MAP>) {
if ($tot_ens_embl =~ $tot_sp_embl) {
print OUT "$ens\t$map{$xr}\t$xr\n";
else {
#print "no\n";
else {
#Print the know gene AC and its database
print OUT "$ens\t$map{$xr}\t$xr\n";
#Print all of the external database it links to (eg: HUGO)
foreach my $both (@{$hash{$xr}}){
use strict;
use Bio::SeqIO;
use Bio::EnsEMBL::DBSQL::Obj;
use Bio::EnsEMBL::DBLoader;
use Getopt::Long;
use Bio::SeqIO;
my $dbpass = undef;
my $dbuser = 'ensro';
my $ensdbname = 'ensembl080';
my $host = '';
my $output;
'db:s' => \$ensdbname,
'host:s'=> \$host,
'dbuser:s'=> \$dbuser,
'output:s' => \$output
my $enslocator = "Bio::EnsEMBL::DBSQL::Obj/host=$host;dbname=$ensdbname;user=$dbuser;pass=$dbpass;perlonlyfeatures=1";
my $ensdb = Bio::EnsEMBL::DBLoader->new($enslocator);
my $sth = $ensdb->prepare ("select,cl.embl_id from transcript as t, exon_transcript as et, clone as cl, contig as c, exon as e where and et.exon = and e.contig = c.internal_id and c.clone = cl.internal_id");
my %hash;
my %seen;
print STDERR "Getting data\n";
while (my @row = $sth->fetchrow) {
if (! defined $seen{$row[1]}) {
$seen{$row[1]} = 1;
print STDERR "Writing out\n";
open (OUT,">$output");
foreach my $keys (keys %hash) {
my @array = @{$hash{$keys}};
foreach my $arr (@array) {
print OUT "ENSEMBL\t$keys\tEMBL\t$arr\n";
......@@ -32,9 +32,9 @@ my ($ens,$sp,$refseq);
sub runpmatch {
print STDERR "Running pmatch\n";
......@@ -87,7 +87,7 @@ sub postprocesspmatch {
#Write out the processed data
foreach my $key ( keys %hash1 ) {
if ($hashlength{$key} >= 20) {
if (($hashlength{$key} >= 20)) {
($a,$b) = split(/:/,$key);
print OUT "$a\t$b\t$hash1{$key}\n";
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment