Commit ce19e680 authored by Patrick Meidl's avatar Patrick Meidl
Browse files

plugin architecture for InternalIdMapper

parent e28ad271
......@@ -5,7 +5,7 @@ dry_run = 0
loglevel = DEBUG
; paths
basedir = /lustre/work1/ensembl/pm2/idmapping/perl/2008-04-22c
basedir = /lustre/work1/ensembl/pm2/idmapping/perl/2008-04-28
; prepend this path to your 'log' parameter
; will default to "$basedir/log" if not set
......@@ -27,7 +27,7 @@ targetdbname = pm2_pan_troglodytes_core_41_21
; caching
;cache_method = build_cache_all
build_cache_auto_threshold = 100
build_cache_concurrent_jobs = 200
build_cache_concurrent_jobs = 200
; limit
;region = chromosome:CHIMP1A:1:1:2000000:1
......@@ -50,6 +50,25 @@ transcript_score_threshold = 0
synteny_rescore_jobs = 20
;lsf_opt_synteny_rescore =
; InternalIdMapper
;plugin_internal_id_mappers_gene = \
; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::init_basic,\
; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::synteny,\
; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::best_transcript,\
; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::biotype,\
; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::internal_id
;plugin_internal_id_mappers_transcript = \
; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::init_basic,\
; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::non_exact_translation,\
; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::mapped_gene,\
; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::internal_id,\
; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::single_gene
;plugin_internal_id_mappers_exon = \
; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::init_basic,\
; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::mapped_transcript
; StableIdMapper
mapping_types = gene,transcript,translation,exon
;plugin_stable_id_generator = Bio::EnsEMBL::IdMapping::StableIdGenerator::EnsemblGeneric
......
......@@ -87,6 +87,9 @@ $conf->parse_options(
'exonerate_jobs|exoneratejobs=i' => 0,
'exonerate_bytes_per_job|exoneratebytesperjob=f' => 0,
'exonerate_extra_params|exonerateextraparams=s' => 0,
'plugin_internal_id_mappers_gene=s@' => 0,
'plugin_internal_id_mappers_transcript=s@' => 0,
'plugin_internal_id_mappers_exon=s@' => 0,
'mapping_types=s@' => 1,
'plugin_stable_id_generator=s' => 0,
'upload_events|uploadevents=s' => 0,
......
......@@ -96,6 +96,9 @@ $conf->parse_options(
'exonerate_jobs|exoneratejobs=i' => 0,
'exonerate_bytes_per_job|exoneratebytesperjob=f' => 0,
'exonerate_extra_params|exonerateextraparams=s' => 0,
'plugin_internal_id_mappers_gene=s@' => 0,
'plugin_internal_id_mappers_transcript=s@' => 0,
'plugin_internal_id_mappers_exon=s@' => 0,
'mapping_types=s@' => 1,
'plugin_stable_id_generator=s' => 0,
'upload_events|uploadevents=s' => 0,
......
package Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper;
=head1 NAME
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 METHODS
=head1 LICENCE
This code is distributed under an Apache style licence. Please see
http:#www.ensembl.org/info/about/code_licence.html for details.
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use strict;
use warnings;
no warnings 'uninitialized';
use Bio::EnsEMBL::IdMapping::BaseObject;
our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
use Bio::EnsEMBL::IdMapping::MappingList;
# scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this
use constant SIMILAR_SCORE_RATIO => 0.01;
#
# find the highest unambiguous score for all sources and targets in a scoring
# matrix
#
sub basic_mapping {
my $self = shift;
my $matrix = shift;
my $mapping_name = shift;
# argument checks
unless ($matrix and
$matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
}
throw('Need a name for serialising the mapping.') unless ($mapping_name);
# Create a new MappingList object. Specify AUTO_LOAD to load serialised
# existing mappings if found
my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
-DUMP_PATH => $dump_path,
-CACHE_FILE => "${mapping_name}.ser",
-AUTO_LOAD => 1,
);
# checkpoint test: return a previously stored MappingList
if ($mappings->loaded) {
$self->logger->info("Read existing mappings from ${mapping_name}.ser.\n");
return $mappings;
}
my $sources_done = {};
my $targets_done = {};
# sort scoring matrix entries by descending score
my @sorted_entries = sort { $b->score <=> $a->score }
@{ $matrix->get_all_Entries };
# debug
#my $idx = substr($mapping_name, -1);
while (my $entry = shift(@sorted_entries)) {
#$self->logger->debug("\nxxx$idx ".$entry->to_string." ");
# we already found a mapping for either source or target
next if ($sources_done->{$entry->source} or
$targets_done->{$entry->target});
#$self->logger->debug('d');
# there's a better mapping for either source or target
next if ($self->higher_score_exists($entry, $matrix, $sources_done,
$targets_done));
#$self->logger->debug('h');
# check for ambiguous mappings; they are dealt with later
my $other_sources = [];
my $other_targets = [];
if ($self->ambiguous_mapping($entry, $matrix, $other_sources, $other_targets)) {
#$self->logger->debug('a');
$other_sources = $self->filter_sources($other_sources, $sources_done);
$other_targets = $self->filter_targets($other_targets, $targets_done);
next if (scalar(@$other_sources) or scalar(@$other_targets));
}
#$self->logger->debug('A');
# this is the best mapping, add it
$mappings->add_Entry($entry);
$sources_done->{$entry->source} = 1;
$targets_done->{$entry->target} = 1;
}
# create checkpoint
$mappings->write_to_file;
return $mappings;
}
sub higher_score_exists {
my ($self, $entry, $matrix, $sources_done, $targets_done) = @_;
my $source = $entry->source;
my $target = $entry->target;
my $score = $entry->score;
foreach my $other_source (@{ $matrix->get_sources_for_target($target) }) {
if ($other_source != $source and !$sources_done->{$other_source} and
$score < $matrix->get_score($other_source, $target)) {
return 1;
}
}
foreach my $other_target (@{ $matrix->get_targets_for_source($source) }) {
if ($other_target != $target and !$targets_done->{$other_target} and
$score < $matrix->get_score($source, $other_target)) {
return 1;
}
}
return 0;
}
#
# find ambiguous mappings (see scores_similar() for definition)
#
sub ambiguous_mapping {
my ($self, $entry, $matrix, $other_sources, $other_targets) = @_;
my $source = $entry->source;
my $target = $entry->target;
my $score = $entry->score;
my $retval = 0;
foreach my $other_source (@{ $matrix->get_sources_for_target($target) }) {
my $other_score = $matrix->get_score($other_source, $target);
if ($other_source != $source and
($self->scores_similar($score, $other_score) or $score < $other_score)) {
$retval = 1;
push @{ $other_sources }, $other_source;
}
}
foreach my $other_target (@{ $matrix->get_targets_for_source($source) }) {
my $other_score = $matrix->get_score($source, $other_target);
if ($other_target != $target and
($self->scores_similar($score, $other_score) or $score < $other_score)) {
$retval = 1;
push @{ $other_targets }, $other_target;
}
}
return $retval;
}
#
# rule for similarity taken from java code...
#
sub scores_similar {
my ($self, $s1, $s2) = @_;
# always give priority to exact matches over very similar ones
return 0 if ($s1 == 1 and $s2 < 1);
my $diff = $s1 -$s2;
$diff = -$diff if ($diff < 0);
my $pc = 2 * $diff / ($s1 + $s2);
return ($pc < SIMILAR_SCORE_RATIO);
}
sub filter_sources {
my ($self, $other_sources, $sources_done) = @_;
unless (scalar(@$other_sources) and scalar(keys %$sources_done)) {
return $other_sources;
}
my @tmp = ();
foreach my $e (@{ $other_sources }) {
push @tmp, $e unless ($sources_done->{$e});
}
return \@tmp;
}
sub filter_targets {
my ($self, $other_targets, $targets_done) = @_;
unless (scalar(@{ $other_targets }) and scalar(keys %$targets_done)) {
return $other_targets;
}
my @tmp = ();
foreach my $e (@{ $other_targets }) {
push @tmp, $e unless ($targets_done->{$e});
}
return \@tmp;
}
1;
package Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric;
=head1 NAME
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 METHODS
=head1 LICENCE
This code is distributed under an Apache style licence. Please see
http:#www.ensembl.org/info/about/code_licence.html for details.
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use strict;
use warnings;
no warnings 'uninitialized';
use Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper;
our @ISA = qw(Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
#
# basic mapping
#
sub init_basic {
my $self = shift;
my $num = shift;
my $esb = shift;
my $mappings = shift;
my $exon_scores = shift;
$self->logger->info("Basic exon mapping...\n", 0, 'stamped');
$mappings = $self->basic_mapping($exon_scores, "exon_mappings$num");
$num++;
my $new_scores = $esb->create_shrinked_matrix($exon_scores, $mappings,
"exon_matrix$num");
return ($new_scores, $mappings);
}
#
# reduce score for mappings of exons which do not belong to mapped
# transcripts
#
sub mapped_transcript {
my $self = shift;
my $num = shift;
my $esb = shift;
my $mappings = shift;
my $exon_scores = shift;
$self->logger->info("Exons in mapped transcript...\n", 0, 'stamped');
unless ($exon_scores->loaded) {
$esb->non_mapped_transcript_rescore($exon_scores, $mappings);
$exon_scores->write_to_file;
}
$mappings = $self->basic_mapping($exon_scores, "exon_mappings$num");
$num++;
my $new_scores = $esb->create_shrinked_matrix($exon_scores, $mappings,
"exon_matrix$num");
return ($new_scores, $mappings);
}
1;
package Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric;
=head1 NAME
Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric - default Ensembl
InternalIdMapper implementation for genes
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 METHODS
=head1 LICENCE
This code is distributed under an Apache style licence. Please see
http://www.ensembl.org/info/about/code_licence.html for details.
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use strict;
use warnings;
no warnings 'uninitialized';
use Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper;
our @ISA = qw(Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
#
# basic mapping
#
sub init_basic {
my $self = shift;
my $num = shift;
my $gsb = shift;
my $mappings = shift;
my $gene_scores = shift;
$self->logger->info("Basic gene mapping...\n", 0, 'stamped');
$mappings = $self->basic_mapping($gene_scores, "gene_mappings$num");
$num++;
my $new_scores = $gsb->create_shrinked_matrix($gene_scores, $mappings,
"gene_matrix$num");
return ($new_scores, $mappings);
}
#
# build the synteny from unambiguous mappings
#
sub synteny {
my $self = shift;
my $num = shift;
my $gsb = shift;
my $mappings = shift;
my $gene_scores = shift;
unless ($gene_scores->loaded) {
$self->logger->info("Synteny Framework building...\n", 0, 'stamped');
my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new(
-DUMP_PATH => $dump_path,
-CACHE_FILE => 'synteny_framework.ser',
-LOGGER => $self->logger,
-CONF => $self->conf,
-CACHE => $self->cache,
);
$sf->build_synteny($mappings);
# use it to rescore the genes
$self->logger->info("\nSynteny assisted mapping...\n", 0, 'stamped');
$gene_scores = $sf->rescore_gene_matrix_lsf($gene_scores);
# checkpoint
$gene_scores->write_to_file;
}
my $new_mappings = $self->basic_mapping($gene_scores, "gene_mappings$num");
$num++;
my $new_scores = $gsb->create_shrinked_matrix($gene_scores, $new_mappings,
"gene_matrix$num");
return ($new_scores, $new_mappings);
}
#
# rescore with simple scoring function and try again
#
sub best_transcript {
my $self = shift;
my $num = shift;
my $gsb = shift;
my $mappings = shift;
my $gene_scores = shift;
my $transcript_scores = shift;
$self->logger->info("Retry with simple best transcript score...\n", 0, 'stamped');
unless ($gene_scores->loaded) {
$gsb->simple_gene_rescore($gene_scores, $transcript_scores);
$gene_scores->write_to_file;
}
my $new_mappings = $self->basic_mapping($gene_scores, "gene_mappings$num");
$num++;
my $new_scores = $gsb->create_shrinked_matrix($gene_scores, $new_mappings,
"gene_matrix$num");
return ($new_scores, $new_mappings);
}
#
# rescore by penalising scores between genes with different biotypes
#
sub biotype {
my $self = shift;
my $num = shift;
my $gsb = shift;
my $mappings = shift;
my $gene_scores = shift;
$self->logger->info("Retry with biotype disambiguation...\n", 0, 'stamped');
unless ($gene_scores->loaded) {
$gsb->biotype_gene_rescore($gene_scores);
$gene_scores->write_to_file;
}
my $new_mappings = $self->basic_mapping($gene_scores, "gene_mappings$num");
$num++;
my $new_scores = $gsb->create_shrinked_matrix($gene_scores, $new_mappings,
"gene_matrix$num");
return ($new_scores, $new_mappings);
}
#
# selectively rescore by penalising scores between genes with different
# internalIDs
#
sub internal_id {
my $self = shift;
my $num = shift;
my $gsb = shift;
my $mappings = shift;
my $gene_scores = shift;
$self->logger->info("Retry with internalID disambiguation...\n", 0, 'stamped');
unless ($gene_scores->loaded) {
$gsb->internal_id_rescore($gene_scores);
$gene_scores->write_to_file;
}
my $new_mappings = $self->basic_mapping($gene_scores, "gene_mappings$num");
$num++;
my $new_scores = $gsb->create_shrinked_matrix($gene_scores, $new_mappings,
"gene_matrix$num");
return ($new_scores, $new_mappings);
}