From ce19e68064d36484e4932fbebd10810c2a6502bd Mon Sep 17 00:00:00 2001 From: Patrick Meidl <pm2@sanger.ac.uk> Date: Mon, 28 Apr 2008 20:59:53 +0000 Subject: [PATCH] plugin architecture for InternalIdMapper --- misc-scripts/id_mapping/default.conf | 23 +- misc-scripts/id_mapping/id_mapping.pl | 3 + misc-scripts/id_mapping/run.pl | 3 + .../Bio/EnsEMBL/IdMapping/InternalIdMapper.pm | 621 ++++-------------- .../IdMapping/InternalIdMapper/BaseMapper.pm | 250 +++++++ .../InternalIdMapper/EnsemblExonGeneric.pm | 91 +++ .../InternalIdMapper/EnsemblGeneGeneric.pm | 187 ++++++ .../EnsemblTranscriptGeneric.pm | 276 ++++++++ 8 files changed, 971 insertions(+), 483 deletions(-) create mode 100644 modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/BaseMapper.pm create mode 100644 modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblExonGeneric.pm create mode 100644 modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblGeneGeneric.pm create mode 100644 modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblTranscriptGeneric.pm diff --git a/misc-scripts/id_mapping/default.conf b/misc-scripts/id_mapping/default.conf index 2d90cd045c..8d5b95c597 100644 --- a/misc-scripts/id_mapping/default.conf +++ b/misc-scripts/id_mapping/default.conf @@ -5,7 +5,7 @@ dry_run = 0 loglevel = DEBUG ; paths -basedir = /lustre/work1/ensembl/pm2/idmapping/perl/2008-04-22c +basedir = /lustre/work1/ensembl/pm2/idmapping/perl/2008-04-28 ; prepend this path to your 'log' parameter ; will default to "$basedir/log" if not set @@ -27,7 +27,7 @@ targetdbname = pm2_pan_troglodytes_core_41_21 ; caching ;cache_method = build_cache_all build_cache_auto_threshold = 100 -build_cache_concurrent_jobs = 200 +build_cache_concurrent_jobs = 200 ; limit ;region = chromosome:CHIMP1A:1:1:2000000:1 @@ -50,6 +50,25 @@ transcript_score_threshold = 0 synteny_rescore_jobs = 20 ;lsf_opt_synteny_rescore = +; InternalIdMapper +;plugin_internal_id_mappers_gene = \ +; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::init_basic,\ +; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::synteny,\ +; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::best_transcript,\ +; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::biotype,\ +; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::internal_id + +;plugin_internal_id_mappers_transcript = \ +; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::init_basic,\ +; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::non_exact_translation,\ +; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::mapped_gene,\ +; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::internal_id,\ +; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::single_gene + +;plugin_internal_id_mappers_exon = \ +; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::init_basic,\ +; Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::mapped_transcript + ; StableIdMapper mapping_types = gene,transcript,translation,exon ;plugin_stable_id_generator = Bio::EnsEMBL::IdMapping::StableIdGenerator::EnsemblGeneric diff --git a/misc-scripts/id_mapping/id_mapping.pl b/misc-scripts/id_mapping/id_mapping.pl index 6030851bf0..f1b4903d2b 100755 --- a/misc-scripts/id_mapping/id_mapping.pl +++ b/misc-scripts/id_mapping/id_mapping.pl @@ -87,6 +87,9 @@ $conf->parse_options( 'exonerate_jobs|exoneratejobs=i' => 0, 'exonerate_bytes_per_job|exoneratebytesperjob=f' => 0, 'exonerate_extra_params|exonerateextraparams=s' => 0, + 'plugin_internal_id_mappers_gene=s@' => 0, + 'plugin_internal_id_mappers_transcript=s@' => 0, + 'plugin_internal_id_mappers_exon=s@' => 0, 'mapping_types=s@' => 1, 'plugin_stable_id_generator=s' => 0, 'upload_events|uploadevents=s' => 0, diff --git a/misc-scripts/id_mapping/run.pl b/misc-scripts/id_mapping/run.pl index ad367d3ad0..f764352b89 100755 --- a/misc-scripts/id_mapping/run.pl +++ b/misc-scripts/id_mapping/run.pl @@ -96,6 +96,9 @@ $conf->parse_options( 'exonerate_jobs|exoneratejobs=i' => 0, 'exonerate_bytes_per_job|exoneratebytesperjob=f' => 0, 'exonerate_extra_params|exonerateextraparams=s' => 0, + 'plugin_internal_id_mappers_gene=s@' => 0, + 'plugin_internal_id_mappers_transcript=s@' => 0, + 'plugin_internal_id_mappers_exon=s@' => 0, 'mapping_types=s@' => 1, 'plugin_stable_id_generator=s' => 0, 'upload_events|uploadevents=s' => 0, diff --git a/modules/Bio/EnsEMBL/IdMapping/InternalIdMapper.pm b/modules/Bio/EnsEMBL/IdMapping/InternalIdMapper.pm index bd2fe1b497..449b584369 100644 --- a/modules/Bio/EnsEMBL/IdMapping/InternalIdMapper.pm +++ b/modules/Bio/EnsEMBL/IdMapping/InternalIdMapper.pm @@ -37,7 +37,7 @@ use Bio::EnsEMBL::IdMapping::BaseObject; our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject); use Bio::EnsEMBL::Utils::Exception qw(throw warning); -use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); +use Bio::EnsEMBL::Utils::ScriptUtils qw(inject path_append); use Bio::EnsEMBL::IdMapping::Entry; use Bio::EnsEMBL::IdMapping::MappingList; use Bio::EnsEMBL::IdMapping::SyntenyFramework; @@ -93,110 +93,44 @@ sub map_genes { # create gene mappings $self->logger->info("No gene mappings found. Will calculate them now.\n"); - # - # basic mapping - # - $self->logger->info("Basic gene mapping...\n", 0, 'stamped'); - - my $mappings0 = $self->basic_mapping($gene_scores, 'gene_mappings0'); - - my $gene_scores1 = $gsb->create_shrinked_matrix($gene_scores, $mappings0, - 'gene_matrix1'); - - - # - # build the synteny from unambiguous mappings - # - unless ($gene_scores1->loaded) { - $self->logger->info("Synteny Framework building...\n", 0, 'stamped'); - my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new( - -DUMP_PATH => $dump_path, - -CACHE_FILE => 'synteny_framework.ser', - -LOGGER => $self->logger, - -CONF => $self->conf, - -CACHE => $self->cache, - ); - $sf->build_synteny($mappings0); - - # use it to rescore the genes - $self->logger->info("\nSynteny assisted mapping...\n", 0, 'stamped'); - $gene_scores1 = $sf->rescore_gene_matrix_lsf($gene_scores1); - - # checkpoint - $gene_scores1->write_to_file; - } - - my $mappings1 = $self->basic_mapping($gene_scores1, 'gene_mappings1'); - - my $gene_scores2 = $gsb->create_shrinked_matrix($gene_scores1, $mappings1, - 'gene_matrix2'); - + # determine which plugin methods to run + my @default_plugins = (qw( + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::init_basic + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::synteny + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::best_transcript + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::biotype + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::internal_id + )); + + my @plugins = $self->conf->param('plugin_internal_id_mappers_gene'); + @plugins = @default_plugins unless (defined($plugins[0])); + + my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'gene_mappings0.ser', + ); + my @mappings = (); + my $i = 0; # - # rescore with simple scoring function and try again + # run the scoring chain # - $self->logger->info("Retry with simple best transcript score...\n", 0, 'stamped'); - - unless ($gene_scores2->loaded) { - $gsb->simple_gene_rescore($gene_scores2, $transcript_scores); - $gene_scores2->write_to_file; - } - - my $mappings2 = $self->basic_mapping($gene_scores2, 'gene_mappings2'); - - my $gene_scores3 = $gsb->create_shrinked_matrix($gene_scores2, $mappings2, - 'gene_matrix3'); + foreach my $plugin (@plugins) { + ($gene_scores, $new_mappings) = $self->delegate_to_plugin($plugin, $i++, + $gsb, $new_mappings, $gene_scores, $transcript_scores); - - # - # rescore by penalising scores between genes with different biotypes - # - $self->logger->info("Retry with biotype disambiguation...\n", 0, 'stamped'); - - unless ($gene_scores3->loaded) { - $gsb->biotype_gene_rescore($gene_scores3); - $gene_scores3->write_to_file; + push(@mappings, $new_mappings); } - my $mappings3 = $self->basic_mapping($gene_scores3, 'gene_mappings3'); - - my $gene_scores4 = $gsb->create_shrinked_matrix($gene_scores3, $mappings3, - 'gene_matrix4'); - - - # - # selectively rescore by penalising scores between genes with different - # internalIDs - # - $self->logger->info("Retry with internalID disambiguation...\n", 0, 'stamped'); - - unless ($gene_scores4->loaded) { - $gsb->internal_id_rescore($gene_scores4); - $gene_scores4->write_to_file; - } - - my $mappings4 = $self->basic_mapping($gene_scores4, 'gene_mappings4'); - - my $remaining_gene_scores = $gsb->create_shrinked_matrix( - $gene_scores4, $mappings4, 'remaining_gene_matrix'); - - - # # report remaining ambiguities - # - $self->logger->info($remaining_gene_scores->get_source_count. + $self->logger->info($gene_scores->get_source_count. " source genes are ambiguous with ". - $remaining_gene_scores->get_target_count." target genes.\n\n"); - - $self->log_ambiguous($remaining_gene_scores, 'gene'); + $gene_scores->get_target_count." target genes.\n\n"); + $self->log_ambiguous($gene_scores, 'gene'); - # # merge mappings and write to file - # - $mappings->add_all($mappings0, $mappings1, $mappings2, $mappings3, - $mappings4); - + $mappings->add_all(@mappings); $mappings->write_to_file; if ($self->logger->loglevel eq 'debug') { @@ -258,103 +192,44 @@ sub map_transcripts { # create transcript mappings $self->logger->info("No transcript mappings found. Will calculate them now.\n"); - # - # basic mapping - # - $self->logger->info("Basic transcript mapping...\n", 0, 'stamped'); - - my $mappings0 = $self->basic_mapping($transcript_scores, - 'transcript_mappings0'); - - my $transcript_scores1 = $tsb->create_shrinked_matrix( - $transcript_scores, $mappings0, 'transcript_matrix1'); - - - # - # handle cases with exact match but different translation - # - $self->logger->info("Exact Transcript non-exact Translation...\n", 0, 'stamped'); - - unless ($transcript_scores1->loaded) { - $tsb->different_translation_rescore($transcript_scores1); - $transcript_scores1->write_to_file; - } - - my $mappings1 = $self->basic_mapping($transcript_scores1, - 'transcript_mappings1'); - - my $transcript_scores2 = $tsb->create_shrinked_matrix( - $transcript_scores1, $mappings1, 'transcript_matrix2'); - + # determine which plugin methods to run + my @default_plugins = (qw( + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::init_basic + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::non_exact_translation + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::mapped_gene + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::internal_id + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::single_gene + )); + + my @plugins = $self->conf->param('plugin_internal_id_mappers_transcript'); + @plugins = @default_plugins unless (defined($plugins[0])); + + my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'transcript_mappings0.ser', + ); + my @mappings = (); + my $i = 0; # - # reduce score for mappings of transcripts which do not belong to mapped - # genes + # run the scoring chain # - $self->logger->info("Transcripts in mapped genes...\n", 0, 'stamped'); - - unless ($transcript_scores2->loaded) { - $tsb->non_mapped_gene_rescore($transcript_scores2, $gene_mappings); - $transcript_scores2->write_to_file; - } - - my $mappings2 = $self->basic_mapping($transcript_scores2, - 'transcript_mappings2'); - - my $transcript_scores3 = $tsb->create_shrinked_matrix( - $transcript_scores2, $mappings2, 'transcript_matrix3'); - + foreach my $plugin (@plugins) { + ($transcript_scores, $new_mappings) = $self->delegate_to_plugin($plugin, + $i++, $tsb, $new_mappings, $transcript_scores, $gene_mappings); - # - # selectively rescore by penalising scores between transcripts with - # different internalIDs - # - $self->logger->info("Retry with internalID disambiguation...\n", 0, 'stamped'); - - unless ($transcript_scores3->loaded) { - $tsb->internal_id_rescore($transcript_scores3); - $transcript_scores3->write_to_file; + push(@mappings, $new_mappings); } - my $mappings3 = $self->basic_mapping($transcript_scores3, - 'transcript_mappings3'); - - my $transcript_scores4 = $tsb->create_shrinked_matrix( - $transcript_scores3, $mappings3, 'transcript_matrix4'); - - - # - # handle ambiguities between transcripts in single genes - # - $self->logger->info("Transcripts in single genes...\n", 0, 'stamped'); - - unless ($transcript_scores4->loaded) { - $transcript_scores4->write_to_file; - } - - my $mappings4 = $self->same_gene_transcript_mapping($transcript_scores4, - 'transcript_mappings4'); - - my $remaining_transcript_scores = $tsb->create_shrinked_matrix( - $transcript_scores4, $mappings4, 'transcript_matrix5'); - - - # # report remaining ambiguities - # - $self->logger->info($remaining_transcript_scores->get_source_count. + $self->logger->info($transcript_scores->get_source_count. " source transcripts are ambiguous with ". - $remaining_transcript_scores->get_target_count." target transcripts.\n\n"); + $transcript_scores->get_target_count." target transcripts.\n\n"); - $self->log_ambiguous($remaining_transcript_scores, 'transcript'); + $self->log_ambiguous($transcript_scores, 'transcript'); - - # # merge mappings and write to file - # - $mappings->add_all($mappings0, $mappings1, $mappings2, $mappings3, - $mappings4); - + $mappings->add_all(@mappings); $mappings->write_to_file; if ($self->logger->loglevel eq 'debug') { @@ -417,49 +292,41 @@ sub map_exons { # create exon mappings $self->logger->info("No exon mappings found. Will calculate them now.\n"); - # - # basic mapping - # - $self->logger->info("Basic exon mapping...\n", 0, 'stamped'); + # determine which plugin methods to run + my @default_plugins = (qw( + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::init_basic + Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::mapped_transcript + )); - my $mappings0 = $self->basic_mapping($exon_scores, 'exon_mappings0'); + my @plugins = $self->conf->param('plugin_internal_id_mappers_exon'); + @plugins = @default_plugins unless (defined($plugins[0])); - my $exon_scores1 = $esb->create_shrinked_matrix( $exon_scores, $mappings0, - 'exon_matrix1'); - + my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'exon_mappings0.ser', + ); + my @mappings = (); + my $i = 0; # - # reduce score for mappings of exons which do not belong to mapped - # transcripts + # run the scoring chain # - $self->logger->info("Exons in mapped transcripts...\n", 0, 'stamped'); - - unless ($exon_scores1->loaded) { - $esb->non_mapped_transcript_rescore($exon_scores1, $transcript_mappings); - $exon_scores1->write_to_file; - } - - my $mappings1 = $self->basic_mapping($exon_scores1, 'exon_mappings1'); - - my $remaining_exon_scores = $esb->create_shrinked_matrix( - $exon_scores1, $mappings1, 'exon_matrix2'); + foreach my $plugin (@plugins) { + ($exon_scores, $new_mappings) = $self->delegate_to_plugin($plugin, $i++, + $esb, $new_mappings, $exon_scores); + push(@mappings, $new_mappings); + } - # # report remaining ambiguities - # - $self->logger->info($remaining_exon_scores->get_source_count. + $self->logger->info($exon_scores->get_source_count. " source exons are ambiguous with ". - $remaining_exon_scores->get_target_count." target exons.\n\n"); + $exon_scores->get_target_count." target exons.\n\n"); - $self->log_ambiguous($remaining_exon_scores, 'exon'); + $self->log_ambiguous($exon_scores, 'exon'); - - # # merge mappings and write to file - # - $mappings->add_all($mappings0, $mappings1); - + $mappings->add_all(@mappings); $mappings->write_to_file; if ($self->logger->loglevel eq 'debug') { @@ -475,6 +342,10 @@ sub map_exons { } +# +# this is not implemented as a plugin, since a) it's too simple and b) it's +# tied to transcripts so there are no translation scores or score builder. +# sub map_translations { my $self = shift; my $transcript_mappings = shift; @@ -557,304 +428,92 @@ sub map_translations { } -# -# find the highest unambiguous score for all sources and targets in a scoring -# matrix -# -sub basic_mapping { +sub delegate_to_plugin { my $self = shift; - my $matrix = shift; - my $mapping_name = shift; + my $plugin = shift; + my $num = shift; + my $score_builder = shift; + my $mappings = shift; + my $scores = shift; # argument checks - unless ($matrix and - $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { - throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + unless ($score_builder and + $score_builder->isa('Bio::EnsEMBL::IdMapping::ScoreBuilder')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoreBuilder.'); } - throw('Need a name for serialising the mapping.') unless ($mapping_name); - - # Create a new MappingList object. Specify AUTO_LOAD to load serialised - # existing mappings if found - my $dump_path = path_append($self->conf->param('basedir'), 'mapping'); - - my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new( - -DUMP_PATH => $dump_path, - -CACHE_FILE => "${mapping_name}.ser", - -AUTO_LOAD => 1, - ); - - # checkpoint test: return a previously stored MappingList - if ($mappings->loaded) { - $self->logger->info("Read existing mappings from ${mapping_name}.ser.\n"); - return $mappings; + unless ($mappings and + $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { + throw('Need a Bio::EnsEMBL::IdMapping::MappingList.'); } - - my $sources_done = {}; - my $targets_done = {}; - - # sort scoring matrix entries by descending score - my @sorted_entries = sort { $b->score <=> $a->score } - @{ $matrix->get_all_Entries }; - - # debug - my $idx = substr($mapping_name, -1); - - while (my $entry = shift(@sorted_entries)) { - - #$self->logger->debug("\nxxx$idx ".$entry->to_string." "); - - # we already found a mapping for either source or target - next if ($sources_done->{$entry->source} or - $targets_done->{$entry->target}); - - #$self->logger->debug('d'); - - # there's a better mapping for either source or target - next if ($self->higher_score_exists($entry, $matrix, $sources_done, - $targets_done)); - - #$self->logger->debug('h'); - - # check for ambiguous mappings; they are dealt with later - my $other_sources = []; - my $other_targets = []; - - if ($self->ambiguous_mapping($entry, $matrix, $other_sources, $other_targets)) { - #$self->logger->debug('a'); - - $other_sources = $self->filter_sources($other_sources, $sources_done); - $other_targets = $self->filter_targets($other_targets, $targets_done); - - next if (scalar(@$other_sources) or scalar(@$other_targets)); - } - - #$self->logger->debug('A'); - - # this is the best mapping, add it - $mappings->add_Entry($entry); - - $sources_done->{$entry->source} = 1; - $targets_done->{$entry->target} = 1; + + unless ($scores and + $scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); } - # create checkpoint - $mappings->write_to_file; + # split plugin name into module and method + $plugin =~ /(.*)::(\w+)$/; + my $module = $1; + my $method = $2; - return $mappings; -} - - -sub higher_score_exists { - my ($self, $entry, $matrix, $sources_done, $targets_done) = @_; - - my $source = $entry->source; - my $target = $entry->target; - my $score = $entry->score; - - foreach my $other_source (@{ $matrix->get_sources_for_target($target) }) { - if ($other_source != $source and !$sources_done->{$other_source} and - $score < $matrix->get_score($other_source, $target)) { - return 1; - } + unless ($module and $method) { + throw("Unable to determine module and method name from $plugin.\n"); } - foreach my $other_target (@{ $matrix->get_targets_for_source($source) }) { - if ($other_target != $target and !$targets_done->{$other_target} and - $score < $matrix->get_score($source, $other_target)) { - return 1; - } - } - - return 0; -} - - -# -# find ambiguous mappings (see scores_similar() for definition) -# -sub ambiguous_mapping { - my ($self, $entry, $matrix, $other_sources, $other_targets) = @_; - - my $source = $entry->source; - my $target = $entry->target; - my $score = $entry->score; - - my $retval = 0; - - foreach my $other_source (@{ $matrix->get_sources_for_target($target) }) { - my $other_score = $matrix->get_score($other_source, $target); + # instantiate the plugin unless we already have an instance + my $plugin_instance; + if ($self->has_plugin($module)) { - if ($other_source != $source and - ($self->scores_similar($score, $other_score) or $score < $other_score)) { - $retval = 1; - push @{ $other_sources }, $other_source; - } - } - - foreach my $other_target (@{ $matrix->get_targets_for_source($source) }) { - my $other_score = $matrix->get_score($source, $other_target); - - if ($other_target != $target and - ($self->scores_similar($score, $other_score) or $score < $other_score)) { - $retval = 1; - push @{ $other_targets }, $other_target; - } - } - - return $retval; -} - - -# -# rule for similarity taken from java code... -# -sub scores_similar { - my ($self, $s1, $s2) = @_; - - # always give priority to exact matches over very similar ones - return 0 if ($s1 == 1 and $s2 < 1); - - my $diff = $s1 -$s2; - $diff = -$diff if ($diff < 0); + # re-use an existing plugin instance + $plugin_instance = $self->get_plugin($module); - my $pc = 2 * $diff / ($s1 + $s2); - - return ($pc < SIMILAR_SCORE_RATIO); -} - - -sub filter_sources { - my ($self, $other_sources, $sources_done) = @_; - - unless (scalar(@$other_sources) and scalar(keys %$sources_done)) { - return $other_sources; - } - - my @tmp = (); + } else { + + # inject and instantiate the plugin module + inject($module); + $plugin_instance = $module->new( + -LOGGER => $self->logger, + -CONF => $self->conf, + -CACHE => $self->cache + ); + $self->add_plugin($plugin_instance); - foreach my $e (@{ $other_sources }) { - push @tmp, $e unless ($sources_done->{$e}); } - return \@tmp; + # run the method on the plugin + # + # pass in a sequence number (number of method run, used for generating + # checkpoint files), the scores used for determining the mapping, and all + # other arguments passed to this method (these will vary for different object + # types) + # + # return the scores and mappings to feed into the next plugin in the chain + return $plugin_instance->$method($num, $score_builder, $mappings, $scores, @_); } -sub filter_targets { - my ($self, $other_targets, $targets_done) = @_; - - unless (scalar(@{ $other_targets }) and scalar(keys %$targets_done)) { - return $other_targets; - } - - my @tmp = (); - - foreach my $e (@{ $other_targets }) { - push @tmp, $e unless ($targets_done->{$e}); - } +sub has_plugin { + my $self = shift; + my $module = shift; - return \@tmp; + defined($self->{'_plugins'}->{$module}) ? (return 1) : (return 0); } -# -# modified basic mapper that maps transcripts that are ambiguous within one gene -# -sub same_gene_transcript_mapping { +sub get_plugin { my $self = shift; - my $matrix = shift; - my $mapping_name = shift; - - # argument checks - unless ($matrix and - $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { - throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); - } - - throw('Need a name for serialising the mapping.') unless ($mapping_name); - - # Create a new MappingList object. Specify AUTO_LOAD to load serialised - # existing mappings if found - my $dump_path = path_append($self->conf->param('basedir'), 'mapping'); - - my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new( - -DUMP_PATH => $dump_path, - -CACHE_FILE => "${mapping_name}.ser", - -AUTO_LOAD => 1, - ); - - # checkpoint test: return a previously stored MappingList - if ($mappings->loaded) { - $self->logger->info("Read existing mappings from ${mapping_name}.ser.\n"); - return $mappings; - } - - my $sources_done = {}; - my $targets_done = {}; - - # sort scoring matrix entries by descending score - my @sorted_entries = sort { $b->score <=> $a->score || - $a->source <=> $b->source || $a->target <=> $b->target } - @{ $matrix->get_all_Entries }; - - while (my $entry = shift(@sorted_entries)) { - - # $self->logger->debug("\nxxx4 ".$entry->to_string." "); - - # we already found a mapping for either source or target yet - next if ($sources_done->{$entry->source} or - $targets_done->{$entry->target}); - - #$self->logger->debug('d'); - - my $other_sources = []; - my $other_targets = []; - my %source_genes = (); - my %target_genes = (); - - if ($self->ambiguous_mapping($entry, $matrix, $other_sources, $other_targets)) { - #$self->logger->debug('a'); - - $other_sources = $self->filter_sources($other_sources, $sources_done); - $other_targets = $self->filter_targets($other_targets, $targets_done); + my $module = shift; - $source_genes{$self->cache->get_by_key('genes_by_transcript_id', - 'source', $entry->source)} = 1; - $target_genes{$self->cache->get_by_key('genes_by_transcript_id', - 'target', $entry->target)} = 1; - - foreach my $other_source (@{ $other_sources }) { - $source_genes{$self->cache->get_by_key('genes_by_transcript_id', - 'source', $other_source)} = 1; - } - - foreach my $other_target (@{ $other_targets }) { - $target_genes{$self->cache->get_by_key('genes_by_transcript_id', - 'target', $other_target)} = 1; - } - - # only add mapping if only one source and target gene involved - if (scalar(keys %source_genes) == 1 and scalar(keys %target_genes) == 1) { - #$self->logger->debug('O'); - $mappings->add_Entry($entry); - } - - } else { - #$self->logger->debug('A'); - - # this is the best mapping, add it - $mappings->add_Entry($entry); - } + return $self->{'_plugins'}->{$module}; +} - $sources_done->{$entry->source} = 1; - $targets_done->{$entry->target} = 1; - } - # create checkpoint - $mappings->write_to_file; +sub add_plugin { + my $self = shift; + my $plugin_instance = shift; - return $mappings; + $self->{'_plugins'}->{ref($plugin_instance)} = $plugin_instance; } diff --git a/modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/BaseMapper.pm b/modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/BaseMapper.pm new file mode 100644 index 0000000000..77a6c8208f --- /dev/null +++ b/modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/BaseMapper.pm @@ -0,0 +1,250 @@ +package Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper; + +=head1 NAME + + +=head1 SYNOPSIS + + +=head1 DESCRIPTION + + +=head1 METHODS + + +=head1 LICENCE + +This code is distributed under an Apache style licence. Please see +http:#www.ensembl.org/info/about/code_licence.html for details. + +=head1 AUTHOR + +Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team + +=head1 CONTACT + +Please post comments/questions to the Ensembl development list +<ensembl-dev@ebi.ac.uk> + +=cut + + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::BaseObject; +our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); +use Bio::EnsEMBL::IdMapping::MappingList; + + +# scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this +use constant SIMILAR_SCORE_RATIO => 0.01; + + +# +# find the highest unambiguous score for all sources and targets in a scoring +# matrix +# +sub basic_mapping { + my $self = shift; + my $matrix = shift; + my $mapping_name = shift; + + # argument checks + unless ($matrix and + $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + throw('Need a name for serialising the mapping.') unless ($mapping_name); + + # Create a new MappingList object. Specify AUTO_LOAD to load serialised + # existing mappings if found + my $dump_path = path_append($self->conf->param('basedir'), 'mapping'); + + my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => "${mapping_name}.ser", + -AUTO_LOAD => 1, + ); + + # checkpoint test: return a previously stored MappingList + if ($mappings->loaded) { + $self->logger->info("Read existing mappings from ${mapping_name}.ser.\n"); + return $mappings; + } + + my $sources_done = {}; + my $targets_done = {}; + + # sort scoring matrix entries by descending score + my @sorted_entries = sort { $b->score <=> $a->score } + @{ $matrix->get_all_Entries }; + + # debug + #my $idx = substr($mapping_name, -1); + + while (my $entry = shift(@sorted_entries)) { + + #$self->logger->debug("\nxxx$idx ".$entry->to_string." "); + + # we already found a mapping for either source or target + next if ($sources_done->{$entry->source} or + $targets_done->{$entry->target}); + + #$self->logger->debug('d'); + + # there's a better mapping for either source or target + next if ($self->higher_score_exists($entry, $matrix, $sources_done, + $targets_done)); + + #$self->logger->debug('h'); + + # check for ambiguous mappings; they are dealt with later + my $other_sources = []; + my $other_targets = []; + + if ($self->ambiguous_mapping($entry, $matrix, $other_sources, $other_targets)) { + #$self->logger->debug('a'); + + $other_sources = $self->filter_sources($other_sources, $sources_done); + $other_targets = $self->filter_targets($other_targets, $targets_done); + + next if (scalar(@$other_sources) or scalar(@$other_targets)); + } + + #$self->logger->debug('A'); + + # this is the best mapping, add it + $mappings->add_Entry($entry); + + $sources_done->{$entry->source} = 1; + $targets_done->{$entry->target} = 1; + } + + # create checkpoint + $mappings->write_to_file; + + return $mappings; +} + + +sub higher_score_exists { + my ($self, $entry, $matrix, $sources_done, $targets_done) = @_; + + my $source = $entry->source; + my $target = $entry->target; + my $score = $entry->score; + + foreach my $other_source (@{ $matrix->get_sources_for_target($target) }) { + if ($other_source != $source and !$sources_done->{$other_source} and + $score < $matrix->get_score($other_source, $target)) { + return 1; + } + } + + foreach my $other_target (@{ $matrix->get_targets_for_source($source) }) { + if ($other_target != $target and !$targets_done->{$other_target} and + $score < $matrix->get_score($source, $other_target)) { + return 1; + } + } + + return 0; +} + + +# +# find ambiguous mappings (see scores_similar() for definition) +# +sub ambiguous_mapping { + my ($self, $entry, $matrix, $other_sources, $other_targets) = @_; + + my $source = $entry->source; + my $target = $entry->target; + my $score = $entry->score; + + my $retval = 0; + + foreach my $other_source (@{ $matrix->get_sources_for_target($target) }) { + my $other_score = $matrix->get_score($other_source, $target); + + if ($other_source != $source and + ($self->scores_similar($score, $other_score) or $score < $other_score)) { + $retval = 1; + push @{ $other_sources }, $other_source; + } + } + + foreach my $other_target (@{ $matrix->get_targets_for_source($source) }) { + my $other_score = $matrix->get_score($source, $other_target); + + if ($other_target != $target and + ($self->scores_similar($score, $other_score) or $score < $other_score)) { + $retval = 1; + push @{ $other_targets }, $other_target; + } + } + + return $retval; +} + + +# +# rule for similarity taken from java code... +# +sub scores_similar { + my ($self, $s1, $s2) = @_; + + # always give priority to exact matches over very similar ones + return 0 if ($s1 == 1 and $s2 < 1); + + my $diff = $s1 -$s2; + $diff = -$diff if ($diff < 0); + + my $pc = 2 * $diff / ($s1 + $s2); + + return ($pc < SIMILAR_SCORE_RATIO); +} + + +sub filter_sources { + my ($self, $other_sources, $sources_done) = @_; + + unless (scalar(@$other_sources) and scalar(keys %$sources_done)) { + return $other_sources; + } + + my @tmp = (); + + foreach my $e (@{ $other_sources }) { + push @tmp, $e unless ($sources_done->{$e}); + } + + return \@tmp; +} + + +sub filter_targets { + my ($self, $other_targets, $targets_done) = @_; + + unless (scalar(@{ $other_targets }) and scalar(keys %$targets_done)) { + return $other_targets; + } + + my @tmp = (); + + foreach my $e (@{ $other_targets }) { + push @tmp, $e unless ($targets_done->{$e}); + } + + return \@tmp; +} + + +1; + diff --git a/modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblExonGeneric.pm b/modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblExonGeneric.pm new file mode 100644 index 0000000000..7caae11a37 --- /dev/null +++ b/modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblExonGeneric.pm @@ -0,0 +1,91 @@ +package Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric; + +=head1 NAME + + +=head1 SYNOPSIS + + +=head1 DESCRIPTION + + +=head1 METHODS + + +=head1 LICENCE + +This code is distributed under an Apache style licence. Please see +http:#www.ensembl.org/info/about/code_licence.html for details. + +=head1 AUTHOR + +Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team + +=head1 CONTACT + +Please post comments/questions to the Ensembl development list +<ensembl-dev@ebi.ac.uk> + +=cut + + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper; +our @ISA = qw(Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + + +# +# basic mapping +# +sub init_basic { + my $self = shift; + my $num = shift; + my $esb = shift; + my $mappings = shift; + my $exon_scores = shift; + + $self->logger->info("Basic exon mapping...\n", 0, 'stamped'); + + $mappings = $self->basic_mapping($exon_scores, "exon_mappings$num"); + $num++; + my $new_scores = $esb->create_shrinked_matrix($exon_scores, $mappings, + "exon_matrix$num"); + + return ($new_scores, $mappings); +} + + +# +# reduce score for mappings of exons which do not belong to mapped +# transcripts +# +sub mapped_transcript { + my $self = shift; + my $num = shift; + my $esb = shift; + my $mappings = shift; + my $exon_scores = shift; + + $self->logger->info("Exons in mapped transcript...\n", 0, 'stamped'); + + unless ($exon_scores->loaded) { + $esb->non_mapped_transcript_rescore($exon_scores, $mappings); + $exon_scores->write_to_file; + } + + $mappings = $self->basic_mapping($exon_scores, "exon_mappings$num"); + $num++; + my $new_scores = $esb->create_shrinked_matrix($exon_scores, $mappings, + "exon_matrix$num"); + + return ($new_scores, $mappings); +} + + +1; + diff --git a/modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblGeneGeneric.pm b/modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblGeneGeneric.pm new file mode 100644 index 0000000000..c424896f12 --- /dev/null +++ b/modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblGeneGeneric.pm @@ -0,0 +1,187 @@ +package Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric; + +=head1 NAME + +Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric - default Ensembl +InternalIdMapper implementation for genes + +=head1 SYNOPSIS + + +=head1 DESCRIPTION + + +=head1 METHODS + + +=head1 LICENCE + +This code is distributed under an Apache style licence. Please see +http://www.ensembl.org/info/about/code_licence.html for details. + +=head1 AUTHOR + +Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team + +=head1 CONTACT + +Please post comments/questions to the Ensembl development list +<ensembl-dev@ebi.ac.uk> + +=cut + + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper; +our @ISA = qw(Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); + + +# +# basic mapping +# +sub init_basic { + my $self = shift; + my $num = shift; + my $gsb = shift; + my $mappings = shift; + my $gene_scores = shift; + + $self->logger->info("Basic gene mapping...\n", 0, 'stamped'); + + $mappings = $self->basic_mapping($gene_scores, "gene_mappings$num"); + $num++; + + my $new_scores = $gsb->create_shrinked_matrix($gene_scores, $mappings, + "gene_matrix$num"); + + return ($new_scores, $mappings); +} + + +# +# build the synteny from unambiguous mappings +# +sub synteny { + my $self = shift; + my $num = shift; + my $gsb = shift; + my $mappings = shift; + my $gene_scores = shift; + + unless ($gene_scores->loaded) { + $self->logger->info("Synteny Framework building...\n", 0, 'stamped'); + my $dump_path = path_append($self->conf->param('basedir'), 'mapping'); + my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => 'synteny_framework.ser', + -LOGGER => $self->logger, + -CONF => $self->conf, + -CACHE => $self->cache, + ); + $sf->build_synteny($mappings); + + # use it to rescore the genes + $self->logger->info("\nSynteny assisted mapping...\n", 0, 'stamped'); + $gene_scores = $sf->rescore_gene_matrix_lsf($gene_scores); + + # checkpoint + $gene_scores->write_to_file; + } + + my $new_mappings = $self->basic_mapping($gene_scores, "gene_mappings$num"); + $num++; + my $new_scores = $gsb->create_shrinked_matrix($gene_scores, $new_mappings, + "gene_matrix$num"); + + return ($new_scores, $new_mappings); +} + + +# +# rescore with simple scoring function and try again +# +sub best_transcript { + my $self = shift; + my $num = shift; + my $gsb = shift; + my $mappings = shift; + my $gene_scores = shift; + my $transcript_scores = shift; + + $self->logger->info("Retry with simple best transcript score...\n", 0, 'stamped'); + + unless ($gene_scores->loaded) { + $gsb->simple_gene_rescore($gene_scores, $transcript_scores); + $gene_scores->write_to_file; + } + + my $new_mappings = $self->basic_mapping($gene_scores, "gene_mappings$num"); + $num++; + my $new_scores = $gsb->create_shrinked_matrix($gene_scores, $new_mappings, + "gene_matrix$num"); + + return ($new_scores, $new_mappings); +} + + +# +# rescore by penalising scores between genes with different biotypes +# +sub biotype { + my $self = shift; + my $num = shift; + my $gsb = shift; + my $mappings = shift; + my $gene_scores = shift; + + $self->logger->info("Retry with biotype disambiguation...\n", 0, 'stamped'); + + unless ($gene_scores->loaded) { + $gsb->biotype_gene_rescore($gene_scores); + $gene_scores->write_to_file; + } + + my $new_mappings = $self->basic_mapping($gene_scores, "gene_mappings$num"); + $num++; + my $new_scores = $gsb->create_shrinked_matrix($gene_scores, $new_mappings, + "gene_matrix$num"); + + return ($new_scores, $new_mappings); +} + + +# +# selectively rescore by penalising scores between genes with different +# internalIDs +# +sub internal_id { + my $self = shift; + my $num = shift; + my $gsb = shift; + my $mappings = shift; + my $gene_scores = shift; + + $self->logger->info("Retry with internalID disambiguation...\n", 0, 'stamped'); + + unless ($gene_scores->loaded) { + $gsb->internal_id_rescore($gene_scores); + $gene_scores->write_to_file; + } + + my $new_mappings = $self->basic_mapping($gene_scores, "gene_mappings$num"); + $num++; + my $new_scores = $gsb->create_shrinked_matrix($gene_scores, $new_mappings, + "gene_matrix$num"); + + return ($new_scores, $new_mappings); +} + + +1; + diff --git a/modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblTranscriptGeneric.pm b/modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblTranscriptGeneric.pm new file mode 100644 index 0000000000..8c7ad9ab33 --- /dev/null +++ b/modules/Bio/EnsEMBL/IdMapping/InternalIdMapper/EnsemblTranscriptGeneric.pm @@ -0,0 +1,276 @@ +package Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric; + +=head1 NAME + + +=head1 SYNOPSIS + + +=head1 DESCRIPTION + + +=head1 METHODS + + +=head1 LICENCE + +This code is distributed under an Apache style licence. Please see +http:#www.ensembl.org/info/about/code_licence.html for details. + +=head1 AUTHOR + +Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team + +=head1 CONTACT + +Please post comments/questions to the Ensembl development list +<ensembl-dev@ebi.ac.uk> + +=cut + + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper; +our @ISA = qw(Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); + + +# +# basic mapping +# +sub init_basic { + my $self = shift; + my $num = shift; + my $tsb = shift; + my $mappings = shift; + my $transcript_scores = shift; + + $self->logger->info("Basic transcript mapping...\n", 0, 'stamped'); + + $mappings = $self->basic_mapping($transcript_scores, + "transcript_mappings$num"); + $num++; + my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings, + "transcript_matrix$num"); + + return ($new_scores, $mappings); +} + + +# +# handle cases with exact match but different translation +# +sub non_exact_translation { + my $self = shift; + my $num = shift; + my $tsb = shift; + my $mappings = shift; + my $transcript_scores = shift; + + $self->logger->info("Exact Transcript non-exact Translation...\n", 0, 'stamped'); + + unless ($transcript_scores->loaded) { + $tsb->different_translation_rescore($transcript_scores); + $transcript_scores->write_to_file; + } + + $mappings = $self->basic_mapping($transcript_scores, + "transcript_mappings$num"); + $num++; + my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings, + "transcript_matrix$num"); + + return ($new_scores, $mappings); +} + + +# +# reduce score for mappings of transcripts which do not belong to mapped +# genes +# +sub mapped_gene { + my $self = shift; + my $num = shift; + my $tsb = shift; + my $mappings = shift; + my $transcript_scores = shift; + my $gene_mappings = shift; + + $self->logger->info("Transcripts in mapped genes...\n", 0, 'stamped'); + + unless ($transcript_scores->loaded) { + $tsb->non_mapped_gene_rescore($transcript_scores, $gene_mappings); + $transcript_scores->write_to_file; + } + + $mappings = $self->basic_mapping($transcript_scores, + "transcript_mappings$num"); + $num++; + my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings, + "transcript_matrix$num"); + + return ($new_scores, $mappings); +} + + +# +# selectively rescore by penalising scores between transcripts with +# different internalIDs +# +sub internal_id { + my $self = shift; + my $num = shift; + my $tsb = shift; + my $mappings = shift; + my $transcript_scores = shift; + + $self->logger->info("Retry with internalID disambiguation...\n", 0, 'stamped'); + + unless ($transcript_scores->loaded) { + $tsb->internal_id_rescore($transcript_scores); + $transcript_scores->write_to_file; + } + + $mappings = $self->basic_mapping($transcript_scores, + "transcript_mappings$num"); + $num++; + my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings, + "transcript_matrix$num"); + + return ($new_scores, $mappings); +} + + +# +# handle ambiguities between transcripts in single genes +# +sub single_gene { + my $self = shift; + my $num = shift; + my $tsb = shift; + my $mappings = shift; + my $transcript_scores = shift; + + $self->logger->info("Transcripts in single genes...\n", 0, 'stamped'); + + unless ($transcript_scores->loaded) { + $transcript_scores->write_to_file; + } + + $mappings = $self->same_gene_transcript_mapping($transcript_scores, + "transcript_mappings$num"); + $num++; + my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings, + "transcript_matrix$num"); + + return ($new_scores, $mappings); +} + + +# +# modified basic mapper that maps transcripts that are ambiguous within one gene +# +sub same_gene_transcript_mapping { + my $self = shift; + my $matrix = shift; + my $mapping_name = shift; + + # argument checks + unless ($matrix and + $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { + throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); + } + + throw('Need a name for serialising the mapping.') unless ($mapping_name); + + # Create a new MappingList object. Specify AUTO_LOAD to load serialised + # existing mappings if found + my $dump_path = path_append($self->conf->param('basedir'), 'mapping'); + + my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new( + -DUMP_PATH => $dump_path, + -CACHE_FILE => "${mapping_name}.ser", + -AUTO_LOAD => 1, + ); + + # checkpoint test: return a previously stored MappingList + if ($mappings->loaded) { + $self->logger->info("Read existing mappings from ${mapping_name}.ser.\n"); + return $mappings; + } + + my $sources_done = {}; + my $targets_done = {}; + + # sort scoring matrix entries by descending score + my @sorted_entries = sort { $b->score <=> $a->score || + $a->source <=> $b->source || $a->target <=> $b->target } + @{ $matrix->get_all_Entries }; + + while (my $entry = shift(@sorted_entries)) { + + # $self->logger->debug("\nxxx4 ".$entry->to_string." "); + + # we already found a mapping for either source or target yet + next if ($sources_done->{$entry->source} or + $targets_done->{$entry->target}); + + #$self->logger->debug('d'); + + my $other_sources = []; + my $other_targets = []; + my %source_genes = (); + my %target_genes = (); + + if ($self->ambiguous_mapping($entry, $matrix, $other_sources, $other_targets)) { + #$self->logger->debug('a'); + + $other_sources = $self->filter_sources($other_sources, $sources_done); + $other_targets = $self->filter_targets($other_targets, $targets_done); + + $source_genes{$self->cache->get_by_key('genes_by_transcript_id', + 'source', $entry->source)} = 1; + $target_genes{$self->cache->get_by_key('genes_by_transcript_id', + 'target', $entry->target)} = 1; + + foreach my $other_source (@{ $other_sources }) { + $source_genes{$self->cache->get_by_key('genes_by_transcript_id', + 'source', $other_source)} = 1; + } + + foreach my $other_target (@{ $other_targets }) { + $target_genes{$self->cache->get_by_key('genes_by_transcript_id', + 'target', $other_target)} = 1; + } + + # only add mapping if only one source and target gene involved + if (scalar(keys %source_genes) == 1 and scalar(keys %target_genes) == 1) { + #$self->logger->debug('O'); + $mappings->add_Entry($entry); + } + + } else { + #$self->logger->debug('A'); + + # this is the best mapping, add it + $mappings->add_Entry($entry); + } + + $sources_done->{$entry->source} = 1; + $targets_done->{$entry->target} = 1; + } + + # create checkpoint + $mappings->write_to_file; + + return $mappings; +} + + +1; + -- GitLab