From df978e0fb339fef310243e522c40013d8a4564d2 Mon Sep 17 00:00:00 2001 From: ens-bwalts <bwalts@ebi.ac.uk> Date: Wed, 20 Nov 2019 10:01:49 +0000 Subject: [PATCH] implement a simple cacheing system for Mapper interval trees --- modules/Bio/EnsEMBL/Mapper.pm | 119 ++++++++++++++++++++++++---------- modules/t/mapper.t | 9 +++ 2 files changed, 95 insertions(+), 33 deletions(-) diff --git a/modules/Bio/EnsEMBL/Mapper.pm b/modules/Bio/EnsEMBL/Mapper.pm index 801f932ecd..699dca486c 100644 --- a/modules/Bio/EnsEMBL/Mapper.pm +++ b/modules/Bio/EnsEMBL/Mapper.pm @@ -180,6 +180,8 @@ sub new { my $self = bless( { "_pair_$from" => {}, "_pair_$to" => {}, + "_tree_$from" => {}, + "_tree_$to" => {}, 'pair_count' => 0, 'to' => $to, 'from' => $from, @@ -211,6 +213,8 @@ sub flush { $self->{"_pair_$from"} = {}; $self->{"_pair_$to"} = {}; + $self->{"_tree_$from"} = {}; + $self->{"_tree_$to"} = {}; $self->{'pair_count'} = 0; } @@ -301,35 +305,45 @@ sub map_coordinates { my $last_used_pair; + # my best guess is that lr stands for "list of regions" my $lr = $hash->{ uc($id) }; - - # create set of intervals to be checked for overlap - my $from_intervals; - foreach my $i (@{$lr}) { - my $start = $i->{$from}{start}; - my $end = $i->{$from}{end}; + # if we don't already have an interval tree built for this id, + # build one now + if ( !defined( $self->{"_tree_$type"}->{ uc($id) } )) { + # create set of intervals to be checked for overlap + my $from_intervals; + + foreach my $i (@{$lr}) { + my $start = $i->{$from}{start}; + my $end = $i->{$from}{end}; + + if ($end < $start) { + my $tmp = $start; + $start = $end; + $end = $tmp; + } - if ($end < $start) { - my $tmp = $start; - $start = $end; - $end = $tmp; + push @{$from_intervals}, Bio::EnsEMBL::Utils::Interval->new($start, $end, $i); } - push @{$from_intervals}, Bio::EnsEMBL::Utils::Interval->new($start, $end, $i); + # + # Create the interval tree defined on the above set of intervals + # + # Two options: + # + # 1. Use immutable interval tree implementation + # 2. Use mutable interval tree implementation + # + # As of release/99, we have more experience with the immutable + # interval tree, so we will stick with this one. A future + # refactoring effort may wish to replace this with a dynamically + # maintained mutable interval tree, rather than simply throwing + # trees away and rebuilding when the underlying interval set changes + $self->{"_tree_$type"}->{ uc($id) } = Bio::EnsEMBL::Utils::Tree::Interval::Immutable->new($from_intervals); } - - # - # Create and query the interval tree defined on the above set of intervals - # - # Two options: - # - # 1. Use immutable interval tree implementation - # 2. Use mutable interval tree implementation - # - # Opt for the first one as the interval set's not supposed to change - my $itree = Bio::EnsEMBL::Utils::Tree::Interval::Immutable->new($from_intervals); - my $overlap = $itree->query($start, $end); + # query the interval tree (either cached or created new) for overlapping intervals + my $overlap = $self->{"_tree_$type"}->{ uc($id) }->query($start, $end); my $rank = 0; my $orig_start = $start; @@ -695,6 +709,11 @@ sub add_map_coordinates { push( @{ $self->{"_pair_$map_to"}->{ uc($chr_name) } }, $pair ); push( @{ $self->{"_pair_$map_from"}->{ uc($contig_id) } }, $pair ); + # any interval trees for this set of intervals are now invalid + # so remove them + $self->{"_tree_$map_to"}->{ uc($contig_id) } = undef; + $self->{"_tree_$map_from"}->{ uc($contig_id) } = undef; + $self->{'pair_count'}++; $self->{'_is_sorted'} = 0; } ## end sub add_map_coordinates @@ -750,6 +769,11 @@ sub add_indel_coordinates{ push( @{$self->{"_pair_$map_to"}->{uc($chr_name)}}, $pair ); push( @{$self->{"_pair_$map_from"}->{uc($contig_id)}}, $pair ); + # any interval trees for this set of intervals are now invalid + # so remove them + $self->{"_tree_$map_to"}->{ uc($chr_name) } = undef; + $self->{"_tree_$map_from"}->{ uc($contig_id) } = undef; + $self->{'pair_count'}++; $self->{'_is_sorted'} = 0; @@ -806,18 +830,42 @@ sub map_indel { my $lr = $hash->{ uc($id) }; - # create set of intervals to be checked for overlap - my $from_intervals; - foreach my $i (@{$lr}) { - my $start = $i->{$from}{start}<=$i->{$from}{end}?$i->{$from}{start}:$i->{$from}{end}; - my $end = $i->{$from}{start}<=$i->{$from}{end}?$i->{$from}{end}:$i->{$from}{start}; + # if we don't already have an interval tree built for this id, + # build one now + if ( !defined $self->{"_tree_$type"}->{ uc($id) } ) { + # create set of intervals to be checked for overlap + my $from_intervals; - push @{$from_intervals}, Bio::EnsEMBL::Utils::Interval->new($start, $end, $i); - } + foreach my $i (@{$lr}) { + my $start = $i->{$from}{start}; + my $end = $i->{$from}{end}; + + if ($end < $start) { + my $tmp = $start; + $start = $end; + $end = $tmp; + } + + push @{$from_intervals}, Bio::EnsEMBL::Utils::Interval->new($start, $end, $i); + } - # create and query the interval tree defined on the above set of intervals - my $itree = Bio::EnsEMBL::Utils::Tree::Interval::Immutable->new($from_intervals); - my $overlap = $itree->query($start, $end); + # + # Create the interval tree defined on the above set of intervals + # + # Two options: + # + # 1. Use immutable interval tree implementation + # 2. Use mutable interval tree implementation + # + # As of release/99, we have more experience with the immutable + # interval tree, so we will stick with this one. A future + # refactoring effort may wish to replace this with a dynamically + # maintained mutable interval tree, rather than simply throwing + # trees away and rebuilding when the underlying interval set changes + $self->{"_tree_$type"}->{ uc($id) } = Bio::EnsEMBL::Utils::Tree::Interval::Immutable->new($from_intervals); +} + # query the interval tree (either cached or created new) for overlapping intervals + my $overlap = $self->{"_tree_$type"}->{ uc($id) }->query($start, $end); foreach my $i (@{$overlap}) { my $pair = $i->data; @@ -866,12 +914,14 @@ sub add_Mapper{ foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_to"}}) { push(@{$self->{"_pair_$mapper_to"}->{$seq_name}}, @{$mapper->{"_pair_$mapper_to"}->{$seq_name}}); + $self->{"_tree_$mapper_to"}->{ uc($seq_name) } = undef; $count_a += scalar(@{$mapper->{"_pair_$mapper_to"}->{$seq_name}}); } my $count_b = 0; foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_from"}}) { push(@{$self->{"_pair_$mapper_from"}->{$seq_name}}, @{$mapper->{"_pair_$mapper_from"}->{$seq_name}}); + $self->{"_tree_$mapper_from"}->{ uc($seq_name) } = undef; $count_b += scalar(@{$mapper->{"_pair_$mapper_from"}->{$seq_name}}); } @@ -1099,6 +1149,9 @@ sub _merge_pairs { $self->{'pair_count'} = 0; for my $key ( keys %{$self->{"_pair_$map_to"}} ) { + # merging pairs means all interval trees for + # this set of intervals are now invalid, so delete them + $self->{"_tree_$map_to"}->{ uc($key) } = undef; $lr = $self->{"_pair_$map_to"}->{$key}; my $i = 0; diff --git a/modules/t/mapper.t b/modules/t/mapper.t index caf88ff710..ca1678bfdd 100644 --- a/modules/t/mapper.t +++ b/modules/t/mapper.t @@ -80,6 +80,8 @@ test_transform ($mapper, # # check if the mapper can do merging +# at the same time, check that IntervalTrees are correctly cached +# and that IntervalTree caches are correctly invalidated # $mapper = Bio::EnsEMBL::Mapper->new( "asm1", "asm2" ); @@ -88,10 +90,17 @@ $mapper->add_map_coordinates( "1", 1, 10, 1, "1", 101, 110 ); $mapper->add_map_coordinates( "1", 21, 30, 1, "1", 121, 130 ); $mapper->add_map_coordinates( "1", 11, 20, 1, "1", 111, 120 ); +is($mapper->{_tree_asm1}->{1}, undef, "tree not yet created for asm1 interval 1"); + test_transform( $mapper, [ "1", 5, 25, 1, "asm1" ], [ "1", 105, 125, 1 ] ); +is(defined($mapper->{_tree_asm1}->{1}), 1, "tree created and cached for asm1 interval 1"); + + +$mapper->add_map_coordinates( "1", 31, 40, 1, "1", 131, 140); +is($mapper->{_tree_asm1}->{1}, undef, "tree deleted after modifying asm1 interval 1"); # -- GitLab