From df978e0fb339fef310243e522c40013d8a4564d2 Mon Sep 17 00:00:00 2001
From: ens-bwalts <bwalts@ebi.ac.uk>
Date: Wed, 20 Nov 2019 10:01:49 +0000
Subject: [PATCH] implement a simple cacheing system for Mapper interval trees

---
 modules/Bio/EnsEMBL/Mapper.pm | 119 ++++++++++++++++++++++++----------
 modules/t/mapper.t            |   9 +++
 2 files changed, 95 insertions(+), 33 deletions(-)

diff --git a/modules/Bio/EnsEMBL/Mapper.pm b/modules/Bio/EnsEMBL/Mapper.pm
index 801f932ecd..699dca486c 100644
--- a/modules/Bio/EnsEMBL/Mapper.pm
+++ b/modules/Bio/EnsEMBL/Mapper.pm
@@ -180,6 +180,8 @@ sub new {
 
   my $self = bless( { "_pair_$from" => {},
                       "_pair_$to"   => {},
+                      "_tree_$from" => {},
+                      "_tree_$to"   => {},
                       'pair_count'  => 0,
                       'to'          => $to,
                       'from'        => $from,
@@ -211,6 +213,8 @@ sub flush {
 
   $self->{"_pair_$from"} = {};
   $self->{"_pair_$to"} = {};
+  $self->{"_tree_$from"} = {};
+  $self->{"_tree_$to"} = {};
 
   $self->{'pair_count'} = 0;
 }
@@ -301,35 +305,45 @@ sub map_coordinates {
 
   my $last_used_pair;
 
+  # my best guess is that lr stands for "list of regions"
   my $lr = $hash->{ uc($id) };
-  
-  # create set of intervals to be checked for overlap
-  my $from_intervals;
 
-  foreach my $i (@{$lr}) {
-    my $start = $i->{$from}{start};
-    my $end = $i->{$from}{end};
+  # if we don't already have an interval tree built for this id,
+  # build one now
+  if ( !defined( $self->{"_tree_$type"}->{ uc($id) } )) {
+    # create set of intervals to be checked for overlap
+    my $from_intervals;
+
+    foreach my $i (@{$lr}) {
+      my $start = $i->{$from}{start};
+      my $end = $i->{$from}{end};
+
+      if ($end < $start) {
+        my $tmp = $start;
+        $start = $end;
+        $end = $tmp;
+      }
 
-    if ($end < $start) {
-      my $tmp = $start;
-      $start = $end;
-      $end = $tmp;
+      push @{$from_intervals}, Bio::EnsEMBL::Utils::Interval->new($start, $end, $i);
     }
 
-    push @{$from_intervals}, Bio::EnsEMBL::Utils::Interval->new($start, $end, $i);
+    #
+    # Create the interval tree defined on the above set of intervals
+    #
+    # Two options:
+    #
+    # 1. Use immutable interval tree implementation
+    # 2. Use mutable interval tree implementation
+    #
+    # As of release/99, we have more experience with the immutable
+    # interval tree, so we will stick with this one. A future
+    # refactoring effort may wish to replace this with a dynamically
+    # maintained mutable interval tree, rather than simply throwing
+    # trees away and rebuilding when the underlying interval set changes
+    $self->{"_tree_$type"}->{ uc($id) } = Bio::EnsEMBL::Utils::Tree::Interval::Immutable->new($from_intervals);
   }
-
-  #
-  # Create and query the interval tree defined on the above set of intervals
-  #
-  # Two options:
-  #
-  # 1. Use immutable interval tree implementation
-  # 2. Use mutable interval tree implementation
-  #
-  # Opt for the first one as the interval set's not supposed to change
-  my $itree = Bio::EnsEMBL::Utils::Tree::Interval::Immutable->new($from_intervals);
-  my $overlap = $itree->query($start, $end);
+  # query the interval tree (either cached or created new) for overlapping intervals
+  my $overlap = $self->{"_tree_$type"}->{ uc($id) }->query($start, $end);
 
   my $rank              = 0;
   my $orig_start        = $start;
@@ -695,6 +709,11 @@ sub add_map_coordinates {
   push( @{ $self->{"_pair_$map_to"}->{ uc($chr_name) } },    $pair );
   push( @{ $self->{"_pair_$map_from"}->{ uc($contig_id) } }, $pair );
 
+  # any interval trees for this set of intervals are now invalid
+  # so remove them
+  $self->{"_tree_$map_to"}->{ uc($contig_id) } = undef;
+  $self->{"_tree_$map_from"}->{ uc($contig_id) } = undef;
+
   $self->{'pair_count'}++;
   $self->{'_is_sorted'} = 0;
 } ## end sub add_map_coordinates
@@ -750,6 +769,11 @@ sub add_indel_coordinates{
   push( @{$self->{"_pair_$map_to"}->{uc($chr_name)}}, $pair );
   push( @{$self->{"_pair_$map_from"}->{uc($contig_id)}}, $pair );
 
+  # any interval trees for this set of intervals are now invalid
+  # so remove them
+  $self->{"_tree_$map_to"}->{ uc($chr_name) } = undef;
+  $self->{"_tree_$map_from"}->{ uc($contig_id) } = undef;
+
   $self->{'pair_count'}++;
 
   $self->{'_is_sorted'} = 0;
@@ -806,18 +830,42 @@ sub map_indel {
 
   my $lr = $hash->{ uc($id) };
 
-  # create set of intervals to be checked for overlap
-  my $from_intervals;
-  foreach my $i (@{$lr}) {
-    my $start = $i->{$from}{start}<=$i->{$from}{end}?$i->{$from}{start}:$i->{$from}{end};
-    my $end = $i->{$from}{start}<=$i->{$from}{end}?$i->{$from}{end}:$i->{$from}{start};
+  # if we don't already have an interval tree built for this id,
+  # build one now
+  if ( !defined $self->{"_tree_$type"}->{ uc($id) } ) {
+    # create set of intervals to be checked for overlap
+    my $from_intervals;
 
-    push @{$from_intervals}, Bio::EnsEMBL::Utils::Interval->new($start, $end, $i);
-  }
+    foreach my $i (@{$lr}) {
+      my $start = $i->{$from}{start};
+      my $end = $i->{$from}{end};
+
+      if ($end < $start) {
+        my $tmp = $start;
+        $start = $end;
+        $end = $tmp;
+      }
+
+      push @{$from_intervals}, Bio::EnsEMBL::Utils::Interval->new($start, $end, $i);
+    }
 
-  # create and query the interval tree defined on the above set of intervals
-  my $itree = Bio::EnsEMBL::Utils::Tree::Interval::Immutable->new($from_intervals);
-  my $overlap = $itree->query($start, $end);
+    #
+    # Create the interval tree defined on the above set of intervals
+    #
+    # Two options:
+    #
+    # 1. Use immutable interval tree implementation
+    # 2. Use mutable interval tree implementation
+    #
+    # As of release/99, we have more experience with the immutable
+    # interval tree, so we will stick with this one. A future
+    # refactoring effort may wish to replace this with a dynamically
+    # maintained mutable interval tree, rather than simply throwing
+    # trees away and rebuilding when the underlying interval set changes
+    $self->{"_tree_$type"}->{ uc($id) } = Bio::EnsEMBL::Utils::Tree::Interval::Immutable->new($from_intervals);
+}
+  # query the interval tree (either cached or created new) for overlapping intervals
+  my $overlap = $self->{"_tree_$type"}->{ uc($id) }->query($start, $end);
 
   foreach my $i (@{$overlap}) {
     my $pair = $i->data;
@@ -866,12 +914,14 @@ sub add_Mapper{
   foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_to"}}) {
     push(@{$self->{"_pair_$mapper_to"}->{$seq_name}},
         @{$mapper->{"_pair_$mapper_to"}->{$seq_name}});
+    $self->{"_tree_$mapper_to"}->{ uc($seq_name) } = undef;
     $count_a += scalar(@{$mapper->{"_pair_$mapper_to"}->{$seq_name}});
   }
   my $count_b = 0;
   foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_from"}}) {
     push(@{$self->{"_pair_$mapper_from"}->{$seq_name}},
         @{$mapper->{"_pair_$mapper_from"}->{$seq_name}});
+    $self->{"_tree_$mapper_from"}->{ uc($seq_name) } = undef;
     $count_b += scalar(@{$mapper->{"_pair_$mapper_from"}->{$seq_name}});
   }
 
@@ -1099,6 +1149,9 @@ sub _merge_pairs {
   $self->{'pair_count'} = 0;
 
   for my $key ( keys %{$self->{"_pair_$map_to"}} ) {
+    # merging pairs means all interval trees for
+    # this set of intervals are now invalid, so delete them
+    $self->{"_tree_$map_to"}->{ uc($key) } = undef;
     $lr = $self->{"_pair_$map_to"}->{$key}; 
     
     my $i = 0;
diff --git a/modules/t/mapper.t b/modules/t/mapper.t
index caf88ff710..ca1678bfdd 100644
--- a/modules/t/mapper.t
+++ b/modules/t/mapper.t
@@ -80,6 +80,8 @@ test_transform ($mapper,
 
 #
 # check if the mapper can do merging
+# at the same time, check that IntervalTrees are correctly cached
+# and that IntervalTree caches are correctly invalidated
 #
 
 $mapper = Bio::EnsEMBL::Mapper->new( "asm1", "asm2" );
@@ -88,10 +90,17 @@ $mapper->add_map_coordinates( "1", 1, 10, 1, "1", 101, 110 );
 $mapper->add_map_coordinates( "1", 21, 30, 1, "1", 121, 130 );
 $mapper->add_map_coordinates( "1", 11, 20, 1, "1", 111, 120 );
 
+is($mapper->{_tree_asm1}->{1}, undef, "tree not yet created for asm1 interval 1");
+
 test_transform( $mapper, 
 		[ "1", 5, 25, 1, "asm1" ],
 		[ "1", 105, 125, 1 ] );
 
+is(defined($mapper->{_tree_asm1}->{1}), 1, "tree created and cached for asm1 interval 1");
+
+
+$mapper->add_map_coordinates( "1", 31, 40, 1, "1", 131, 140);
+is($mapper->{_tree_asm1}->{1}, undef, "tree deleted after modifying asm1 interval 1");
 
 
 #
-- 
GitLab