added optimisation: Transcripts and Exons can now be loaded immediately on...

added optimisation: Transcripts and Exons can now be loaded immediately on Gene fetch rather than lazy-loading later

added optimisation: Transcripts and Exons can now be loaded immediately on...
added optimisation: Transcripts and Exons can now be loaded immediately on Gene fetch rather than lazy-loading later
acea98ea · Graham McVicker · d1f6562d · acea98ea · acea98ea · acea98ea
Commit acea98ea authored 20 years ago by Graham McVicker
--- a/modules/Bio/EnsEMBL/DBSQL/GeneAdaptor.pm
+++ b/modules/Bio/EnsEMBL/DBSQL/GeneAdaptor.pm
@@ -268,6 +268,110 @@ sub fetch_all_by_domain {



+=head2 fetch_all_by_Slice
+
+  Arg [1]    : Bio::EnsEMBL::Slice $slice
+               The slice to fetch genes on.
+  Arg [3]    : (optional) boolean $load_transcripts
+               if true, transcripts will be loaded immediately rather than
+               lazy loaded later.
+  Example    : @genes = @{$gene_adaptor->fetch_all_by_Slice()};
+  Description: Overrides superclass method to optionally load transcripts
+               immediately rather than lazy-loading them later.  This
+               is more efficient when there are a lot of genes whose
+               transcripts are going to be used.
+  Returntype : reference to list of transcripts
+  Exceptions : thrown if exon cannot be placed on transcript slice
+  Caller     : Slice::get_all_Transcripts
+
+=cut
+
+sub fetch_all_by_Slice {
+  my $self  = shift;
+  my $slice = shift;
+  my $logic_name = shift;
+  my $load_exons = shift;
+
+  my $genes = $self->SUPER::fetch_all_by_Slice($slice, $logic_name);
+
+  # if there are 0 or 1 genes still do lazy-loading
+  if(!$load_exons || @$genes < 2) {
+    return $genes;
+  }
+
+  # preload all of the transcripts now, instead of lazy loading later
+  # faster than 1 query per transcript
+
+  # get extent of region spanned by transcripts
+  my ($min_start, $max_end);
+  foreach my $g (@$genes) {
+    if(!defined($min_start) || $g->start() < $min_start) {
+      $min_start = $g->start();
+    }
+    if(!defined($max_end) || $g->end() > $max_end) {
+      $max_end   = $g->end();
+    }
+  }
+
+  $min_start += $slice->start() - 1;
+  $max_end   += $slice->start() - 1;
+
+  my $ext_slice;
+
+  if($min_start >= $slice->start() && $max_end <= $slice->end()) {
+    $ext_slice = $slice;
+  } else {
+    my $sa = $self->db()->get_SliceAdaptor();
+    $ext_slice = $sa->fetch_by_region
+      ($slice->coord_system->name(), $slice->seq_region_name(),
+       $min_start,$max_end, $slice->strand(), $slice->coord_system->version());
+  }
+
+  # associate transcript identifiers with genes
+
+  my %g_hash = map {$_->dbID => $_} @$genes;
+
+  my $g_id_str = '(' . join(',', keys %g_hash) . ')';
+
+  my $sth = $self->prepare("SELECT gene_id, transcript_id " .
+                           "FROM   transcript " .
+                           "WHERE  gene_id IN $g_id_str");
+
+  $sth->execute();
+
+  my ($g_id, $tr_id);
+  $sth->bind_columns(\$g_id, \$tr_id);
+
+  my %tr_g_hash;
+
+  while($sth->fetch()) {
+    $tr_g_hash{$tr_id} = $g_hash{$g_id};
+  }
+
+  $sth->finish();
+
+  my $ta = $self->db()->get_TranscriptAdaptor();
+  my $transcripts = $ta->fetch_all_by_Slice($ext_slice,1);
+
+  # move transcripts onto gene slice, and add them to genes
+  foreach my $tr (@$transcripts) {
+    $tr = $tr->transfer($slice) if($slice != $ext_slice);
+
+    if(!$tr) {
+      throw("Unexpected. Transcript could not be transfered onto Gene slice.");
+    }
+
+    $tr_g_hash{$tr->dbID()}->add_Transcript($tr);
+  }
+
+  return $genes;
+}
+
+
+
+
+
+
 =head2 fetch_by_transcript_id

  Arg [1]    : int $transid

--- a/modules/Bio/EnsEMBL/DBSQL/TranscriptAdaptor.pm
+++ b/modules/Bio/EnsEMBL/DBSQL/TranscriptAdaptor.pm
@@ -327,7 +327,7 @@ sub fetch_all_by_Slice {
    $ext_slice = $slice;
  } else {
    my $sa = $self->db()->get_SliceAdaptor();
-    my $ext_slice = $sa->fetch_by_region
+    $ext_slice = $sa->fetch_by_region
      ($slice->coord_system->name(), $slice->seq_region_name(),
       $min_start,$max_end, $slice->strand(), $slice->coord_system->version());
  }

--- a/modules/Bio/EnsEMBL/Slice.pm
+++ b/modules/Bio/EnsEMBL/Slice.pm
@@ -1250,9 +1250,18 @@ sub get_all_SNPs_transcripts {
 =head2 get_all_Genes

  Arg [1]    : (optional) string $logic_name
-               The name of the analysis used to generate the genes to retrieve 
+               The name of the analysis used to generate the genes to retrieve
+  Arg [2]    : (optional) string $dbtype
+               The dbtype of genes to obtain.  This assumes that the db has
+               been added to the DBAdaptor under this name (using the
+               DBConnection::add_db_adaptor method).
+  Arg [3]    : (optional) boolean $load_transcripts
+               If set to true, transcripts will be loaded immediately rather
+               than being lazy-loaded on request.  This will result in a
+               significant speed up if the Transcripts and Exons are going to
+               be used (but a slow down if they are not).
  Example    : @genes = @{$slice->get_all_Genes};
-  Description: Retrieves all genes that overlap this slice.  
+  Description: Retrieves all genes that overlap this slice.
  Returntype : listref of Bio::EnsEMBL::Genes
  Exceptions : none
  Caller     : none
@@ -1260,7 +1269,7 @@ sub get_all_SNPs_transcripts {
 =cut

 sub get_all_Genes{
-   my ($self, $logic_name, $dbtype) = @_;
+   my ($self, $logic_name, $dbtype, $load_transcripts) = @_;

  if(!$self->adaptor()) {
    warning('Cannot get Genes without attached adaptor');
@@ -1278,7 +1287,9 @@ sub get_all_Genes{
     $db = $self->adaptor->db;
   }

-   return $db->get_GeneAdaptor()->fetch_all_by_Slice( $self, $logic_name );
+   my $ga = $db->get_GeneAdaptor();
+
+   return $ga->fetch_all_by_Slice( $self, $logic_name, $load_transcripts);
 }

 =head2 get_all_Genes_by_type
@@ -1286,6 +1297,11 @@ sub get_all_Genes{

  Arg [1]    : string $type 
  Arg [2]    : (optional) string $logic_name
+  Arg [3]    : (optional) boolean $load_transcripts
+               If set to true, transcripts will be loaded immediately rather
+               than being lazy-loaded on request.  This will result in a
+               significant speed up if the Transcripts and Exons are going to
+               be used (but a slow down if they are not).
  Example    : @genes = @{$slice->get_all_Genes_by_type($type, 'ensembl')};
  Description: Retrieves genes that overlap this slice of type $type.
               This is primarily used by the genebuilding code when several
@@ -1301,14 +1317,15 @@ sub get_all_Genes{
 =cut

 sub get_all_Genes_by_type{
-  my ($self, $type, $logic_name) = @_;
+  my ($self, $type, $logic_name, $load_transcripts) = @_;

  if(!$self->adaptor()) {
    warning('Cannot get Genes without attached adaptor');
    return [];
  }

-  my @out = grep { $_->type eq $type } @{ $self->get_all_Genes($logic_name)};
+  my @out = grep { $_->type eq $type } 
+    @{ $self->get_all_Genes($logic_name, $load_transcripts)};

  return \@out;
 }