Cache.pm 37 KB
Newer Older
1
=head1 LICENSE
2

3
  Copyright (c) 1999-2013 The European Bioinformatics Institute and
4
  Genome Research Limited.  All rights reserved.
5

6 7
  This software is distributed under a modified Apache license.
  For license details, please see
8

9
    http://www.ensembl.org/info/about/code_licence.html
10

11
=head1 CONTACT
12

13
  Please email comments or questions to the public Ensembl
14
  developers list at <dev@ensembl.org>.
15

16 17
  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.
18

19
=cut
20

21
=head1 NAME
22

23 24
Bio::EnsEMBL::IdMapping::Cache - a cache to hold data objects used by the 
IdMapping application
25

26
=head1 DESCRIPTION
27

28
=head1 METHODS
29 30 31 32

=cut


33 34
package Bio::EnsEMBL::IdMapping::Cache;

35 36 37 38 39 40
use strict;
use warnings;
no warnings 'uninitialized';

use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
41
use Bio::EnsEMBL::Utils::ScriptUtils qw(parse_bytes path_append);
42
use Bio::EnsEMBL::Utils::Scalar qw(assert_ref);
43 44 45 46
use Bio::EnsEMBL::IdMapping::TinyGene;
use Bio::EnsEMBL::IdMapping::TinyTranscript;
use Bio::EnsEMBL::IdMapping::TinyTranslation;
use Bio::EnsEMBL::IdMapping::TinyExon;
47
use Bio::EnsEMBL::DBSQL::DBAdaptor;
48
use Storable qw(nstore retrieve);
49
use Digest::MD5 qw(md5_hex);
50

Patrick Meidl's avatar
Patrick Meidl committed
51 52 53
# define available cache names here
my @cache_names = qw(
    exons_by_id
54 55
    transcripts_by_id
    transcripts_by_exon_id
56
    translations_by_id
57 58
    genes_by_id
    genes_by_transcript_id
Patrick Meidl's avatar
Patrick Meidl committed
59 60 61
);


62 63
=head2 new

Patrick Meidl's avatar
Patrick Meidl committed
64 65 66 67 68 69
  Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object
  Arg [CONF]  : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object
  Example     : my $cache = Bio::EnsEMBL::IdMapping::Cache->new(
                  -LOGGER => $logger,
                  -CONF   => $conf,
                );
70 71
  Description : constructor
  Return type : Bio::EnsEMBL::IdMapping::Cache object
Patrick Meidl's avatar
Patrick Meidl committed
72
  Exceptions  : thrown on wrong or missing arguments
73
  Caller      : general
Patrick Meidl's avatar
Patrick Meidl committed
74 75
  Status      : At Risk
              : under development
76 77 78 79

=cut

sub new {
Patrick Meidl's avatar
Patrick Meidl committed
80 81
  my $caller = shift;
  my $class = ref($caller) || $caller;
82

Patrick Meidl's avatar
Patrick Meidl committed
83 84
  my ($logger, $conf, $load_instance) =
    rearrange(['LOGGER', 'CONF', 'LOAD_INSTANCE'], @_);
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99

  unless ($logger->isa('Bio::EnsEMBL::Utils::Logger')) {
    throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging.");
  }
  
  unless ($conf->isa('Bio::EnsEMBL::Utils::ConfParser')) {
    throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object.");
  }
  
  my $self = {};
  bless ($self, $class);

  # initialise
  $self->logger($logger);
  $self->conf($conf);
Patrick Meidl's avatar
Patrick Meidl committed
100 101 102 103

  if ($load_instance) {
    $self->read_instance_from_file;
  }
104 105 106 107 108
  
  return $self;
}


109
=head2 build_cache_by_slice
Patrick Meidl's avatar
Patrick Meidl committed
110 111 112 113

  Arg[1]      : String $dbtype - db type (source|target)
  Arg[2]      : String $slice_name - the name of a slice (format as returned by
                Bio::EnsEMBL::Slice->name)
114 115
  Example     : my ($num_genes, $filesize) = $cache->build_cache_by_slice(
                  'source', 'chromosome:NCBI36:X:1:1000000:-1');
Patrick Meidl's avatar
Patrick Meidl committed
116 117
  Description : Builds a cache of genes, transcripts, translations and exons
                needed by the IdMapping application and serialises the resulting
118
                cache object to a file, one slice at a time.
Patrick Meidl's avatar
Patrick Meidl committed
119 120 121 122 123 124 125 126 127
  Return type : list of the number of genes processed and the size of the
                serialised cache file
  Exceptions  : thrown on invalid slice name
  Caller      : general
  Status      : At Risk
              : under development

=cut

128
sub build_cache_by_slice {
129 130
  my $self       = shift;
  my $dbtype     = shift;
Patrick Meidl's avatar
Patrick Meidl committed
131
  my $slice_name = shift;
132

Patrick Meidl's avatar
bug fix  
Patrick Meidl committed
133 134 135
  # set cache method (required for loading cache later)
  $self->cache_method('BY_SEQ_REGION');

Patrick Meidl's avatar
Patrick Meidl committed
136
  my $dba = $self->get_DBAdaptor($dbtype);
137 138
  my $sa  = $dba->get_SliceAdaptor;

Patrick Meidl's avatar
Patrick Meidl committed
139
  my $slice = $sa->fetch_by_name($slice_name);
Patrick Meidl's avatar
Patrick Meidl committed
140 141 142
  unless ($slice) {
    throw("Could not retrieve slice $slice_name.");
  }
143 144

  my $genes = $slice->get_all_Genes( undef, undef, 1 );
Patrick Meidl's avatar
Patrick Meidl committed
145

Patrick Meidl's avatar
Patrick Meidl committed
146
  # find common coord_system
147
  my $common_cs_found = $self->find_common_coord_systems;
Patrick Meidl's avatar
Patrick Meidl committed
148

149 150 151
  # find out whether native coord_system is a common coord_system.
  # if so, you don't need to project.
  # also don't project if no common coord_system present
Patrick Meidl's avatar
Patrick Meidl committed
152
  my $need_project = 1;
153

154 155 156 157 158
  my $csid = join( ':',
                   $slice->coord_system_name,
                   $slice->coord_system->version );

  if ( $self->is_common_cs($csid) or !$self->highest_common_cs ) {
159 160
    $need_project = 0;
  }
161

Patrick Meidl's avatar
Patrick Meidl committed
162 163
  # build cache
  my $type = "$dbtype.$slice_name";
164 165
  my $num_genes =
    $self->build_cache_from_genes( $type, $genes, $need_project );
Patrick Meidl's avatar
Patrick Meidl committed
166 167 168 169 170
  undef $genes;

  # write cache to file, then flush cache to reclaim memory
  my $size = $self->write_all_to_file($type);

171
  return $num_genes, $size;
172
} ## end sub build_cache_by_slice
173 174


175
=head2 build_cache_all
Patrick Meidl's avatar
Patrick Meidl committed
176

177 178 179 180 181 182 183 184 185 186 187 188
  Arg[1]      : String $dbtype - db type (source|target)
  Example     : my ($num_genes, $filesize) = $cache->build_cache_all('source');
  Description : Builds a cache of genes, transcripts, translations and exons
                needed by the IdMapping application and serialises the
                resulting cache object to a file. All genes across the genome
                are processed in one go. This method should be used when
                build_cache_by_seq_region can't be used due to a large number
                of toplevel seq_regions (e.g. 2x genomes).
  Return type : list of the number of genes processed and the size of the
                serialised cache file
  Exceptions  : thrown on invalid slice name
  Caller      : general
Patrick Meidl's avatar
Patrick Meidl committed
189 190 191 192 193
  Status      : At Risk
              : under development

=cut

194
sub build_cache_all {
Patrick Meidl's avatar
Patrick Meidl committed
195
  my $self = shift;
196 197
  my $dbtype = shift;
  
Patrick Meidl's avatar
bug fix  
Patrick Meidl committed
198 199 200
  # set cache method (required for loading cache later)
  $self->cache_method('ALL');

201 202 203 204
  my $dba = $self->get_DBAdaptor($dbtype);
  my $ga = $dba->get_GeneAdaptor;
  
  my $genes = $ga->fetch_all;
Patrick Meidl's avatar
Patrick Meidl committed
205

206 207
  # find common coord_system
  my $common_cs_found = $self->find_common_coord_systems;
208

209
  # Build cache. Setting $need_project to 'CHECK' will cause
210 211
  # build_cache_from_genes() to check the coordinate system for each
  # gene.
212 213
  my $type         = "$dbtype.ALL";
  my $need_project = 'CHECK';
214 215 216
  my $num_genes =
    $self->build_cache_from_genes( $type, $genes, $need_project );

217
  undef $genes;
Patrick Meidl's avatar
Patrick Meidl committed
218

219 220 221 222
  # write cache to file, then flush cache to reclaim memory
  my $size = $self->write_all_to_file($type);

  return $num_genes, $size;
223 224 225
}


Patrick Meidl's avatar
Patrick Meidl committed
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
=head2 build_cache_from_genes 

  Arg[1]      : String $type - cache type
  Arg[2]      : Listref of Bio::EnsEMBL::Genes $genes - genes to build cache
                from
  Arg[3]      : Boolean $need_project - indicate if we need to project exons to
                common coordinate system
  Example     : $cache->build_cache_from_genes(
                  'source.chromosome:NCBI36:X:1:100000:1', \@genes);
  Description : Builds the cache by fetching transcripts, translations and exons
                for a list of genes from the database, and creating lightweight
                Bio::EnsEMBL::IdMapping::TinyFeature objects containing only the
                data needed by the IdMapping application. These objects are
                attached to a name cache in this cache object. Exons only need
                to be projected to a commond coordinate system if their native
                coordinate system isn't common to source and target assembly
                itself.
243
  Return type : int - number of genes after filtering
Patrick Meidl's avatar
Patrick Meidl committed
244 245 246 247 248 249 250
  Exceptions  : thrown on wrong or missing arguments
  Caller      : internal
  Status      : At Risk
              : under development

=cut

251
sub build_cache_from_genes {
252 253 254
  my $self         = shift;
  my $type         = shift;
  my $genes        = shift;
Patrick Meidl's avatar
Patrick Meidl committed
255
  my $need_project = shift;
256

Patrick Meidl's avatar
Patrick Meidl committed
257
  throw("You must provide a type.") unless $type;
258 259
  throw("You must provide a listref of genes.")
    unless ( ref($genes) eq 'ARRAY' );
260

261
  # biotype filter
262 263 264
  if ( $self->conf()->param('biotypes') ||
       $self->conf()->param('biotypes_include') ||
       $self->conf()->param('biotypes_exclude') )
265
  {
266 267
    $genes = $self->filter_biotypes($genes);
  }
268 269
  my $num_genes = scalar(@$genes);

270
  # initialise cache for the given type.
271
  $self->{'cache'}->{$type} = {};
272

Patrick Meidl's avatar
Patrick Meidl committed
273 274
  #my $i = 0;
  #my $num_genes = scalar(@$genes);
Patrick Meidl's avatar
Patrick Meidl committed
275
  #my $progress_id = $self->logger->init_progress($num_genes);
276

277 278 279 280
 # loop over genes sorted by gene location.
 # the sort will hopefully improve assembly mapper cache performance and
 # therefore speed up exon sequence retrieval
  foreach my $gene ( sort { $a->start <=> $b->start } @$genes ) {
Patrick Meidl's avatar
Patrick Meidl committed
281
    #$self->logger->log_progressbar($progress_id, ++$i, 2);
Patrick Meidl's avatar
Patrick Meidl committed
282
    #$self->logger->log_progress($num_genes, ++$i, 20, 3, 1);
Patrick Meidl's avatar
Patrick Meidl committed
283

284
    if ( $need_project eq 'CHECK' ) {
285 286 287
      # find out whether native coord_system is a common coord_system.
      # if so, you don't need to project.
      # also don't project if no common coord_system present
288 289 290 291 292
      if ( $self->highest_common_cs ) {
        my $csid = join( ':',
                         $gene->slice->coord_system_name,
                         $gene->slice->coord_system->version );
        if ( $self->is_common_cs($csid) ) {
293 294
          $need_project = 0;
        }
295 296
      }
      else {
297 298 299 300
        $need_project = 0;
      }
    }

Patrick Meidl's avatar
Patrick Meidl committed
301
    # create lightweigt gene
302 303 304 305 306 307 308 309 310
    my $lgene =
      Bio::EnsEMBL::IdMapping::TinyGene->new_fast( [
                          $gene->dbID,          $gene->stable_id,
                          $gene->version,       $gene->created_date,
                          $gene->modified_date, $gene->start,
                          $gene->end,           $gene->strand,
                          $gene->slice->seq_region_name, $gene->biotype,
                          $gene->status, $gene->analysis->logic_name,
                          ( $gene->is_known ? 1 : 0 ), ] );
Patrick Meidl's avatar
Patrick Meidl committed
311

312
    # build gene caches
313 314
    $self->add( 'genes_by_id', $type, $gene->dbID, $lgene );

315
    # transcripts
316 317 318 319 320 321 322 323
    foreach my $tr ( @{ $gene->get_all_Transcripts } ) {
      my $ltr =
        Bio::EnsEMBL::IdMapping::TinyTranscript->new_fast( [
                               $tr->dbID,          $tr->stable_id,
                               $tr->version,       $tr->created_date,
                               $tr->modified_date, $tr->start,
                               $tr->end,           $tr->strand,
                               $tr->length, md5_hex( $tr->spliced_seq ),
324
                               ( $tr->is_known ? 1 : 0 ) ] );
Patrick Meidl's avatar
Patrick Meidl committed
325

326
      $ltr->biotype( $tr->biotype() );
327
      $lgene->add_Transcript($ltr);
Patrick Meidl's avatar
Patrick Meidl committed
328

329
      # build transcript caches
330 331
      $self->add( 'transcripts_by_id',      $type, $tr->dbID, $ltr );
      $self->add( 'genes_by_transcript_id', $type, $tr->dbID, $lgene );
332 333

      # translation (if there is one)
334 335 336 337 338 339 340 341
      if ( my $tl = $tr->translation ) {
        my $ltl =
          Bio::EnsEMBL::IdMapping::TinyTranslation->new_fast( [
                         $tl->dbID,          $tl->stable_id,
                         $tl->version,       $tl->created_date,
                         $tl->modified_date, $tr->dbID,
                         $tr->translate->seq, ( $tr->is_known ? 1 : 0 ),
                       ] );
342

343
        $ltr->add_Translation($ltl);
344

345
        $self->add( 'translations_by_id', $type, $tl->dbID, $ltl );
346

Patrick Meidl's avatar
Patrick Meidl committed
347
        undef $tl;
348 349
      }

Patrick Meidl's avatar
Patrick Meidl committed
350
      # exons
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
      foreach my $exon ( @{ $tr->get_all_Exons } ) {
        my $lexon =
          Bio::EnsEMBL::IdMapping::TinyExon->new_fast( [
                         $exon->dbID,
                         $exon->stable_id,
                         $exon->version,
                         $exon->created_date,
                         $exon->modified_date,
                         $exon->start,
                         $exon->end,
                         $exon->strand,
                         $exon->slice->seq_region_name,
                         $exon->slice->coord_system_name,
                         $exon->slice->coord_system->version,
                         $exon->slice->subseq( $exon->start, $exon->end,
                                               $exon->strand ),
                         $exon->phase,
                         $need_project, ] );
Patrick Meidl's avatar
Patrick Meidl committed
369

Patrick Meidl's avatar
Patrick Meidl committed
370 371
        # get coordinates in common coordinate system if needed
        if ($need_project) {
372 373 374
          my @seg = @{
            $exon->project( $self->highest_common_cs,
                            $self->highest_common_cs_version ) };
Patrick Meidl's avatar
Patrick Meidl committed
375

376
          if ( scalar(@seg) == 1 ) {
Patrick Meidl's avatar
Patrick Meidl committed
377
            my $sl = $seg[0]->to_Slice;
378 379 380 381
            $lexon->common_start( $sl->start );
            $lexon->common_end( $sl->end );
            $lexon->common_strand( $sl->strand );
            $lexon->common_sr_name( $sl->seq_region_name );
Patrick Meidl's avatar
Patrick Meidl committed
382 383
          }
        }
384

Patrick Meidl's avatar
Patrick Meidl committed
385 386
        $ltr->add_Exon($lexon);

387 388 389
        $self->add( 'exons_by_id', $type, $exon->dbID, $lexon );
        $self->add_list( 'transcripts_by_exon_id',
                         $type, $exon->dbID, $ltr );
Patrick Meidl's avatar
Patrick Meidl committed
390 391

        undef $exon;
392
      } ## end foreach my $exon ( @{ $tr->get_all_Exons...})
Patrick Meidl's avatar
Patrick Meidl committed
393 394

      undef $tr;
395
    } ## end foreach my $tr ( @{ $gene->get_all_Transcripts...})
396

Patrick Meidl's avatar
Patrick Meidl committed
397
    undef $gene;
398
  } ## end foreach my $gene ( sort { $a...})
399

400
  return $num_genes;
401
} ## end sub build_cache_from_genes
402 403 404 405 406 407


=head2 filter_biotypes

  Arg[1]      : Listref of Bio::EnsEMBL::Genes $genes - the genes to filter
  Example     : my @filtered = @{ $cache->filter_biotypes(\@genes) };
408 409 410

  Description : Filters a list of genes by biotype.  Biotypes are
                taken from the IdMapping configuration parameter
411
                'biotypes_include' or 'biotypes_exclude'.
412

413
                If the configuration parameter 'biotypes_exclude' is
414 415
                defined, then rather than returning the genes whose
                biotype is listed in the configuration parameter
416 417
                'biotypes_include' the method will return the genes
                whose biotype is *not* listed in the 'biotypes_exclude'
418 419 420 421 422
                configuration parameter.

                It is an error to define both these configuration
                parameters.

423 424 425
                The old parameter 'biotypes' is equivalent to
                'biotypes_include'.

426 427 428 429 430 431 432 433 434
  Return type : Listref of Bio::EnsEMBL::Genes (or empty list)
  Exceptions  : none
  Caller      : internal
  Status      : At Risk
              : under development

=cut

sub filter_biotypes {
435
  my ( $self, $genes ) = @_;
436

437 438 439
  my @filtered;
  my @biotypes;
  my $opt_reverse;
440

441 442 443 444
  if ( defined( $self->conf()->param('biotypes_include') ) ||
       defined( $self->conf()->param('biotypes') ) )
  {
    if ( defined( $self->conf()->param('biotypes_exclude') ) ) {
445
      $self->logger()
446 447
        ->error( "You may not use both " .
                 "'biotypes_include' and 'biotypes_exclude' " .
448 449 450
                 "in the configuration" );
    }

451 452 453 454 455 456
    if ( defined( $self->conf()->param('biotypes_include') ) ) {
      @biotypes = $self->conf()->param('biotypes_include');
    }
    else {
      @biotypes = $self->conf()->param('biotypes');
    }
457 458 459
    $opt_reverse = 0;
  }
  else {
460
    @biotypes    = $self->conf()->param('biotypes_exclude');
461
    $opt_reverse = 1;
462 463
  }

464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
  foreach my $gene ( @{$genes} ) {
    my $keep_gene;

    foreach my $biotype (@biotypes) {
      if ( $gene->biotype() eq $biotype ) {
        if   ($opt_reverse) { $keep_gene = 0 }
        else                { $keep_gene = 1 }
        last;
      }
    }

    if ( defined($keep_gene) ) {
      if ($keep_gene) {
        push( @filtered, $gene );
      }
    }
    elsif ($opt_reverse) {
      push( @filtered, $gene );
    }
  }

  return \@filtered;
} ## end sub filter_biotypes
487 488


Patrick Meidl's avatar
Patrick Meidl committed
489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505
=head2 add

  Arg[1]      : String $name - a cache name (e.g. 'genes_by_id')
  Arg[2]      : String type - a cache type (e.g. "source.$slice_name")
  Arg[3]      : String $key - key of this entry (e.g. a gene dbID)
  Arg[4]      : Bio::EnsEMBL::IdMappping::TinyFeature $val - value to cache
  Example     : $cache->add('genes_by_id',
                  'source.chromosome:NCBI36:X:1:1000000:1', '1234', $tiny_gene);
  Description : Adds a TinyFeature object to a named cache.
  Return type : Bio::EnsEMBL::IdMapping::TinyFeature
  Exceptions  : thrown on wrong or missing arguments
  Caller      : internal
  Status      : At Risk
              : under development

=cut

506 507 508
sub add {
  my $self = shift;
  my $name = shift;
Patrick Meidl's avatar
Patrick Meidl committed
509
  my $type = shift;
510 511 512 513
  my $key = shift;
  my $val = shift;

  throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
Patrick Meidl's avatar
Patrick Meidl committed
514
  throw("You must provide a cache type.") unless $type;
515 516
  throw("You must provide a cache key (e.g. a gene dbID).") unless $key;

517
  $self->{'cache'}->{$type}->{$name}->{$key} = $val;
518

519
  return $self->{'cache'}->{$type}->{$name}->{$key};
520 521
}

Patrick Meidl's avatar
Patrick Meidl committed
522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540
=head2 add_list

  Arg[1]      : String $name - a cache name (e.g. 'genes_by_id')
  Arg[2]      : String type - a cache type (e.g. "source.$slice_name")
  Arg[3]      : String $key - key of this entry (e.g. a gene dbID)
  Arg[4]      : List of Bio::EnsEMBL::IdMappping::TinyFeature @val - values
                to cache
  Example     : $cache->add_list('transcripts_by_exon_id',
                  'source.chromosome:NCBI36:X:1:1000000:1', '1234',
                  $tiny_transcript1, $tiny_transcript2);
  Description : Adds a list of TinyFeature objects to a named cache.
  Return type : Listref of Bio::EnsEMBL::IdMapping::TinyFeature objects
  Exceptions  : thrown on wrong or missing arguments
  Caller      : internal
  Status      : At Risk
              : under development

=cut

541 542 543
sub add_list {
  my $self = shift;
  my $name = shift;
Patrick Meidl's avatar
Patrick Meidl committed
544
  my $type = shift;
545 546 547 548
  my $key = shift;
  my @vals = @_;

  throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
Patrick Meidl's avatar
Patrick Meidl committed
549
  throw("You must provide a cache type.") unless $type;
550 551
  throw("You must provide a cache key (e.g. a gene dbID).") unless $key;

552
  push @{ $self->{'cache'}->{$type}->{$name}->{$key} }, @vals;
553

554
  return $self->{'cache'}->{$type}->{$name}->{$key};
555 556 557 558 559
}

sub get_by_key {
  my $self = shift;
  my $name = shift;
Patrick Meidl's avatar
Patrick Meidl committed
560
  my $type = shift;
561 562 563
  my $key = shift;

  throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
Patrick Meidl's avatar
Patrick Meidl committed
564
  throw("You must provide a cache type.") unless $type;
565 566
  throw("You must provide a cache key (e.g. a gene dbID).") unless $key;

567
  # transparently load cache from file unless already loaded
568 569
  unless ($self->{'instance'}->{'loaded'}->{"$type"}) {
    $self->read_and_merge($type);
570 571
  }

572
  return $self->{'cache'}->{$type}->{$name}->{$key};
573 574 575 576 577
}

sub get_by_name {
  my $self = shift;
  my $name = shift;
Patrick Meidl's avatar
Patrick Meidl committed
578
  my $type = shift;
579 580

  throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
Patrick Meidl's avatar
Patrick Meidl committed
581
  throw("You must provide a cache type.") unless $type;
582
  
583
  # transparently load cache from file unless already loaded
584 585
  unless ($self->{'instance'}->{'loaded'}->{$type}) {
    $self->read_and_merge($type);
586 587
  }

588
  return $self->{'cache'}->{$type}->{$name} || {};
589 590 591
}


592
sub get_count_by_name {
593
  my $self = shift;
Patrick Meidl's avatar
Patrick Meidl committed
594
  my $name = shift;
595
  my $type = shift;
596

Patrick Meidl's avatar
Patrick Meidl committed
597
  throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
598 599 600
  throw("You must provide a cache type.") unless $type;
  
  # transparently load cache from file unless already loaded
601 602
  unless ($self->{'instance'}->{'loaded'}->{$type}) {
    $self->read_and_merge($type);
Patrick Meidl's avatar
Patrick Meidl committed
603
  }
604 605

  return scalar(keys %{ $self->get_by_name($name, $type) });
Patrick Meidl's avatar
Patrick Meidl committed
606 607 608 609 610 611 612
}


sub find_common_coord_systems {
  my $self = shift;

  # get adaptors for source db
613
  my $s_dba = $self->get_DBAdaptor('source');
Patrick Meidl's avatar
Patrick Meidl committed
614
  my $s_csa = $s_dba->get_CoordSystemAdaptor;
615
  my $s_sa  = $s_dba->get_SliceAdaptor;
Patrick Meidl's avatar
Patrick Meidl committed
616 617

  # get adaptors for target db
618
  my $t_dba = $self->get_DBAdaptor('target');
Patrick Meidl's avatar
Patrick Meidl committed
619
  my $t_csa = $t_dba->get_CoordSystemAdaptor;
620
  my $t_sa  = $t_dba->get_SliceAdaptor;
Patrick Meidl's avatar
Patrick Meidl committed
621 622

  # find common coord_systems
623 624
  my @s_coord_systems = @{ $s_csa->fetch_all };
  my @t_coord_systems = @{ $t_csa->fetch_all };
625
  my $found_highest   = 0;
Patrick Meidl's avatar
Patrick Meidl committed
626

627
SOURCE:
Patrick Meidl's avatar
Patrick Meidl committed
628
  foreach my $s_cs (@s_coord_systems) {
629 630
    if ( !$s_cs->is_default() ) { next SOURCE }

631
  TARGET:
Patrick Meidl's avatar
Patrick Meidl committed
632
    foreach my $t_cs (@t_coord_systems) {
633 634
      if ( !$t_cs->is_default() ) { next TARGET }

635
      if ( $s_cs->name eq $t_cs->name ) {
Patrick Meidl's avatar
Patrick Meidl committed
636 637

        # test for identical coord_system version
638
        if ( $s_cs->version and ( $s_cs->version ne $t_cs->version ) ) {
Patrick Meidl's avatar
Patrick Meidl committed
639 640 641 642
          next TARGET;
        }

        # test for at least 50% identical seq_regions
643
        if ( $self->seq_regions_compatible( $s_cs, $s_sa, $t_sa ) ) {
Patrick Meidl's avatar
Patrick Meidl committed
644
          $self->add_common_cs($s_cs);
645

646
          unless ($found_highest) {
647 648
            $self->highest_common_cs( $s_cs->name );
            $self->highest_common_cs_version( $s_cs->version );
Patrick Meidl's avatar
Patrick Meidl committed
649 650 651 652 653 654 655
          }

          $found_highest = 1;

          next SOURCE;
        }
      }
656 657 658
    } ## end foreach my $t_cs (@t_coord_systems)
  } ## end foreach my $s_cs (@s_coord_systems)

659
  return $found_highest;
660
} ## end sub find_common_coord_systems
Patrick Meidl's avatar
Patrick Meidl committed
661 662 663 664 665 666 667


sub seq_regions_compatible {
  my $self = shift;
  my $cs = shift;
  my $s_sa = shift;
  my $t_sa = shift;
Patrick Meidl's avatar
Patrick Meidl committed
668

Patrick Meidl's avatar
Patrick Meidl committed
669 670
  unless ($cs and $cs->isa('Bio::EnsEMBL::CoordSystem')) {
    throw('You must provide a CoordSystem');
Patrick Meidl's avatar
Patrick Meidl committed
671
  }
Patrick Meidl's avatar
Patrick Meidl committed
672 673 674 675 676 677 678 679 680 681 682 683 684 685

  unless ($s_sa and $t_sa and $s_sa->isa('Bio::EnsEMBL::DBSQL::SliceAdaptor')
          and $t_sa->isa('Bio::EnsEMBL::DBSQL::SliceAdaptor')) {
    throw('You must provide a source and target SliceAdaptor');
  }

  my %sr_match;
  my $equal = 0;

  my $s_seq_regions = $s_sa->fetch_all($cs->name, $cs->version);
  my $t_seq_regions = $t_sa->fetch_all($cs->name, $cs->version);
  
  # sanity check to prevent divison by zero
  my $s_count = scalar(@$s_seq_regions);
686
  my $t_count = scalar(@$t_seq_regions);
Patrick Meidl's avatar
Patrick Meidl committed
687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704
  return(0) if ($s_count == 0 or $t_count == 0);
  
  foreach my $s_sr (@$s_seq_regions) {
    $sr_match{$s_sr->seq_region_name} = $s_sr->length;
  }

  foreach my $t_sr (@$t_seq_regions) {
    if (exists($sr_match{$t_sr->seq_region_name})) {
      $equal++;

      # return false if we have a region with same name but different length
      return(0) unless ($sr_match{$t_sr->seq_region_name} == $t_sr->length);
    }
  }

  if ($equal/$s_count > 0.5 and $equal/$t_count > 0.5) {
    return(1);
  } else {
705
    $self->logger->info("Only $equal seq_regions identical for ".$cs->name." ".$cs->version."\n");
Patrick Meidl's avatar
Patrick Meidl committed
706 707 708
    return(0);
  }
  
709 710 711
}


712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799
sub check_db_connection {
  my $self = shift;
  my $dbtype = shift;
  
  my $err = 0;
  
  eval {
    my $dba = $self->get_DBAdaptor($dbtype);
    $dba->dbc->connect;
  };
  
  if ($@) {
    $self->logger->warning("Can't connect to $dbtype db: $@\n");
    $err++;
  } else {
    $self->logger->debug("Connection to $dbtype db ok.\n");
    $self->{'_db_conn_ok'}->{$dbtype} = 1;
  }

  return $err;
}

  
sub check_db_read_permissions {
  my $self = shift;
  my $dbtype = shift;

  # skip this check if db connection failed (this prevents re-throwing
  # exceptions).
  return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
  
  my $err = 0;
  my %privs = %{ $self->get_db_privs($dbtype) };
  
  unless ($privs{'SELECT'} or $privs{'ALL PRIVILEGES'}) {
    $self->logger->warning("User doesn't have read permission on $dbtype db.\n");
    $err++;
  } else {
    $self->logger->debug("Read permission on $dbtype db ok.\n");
  }

  return $err;
}

  
sub check_db_write_permissions {
  my $self = shift;
  my $dbtype = shift;
  
  # skip this check if db connection failed (this prevents re-throwing
  # exceptions).
  return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
  
  my $err = 0;

  unless ($self->do_upload) {
    $self->logger->debug("No uploads, so write permission on $dbtype db not required.\n");
    return $err;
  }

  my %privs = %{ $self->get_db_privs($dbtype) };

  unless ($privs{'INSERT'} or $privs{'ALL PRIVILEGES'}) {
    $self->logger->warning("User doesn't have write permission on $dbtype db.\n");
    $err++;
  } else {
    $self->logger->debug("Write permission on $dbtype db ok.\n");
  }

  return $err;
}


sub do_upload {
  my $self = shift;

  if ($self->conf->param('dry_run') or
    ! ($self->conf->param('upload_events') or
       $self->conf->param('upload_stable_ids') or
       $self->conf->param('upload_archive'))) {
    return 0;
  } else {
    return 1;
  }
}   


sub get_db_privs {
800
  my ( $self, $dbtype ) = @_;
801 802

  my %privs = ();
803
  my $rs;
804 805 806

  # get privileges from mysql db
  eval {
807 808
    my $dbc = $self->get_DBAdaptor($dbtype)->dbc();
    my $sql = qq(SHOW GRANTS FOR ) . $dbc->username();
809
    my $sth = $dbc->prepare($sql);
810 811
    $sth->execute();
    $rs = $sth->fetchall_arrayref();
812
    #$sth->finish();
813 814 815
  };

  if ($@) {
816 817
    $self->logger->warning(
      "Error obtaining privileges from $dbtype db: $@\n");
818 819 820 821
    return {};
  }

  # parse the output
822 823 824 825 826 827 828 829
  foreach my $r ( map { $_->[0] } @{$rs} ) {
    $r =~ s/GRANT (.*) ON .*/$1/i;
    foreach my $p ( split( ',', $r ) ) {
      # trim leading and trailing whitespace
      $p =~ s/^\s+//;
      $p =~ s/\s+$//;
      $privs{ uc($p) } = 1;
    }
830 831 832
  }

  return \%privs;
833
} ## end sub get_db_privs
834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852


sub check_empty_tables {
  my $self = shift;
  my $dbtype = shift;
  
  # skip this check if db connection failed (this prevents re-throwing
  # exceptions).
  return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
  
  my $err = 0;
  my $c = 0;

  if ($self->conf->param('no_check_empty_tables') or !$self->do_upload) {
    $self->logger->debug("Won't check for empty stable ID and archive tables in $dbtype db.\n");
    return $err;
  }

  eval {
853 854
    my @tables =
      qw(
855 856 857 858 859 860 861 862 863 864 865 866
      gene_stable_id
      transcript_stable_id
      translation_stable_id
      exon_stable_id
      stable_id_event
      mapping_session
      gene_archive
      peptide_archive
    );

    my $dba = $self->get_DBAdaptor($dbtype);
    foreach my $table (@tables) {
867 868 869 870 871 872 873 874 875 876 877 878
      if ( $table =~ /^([^_]+)_stable_id/ ) {
        $table = $1;
        if ( $c =
             $self->fetch_value_from_db(
               $dba,
               "SELECT COUNT(*) FROM $table WHERE stable_id IS NOT NULL"
             ) )
        {
          $self->logger->warning(
                        "$table table in $dbtype db has $c stable IDs.\n");
          $err++;
        }
879
      }
880 881 882 883 884 885 886 887 888 889 890 891
      else {
        if ( $c =
             $self->fetch_value_from_db(
                                     $dba, "SELECT COUNT(*) FROM $table"
             ) )
        {
          $self->logger->warning(
                        "$table table in $dbtype db has $c entries.\n");
          $err++;
        }
      }
    } ## end foreach my $table (@tables)
892 893 894
  };

  if ($@) {
895 896 897
    $self->logger->warning(
"Error retrieving stable ID and archive table row counts from $dbtype db: $@\n"
    );
898
    $err++;
899 900 901 902
  }
  elsif ( !$err ) {
    $self->logger->debug(
         "All stable ID and archive tables in $dbtype db are empty.\n");
903 904 905 906 907 908
  }
  return $err;
}


sub check_sequence {
909 910
  my ( $self, $dbtype ) = @_;

911 912
  # skip this check if db connection failed (this prevents re-throwing
  # exceptions).
913 914
  return 1 unless ( $self->{'_db_conn_ok'}->{$dbtype} );

915
  my $err = 0;
916 917
  my $c   = 0;

918 919
  eval {
    my $dba = $self->get_DBAdaptor($dbtype);
920 921 922 923 924
    unless ( $c =
             $self->fetch_value_from_db(
                               $dba->dnadb(), "SELECT COUNT(*) FROM dna"
             ) )
    {
925 926 927
      $err++;
    }
  };
928

929
  if ($@) {
930 931
    $self->logger->warning(   "Error retrieving dna table row count "
                            . "from $dbtype database: $@\n" );
932 933
    $err++;
  } elsif ($err) {
934
    $self->logger->warning("No sequence found in $dbtype database.\n");
935
  } else {
936 937
    $self->logger->debug(
                ucfirst($dbtype) . " db has sequence ($c entries).\n" );
938 939 940
  }

  return $err;
941
} ## end sub check_sequence
942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969


sub check_meta_entries {
  my $self = shift;
  my $dbtype = shift;
  
  # skip this check if db connection failed (this prevents re-throwing
  # exceptions).
  return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
  
  my $err = 0;
  my $assembly_default;
  my $schema_version;
  
  eval {
    my $dba = $self->get_DBAdaptor($dbtype);
    $assembly_default = $self->fetch_value_from_db($dba,
      qq(SELECT meta_value FROM meta WHERE meta_key = 'assembly.default'));
    $schema_version = $self->fetch_value_from_db($dba,
      qq(SELECT meta_value FROM meta WHERE meta_key = 'schema_version'));
  };
  
  if ($@) {
    $self->logger->warning("Error retrieving dna table row count from $dbtype db: $@\n");
    return ++$err;
  }
  
  unless ($assembly_default) {
970
    $self->logger->warning("No meta.assembly.default value found in $dbtype db.\n");
971 972
    $err++;
  } else {
973
    $self->logger->debug("meta.assembly.default value found ($assembly_default).\n");
974 975 976
  }

  unless ($schema_version) {
977
    $self->logger->warning("No meta.schema_version value found in $dbtype db.\n");
978 979
    $err++;
  } else {
980
    $self->logger->debug("meta.schema_version value found ($schema_version).\n");
981 982 983 984 985 986 987
  }

  return $err;
}


sub fetch_value_from_db {