DensityFeatureAdaptor.pm 19.9 KB
Newer Older
1
=head1 LICENSE
2

Andy Yates's avatar
Andy Yates committed
3
  Copyright (c) 1999-2012 The European Bioinformatics Institute and
4
5
6
7
8
9
10
11
12
13
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
14
  developers list at <dev@ensembl.org>.
15
16
17
18
19

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=cut
20
21
22
23
24
25
26

=head1 NAME

Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor

=head1 SYNOPSIS

27
  my $dfa = $database_adaptor->get_DensityFeatureAdaptor();
28

29
30
  my $interpolate   = 1;
  my $blocks_wanted = 50;
31

32
33
34
35
  @dense_feats = @{
    $dfa->fetch_all_by_Slice( $slice, 'SNPDensity', $blocks_wanted,
      $interpolate );
    }
36
37
38
39
40
41
42
43
44
45
46
47
48
49

=head1 DESCRIPTION

Density Feature Adaptor - An adaptor responsible for the creation of density
features from the database.

=head1 METHODS

=cut

package Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor;
use vars qw(@ISA);
use strict;

50
51

use POSIX;
52
use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor;
53
use Bio::EnsEMBL::Utils::Cache;
54
use Bio::EnsEMBL::DensityFeature;
Web Admin's avatar
Web Admin committed
55
use Bio::EnsEMBL::DensityFeatureSet;
56
use Bio::EnsEMBL::DensityType;
57
58
59
60
use Bio::EnsEMBL::Utils::Exception qw(throw warning);

@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor);

61
our $DENSITY_FEATURE_CACHE_SIZE = 20;
62

63
64
65
66
67
68
69
70
71
=head2 new

  Arg [1]    : list of args @args
               Superclass constructor arguments
  Example    : none
  Description: Constructor which just initializes internal cache structures
  Returntype : Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor
  Exceptions : none
  Caller     : implementing subclass constructors
72
  Status     : Stable
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

=cut

sub new {
  my $caller = shift;

  my $class = ref($caller) || $caller;

  my $self = $class->SUPER::new(@_);

  #initialize an LRU cache
  my %cache;
  tie(%cache, 'Bio::EnsEMBL::Utils::Cache', $DENSITY_FEATURE_CACHE_SIZE);
  $self->{'_density_feature_cache'} = \%cache;

  return $self;
}
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127

=head2 fetch_all_by_Slice

  Arg [1]    : Bio::EnsEMBL::Slice $slice - The slice representing the region
               to retrieve density features from.
  Arg [2]    : string $logic_name - The logic name of the density features to
               retrieve.
  Arg [3]    : int $num_blocks (optional; default = 50) - The number of
               features that are desired. The ratio between the size of these
               features and the size of the features in the database will be
               used to determine which database features will be used.
  Arg [4]    : boolean $interpolate (optional; default = 0) - A flag indicating
               whether the features in the database should be interpolated to
               fit them to the requested number of features.  If true the
               features will be interpolated to provide $num_blocks features.
               This will not guarantee that exactly $num_blocks features are
               returned due to rounding etc. but it will be close.
  Arg [5]    : float $max_ratio - The maximum ratio between the size of the
               requested features (as determined by $num_blocks) and the actual
               size of the features in the database.  If this value is exceeded
               then an empty list will be returned.  This can be used to
               prevent excessive interpolation of the database values.
  Example    : #interpolate:
               $feats = $dfa->fetch_all_by_Slice($slice,'SNPDensity', 10, 1);
               #do not interpoloate, get what is in the database:
               $feats = $dfa->fetch_all_by_Slice($slice,'SNPDensity', 50);
               #interpolate, but not too much
               $feats = $dfa->fetch_all_by_Slice($slice,'SNPDensity',50,1,5.0);
  Description: Retrieves a set of density features which overlap the region
               of this slice. Density features are a discrete representation
               of a continuous value along a sequence, such as a density or
               percent coverage.  Density Features may be stored in chunks of
               different sizes in the database, and interpolated to fit the
               desired size for the requested region.  For example the database
               may store a single density value for each 1KB and also for each
               1MB.  When fetching for an entire chromosome the 1MB density
               chunks will be used if the requested number of blocks is not
               very high.
128
129
               Note that features which have been interpolated are not stored
               in the database and as such will have no dbID or adaptor set.
130
131
132
133
134
  Returntype : Bio::EnsEMBL::DensityFeature
  Exceptions : warning on invalid $num_blocks argument
               warning if no features with logic_name $logic_name exist
               warning if density_type table has invalid block_size value
  Caller     : general
135
  Status     : Stable
136
137
138
139
140
141
142
143
144
145
146
147
148
149

=cut

sub fetch_all_by_Slice {
  my ($self, $slice, $logic_name, $num_blocks, $interpolate, $max_ratio) = @_;

  if(defined($num_blocks) && $num_blocks < 1) {
    warning("Invalid number of density blocks [$num_blocks] requested.\n" .
	   "Returning empty list.");
    return [];
  }

  $num_blocks ||= 50;
  my $length = $slice->length();
150
151

  my $wanted_block_size = POSIX::ceil($length/$num_blocks);
152

153
154
155
156
  #
  # get out all of the density types and choose the one with the
  # block size closest to our desired block size
  #
157

158
  my $dta = $self->db()->get_DensityTypeAdaptor();
159

160
  my @dtypes = @{$dta->fetch_all_by_logic_name($logic_name)};
161
  if( ! @dtypes ){
162
163
164
165
166
    my @all_dtypes =  @{ $dta->fetch_all() };
    @all_dtypes or warning( "No DensityTypes in $dta" ) && return [];
    my $valid_list = join( ", ", map{$_->analysis->logic_name} @all_dtypes );
    warning( "No DensityTypes for logic name $logic_name. ".
	     "Select from $valid_list" );
167
168
    return [];
  }
169

170
171
  my $best_ratio   = undef;
  my $density_type = undef;
172
173
  my $best_ratio_large = undef;
  my $density_type_large = undef;
Arne Stabenau's avatar
Arne Stabenau committed
174
  my %dt_ratio_hash;
175

176
  foreach my $dt (@dtypes) {
177
178
179
180
181
182
183
184
185
186
187

    my $ratio;
    if( $dt->block_size() > 0 ) {
      $ratio = $wanted_block_size/$dt->block_size();
    } else {
      # This is only valid if the requested seq_region is the one the 
      # features are stored on. Please use sensibly. Or find better implementation.

      my $block_size = $slice->seq_region_length() / $dt->region_features();
      $ratio = $wanted_block_size / $block_size;
    }
Arne Stabenau's avatar
Arne Stabenau committed
188
    
189
    # we prefer to use a block size that's smaller than the required one
Arne Stabenau's avatar
Arne Stabenau committed
190
191
192
193
    # (better results on interpolation).
    # give larger bits a disadvantage and make them comparable
    if( $ratio < 1 ) {
      $ratio = 5/$ratio;
194
    }
Arne Stabenau's avatar
Arne Stabenau committed
195
196

    $dt_ratio_hash{ $ratio } = $dt;
197
  }
198

Arne Stabenau's avatar
Arne Stabenau committed
199
200
  $best_ratio = (sort {$a<=>$b} (keys %dt_ratio_hash))[0];
  
201
202
203
204
205
206
207
  #the ratio was not good enough, or this logic name was not in the
  #density type table, return an empty list
  if(!defined($best_ratio) ||
     (defined($max_ratio) && $best_ratio > $max_ratio)) {
    return [];
  }

Arne Stabenau's avatar
Arne Stabenau committed
208
209
  $density_type = $dt_ratio_hash{$best_ratio};

210
  my $constraint = "df.density_type_id = " . $density_type->dbID();
211
212

  my @features =
213
    @{$self->fetch_all_by_Slice_constraint($slice,$constraint)};
214
215
216
217
218
219

  return \@features if(!$interpolate);

  #interpolate the features into new features of a different size
  my @out;
  #sort the features on start position
220
  @features = sort( { $a->start() <=> $b->start() } @features );
221
222
223
224
225

  #resize the features that were returned
  my $start = 1;
  my $end   = $start+$wanted_block_size-1;

226
227
228
229
230
231
232
233
234
  my $value_type = $density_type->value_type();

  # create a new density type object for the interpolated features that
  # is not stored in the database
  $density_type = Bio::EnsEMBL::DensityType->new
    (-VALUE_TYPE => $value_type,
     -ANALYSIS   => $density_type->analysis(),
     -BLOCK_SIZE => $wanted_block_size);

235
  while($start < $length) {
236
#    $end = $length if($end > $length);
237
238
239
240
241
242
243
244
245
246
247
248
249
250

    my $density_value = 0.0;
    my ($f, $fstart, $fend, $portion);
    my @dvalues;

    #construct a new feature using all of the old density features that
    #we overlapped
    while(($f = shift(@features)) && $end > $f->{'start'}) {

      #what portion of this feature are we using to construct our new block?
      $fstart = ($f->{'start'} < $start) ? $start : $f->{'start'};
      $fend   = ($f->{'end'}   > $end  ) ? $end   : $f->{'end'};
      $fend   = $length if($fend > $length);

251
      if($value_type eq 'sum') {
252
253
254
255
256
257
258

        $portion = ($fend-$fstart+1)/$f->length();

        #take a percentage of density value, depending on how much of the
        #feature we overlapped
        $density_value += $portion * $f->{'density_value'};

259
      } elsif($value_type eq 'ratio') {
260
261
262
263
264
265

        #maintain a running total of the length used from each feature
        #and its value
        push(@dvalues,[$fend-$fstart+1,$f->{'density_value'}]);

      } else {
266
        throw("Unknown density value type [$value_type].");
267
268
269
270
271
272
273
274
      }

      #do not want to look at next feature if we only used part of this one:
      last if($fend < $f->{'end'});
    }

    #if we did not completely overlap the last feature, put it back on so
    #it can be partially used by the next block
Graham McVicker's avatar
Graham McVicker committed
275
    if(defined($f) && (!defined($fend) || $fend < $f->{'end'})) {
276
277
278
      unshift(@features, $f);
    }

279
    if($value_type eq 'ratio') {
280
281
282
283
284
285
286
287
288
289
290
      #take a weighted average of the all the density values of the features
      #used to construct this one
      my $total_len = $end - $start + 1;
      if($total_len > 0) {
        foreach my $pair (@dvalues) {
          my ($dlen, $dval) = @$pair;
          $density_value += $dval * ($dlen/$total_len);
        }
      }
    }

291
292
    # interpolated features are not stored in the db so do not set the dbID
    # or the adaptor
293
294
295
296
    push @out, Bio::EnsEMBL::DensityFeature->new
      (-seq_region    => $slice,
       -start         => $start,
       -end           => $end,
297
       -density_type  => $density_type,
298
       -density_value => $density_value);
299
300
301
302
303
304
305
306
307
308
309
310

    $start = $end + 1;
    $end  += $wanted_block_size;
  }

  return \@out;
}


sub _tables {
  my $self = shift;

311
  return (['density_feature', 'df']);
312
313
314
315
316
317
318
319
}


sub _columns {
  my $self = shift;

  return qw( df.density_feature_id
             df.seq_region_id df.seq_region_start df.seq_region_end
320
             df.density_value df.density_type_id );
321
322
323
324
325
326
327
328
329
330
331
332
333
334
}



sub _objs_from_sth {
  my ($self, $sth, $mapper, $dest_slice) = @_;

  #
  # This code is ugly because an attempt has been made to remove as many
  # function calls as possible for speed purposes.  Thus many caches,
  # and a fair bit of gymnastics have been used.
  #

  my $sa = $self->db()->get_SliceAdaptor();
335
  my $dta = $self->db()->get_DensityTypeAdaptor();
336
337

  my @features;
338
  my %dtype_hash;
339
340
341
342
343
  my %slice_hash;
  my %sr_name_hash;
  my %sr_cs_hash;

  my($density_feature_id, $seq_region_id, $seq_region_start, $seq_region_end,
344
     $density_value, $density_type_id );
345
346

  $sth->bind_columns(\$density_feature_id, \$seq_region_id, \$seq_region_start,
347
                     \$seq_region_end, \$density_value, \$density_type_id);
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367

  my $asm_cs;
  my $cmp_cs;
  my $asm_cs_vers;
  my $asm_cs_name;
  my $cmp_cs_vers;
  my $cmp_cs_name;
  if($mapper) {
    $asm_cs = $mapper->assembled_CoordSystem();
    $cmp_cs = $mapper->component_CoordSystem();
    $asm_cs_name = $asm_cs->name();
    $asm_cs_vers = $asm_cs->version();
    $cmp_cs_name = $cmp_cs->name();
    $cmp_cs_vers = $cmp_cs->version();
  }

  my $dest_slice_start;
  my $dest_slice_end;
  my $dest_slice_strand;
  my $dest_slice_length;
368
  my $dest_slice_sr_name;
369
  my $dest_slice_sr_id;
370

371
372
373
374
375
  if($dest_slice) {
    $dest_slice_start  = $dest_slice->start();
    $dest_slice_end    = $dest_slice->end();
    $dest_slice_strand = $dest_slice->strand();
    $dest_slice_length = $dest_slice->length();
376
    $dest_slice_sr_name = $dest_slice->seq_region_name();
377
    $dest_slice_sr_id  = $dest_slice->get_seq_region_id();
378
379
380
  }

  FEATURE: while($sth->fetch()) {
381
382
383
    #get the density type object
    my $density_type = $dtype_hash{$density_type_id} ||=
      $dta->fetch_by_dbID($density_type_id);
384
385

    #get the slice object
386
387
    #need to get the internal_seq_region, if present
    $seq_region_id = $self->get_seq_region_id_internal($seq_region_id);
388
389
390
391
392
393
394
395
    my $slice = $slice_hash{"ID:".$seq_region_id};
    if(!$slice) {
      $slice = $sa->fetch_by_seq_region_id($seq_region_id);
      $slice_hash{"ID:".$seq_region_id} = $slice;
      $sr_name_hash{$seq_region_id} = $slice->seq_region_name();
      $sr_cs_hash{$seq_region_id} = $slice->coord_system();
    }

396
397
398
399
    my $sr_name = $sr_name_hash{$seq_region_id};
    my $sr_cs   = $sr_cs_hash{$seq_region_id};
 
   #
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
    # remap the feature coordinates to another coord system
    # if a mapper was provided
    #
    if($mapper) {

      my $len = $seq_region_end - $seq_region_start + 1;

      my @coords =
        $mapper->map($sr_name, $seq_region_start, $seq_region_end,1, $sr_cs);

      #filter out gaps
      @coords = grep {!$_->isa('Bio::EnsEMBL::Mapper::Gap')} @coords;

      #throw out density features mapped to gaps, or split
      next FEATURE if(@coords != 1);

      $seq_region_start  = $coords[0]->{'start'};
      $seq_region_end    = $coords[0]->{'end'};
418
      $seq_region_id     = $coords[0]->{'id'};
419

420
      if($density_type->value_type() eq 'sum') {
421
422
423
424
425
426
        #adjust density value so it reflects length of feature actually used
        my $newlen = $seq_region_end - $seq_region_start +1;
        $density_value *= $newlen/$len if($newlen != $len);
      }

      #get a slice in the coord system we just mapped to
427
428
429
430
431
432
433
434
#      if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) {
        $slice = $slice_hash{"ID:".$seq_region_id} ||=
          $sa->fetch_by_seq_region_id($seq_region_id);
#      } else {
#        $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||=
#          $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef,
#                               $asm_cs_vers);
#      }
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
    }

    #
    # If a destination slice was provided convert the coords
    # If the dest_slice starts at 1 and is foward strand, nothing needs doing
    #
    if($dest_slice) {
      if($dest_slice_start != 1 || $dest_slice_strand != 1) {
        if($dest_slice_strand == 1) {
          $seq_region_start = $seq_region_start - $dest_slice_start + 1;
          $seq_region_end   = $seq_region_end   - $dest_slice_start + 1;
        } else {
          my $tmp_seq_region_start = $seq_region_start;
          $seq_region_start = $dest_slice_end - $seq_region_end + 1;
          $seq_region_end   = $dest_slice_end - $tmp_seq_region_start + 1;
        }
451
      }
452

453
454
      #throw away features entirely off the end of the requested slice
      if($seq_region_end < 1 || $seq_region_start > $dest_slice_length ||
455
	 ( $dest_slice_sr_id ne $seq_region_id )) {
456
	next FEATURE;
457
458
459
460
      }
      $slice = $dest_slice;
    }

461
462
463
464
465
466
467
468
469
470
471
    push( @features,
          $self->_create_feature( 'Bio::EnsEMBL::DensityFeature', {
                                    -dbID       => $density_feature_id,
                                    -adaptor    => $self,
                                    -start      => $seq_region_start,
                                    -end        => $seq_region_end,
                                    -seq_region => $slice,
                                    -density_value => $density_value,
                                    -density_type  => $density_type
                                  } ) );

472
  }
473

474
475
476
477
478
479
480
481
482
483
  return \@features;
}


=head2 list_dbIDs

  Arg [1]    : none
  Example    : @feature_ids = @{$density_feature_adaptor->list_dbIDs()};
  Description: Gets an array of internal ids for all density features in the
               current db
484
  Arg[1]     : <optional> int. not set to 0 for the ids to be sorted by the seq_region.
485
486
487
  Returntype : list of ints
  Exceptions : none
  Caller     : ?
488
  Status     : Stable
489
490
491
492

=cut

sub list_dbIDs {
493
   my ($self, $ordered) = @_;
494

495
   return $self->_list_dbIDs("density_feature",undef, $ordered);
496
497
}

498
499
500
501
502
503
504
505
506
507
508
509

=head2 store

  Arg [1]    : list of Bio::EnsEMBL::DensityFeatures @df
               the simple features to store in the database
  Example    : $density_feature_adaptor->store(1234, @density_feats);
  Description: Stores a list of density feature objects in the database
  Returntype : none
  Exceptions : thrown if @df is not defined, if any of the features do not
               have an attached slice.
               or if any elements of @df are not Bio::EnsEMBL::SeqFeatures 
  Caller     : general
510
  Status     : Stable
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546

=cut

sub store{
  my ($self,@df) = @_;

  if( scalar(@df) == 0 ) {
    throw("Must call store with list of DensityFeatures");
  }
#mysql> desc density_feature;
#+--------------------+---------+------+-----+---------+----------------+
#| Field              | Type    | Null | Key | Default | Extra          |
#+--------------------+---------+------+-----+---------+----------------+
#| density_feature_id | int(11) |      | PRI | NULL    | auto_increment |
#| density_type_id    | int(11) |      | MUL | 0       |                |
#| seq_region_id      | int(11) |      |     | 0       |                |
#| seq_region_start   | int(11) |      |     | 0       |                |
#| seq_region_end     | int(11) |      |     | 0       |                |
#| density_value      | float   |      |     | 0       |                |
#+--------------------+---------+------+-----+---------+----------------+

  my $sth = $self->prepare
    ("INSERT INTO density_feature (seq_region_id, seq_region_start, " .
                                  "seq_region_end, density_type_id, " .
                                  "density_value) " .
     "VALUES (?,?,?,?,?)");

  my $db = $self->db();
  my $analysis_adaptor = $db->get_AnalysisAdaptor();

 FEATURE: foreach my $df ( @df ) {

    if( !ref $df || !$df->isa("Bio::EnsEMBL::DensityFeature") ) {
      throw("DensityFeature must be an Ensembl DensityFeature, " .
            "not a [".ref($df)."]");
    }
547
548
549
    
    # we dont store 0 value density features
    next if( $df->density_value == 0 );
550
551
552
553
554
555
556
557
558
559
560
    if($df->is_stored($db)) {
      warning("DensityFeature [".$df->dbID."] is already stored" .
              " in this database.");
      next FEATURE;
    }

    if(!defined($df->density_type)) {
      throw("A density type must be attached to the features to be stored.");
    }

    #store the density_type if it has not been stored yet
561

562
    if(!$df->density_type->is_stored($db)) {
563
564
      my $dta = $db->get_DensityTypeAdaptor();
      $dta->store($df->density_type());
565
566
567
568
569
570
    }

    my $original = $df;
    my $seq_region_id;
    ($df, $seq_region_id) = $self->_pre_store($df);

571
572
573
574
575
576
    $sth->bind_param(1,$seq_region_id,SQL_INTEGER);
    $sth->bind_param(2,$df->start,SQL_INTEGER);
    $sth->bind_param(3,$df->end,SQL_INTEGER);
    $sth->bind_param(4,$df->density_type->dbID,SQL_INTEGER);
    $sth->bind_param(5,$df->density_value,SQL_FLOAT);
    $sth->execute();
577
578
579
580
581
582

    $original->dbID($sth->{'mysql_insertid'});
    $original->adaptor($self);
  }
}

583
584
585
586
587
588
589
590
591
592
593
594
595
=head2 fetch_Featureset_by_Slice 

  Arg [1-5]  : see
               Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor::fetch_all_by_Slice()
               for argument documentation
  Example    : $featureset = $dfa->fetch_FeatureSet_by_Slice($slice,'SNPDensity', 10, 1);
  Description: wrapper around
               Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor::fetch_all_by_Slice()
               which returns a Bio::EnsEMBL::DensityFeatureSet and also caches
               results
  Returntype : Bio::EnsEMBL::DensityFeatureSet
  Exceptions : none
  Caller     : general
596
  Status     : Stable
597
598

=cut
599

Web Admin's avatar
Web Admin committed
600
sub fetch_Featureset_by_Slice {
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
  my ($self, $slice, $logic_name, $num_blocks, $interpolate, $max_ratio) = @_;

  my $key = join(":", $slice->name,
                      $logic_name,
                      $num_blocks || 50,
                      $interpolate || 0,
                      $max_ratio);
                      
  unless ($self->{'_density_feature_cache'}->{$key}) {
    my $dfeats = $self->fetch_all_by_Slice($slice, $logic_name, $num_blocks,
                                           $interpolate, $max_ratio);
    $self->{'_density_feature_cache'}->{$key} =
      new Bio::EnsEMBL::DensityFeatureSet($dfeats);
  }
  return $self->{'_density_feature_cache'}->{$key};
}

618

619
1;