DensityFeatureAdaptor.pm 18.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#
# Ensembl module for Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor
#
# Copyright EMBL/EBI
#
# You may distribute this module under the same terms as perl itself

# POD documentation - main docs before the code

=head1 NAME

Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor

=head1 SYNOPSIS

my $dfa = $database_adaptor->get_DensityFeatureAdaptor();

my $interpolate = 1;
my $blocks_wanted = 50;

@dense_feats = @{$dfa->fetch_all_by_Slice($slice,'SNPDensity',
                                             $blocks_wanted, $interpolate);}

=head1 DESCRIPTION

Density Feature Adaptor - An adaptor responsible for the creation of density
features from the database.

=head1 CONTACT

Post questions to the Ensembl developer list.

=head1 METHODS

=cut

package Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor;
use vars qw(@ISA);
use strict;

41
42

use POSIX;
43
use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor;
44
use Bio::EnsEMBL::Utils::Cache;
45
use Bio::EnsEMBL::DensityFeature;
Web Admin's avatar
Web Admin committed
46
use Bio::EnsEMBL::DensityFeatureSet;
47
use Bio::EnsEMBL::DensityType;
48
49
50
51
use Bio::EnsEMBL::Utils::Exception qw(throw warning);

@ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor);

52
our $DENSITY_FEATURE_CACHE_SIZE = 20;
53

54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
=head2 new

  Arg [1]    : list of args @args
               Superclass constructor arguments
  Example    : none
  Description: Constructor which just initializes internal cache structures
  Returntype : Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor
  Exceptions : none
  Caller     : implementing subclass constructors

=cut

sub new {
  my $caller = shift;

  my $class = ref($caller) || $caller;

  my $self = $class->SUPER::new(@_);

  #initialize an LRU cache
  my %cache;
  tie(%cache, 'Bio::EnsEMBL::Utils::Cache', $DENSITY_FEATURE_CACHE_SIZE);
  $self->{'_density_feature_cache'} = \%cache;

  return $self;
}
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

=head2 fetch_all_by_Slice

  Arg [1]    : Bio::EnsEMBL::Slice $slice - The slice representing the region
               to retrieve density features from.
  Arg [2]    : string $logic_name - The logic name of the density features to
               retrieve.
  Arg [3]    : int $num_blocks (optional; default = 50) - The number of
               features that are desired. The ratio between the size of these
               features and the size of the features in the database will be
               used to determine which database features will be used.
  Arg [4]    : boolean $interpolate (optional; default = 0) - A flag indicating
               whether the features in the database should be interpolated to
               fit them to the requested number of features.  If true the
               features will be interpolated to provide $num_blocks features.
               This will not guarantee that exactly $num_blocks features are
               returned due to rounding etc. but it will be close.
  Arg [5]    : float $max_ratio - The maximum ratio between the size of the
               requested features (as determined by $num_blocks) and the actual
               size of the features in the database.  If this value is exceeded
               then an empty list will be returned.  This can be used to
               prevent excessive interpolation of the database values.
  Example    : #interpolate:
               $feats = $dfa->fetch_all_by_Slice($slice,'SNPDensity', 10, 1);
               #do not interpoloate, get what is in the database:
               $feats = $dfa->fetch_all_by_Slice($slice,'SNPDensity', 50);
               #interpolate, but not too much
               $feats = $dfa->fetch_all_by_Slice($slice,'SNPDensity',50,1,5.0);
  Description: Retrieves a set of density features which overlap the region
               of this slice. Density features are a discrete representation
               of a continuous value along a sequence, such as a density or
               percent coverage.  Density Features may be stored in chunks of
               different sizes in the database, and interpolated to fit the
               desired size for the requested region.  For example the database
               may store a single density value for each 1KB and also for each
               1MB.  When fetching for an entire chromosome the 1MB density
               chunks will be used if the requested number of blocks is not
               very high.
118
119
               Note that features which have been interpolated are not stored
               in the database and as such will have no dbID or adaptor set.
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
  Returntype : Bio::EnsEMBL::DensityFeature
  Exceptions : warning on invalid $num_blocks argument
               warning if no features with logic_name $logic_name exist
               warning if density_type table has invalid block_size value
  Caller     : general

=cut

sub fetch_all_by_Slice {
  my ($self, $slice, $logic_name, $num_blocks, $interpolate, $max_ratio) = @_;

  if(defined($num_blocks) && $num_blocks < 1) {
    warning("Invalid number of density blocks [$num_blocks] requested.\n" .
	   "Returning empty list.");
    return [];
  }

  $num_blocks ||= 50;
  my $length = $slice->length();
139
140

  my $wanted_block_size = POSIX::ceil($length/$num_blocks);
141

142
143
144
145
  #
  # get out all of the density types and choose the one with the
  # block size closest to our desired block size
  #
146

147
  my $dta = $self->db()->get_DensityTypeAdaptor();
148

149
  my @dtypes = @{$dta->fetch_all_by_logic_name($logic_name)};
150
  if( ! @dtypes ){
151
152
153
154
155
    my @all_dtypes =  @{ $dta->fetch_all() };
    @all_dtypes or warning( "No DensityTypes in $dta" ) && return [];
    my $valid_list = join( ", ", map{$_->analysis->logic_name} @all_dtypes );
    warning( "No DensityTypes for logic name $logic_name. ".
	     "Select from $valid_list" );
156
157
    return [];
  }
158

159
160
  my $best_ratio   = undef;
  my $density_type = undef;
161

162
  foreach my $dt (@dtypes) {
163
    my $ratio;
164
165
    if($dt->block_size() < $wanted_block_size) {
      $ratio = $wanted_block_size/$dt->block_size();
166
    } else {
167
      $ratio = $dt->block_size()/$wanted_block_size;
168
169
170
171
    }

    if(!defined($best_ratio) || $ratio < $best_ratio) {
      $best_ratio = $ratio;
172
      $density_type = $dt;
173
174
175
176
177
178
179
180
181
182
    }
  }

  #the ratio was not good enough, or this logic name was not in the
  #density type table, return an empty list
  if(!defined($best_ratio) ||
     (defined($max_ratio) && $best_ratio > $max_ratio)) {
    return [];
  }

183
  my $constraint = "df.density_type_id = " . $density_type->dbID();
184
185

  my @features =
186
    @{$self->fetch_all_by_Slice_constraint($slice,$constraint)};
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201

  #we don't want to interpolate if the ratio was very close
  $interpolate = 0 if($best_ratio < 1.05);

  return \@features if(!$interpolate);

  #interpolate the features into new features of a different size
  my @out;
  #sort the features on start position
  @features = sort({$a->start() <=> $b->end()} @features);

  #resize the features that were returned
  my $start = 1;
  my $end   = $start+$wanted_block_size-1;

202
203
204
205
206
207
208
209
210
  my $value_type = $density_type->value_type();

  # create a new density type object for the interpolated features that
  # is not stored in the database
  $density_type = Bio::EnsEMBL::DensityType->new
    (-VALUE_TYPE => $value_type,
     -ANALYSIS   => $density_type->analysis(),
     -BLOCK_SIZE => $wanted_block_size);

211
  while($start < $length) {
212
#    $end = $length if($end > $length);
213
214
215
216
217
218
219
220
221
222
223
224
225
226

    my $density_value = 0.0;
    my ($f, $fstart, $fend, $portion);
    my @dvalues;

    #construct a new feature using all of the old density features that
    #we overlapped
    while(($f = shift(@features)) && $end > $f->{'start'}) {

      #what portion of this feature are we using to construct our new block?
      $fstart = ($f->{'start'} < $start) ? $start : $f->{'start'};
      $fend   = ($f->{'end'}   > $end  ) ? $end   : $f->{'end'};
      $fend   = $length if($fend > $length);

227
      if($value_type eq 'sum') {
228
229
230
231
232
233
234

        $portion = ($fend-$fstart+1)/$f->length();

        #take a percentage of density value, depending on how much of the
        #feature we overlapped
        $density_value += $portion * $f->{'density_value'};

235
      } elsif($value_type eq 'ratio') {
236
237
238
239
240
241

        #maintain a running total of the length used from each feature
        #and its value
        push(@dvalues,[$fend-$fstart+1,$f->{'density_value'}]);

      } else {
242
        throw("Unknown density value type [$value_type].");
243
244
245
246
247
248
249
250
      }

      #do not want to look at next feature if we only used part of this one:
      last if($fend < $f->{'end'});
    }

    #if we did not completely overlap the last feature, put it back on so
    #it can be partially used by the next block
Graham McVicker's avatar
Graham McVicker committed
251
    if(defined($f) && (!defined($fend) || $fend < $f->{'end'})) {
252
253
254
      unshift(@features, $f);
    }

255
    if($value_type eq 'ratio') {
256
257
258
259
260
261
262
263
264
265
266
      #take a weighted average of the all the density values of the features
      #used to construct this one
      my $total_len = $end - $start + 1;
      if($total_len > 0) {
        foreach my $pair (@dvalues) {
          my ($dlen, $dval) = @$pair;
          $density_value += $dval * ($dlen/$total_len);
        }
      }
    }

267
268
    # interpolated features are not stored in the db so do not set the dbID
    # or the adaptor
269
270
271
272
    push @out, Bio::EnsEMBL::DensityFeature->new
      (-seq_region    => $slice,
       -start         => $start,
       -end           => $end,
273
       -density_type  => $density_type,
274
       -density_value => $density_value);
275
276
277
278
279
280
281
282
283
284
285
286

    $start = $end + 1;
    $end  += $wanted_block_size;
  }

  return \@out;
}


sub _tables {
  my $self = shift;

287
  return (['density_feature', 'df']);
288
289
290
291
292
293
294
295
}


sub _columns {
  my $self = shift;

  return qw( df.density_feature_id
             df.seq_region_id df.seq_region_start df.seq_region_end
296
             df.density_value df.density_type_id );
297
298
299
300
301
302
303
304
305
306
307
308
309
310
}



sub _objs_from_sth {
  my ($self, $sth, $mapper, $dest_slice) = @_;

  #
  # This code is ugly because an attempt has been made to remove as many
  # function calls as possible for speed purposes.  Thus many caches,
  # and a fair bit of gymnastics have been used.
  #

  my $sa = $self->db()->get_SliceAdaptor();
311
  my $dta = $self->db()->get_DensityTypeAdaptor();
312
313

  my @features;
314
  my %dtype_hash;
315
316
317
318
319
  my %slice_hash;
  my %sr_name_hash;
  my %sr_cs_hash;

  my($density_feature_id, $seq_region_id, $seq_region_start, $seq_region_end,
320
     $density_value, $density_type_id );
321
322

  $sth->bind_columns(\$density_feature_id, \$seq_region_id, \$seq_region_start,
323
                     \$seq_region_end, \$density_value, \$density_type_id);
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343

  my $asm_cs;
  my $cmp_cs;
  my $asm_cs_vers;
  my $asm_cs_name;
  my $cmp_cs_vers;
  my $cmp_cs_name;
  if($mapper) {
    $asm_cs = $mapper->assembled_CoordSystem();
    $cmp_cs = $mapper->component_CoordSystem();
    $asm_cs_name = $asm_cs->name();
    $asm_cs_vers = $asm_cs->version();
    $cmp_cs_name = $cmp_cs->name();
    $cmp_cs_vers = $cmp_cs->version();
  }

  my $dest_slice_start;
  my $dest_slice_end;
  my $dest_slice_strand;
  my $dest_slice_length;
344
345
  my $dest_slice_sr_name;

346
347
348
349
350
  if($dest_slice) {
    $dest_slice_start  = $dest_slice->start();
    $dest_slice_end    = $dest_slice->end();
    $dest_slice_strand = $dest_slice->strand();
    $dest_slice_length = $dest_slice->length();
351
    $dest_slice_sr_name = $dest_slice->seq_region_name();
352
353
354
  }

  FEATURE: while($sth->fetch()) {
355
356
357
    #get the density type object
    my $density_type = $dtype_hash{$density_type_id} ||=
      $dta->fetch_by_dbID($density_type_id);
358
359
360
361
362
363
364
365
366
367
368

    #get the slice object
    my $slice = $slice_hash{"ID:".$seq_region_id};

    if(!$slice) {
      $slice = $sa->fetch_by_seq_region_id($seq_region_id);
      $slice_hash{"ID:".$seq_region_id} = $slice;
      $sr_name_hash{$seq_region_id} = $slice->seq_region_name();
      $sr_cs_hash{$seq_region_id} = $slice->coord_system();
    }

369
370
371
372
    my $sr_name = $sr_name_hash{$seq_region_id};
    my $sr_cs   = $sr_cs_hash{$seq_region_id};
 
   #
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
    # remap the feature coordinates to another coord system
    # if a mapper was provided
    #
    if($mapper) {

      my $len = $seq_region_end - $seq_region_start + 1;

      my @coords =
        $mapper->map($sr_name, $seq_region_start, $seq_region_end,1, $sr_cs);

      #filter out gaps
      @coords = grep {!$_->isa('Bio::EnsEMBL::Mapper::Gap')} @coords;

      #throw out density features mapped to gaps, or split
      next FEATURE if(@coords != 1);

      $seq_region_start  = $coords[0]->{'start'};
      $seq_region_end    = $coords[0]->{'end'};
      $sr_name           = $coords[0]->{'id'};

393
      if($density_type->value_type() eq 'sum') {
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
        #adjust density value so it reflects length of feature actually used
        my $newlen = $seq_region_end - $seq_region_start +1;
        $density_value *= $newlen/$len if($newlen != $len);
      }

      #get a slice in the coord system we just mapped to
      if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) {
        $slice = $slice_hash{"NAME:$sr_name:$cmp_cs_name:$cmp_cs_vers"} ||=
          $sa->fetch_by_region($cmp_cs_name, $sr_name,undef, undef, undef,
                               $cmp_cs_vers);
      } else {
        $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||=
          $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef,
                               $asm_cs_vers);
      }
    }

    #
    # If a destination slice was provided convert the coords
    # If the dest_slice starts at 1 and is foward strand, nothing needs doing
    #
    if($dest_slice) {
      if($dest_slice_start != 1 || $dest_slice_strand != 1) {
        if($dest_slice_strand == 1) {
          $seq_region_start = $seq_region_start - $dest_slice_start + 1;
          $seq_region_end   = $seq_region_end   - $dest_slice_start + 1;
        } else {
          my $tmp_seq_region_start = $seq_region_start;
          $seq_region_start = $dest_slice_end - $seq_region_end + 1;
          $seq_region_end   = $dest_slice_end - $tmp_seq_region_start + 1;
        }
425
      }
426

427
428
429
430
      #throw away features entirely off the end of the requested slice
      if($seq_region_end < 1 || $seq_region_start > $dest_slice_length ||
	 ( $dest_slice_sr_name ne $sr_name )) {
	next FEATURE;
431
432
433
434
      }
      $slice = $dest_slice;
    }

435
    push @features, Bio::EnsEMBL::DensityFeature->new
436
437
438
439
      (-dbID          => $density_feature_id,
       -adaptor       => $self,
       -start         => $seq_region_start,
       -end           => $seq_region_end,
440
       -seq_region    => $slice,
441
442
       -density_value => $density_value,
       -density_type  => $density_type);
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
  }
  return \@features;
}


=head2 list_dbIDs

  Arg [1]    : none
  Example    : @feature_ids = @{$density_feature_adaptor->list_dbIDs()};
  Description: Gets an array of internal ids for all density features in the
               current db
  Returntype : list of ints
  Exceptions : none
  Caller     : ?

=cut

sub list_dbIDs {
   my ($self) = @_;

   return $self->_list_dbIDs("density_feature");
}

466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525

=head2 store

  Arg [1]    : list of Bio::EnsEMBL::DensityFeatures @df
               the simple features to store in the database
  Example    : $density_feature_adaptor->store(1234, @density_feats);
  Description: Stores a list of density feature objects in the database
  Returntype : none
  Exceptions : thrown if @df is not defined, if any of the features do not
               have an attached slice.
               or if any elements of @df are not Bio::EnsEMBL::SeqFeatures 
  Caller     : general

=cut

sub store{
  my ($self,@df) = @_;

  if( scalar(@df) == 0 ) {
    throw("Must call store with list of DensityFeatures");
  }
#mysql> desc density_feature;
#+--------------------+---------+------+-----+---------+----------------+
#| Field              | Type    | Null | Key | Default | Extra          |
#+--------------------+---------+------+-----+---------+----------------+
#| density_feature_id | int(11) |      | PRI | NULL    | auto_increment |
#| density_type_id    | int(11) |      | MUL | 0       |                |
#| seq_region_id      | int(11) |      |     | 0       |                |
#| seq_region_start   | int(11) |      |     | 0       |                |
#| seq_region_end     | int(11) |      |     | 0       |                |
#| density_value      | float   |      |     | 0       |                |
#+--------------------+---------+------+-----+---------+----------------+

  my $sth = $self->prepare
    ("INSERT INTO density_feature (seq_region_id, seq_region_start, " .
                                  "seq_region_end, density_type_id, " .
                                  "density_value) " .
     "VALUES (?,?,?,?,?)");

  my $db = $self->db();
  my $analysis_adaptor = $db->get_AnalysisAdaptor();

 FEATURE: foreach my $df ( @df ) {

    if( !ref $df || !$df->isa("Bio::EnsEMBL::DensityFeature") ) {
      throw("DensityFeature must be an Ensembl DensityFeature, " .
            "not a [".ref($df)."]");
    }

    if($df->is_stored($db)) {
      warning("DensityFeature [".$df->dbID."] is already stored" .
              " in this database.");
      next FEATURE;
    }

    if(!defined($df->density_type)) {
      throw("A density type must be attached to the features to be stored.");
    }

    #store the density_type if it has not been stored yet
526

527
    if(!$df->density_type->is_stored($db)) {
528
529
      my $dta = $db->get_DensityTypeAdaptor();
      $dta->store($df->density_type());
530
531
532
533
534
535
536
537
538
539
540
541
542
543
    }

    my $original = $df;
    my $seq_region_id;
    ($df, $seq_region_id) = $self->_pre_store($df);

    $sth->execute($seq_region_id, $df->start, $df->end,
                  $df->density_type->dbID, $df->density_value);

    $original->dbID($sth->{'mysql_insertid'});
    $original->adaptor($self);
  }
}

544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
=head2 fetch_Featureset_by_Slice 

  Arg [1-5]  : see
               Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor::fetch_all_by_Slice()
               for argument documentation
  Example    : $featureset = $dfa->fetch_FeatureSet_by_Slice($slice,'SNPDensity', 10, 1);
  Description: wrapper around
               Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor::fetch_all_by_Slice()
               which returns a Bio::EnsEMBL::DensityFeatureSet and also caches
               results
  Returntype : Bio::EnsEMBL::DensityFeatureSet
  Exceptions : none
  Caller     : general

=cut
559

Web Admin's avatar
Web Admin committed
560
sub fetch_Featureset_by_Slice {
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
  my ($self, $slice, $logic_name, $num_blocks, $interpolate, $max_ratio) = @_;

  my $key = join(":", $slice->name,
                      $logic_name,
                      $num_blocks || 50,
                      $interpolate || 0,
                      $max_ratio);
                      
  unless ($self->{'_density_feature_cache'}->{$key}) {
    my $dfeats = $self->fetch_all_by_Slice($slice, $logic_name, $num_blocks,
                                           $interpolate, $max_ratio);
    $self->{'_density_feature_cache'}->{$key} =
      new Bio::EnsEMBL::DensityFeatureSet($dfeats);
  }
  return $self->{'_density_feature_cache'}->{$key};
}

578

579
1;