From 30ffe150a19bcad1588c56988c25e8cfeacbacc6 Mon Sep 17 00:00:00 2001
From: Nathan Johnson <njohnson@ebi.ac.uk>
Date: Wed, 7 Jul 2010 10:06:16 +0000
Subject: [PATCH] updated RPKM methods set_config now only test 0
 wsize/skip_zero conflict for wsize passed as params added some more pod

---
 modules/Bio/EnsEMBL/Utils/Collector.pm | 213 ++++++++++++++-----------
 1 file changed, 119 insertions(+), 94 deletions(-)

diff --git a/modules/Bio/EnsEMBL/Utils/Collector.pm b/modules/Bio/EnsEMBL/Utils/Collector.pm
index ab21d72ca2..20e0375c67 100644
--- a/modules/Bio/EnsEMBL/Utils/Collector.pm
+++ b/modules/Bio/EnsEMBL/Utils/Collector.pm
@@ -97,6 +97,8 @@ our $max_view_width = 1000000;   # Max bp width in location/detailed view
 our $max_data_type_size = 16777216;    # Default is 16MB for long BLOB
 # This is really a guide value as this should be set in the inheriting
 # Collector class by deducting the rest of the row size from this value.
+# Is is upto the inheritor to handle checking whether this size has been 
+# exceeded.
 
 # NOTE: Theoretically the min window size is: slice_length/(16777216/2)
 # So for human chr1:  249,250,621/(16777216/2) = 29.7 => 30.  However,
@@ -135,7 +137,7 @@ sub new {
   return bless {}, $_[0];   # Simply blesses this class as an empty hash.
 
   # Do not set anything here, as will not be first in ISA for
-  # FeatureAdaptors.  Hence not guaranteed to be called.
+  # FeatureAdaptors.  Hence, not guaranteed to be called.
 }
 
 =head2 new_assembly
@@ -168,7 +170,7 @@ sub new_assembly {
 #   in Adaptor/Collector.
 
 # Package variables used here instead of attrs to enable easy
-# default config in inheriting class/run script method provided
+# default config in inheriting class/script method. Provided
 # for easy/standardised fetch access outside of this package
 # i.e. Collectors/Adaptors
 
@@ -789,10 +791,44 @@ sub _get_Slice_chunks {
   return $self->{'_slice_chunks'};
 } ## end sub _get_Slice_chunks
 
-# set_config basically replaces the constructor as 'new' will not be
-# called for Adaptor based Collectors.  Separating the confg like this
-# from the store method is largely redundant, as we normally submit
-# separate Slice based jobs.
+
+
+
+=head2 set_config
+
+  Arg[0]     : optional hash - parameter hash(see above methods for more info):
+
+        WINDOW_SIZES        => array ref - subset of defined window
+                               sizes
+        BIN_METHOD          => string
+        MAX_VIEW_WIDTH      => int
+        MAX_DATA_TYPE_SIZE  => int
+        PACK_TEMPLATE       => string
+        PACKED_SIZE         => int
+        BIN_MODEL           => string
+        NEW_ASSEMBLY        => string
+        METHOD_CONFIG       => hash of method specific config params
+        SKIP_ZERO_WINDOW    => boolean - skips generation of 0 wsize
+                               this is used if already generated
+                               from an assembly projection.
+
+               NOTE: Over-riding any of the default config may cause
+               problems when storing or retrieving Collection data,
+               except sub sets of default window sizes.
+
+  Description: This method replaces the constructor as new will not be
+               called for Adaptor based Collectors.
+               Separating this from the store method is currently 
+               redundant as jobs are normally submitetd in Slice based 
+               jobs. However, this will be required if the store method 
+               is further seaprated into fetch/generate and store methods
+  Returntype : None
+  Exceptions : Throws if no window sizes or max_view_width defined
+  Caller     : Inheritor Collector e.g. Bio::EnsEMBL::Funcgen:Collector::ResultFeature
+               or script.
+  Status     : At Risk
+
+=cut
 
 sub set_config {
   my ( $self, %config ) = @_;
@@ -833,7 +869,7 @@ sub set_config {
   # Other vars
   $self->new_assembly($new_assm);
   $self->{'_only_natural'} = 0;
-  $self->{'_store_natural'} = grep /^0/, @$window_sizes;
+  $self->{'_store_natural'} = grep /^0$/, @$window_sizes;
 
   ### Set window_sizes
 
@@ -842,6 +878,17 @@ sub set_config {
       . "for large Collections, "
       . "defaulting to window_sizes = (0)\n";
 
+
+	if ( $skip_zero_window ) {
+	  throw(   "You cannot -skip_zero_window or "
+			   . "omit 0 from -window_sizes "
+			   . "when projecting to a new assembly($new_assm) "
+			   . "which should only be generated using window_size=0" );
+	}
+	
+
+
+
     # Then build the bins on the projected 0 level single Features
 
     # Test we haven't explicity set window_sizes to be something else
@@ -857,29 +904,23 @@ sub set_config {
     $self->window_sizes( [0] );
   } else {
 
-    if ( $window_sizes && $skip_zero_window && grep /^0$/,
-         @$window_sizes )
-    {
+    if ( $wsizes && $skip_zero_window && 
+		 ( grep /^0$/, @$wsizes )) {
+	  #Only test passed params not default config
+
       throw(   "You have specied skip_zero_window "
-             . "and window_size 0 in your config, "
+             . "and window_size 0 in your parameters, "
              . "please remove one of these" );
-    } elsif ( defined($window_sizes) && !grep /^0$/, @$window_sizes ) {
+    } 
+	elsif ( defined($window_sizes) && !grep /^0$/, @$window_sizes ) {
       $skip_zero_window = 1;
       # re-add 0 window as we need this to build the collections
+	  # see ...
       unshift( @{$window_sizes}, 0 );
     }
-
-    # $window_sizes = $self->window_sizes($window_sizes);
-  }
-
-  # This is already done in the script
-  if ( $skip_zero_window && $new_assm ) {
-    throw(   "You cannot -skip_zero_window or "
-           . "omit 0 from -window_sizes "
-           . "when projecting to a new assembly($new_assm) "
-           . "which should only be generated using window_size=0" );
   }
 
+ 
   if ( $self->{'_store_natural'} && scalar( @{$window_sizes} ) == 1 ) {
     $self->{'_only_natural'} = 1;
   }
@@ -893,26 +934,7 @@ sub set_config {
 =head2 store_window_bin_by_Slice
 
   Arg[0]     : Bio::EnsEMBL:Slice
-  Arg[1]     : optional hash - parameter hash(see above methods for more info):
-
-        WINDOW_SIZES        => array ref - subset of defined window
-                               sizes
-        BIN_METHOD          => string
-        MAX_VIEW_WIDTH      => int
-        MAX_DATA_TYPE_SIZE  => int
-        PACK_TEMPLATE       => string
-        PACKED_SIZE         => int
-        BIN_MODEL           => string
-        NEW_ASSEMBLY        => string
-        SKIP_ZERO_WINDOW    => boolean - skips generation of 0 wsize
-                               this is used if already generated
-                               from an assembly projection.
-
-               NOTE: Over-riding any of the default config may cause
-               problems when storing or retrieving Collection data,
-               except sub sets of default window sizes.
-
-  Example    : $collector->store_window_bins_by_Slice($slice, %config);
+  Example    : $collector->store_window_bins_by_Slice($slice);
   Description: This is the main run method, it loops through
                optimal slice chunks from _define_window_chunks,
                calls _bin_features_by_Slice as appropriate and
@@ -1435,7 +1457,7 @@ sub _calculate_max_magnitude {
 } ## end sub _calculate_max_magnitude
 
 
-=head2 _post_process_mac_magnitude
+=head2 _post_process_max_magnitude
 
   Args[0]    : hashref - score bins
   Args[1]    : hashref - count bins
@@ -1522,34 +1544,35 @@ sub _calculate_RPKM {
 sub _post_process_RPKM {
   my ( $self, $bins_ref ) = @_;
 
-  # 10^9 x C / NGB
-  # C = Reads overlapping bin
-  # N = Total reads in the experiment
-  # G = Length of genome in bps
-  # (don't really have to account for non-ref/HAPs or gender here
-  # as should be close enough, CellTypes/gender differences will be
-  # miniscule)
-  # B = length of each bin
-
-  foreach my $wsize ( keys %{$bins_ref} ) {
-    foreach my $bin_index ( 0 .. $#{ $bins_ref->{$wsize} } ) {
-      $bins_ref->{$wsize}->[$bin_index] =
-        ( ( 10**9 )*$bins_ref->{$wsize}->[$bin_index] )/
-        ( ( $self->_RPKM_factor )*$wsize );
-    }
+  #10^9 x C / NGB
+  #C = Reads overlapping bin
+  #N = Total reads in the experiment
+  #G = Length of bin in bps 
+  #(don't really have to account for non-ref/HAPs or gender here 
+  #as should be close enough, CellTypes/gender differences will be miniscule)
+  #B = length of each bin
+
+  foreach my $wsize(keys %{$bins_ref}){
+	
+	foreach my $bin_index(0..$#{$bins_ref->{$wsize}}){
+	  $bins_ref->{$wsize}->[$bin_index] =  
+		((10**9) *  
+		 $bins_ref->{$wsize}->[$bin_index])/(($self->_RPKM_factor($wsize)) * $wsize);
+	}
   }
 
   return;
+
 }
 
 
 =head2 _set_up_RPKM
 
   Args[0]    : hashref - method config e.g
-
-                { DNADB         => Bio::EnsEMBL::DBSQL::DBAdaptor,
+                 {
+                  DNADB         => Bio::EnsEMBL::DBSQL::DBAdaptor,
                   TOTAL_FEATURE => $total_feature_count,
-                  GENDER        => 'female', }
+                 }
 
   Example    : $self->$set_up_method($config);
   Description: Sets the RPKM factor
@@ -1560,37 +1583,30 @@ sub _post_process_RPKM {
 
 =cut	
 
-sub _set_up_RPKM {
-  my ( $self, $config ) = @_;
-
-  my ( $dnadb, $total_features, $gender ) =
-    rearrange( [ 'DNADB', 'TOTAL_FEATURES', 'GENDER' ], %{$config} );
-
-  if ( !defined($self->{'_RPKM_factor'}) ) {
-
-    # Test specifically here to notify about config hash
-    if ( !defined($total_features) ) {
-      throw(   "For RPKM you must pass a valid 'total_features' "
-             . "as part of the method config hash." );
-    }
 
-    if ( !defined($gender) ) {
-      warn("No gender specified from cell. Defaulting to 'female'");
-      $gender = 'female';
-    }
+sub _set_up_RPKM{
+  my ($self, $config) = @_;
 
-    if ( !defined($dnadb) ) {
-      throw(   "For RPKM you must pass 'dnadb' "
-             . "as part of the method config hash." );
-    }
-    # Further validation done in get_diploid_genome_length_by_gender
+  my ($dnadb, $total_features) = rearrange([ 'DNADB', 'TOTAL_FEATURES'], %{$config});
+  
+  #Test specifically here to notify about config hash
+  if(! $total_features){
+	throw("For RPKM you must pass a valid 'total_features' ".
+		  "as part of the method config hash.");
+  }
+  
+  if(! $dnadb){
+	throw("For RPKM you must pass 'dnadb' as part of the method config hash.");
+  }
 
-    my $dip_genome_length =
-      &get_diploid_genome_length_by_gender( $dnadb, $gender );
+  foreach my $wsize(@{$self->window_sizes}){
+	#Should never have 0 here
+	$self->_RPKM_factor($wsize, ($wsize * $total_features));	#N*G
 
-    $self->_RPKM_factor( $dip_genome_length*$total_features );    # N*G
+	warn "setting $wsize RPKM factor($wsize * $total_features) to ".	
+	  $self->_RPKM_factor($wsize);
   }
-
+  
   return;
 } ## end sub _set_up_RPKM
 
@@ -1599,7 +1615,7 @@ sub _set_up_RPKM {
 
   Args[0]    : int - RPKM factor i.e.  (Total reads in the experiment *
                Genome length)
-  Example    : $self->_RPKM_factor$set_up_method($config);
+  Example    : $self->_RPKM_factor($wsize, $factor);
   Description: Gets/Sets the RPKM factor
   Returntype : int
   Exceptions : None
@@ -1608,16 +1624,25 @@ sub _set_up_RPKM {
 
 =cut	
 
-sub _RPKM_factor {
-  my ( $self, $factor ) = @_;
+sub _RPKM_factor{
+  my ($self, $wsize, $factor) = @_;
 
-  if ( defined($factor) ) {
-    $self->{'RPKM_factor'} = $factor;
+  if (! defined $wsize){
+	throw("You must pass at least window_size to get or set the RPKM factor");
   }
 
-  return $self->{'RPKM_factor'};
-}
+  if(defined $factor){
+	$self->{'RPKM_factor'}{$wsize} = $factor;
+  }
+  elsif(! exists $self->{'RPKM_factor'}{$wsize}){
+	#This should never happen unless the window sizes 
+	#are redefined after initialisation
+	throw("You have requested an RPKM factor for a window_size".
+		  " which has not been set:\t$wsize");
+  }
 
+  return $self->{'RPKM_factor'}{$wsize};
+}
 
 =head2 get_diploid_genome_length_by_gender
 
-- 
GitLab