From 536a92c9087ebf5bc9fc0af756f1923337d9f162 Mon Sep 17 00:00:00 2001
From: Dan Staines <dstaines@ebi.ac.uk>
Date: Mon, 18 Mar 2013 09:39:38 +0000
Subject: [PATCH] New method store_batch_on_ for faster bulk storage of
 attributes. New modules GeneGCBatch and PepStatsBatch use this method.
 Updated test to include use of this method.

---
 modules/Bio/EnsEMBL/DBSQL/AttributeAdaptor.pm | 36 +++++++++++
 .../Pipeline/Production/GeneGCBatch.pm        | 60 +++++++++++++++++
 .../Pipeline/Production/PepStatsBatch.pm      | 64 +++++++++++++++++++
 modules/t/attributeAdaptor.t                  | 25 ++++++++
 4 files changed, 185 insertions(+)
 create mode 100644 modules/Bio/EnsEMBL/Pipeline/Production/GeneGCBatch.pm
 create mode 100644 modules/Bio/EnsEMBL/Pipeline/Production/PepStatsBatch.pm

diff --git a/modules/Bio/EnsEMBL/DBSQL/AttributeAdaptor.pm b/modules/Bio/EnsEMBL/DBSQL/AttributeAdaptor.pm
index 9b855af26c..0afb3bea4d 100644
--- a/modules/Bio/EnsEMBL/DBSQL/AttributeAdaptor.pm
+++ b/modules/Bio/EnsEMBL/DBSQL/AttributeAdaptor.pm
@@ -101,6 +101,42 @@ sub AUTOLOAD {
   return undef;
 }
 
+sub store_batch_on_ {
+  my ($self, $type, $attributes, $batch_size) = @_;
+  # $attributes is a hashref where the key is the object ID
+  # and the value is an array ref of Attribute objects
+
+  # maintain a hash of attrib type IDs by code so we don't have to keep looking them up...
+  my $attrib_type_ids = {};
+  # create an arrayref of the values to store
+  my $rows = [];
+  $batch_size ||= scalar(values(%$attributes));
+  while (my ($obj_id, $attribs) = each %{$attributes}) {
+	for my $attrib (@{$attribs}) {
+	  my $attrib_type_id = $attrib_type_ids->{$attrib->code()};
+	  if (!defined $attrib_type_id) {
+		$attrib_type_id = $self->_store_type($attrib);
+		$attrib_type_ids->{$attrib->code()} = $attrib_type_id;
+	  }
+	  push @$rows, sprintf('(%d, %d, %s)', $obj_id, $attrib_type_id, $self->dbc()->db_handle()->quote($attrib->value()));
+	  if (scalar(@$rows) == $batch_size) {
+		$rows = $self->_store_batch_rows($type, $rows);
+	  }
+	}
+  }
+  $rows = $self->_store_batch_rows($type, $rows);
+  return;
+} ## end sub store_batch_on_
+
+sub _store_batch_rows {
+  my ($self, $type, $rows) = @_;
+  my $table = lc($type);
+  if (scalar(@$rows) > 0) {
+	$self->dbc()->sql_helper()->execute_update(-SQL => 'INSERT INTO ' . $table . '_attrib() VALUES' . join(',', @$rows));
+  }
+  return [];
+}
+
 sub store_on_ {
   my $self       = shift;
   my $type       = shift;
diff --git a/modules/Bio/EnsEMBL/Pipeline/Production/GeneGCBatch.pm b/modules/Bio/EnsEMBL/Pipeline/Production/GeneGCBatch.pm
new file mode 100644
index 0000000000..bcbac672a7
--- /dev/null
+++ b/modules/Bio/EnsEMBL/Pipeline/Production/GeneGCBatch.pm
@@ -0,0 +1,60 @@
+package Bio::EnsEMBL::Pipeline::Production::GeneGCBatch;
+
+use strict;
+use warnings;
+
+use base qw/Bio::EnsEMBL::Pipeline::Production::GeneGC/;
+
+# modified version of GeneGC that uses the faster store_batch_on_Gene method from AttributeAdaptor
+sub run {
+  my ($self) = @_;
+  my $species = $self->param('species');
+  my $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($species, 'core');
+
+  my $attrib_code = 'GeneGC';
+  $self->delete_old_attrib($dba, $attrib_code);
+
+  my $genes = Bio::EnsEMBL::Registry->get_adaptor($species, 'core', 'gene')->fetch_all();
+  my $aa = Bio::EnsEMBL::Registry->get_adaptor($self->param('species'), 'core', 'Attribute');
+
+  my $prod_helper = $self->get_production_DBAdaptor()->dbc()->sql_helper();
+  my ($name, $description) = @{
+	$prod_helper->execute(
+	  -SQL => q{
+    SELECT code, name, description
+    FROM attrib_type
+    WHERE code = ? },
+	  -PARAMS => [$attrib_code])->[0]};
+
+  my $attributes = {};
+  while (my $gene = shift @$genes) {
+	my $count = $gene->feature_Slice()->get_base_count->{'%gc'};
+	if ($count > 0) {
+	  push @{$attributes->{$gene->dbID()}},
+		Bio::EnsEMBL::Attribute->new(-NAME        => $name,
+									 -CODE        => $attrib_code,
+									 -VALUE       => $count,
+									 -DESCRIPTION => $description);
+	}
+  }
+  $aa->store_batch_on_Gene($attributes);
+
+} ## end sub run
+
+sub delete_old_attrib {
+  my ($self, $dba, $attrib_code) = @_;
+  my $helper = $dba->dbc()->sql_helper();
+  my $sql    = q{
+    DELETE ga
+    FROM gene_attrib ga, attrib_type at, gene g, seq_region s, coord_system cs
+    WHERE s.seq_region_id = g.seq_region_id
+    AND g.gene_id = ga.gene_id
+    AND cs.coord_system_id = s.coord_system_id
+    AND at.attrib_type_id = ga.attrib_type_id
+    AND cs.species_id = ?
+    AND at.code = ? };
+  $helper->execute_update(-SQL => $sql, -PARAMS => [$dba->species_id(), $attrib_code]);
+}
+
+1;
+
diff --git a/modules/Bio/EnsEMBL/Pipeline/Production/PepStatsBatch.pm b/modules/Bio/EnsEMBL/Pipeline/Production/PepStatsBatch.pm
new file mode 100644
index 0000000000..f8a3e9ec28
--- /dev/null
+++ b/modules/Bio/EnsEMBL/Pipeline/Production/PepStatsBatch.pm
@@ -0,0 +1,64 @@
+package Bio::EnsEMBL::Pipeline::Production::PepStatsBatch;
+
+use strict;
+use warnings;
+use Log::Log4perl qw(get_logger :levels);
+use base qw/Bio::EnsEMBL::Pipeline::Production::PepStats/;
+
+use Bio::EnsEMBL::Attribute;
+
+sub run {
+  my ($self)  = @_;
+  my $species = $self->param('species');
+  my $dbtype  = $self->param('dbtype');
+  my $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($species, $dbtype);
+  if ($dbtype =~ 'vega' || $dbtype =~ 'otherf') {
+	my $core_dba = Bio::EnsEMBL::Registry->get_DBAdaptor($species, 'core');
+	$dba->dnadb($core_dba);
+  }
+  my $log    = get_logger();
+  my $helper = $dba->dbc()->sql_helper();
+
+  my @attrib_codes = $self->get_attrib_codes();
+  $log->info("Deleting old codes");
+  $self->delete_old_attrib($dba, @attrib_codes);
+
+  my $tmpfile = $self->param('tmpdir') . "/$$.pep";
+  $log->info("Dumping translations");
+  $self->dump_translation($dba, $tmpfile);
+  $log->info("Running pepstats");
+  my $results = $self->run_pepstats($tmpfile);
+
+  $log->info("Storing attribs");
+  $self->store_attribs($dba, $results);
+  $log->info("Completed");
+} ## end sub run
+
+sub store_attribs {
+
+  my ($self, $dba, $results) = @_;
+
+  my $attrib_types = $self->get_attrib_types();
+
+  # transform results into a hash
+  # $attributes is a hashref where the key is the object ID
+  # and the value is an array ref of Attribute objects
+  my $attributes = {};
+  foreach my $translation (keys %$results) {
+	foreach my $key (keys %{$results->{$translation}}) {
+
+	  my $value       = $results->{$translation}{$key};
+	  my $attrib_type = $attrib_types->{$key};
+
+	  push @{$attributes->{$translation}},
+		Bio::EnsEMBL::Attribute->new(-NAME        => $attrib_type->{name},
+									 -CODE        => $key,
+									 -VALUE       => $value,
+									 -DESCRIPTION => $attrib_type->{description});
+	}
+  }
+  my $aa = $dba->get_AttributeAdaptor();
+  $aa->store_batch_on_Translation($attributes,1000);
+  return;
+} ## end sub store_attribs
+1;
diff --git a/modules/t/attributeAdaptor.t b/modules/t/attributeAdaptor.t
index 092a85430c..cadf6148bd 100644
--- a/modules/t/attributeAdaptor.t
+++ b/modules/t/attributeAdaptor.t
@@ -11,6 +11,7 @@ our $verbose = 1;
 our $clean   = 0;
 
 my $multi = Bio::EnsEMBL::Test::MultiTestDB->new;
+
 # get a core DBAdaptor
 my $db = $multi->get_DBAdaptor("core");
 
@@ -327,8 +328,32 @@ ok($count == 0);
   $aa->store_on_Gene($gene, [Bio::EnsEMBL::Attribute->new(%args, -VALUE => 0)]);
   my $new_rows = count_rows($db, 'gene_attrib');
   cmp_ok($new_rows, '>', $current_rows, 'Asserting the storage of undefined attributes will always store them');
+  # now remove again
+  $aa->remove_from_Gene($gene);
+  $count = $db->dbc->db_handle->selectall_arrayref("SELECT count(*) FROM gene_attrib " . "WHERE gene_id = " . $gene->dbID())->[0]->[0];
+
+  ok($count == 0);
+
 }
 
+#
+# Test batch storage
+#
+
+my $gene2 = $ga->fetch_by_stable_id('ENSG00000131044');
+my $batch = {$gene->dbID()  => [Bio::EnsEMBL::Attribute->new(-NAME => 'test_name2', -CODE => 'test_code2', -DESCRIPTION => 'test_desc2', VALUE => 'val1'), Bio::EnsEMBL::Attribute->new(-NAME => 'test_name2', -CODE => 'test_code2', -DESCRIPTION => 'test_desc2', VALUE => 'val2')],
+			 $gene2->dbID() => [Bio::EnsEMBL::Attribute->new(-NAME => 'test_name2', -CODE => 'test_code2', -DESCRIPTION => 'test_desc2', VALUE => 'val3'),]};
+my $current_rows = count_rows($db, 'gene_attrib');
+$aa->store_batch_on_Gene($batch);
+my $new_rows = count_rows($db, 'gene_attrib');
+cmp_ok($new_rows, '=', $current_rows + 3, 'Asserting the storage of multiple attributes will always store them');
+
+@attribs = @{$aa->fetch_all_by_Gene($gene)};
+ok(@attribs == 2);
+
+@attribs = @{$aa->fetch_all_by_Gene($gene2)};
+ok(@attribs == 1);
+
 $multi->restore('core', 'misc_attrib', 'seq_region_attrib', 'attrib_type');
 
 done_testing();
-- 
GitLab