From 536a92c9087ebf5bc9fc0af756f1923337d9f162 Mon Sep 17 00:00:00 2001 From: Dan Staines <dstaines@ebi.ac.uk> Date: Mon, 18 Mar 2013 09:39:38 +0000 Subject: [PATCH] New method store_batch_on_ for faster bulk storage of attributes. New modules GeneGCBatch and PepStatsBatch use this method. Updated test to include use of this method. --- modules/Bio/EnsEMBL/DBSQL/AttributeAdaptor.pm | 36 +++++++++++ .../Pipeline/Production/GeneGCBatch.pm | 60 +++++++++++++++++ .../Pipeline/Production/PepStatsBatch.pm | 64 +++++++++++++++++++ modules/t/attributeAdaptor.t | 25 ++++++++ 4 files changed, 185 insertions(+) create mode 100644 modules/Bio/EnsEMBL/Pipeline/Production/GeneGCBatch.pm create mode 100644 modules/Bio/EnsEMBL/Pipeline/Production/PepStatsBatch.pm diff --git a/modules/Bio/EnsEMBL/DBSQL/AttributeAdaptor.pm b/modules/Bio/EnsEMBL/DBSQL/AttributeAdaptor.pm index 9b855af26c..0afb3bea4d 100644 --- a/modules/Bio/EnsEMBL/DBSQL/AttributeAdaptor.pm +++ b/modules/Bio/EnsEMBL/DBSQL/AttributeAdaptor.pm @@ -101,6 +101,42 @@ sub AUTOLOAD { return undef; } +sub store_batch_on_ { + my ($self, $type, $attributes, $batch_size) = @_; + # $attributes is a hashref where the key is the object ID + # and the value is an array ref of Attribute objects + + # maintain a hash of attrib type IDs by code so we don't have to keep looking them up... + my $attrib_type_ids = {}; + # create an arrayref of the values to store + my $rows = []; + $batch_size ||= scalar(values(%$attributes)); + while (my ($obj_id, $attribs) = each %{$attributes}) { + for my $attrib (@{$attribs}) { + my $attrib_type_id = $attrib_type_ids->{$attrib->code()}; + if (!defined $attrib_type_id) { + $attrib_type_id = $self->_store_type($attrib); + $attrib_type_ids->{$attrib->code()} = $attrib_type_id; + } + push @$rows, sprintf('(%d, %d, %s)', $obj_id, $attrib_type_id, $self->dbc()->db_handle()->quote($attrib->value())); + if (scalar(@$rows) == $batch_size) { + $rows = $self->_store_batch_rows($type, $rows); + } + } + } + $rows = $self->_store_batch_rows($type, $rows); + return; +} ## end sub store_batch_on_ + +sub _store_batch_rows { + my ($self, $type, $rows) = @_; + my $table = lc($type); + if (scalar(@$rows) > 0) { + $self->dbc()->sql_helper()->execute_update(-SQL => 'INSERT INTO ' . $table . '_attrib() VALUES' . join(',', @$rows)); + } + return []; +} + sub store_on_ { my $self = shift; my $type = shift; diff --git a/modules/Bio/EnsEMBL/Pipeline/Production/GeneGCBatch.pm b/modules/Bio/EnsEMBL/Pipeline/Production/GeneGCBatch.pm new file mode 100644 index 0000000000..bcbac672a7 --- /dev/null +++ b/modules/Bio/EnsEMBL/Pipeline/Production/GeneGCBatch.pm @@ -0,0 +1,60 @@ +package Bio::EnsEMBL::Pipeline::Production::GeneGCBatch; + +use strict; +use warnings; + +use base qw/Bio::EnsEMBL::Pipeline::Production::GeneGC/; + +# modified version of GeneGC that uses the faster store_batch_on_Gene method from AttributeAdaptor +sub run { + my ($self) = @_; + my $species = $self->param('species'); + my $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($species, 'core'); + + my $attrib_code = 'GeneGC'; + $self->delete_old_attrib($dba, $attrib_code); + + my $genes = Bio::EnsEMBL::Registry->get_adaptor($species, 'core', 'gene')->fetch_all(); + my $aa = Bio::EnsEMBL::Registry->get_adaptor($self->param('species'), 'core', 'Attribute'); + + my $prod_helper = $self->get_production_DBAdaptor()->dbc()->sql_helper(); + my ($name, $description) = @{ + $prod_helper->execute( + -SQL => q{ + SELECT code, name, description + FROM attrib_type + WHERE code = ? }, + -PARAMS => [$attrib_code])->[0]}; + + my $attributes = {}; + while (my $gene = shift @$genes) { + my $count = $gene->feature_Slice()->get_base_count->{'%gc'}; + if ($count > 0) { + push @{$attributes->{$gene->dbID()}}, + Bio::EnsEMBL::Attribute->new(-NAME => $name, + -CODE => $attrib_code, + -VALUE => $count, + -DESCRIPTION => $description); + } + } + $aa->store_batch_on_Gene($attributes); + +} ## end sub run + +sub delete_old_attrib { + my ($self, $dba, $attrib_code) = @_; + my $helper = $dba->dbc()->sql_helper(); + my $sql = q{ + DELETE ga + FROM gene_attrib ga, attrib_type at, gene g, seq_region s, coord_system cs + WHERE s.seq_region_id = g.seq_region_id + AND g.gene_id = ga.gene_id + AND cs.coord_system_id = s.coord_system_id + AND at.attrib_type_id = ga.attrib_type_id + AND cs.species_id = ? + AND at.code = ? }; + $helper->execute_update(-SQL => $sql, -PARAMS => [$dba->species_id(), $attrib_code]); +} + +1; + diff --git a/modules/Bio/EnsEMBL/Pipeline/Production/PepStatsBatch.pm b/modules/Bio/EnsEMBL/Pipeline/Production/PepStatsBatch.pm new file mode 100644 index 0000000000..f8a3e9ec28 --- /dev/null +++ b/modules/Bio/EnsEMBL/Pipeline/Production/PepStatsBatch.pm @@ -0,0 +1,64 @@ +package Bio::EnsEMBL::Pipeline::Production::PepStatsBatch; + +use strict; +use warnings; +use Log::Log4perl qw(get_logger :levels); +use base qw/Bio::EnsEMBL::Pipeline::Production::PepStats/; + +use Bio::EnsEMBL::Attribute; + +sub run { + my ($self) = @_; + my $species = $self->param('species'); + my $dbtype = $self->param('dbtype'); + my $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($species, $dbtype); + if ($dbtype =~ 'vega' || $dbtype =~ 'otherf') { + my $core_dba = Bio::EnsEMBL::Registry->get_DBAdaptor($species, 'core'); + $dba->dnadb($core_dba); + } + my $log = get_logger(); + my $helper = $dba->dbc()->sql_helper(); + + my @attrib_codes = $self->get_attrib_codes(); + $log->info("Deleting old codes"); + $self->delete_old_attrib($dba, @attrib_codes); + + my $tmpfile = $self->param('tmpdir') . "/$$.pep"; + $log->info("Dumping translations"); + $self->dump_translation($dba, $tmpfile); + $log->info("Running pepstats"); + my $results = $self->run_pepstats($tmpfile); + + $log->info("Storing attribs"); + $self->store_attribs($dba, $results); + $log->info("Completed"); +} ## end sub run + +sub store_attribs { + + my ($self, $dba, $results) = @_; + + my $attrib_types = $self->get_attrib_types(); + + # transform results into a hash + # $attributes is a hashref where the key is the object ID + # and the value is an array ref of Attribute objects + my $attributes = {}; + foreach my $translation (keys %$results) { + foreach my $key (keys %{$results->{$translation}}) { + + my $value = $results->{$translation}{$key}; + my $attrib_type = $attrib_types->{$key}; + + push @{$attributes->{$translation}}, + Bio::EnsEMBL::Attribute->new(-NAME => $attrib_type->{name}, + -CODE => $key, + -VALUE => $value, + -DESCRIPTION => $attrib_type->{description}); + } + } + my $aa = $dba->get_AttributeAdaptor(); + $aa->store_batch_on_Translation($attributes,1000); + return; +} ## end sub store_attribs +1; diff --git a/modules/t/attributeAdaptor.t b/modules/t/attributeAdaptor.t index 092a85430c..cadf6148bd 100644 --- a/modules/t/attributeAdaptor.t +++ b/modules/t/attributeAdaptor.t @@ -11,6 +11,7 @@ our $verbose = 1; our $clean = 0; my $multi = Bio::EnsEMBL::Test::MultiTestDB->new; + # get a core DBAdaptor my $db = $multi->get_DBAdaptor("core"); @@ -327,8 +328,32 @@ ok($count == 0); $aa->store_on_Gene($gene, [Bio::EnsEMBL::Attribute->new(%args, -VALUE => 0)]); my $new_rows = count_rows($db, 'gene_attrib'); cmp_ok($new_rows, '>', $current_rows, 'Asserting the storage of undefined attributes will always store them'); + # now remove again + $aa->remove_from_Gene($gene); + $count = $db->dbc->db_handle->selectall_arrayref("SELECT count(*) FROM gene_attrib " . "WHERE gene_id = " . $gene->dbID())->[0]->[0]; + + ok($count == 0); + } +# +# Test batch storage +# + +my $gene2 = $ga->fetch_by_stable_id('ENSG00000131044'); +my $batch = {$gene->dbID() => [Bio::EnsEMBL::Attribute->new(-NAME => 'test_name2', -CODE => 'test_code2', -DESCRIPTION => 'test_desc2', VALUE => 'val1'), Bio::EnsEMBL::Attribute->new(-NAME => 'test_name2', -CODE => 'test_code2', -DESCRIPTION => 'test_desc2', VALUE => 'val2')], + $gene2->dbID() => [Bio::EnsEMBL::Attribute->new(-NAME => 'test_name2', -CODE => 'test_code2', -DESCRIPTION => 'test_desc2', VALUE => 'val3'),]}; +my $current_rows = count_rows($db, 'gene_attrib'); +$aa->store_batch_on_Gene($batch); +my $new_rows = count_rows($db, 'gene_attrib'); +cmp_ok($new_rows, '=', $current_rows + 3, 'Asserting the storage of multiple attributes will always store them'); + +@attribs = @{$aa->fetch_all_by_Gene($gene)}; +ok(@attribs == 2); + +@attribs = @{$aa->fetch_all_by_Gene($gene2)}; +ok(@attribs == 1); + $multi->restore('core', 'misc_attrib', 'seq_region_attrib', 'attrib_type'); done_testing(); -- GitLab