From 1b43ab58f7e9b5a62fcf6e02b7f82b60e1c44de3 Mon Sep 17 00:00:00 2001 From: Dan Staines <dstaines@ebi.ac.uk> Date: Mon, 5 Nov 2012 10:01:40 +0000 Subject: [PATCH] additional count methods to support gene density calculations --- modules/Bio/EnsEMBL/DBSQL/GeneAdaptor.pm | 109 +++++++++++++++++++---- modules/t/gene.t | 14 +++ 2 files changed, 108 insertions(+), 15 deletions(-) diff --git a/modules/Bio/EnsEMBL/DBSQL/GeneAdaptor.pm b/modules/Bio/EnsEMBL/DBSQL/GeneAdaptor.pm index 0919439709..93f5551f98 100644 --- a/modules/Bio/EnsEMBL/DBSQL/GeneAdaptor.pm +++ b/modules/Bio/EnsEMBL/DBSQL/GeneAdaptor.pm @@ -265,7 +265,26 @@ sub fetch_by_stable_id { sub fetch_all_by_biotype { my ($self, $biotype) = @_; + my @genes = @{$self->generic_fetch($self->biotype_constraint($biotype))}; + return \@genes; +} + +=head2 biotype_constraint + + Arg [1] : String $biotype + listref of $biotypes + The biotype of the gene to retrieve. You can have as an argument a reference + to a list of biotypes + Description: Used internally to generate a SQL constraint to restrict a gene query by biotype + Returntype : String + Exceptions : If biotype is not supplied + Caller : general + Status : Stable + +=cut +sub biotype_constraint { + my ($self, $biotype) = @_; if (!defined $biotype) { throw("Biotype or listref of biotypes expected"); } @@ -283,13 +302,31 @@ sub fetch_all_by_biotype { $constraint = "g.biotype = ? and g.is_current = 1"; $self->bind_param_generic_fetch($biotype, SQL_VARCHAR); } - my @genes = @{$self->generic_fetch($constraint)}; - return \@genes; + return $constraint; } -sub fetch_all { - my ($self) = @_; +=head2 count_all_by_biotype + Arg [1] : String $biotype + listref of $biotypes + The biotype of the gene to retrieve. You can have as an argument a reference + to a list of biotypes + Example : $cnt = $gene_adaptor->count_all_by_biotype('protein_coding'); + $cnt = $gene_adaptor->count_all_by_biotypes(['protein_coding', 'sRNA', 'miRNA']); + Description : Retrieves count of gene objects from the database via its biotype or biotypes. + Returntype : integer + Caller : general + Status : Stable + +=cut + +sub count_all_by_biotype { + my ($self, $biotype) = @_; + return $self->generic_count($self->biotype_constraint($biotype)); +} + +sub fetch_all { + my ($self) = @_; my $constraint = 'g.biotype != "LRG_gene" and g.is_current = 1'; my @genes = @{$self->generic_fetch($constraint)}; return \@genes; @@ -616,6 +653,36 @@ sub fetch_all_by_Slice { return $genes; } ## end sub fetch_all_by_Slice +=head2 count_all_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice $slice + The slice to count genes on. + Arg [2] : (optional) biotype(s) string or arrayref of strings + the biotype of the features to count. + Arg [1] : (optional) string $source + the source name of the features to count. + Example : $cnt = $gene_adaptor->count_all_by_Slice(); + Description: Method to count genes on a given slice, filtering by biotype and source + Returntype : integer + Exceptions : thrown if exon cannot be placed on transcript slice + Status : Stable + Caller : general +=cut + +sub count_all_by_Slice { + my ($self, $slice, $biotype, $source) = @_; + + my $constraint = 'g.is_current = 1'; + if (defined($source)) { + $constraint .= " and g.source = '$source'"; + } + if (defined($biotype)) { + $constraint .= " and " . $self->biotype_constraint($biotype); + } + + return $self->count_by_Slice_constraint($slice, $constraint); +} + =head2 fetch_by_transcript_id Arg [1] : Int $trans_id @@ -1562,7 +1629,9 @@ FEATURE: while ($sth->fetch()) { # if ($mapper) { - if (defined $dest_slice && $mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper')) { + if (defined $dest_slice + && $mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper')) + { ($seq_region_id, $seq_region_start, $seq_region_end, $seq_region_strand) = $mapper->map($sr_name, $seq_region_start, $seq_region_end, $seq_region_strand, $sr_cs, 1, $dest_slice); } else { @@ -1790,7 +1859,8 @@ sub fetch_all_by_exon_supporting_evidence { } my $anal_from = ", analysis a " if ($analysis); - my $anal_where = "AND a.analysis_id = f.analysis_id AND a.analysis_id=? " if ($analysis); + my $anal_where = "AND a.analysis_id = f.analysis_id AND a.analysis_id=? " + if ($analysis); my $sql = qq( SELECT DISTINCT(g.gene_id) @@ -1854,7 +1924,8 @@ sub fetch_all_by_transcript_supporting_evidence { } my $anal_from = ", analysis a " if ($analysis); - my $anal_where = "AND a.analysis_id = f.analysis_id AND a.analysis_id=? " if ($analysis); + my $anal_where = "AND a.analysis_id = f.analysis_id AND a.analysis_id=? " + if ($analysis); my $sql = qq( SELECT DISTINCT(g.gene_id) @@ -1938,8 +2009,10 @@ sub fetch_nearest_Gene_by_Feature { # MAYBE set the result of prepare to be static in case lots of calls. # my $sql1_sth = $self->prepare($sql1) || die "Could not prepare $sql1"; - $sql1_sth->execute($start, $seq_region_id, $start) || die "Could not execute sql"; - $sql1_sth->bind_columns(\$gene_id, \$min_dist) || die "Could mot bin columns"; + $sql1_sth->execute($start, $seq_region_id, $start) + || die "Could not execute sql"; + $sql1_sth->bind_columns(\$gene_id, \$min_dist) + || die "Could mot bin columns"; my $last_dist = 99999999999999999; while ($sql1_sth->fetch()) { @@ -1959,8 +2032,10 @@ sub fetch_nearest_Gene_by_Feature { my $sql2_sth = $self->prepare($sql2) || die "could not prepare $sql2"; my ($tmp_min_dist, $tmp_gene_id); - $sql2_sth->execute($end, $seq_region_id, $end) || die "Could not execute sql"; - $sql2_sth->bind_columns(\$tmp_gene_id, \$tmp_min_dist) || die "Could mot bin columns"; + $sql2_sth->execute($end, $seq_region_id, $end) + || die "Could not execute sql"; + $sql2_sth->bind_columns(\$tmp_gene_id, \$tmp_min_dist) + || die "Could mot bin columns"; my $first = 1; while ($sql2_sth->fetch()) { if ($tmp_min_dist <= $last_dist) { @@ -1987,8 +2062,10 @@ sub fetch_nearest_Gene_by_Feature { # MAYBE set the result of prepare to be static in case lots of calls. # my $sql1_sth = $self->prepare($sql1) || die "Could not prepare $sql1"; - $sql1_sth->execute($start, $seq_region_id, $start) || die "Could not execute sql"; - $sql1_sth->bind_columns(\$gene_id, \$min_dist) || die "Could mot bin columns"; + $sql1_sth->execute($start, $seq_region_id, $start) + || die "Could not execute sql"; + $sql1_sth->bind_columns(\$gene_id, \$min_dist) + || die "Could mot bin columns"; my $last_dist; my $first = 1; @@ -2013,8 +2090,10 @@ sub fetch_nearest_Gene_by_Feature { my $sql2_sth = $self->prepare($sql2) || die "could not prepare $sql2"; my ($tmp_min_dist, $tmp_gene_id); - $sql2_sth->execute($end, $seq_region_id, $end) || die "Could not execute sql"; - $sql2_sth->bind_columns(\$tmp_gene_id, \$tmp_min_dist) || die "Could mot bin columns"; + $sql2_sth->execute($end, $seq_region_id, $end) + || die "Could not execute sql"; + $sql2_sth->bind_columns(\$tmp_gene_id, \$tmp_min_dist) + || die "Could mot bin columns"; my $first = 1; my $last_dist; while ($sql2_sth->fetch()) { diff --git a/modules/t/gene.t b/modules/t/gene.t index 37b4c7848b..cdff69f47e 100644 --- a/modules/t/gene.t +++ b/modules/t/gene.t @@ -369,6 +369,16 @@ for my $gene (@$genes) { } } +# try and count the genes on the slice +my $geneCount = $ga->count_all_by_Slice($slice); +ok(scalar(@$genes) == $geneCount); +$geneCount = $ga->count_all_by_Slice($slice, 'protein_coding'); +ok(scalar(@$genes) == $geneCount); +$geneCount = $ga->count_all_by_Slice($slice, 'banana'); +ok($geneCount == 0); +$geneCount = $ga->count_all_by_Slice($slice, ['banana', 'protein_coding']); +ok($geneCount == scalar(@$genes)); + debug("known: $known Unknown: $unknown\n"); ok($known == 17); @@ -478,6 +488,10 @@ debug("Test fetch_all_by_biotype"); ok(@genes == 20); @genes = @{$ga->fetch_all_by_biotype(['protein_coding', 'sRNA'])}; ok(@genes == 20); +$geneCount = $ga->count_all_by_biotype('protein_coding'); +ok($geneCount == 20); +$geneCount = $ga->count_all_by_biotype(['protein_coding', 'sRNA']); +ok($geneCount == 20); # # test Gene: get_all_alt_alleles -- GitLab