From de4616b46accbfa6ea6c5135b4c2f1ff65183db9 Mon Sep 17 00:00:00 2001 From: Patrick Meidl <pm2@sanger.ac.uk> Date: Thu, 3 Nov 2005 14:08:35 +0000 Subject: [PATCH] fixed to work with biotype/status; added check for new biotype/status pairs in database --- .../density_feature/vega_gene_density.pl | 77 +++++++++++++------ 1 file changed, 53 insertions(+), 24 deletions(-) diff --git a/misc-scripts/density_feature/vega_gene_density.pl b/misc-scripts/density_feature/vega_gene_density.pl index a07ede213d..af030247a1 100755 --- a/misc-scripts/density_feature/vega_gene_density.pl +++ b/misc-scripts/density_feature/vega_gene_density.pl @@ -32,7 +32,8 @@ Specific options: =head1 DESCRIPTION This script calculates Vega gene densities and total numbers per chromosome -for use in mapview. +for use in mapview. It also checks for new biotype/status pairs and warns to +adapt the appropriate modules to deal with them. The block size is determined so that you have 150 bins for the smallest chromosome over 5 Mb in length. For chromosomes smaller than 5 Mb, an @@ -109,28 +110,57 @@ my $dfa = $dba->get_DensityFeatureAdaptor; my $dta = $dba->get_DensityTypeAdaptor; my $aa = $dba->get_AnalysisAdaptor; my $attrib_adaptor = $dba->get_AttributeAdaptor; +my $dbh = $dba->dbc->db_handle; # split chromosomes by size and determine block size my $chr_slices = $support->split_chromosomes_by_size(5000000); -# create Analysis and DensityType objects +# known biotype/status pairs and associated density type my %gene_types = ( - "Known" => "knownGeneDensity", - "Novel_CDS" => "novelCDSDensity", - "Novel_Transcript" => "novelTransDensity", - "Putative" => "putativeTransDensity", - "Predicted_Gene" => "predictedTransDensity", - "Ig_Pseudogene_Segment" => "IgPseudoSegDensity", - "Ig_Segment" => "IgSegDensity", - "Processed_pseudogene" => "pseudoGeneDensity", - "Pseudogene" => "pseudoGeneDensity", - "Unprocessed_pseudogene" => "pseudoGeneDensity", - "Known_in_progress" => "knownGeneDensity", - "Novel_CDS_in_progress" => "novelCDSDensity", + "protein_coding_KNOWN" => "knownGeneDensity", + "protein_coding_NOVEL" => "novelCDSDensity", + "processed_transcript_NOVEL" => "novelTransDensity", + "processed_transcript_PUTATIVE" => "putativeTransDensity", + "protein_coding_PREDICTED" => "predictedTransDensity", + "Ig_pseudogene_segment_NOVEL" => "IgPseudoSegDensity", + "Ig_segment_NOVEL" => "IgSegDensity", + "pseudogene_NOVEL" => "pseudoGeneDensity", + "processed_pseudogene_NOVEL" => "pseudoGeneDensity", + "unprocessed_pseudogene_NOVEL" => "pseudoGeneDensity", + "protein_coding_in_progress_KNOWN" => "knownGeneDensity", + "protein_coding_in_progress_NOVEL" => "novelCDSDensity", ); -$support->log("Available gene types:\n "); -$support->log(join("\n ", sort keys %gene_types)."\n"); +# check for new biotype/status pairs +my $sql = qq( + SELECT concat(biotype, '_', status) as c + FROM transcript + GROUP by biotype, status +); +my $sth = $dbh->prepare($sql); +$sth->execute; +my (%type_status, $new); +while (my ($type) = $sth->fetchrow_array) { + if ($gene_types{$type}) { + $type_status{$type} = 'no'; + } else { + $type_status{$type} = 'YES'; + $new = 1; + } +} +my $FMT = "%-50s%-20s\n"; +$support->log("Checking for new biotype/status pairs...\n\n"); +$support->log(sprintf($FMT, qw(BIOTYPE/STATUS NEW)), 1); +$support->log(('-'x70)."\n", 1); +map { $support->log(sprintf($FMT, $_, $type_status{$_}), 1) } + sort keys %type_status; +$support->log("\n"); +if ($new) { + $support->log_warning("There are new biotype/status pairs! You might need to adapt Bio::EnsEMBL::ColourMap, EnsEMBL::Sanger_vega::Component::Chromosome and configure mapview to show them.\n\n"); +} + + +# create Analysis and DensityType objects my (%density_types, $dtcache); foreach my $type (keys %gene_types) { $density_types{$gene_types{$type}} = 1; @@ -159,7 +189,7 @@ foreach my $block_size (keys %{ $chr_slices }) { $support->log(join("\n ", map { $_->seq_region_name } @{ $chr_slices->{$block_size} })."\n"); # looping over chromosomes - $support->log_stamped("Looping over chromosomes...\n"); + $support->log_stamped("\nLooping over chromosomes...\n"); my ($current_start, $current_end); foreach my $slice (@{ $chr_slices->{$block_size} }) { $current_start = 1; @@ -191,10 +221,11 @@ foreach my $block_size (keys %{ $chr_slices }) { foreach my $gene (@{$genes}) { # only count genes that don't overlap the subslice start # (since these were already counted in the last bin) + my $gene_type = $gene->biotype . '_' . $gene->status; if ($gene->start >= 1) { - $total{$gene->type}++; + $total{$gene_type}++; } - $num{$gene_types{$gene->type}}++; + $num{$gene_types{$gene_type}}++; } # create DensityFeature objects for each type @@ -321,12 +352,10 @@ foreach my $block_size (keys %{ $chr_slices }) { # log stats $support->log("\n"); $support->log("Totals for chr $chr:\n", 1); - my $fmt = "%-30s%-40s\n"; - $support->log(sprintf($fmt, qw(TYPE COUNT)), 2); + $support->log(sprintf($FMT, qw(TYPE COUNT)), 2); $support->log(('-'x70)."\n", 2); - map { $support->log(sprintf($fmt, $_, $total{$_}), 2) } sort keys %total; - $support->log_stamped("Done.\n", 1); - $support->log("\n"); + map { $support->log(sprintf($FMT, $_, $total{$_}), 2) } sort keys %total; + $support->log_stamped("\nDone.\n\n", 1); } $support->log_stamped("Done.\n"); } -- GitLab