From d5acf64433d356382d99212b1067f0b3ad1728fd Mon Sep 17 00:00:00 2001 From: Steve Trevanion <st3@sanger.ac.uk> Date: Mon, 12 Mar 2007 10:46:05 +0000 Subject: [PATCH] merge from vega-41-dev --- .../density_feature/vega_gene_density.pl | 269 ++++++++---------- .../density_feature/vega_percent_gc_calc.pl | 35 +-- .../vega_repeat_coverage_calc.pl | 5 +- 3 files changed, 135 insertions(+), 174 deletions(-) diff --git a/misc-scripts/density_feature/vega_gene_density.pl b/misc-scripts/density_feature/vega_gene_density.pl index 7a227ee392..cb864d941f 100755 --- a/misc-scripts/density_feature/vega_gene_density.pl +++ b/misc-scripts/density_feature/vega_gene_density.pl @@ -28,6 +28,7 @@ General options: Specific options: --chromosomes, --chr=LIST only process LIST chromosomes + --prune delete data in the database from a previous run --limit_file only process gene_ids in this file =head1 DESCRIPTION @@ -80,7 +81,6 @@ use Bio::EnsEMBL::Utils::ConversionSupport; use Bio::EnsEMBL::DensityType; use Bio::EnsEMBL::DensityFeature; use POSIX; -use Data::Dumper; $| = 1; @@ -88,12 +88,8 @@ my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT); # parse options $support->parse_common_options(@_); -$support->parse_extra_options( - 'chromosomes|chr=s@', - 'limit_file=s'); -$support->allowed_params($support->get_common_params, - 'chromosomes', - 'limit_file'); +$support->parse_extra_options('chromosomes|chr=s@', 'prune', 'limit_file=s'); +$support->allowed_params($support->get_common_params, 'chromosomes', 'prune', 'limit_file'); if ($support->param('help') or $support->error) { warn $support->error if $support->error; @@ -120,6 +116,73 @@ my $dbh = $dba->dbc->db_handle; # split chromosomes by size and determine block size my $chr_slices = $support->split_chromosomes_by_size(5000000); +# known biotype/status pairs and associated density type + +# Note: the types starting with 'total_...' should only be used for the stats stuff and are composed of other types, +# which are in the array pointed to by the 'parts' key + + +my %gene_types= ( + "protein_coding_KNOWN" => {logic_name => "knownPCodDensity", code => "KnownPCCount", description => "Number of Known Protein Coding"}, + "protein_coding_in_progress_KNOWN" => {logic_name => "knownPCodDensity", code => "KnwnPCProgCount", description => "Number of Known Protein coding genes in progress" }, + "processed_transcript_KNOWN" => {logic_name => "knownPTransDensity", code => "KnownPTCount", description => "Number of Known Processed Transcripts" }, + "protein_coding_NOVEL" => {logic_name => "novelPCodDensity", code => "NovelPCCount", description => "Number of Novel Protein Coding"}, + "protein_coding_in_progress_NOVEL" => {logic_name => "novelPCodDensity", code => "NovPCProgCount", description => "Number of Novel Protein coding genes in progress"}, + "processed_transcript_NOVEL" => {logic_name => "novelPTransDensity", code => "NovelPTCount", description => "Number of Novel transcripts"}, + "processed_transcript_PUTATIVE" => {logic_name => "putativePTransDensity", code => "PutPTCount", description => "Number of Putative processed transcripts"}, + "protein_coding_PREDICTED" => {logic_name => "predictedPCodDensity", code => "PredPCCount", description => "Number of Predicted transcripts"}, + "Ig_pseudogene_segment_UNKNOWN" => {logic_name => "IgPseudoSegDensity", code => "IgPsSegCount", description => "Number of Ig Pseudogene Segments"}, + "Ig_segment_NOVEL" => {logic_name => "IgSegDensity", code => "", description => ""}, + "Ig_segment_KNOWN" => {logic_name => "IgSegDensity", code => "", description => ""}, + "pseudogene_UNKNOWN" => {logic_name => "pseudoGeneDensity", code => "", description => ""}, + "processed_pseudogene_UNKNOWN" => {logic_name => "pseudoGeneDensity", code => "ProcPsCount", description => "Number of Processed pseudogenes"}, + "unprocessed_pseudogene_UNKNOWN" => {logic_name => "pseudoGeneDensity", code => "UnprocPsCount", description => "Number of Unprocessed pseudogenes"}, + "total_pseudogene_UNKNOWN" => {logic_name => "", code => "TotPsCount", description => "Total Number of Pseudogenes", parts => ['pseudogene_UNKNOWN', 'processed_pseudogene_UNKNOWN', 'unprocessed_pseudogene_UNKNOWN']}, + "total_Ig_segment_UNKNOWN" => {logic_name => "", code => "IgSegCount", description => "Number of Ig Segments", parts => ['Ig_segment_NOVEL','Ig_segment_KNOWN']}, + "processed_transcript_UNKNOWN" => {logic_name => "PTransDensity", code => "PTCount", description => "Number of Processed Transcripts"}, + "processed_transcript_PREDICTED" => {logic_name => "PredPTransDensity", code => "PredPTCount", description => "Number of Predicted Processed Transcripts"} +); + +if($support->param('prune')){ + + exit unless ($support->user_proceed('About to delete all data from previous runs of this script. Proceed ?')); + + # delete the results in the database from a previous run + + $support->log("running in prune mode\n"); + $support->log("pruning 1 (tables analysis, density_type, density_feature)\n"); + + # Note: Need to use a left-join in the multitable delete otherwise rows in e.g. analysis which have program= vega_gene_density.pl, but no corresponding rows in the density_type or density_feature table will NOT be deleted. + + my $query= "delete analysis, density_type, density_feature + from analysis + left join density_type on analysis.analysis_id= density_type.analysis_id + left join density_feature on density_type.density_type_id= density_feature.density_type_id + where analysis.program= 'vega_gene_density.pl'"; + + if(my $c = $dbh->do($query)){ + $support->log("prune 1 was successfull: $c previous entries in the database generated by this script have been deleted\n"); + } + else { + $support->log_error("prune 1 failed: any previous entries in the database generated by this script have NOT been deleted\n"); + exit; + } + + foreach my $name(keys %gene_types){ + my $href= $gene_types{$name}; + my $code= $href->{'code'}; + if($code){ + $query= "delete attrib_type, seq_region_attrib from attrib_type left join seq_region_attrib on attrib_type.attrib_type_id= seq_region_attrib.attrib_type_id where attrib_type.code= '$code'"; + if(my $c = $dbh->do($query)){ + $support->log("$c entries removed successfull for code $code\n"); + } + else{ + $support->log_error("prune 2 failed for code $code\n"); + } + } + } +} + #limit to gene_stable_ids ? my (%gene_ids, %found_genes); if ($support->param('limit_file')) { @@ -130,26 +193,6 @@ if ($support->param('limit_file')) { } } -# biotype/status pairs and associated density type -my %gene_types = ( - "protein_coding_KNOWN" => "knownPCodDensity", - "protein_coding_in_progress_KNOWN" => "knownPCodDensity", - "processed_transcript_KNOWN" => "knownPTransDensity", - "protein_coding_NOVEL" => "novelPCodDensity", - "protein_coding_in_progress_NOVEL" => "novelPCodDensity", - "processed_transcript_NOVEL" => "novelPTransDensity", - "processed_transcript_PUTATIVE" => "putativePTransDensity", - "protein_coding_PREDICTED" => "predictedPCodDensity", - "Ig_pseudogene_segment_UNKNOWN" => "IgPseudoSegDensity", - "Ig_segment_NOVEL" => "IgSegDensity", - "Ig_segment_KNOWN" => "IgSegDensity", - "pseudogene_UNKNOWN" => "pseudoGeneDensity", - "processed_pseudogene_UNKNOWN" => "pseudoGeneDensity", - "unprocessed_pseudogene_UNKNOWN" => "pseudoGeneDensity", -# "processed_transcript_UNKNOWN" => "PTransDensity", - "processed_transcript_PREDICTED" => "PredPTransDensity", -); - # check for new biotype/status pairs my $sql = qq( SELECT biotype, status @@ -177,20 +220,24 @@ map { $support->log(sprintf($FMT, $_, $type_status{$_}), 1) } $support->log("\n"); if ($new) { $support->log_warning("There are new biotype/status pairs! You might need to adapt Bio::EnsEMBL::ColourMap, EnsEMBL::Sanger_vega::Component::Chromosome and configure mapview to show them.\n\n"); - exit; } # create Analysis and DensityType objects my (%density_types, $dtcache); foreach my $type (keys %gene_types) { - $density_types{$gene_types{$type}} = 1; + # ignore any type starting with 'total_' + if($type =~/^total_/){ + next; + } + + $density_types{$gene_types{$type}{'logic_name'}} = 1; my $analysis = new Bio::EnsEMBL::Analysis ( -program => "vega_gene_density.pl", -database => "ensembl", -gff_source => "vega_gene_density.pl", -gff_feature => "density", - -logic_name => $gene_types{$type} + -logic_name => $gene_types{$type}{'logic_name'} ); $aa->store($analysis) unless ($support->param('dry_run')); foreach my $block_size (keys %{ $chr_slices }) { @@ -200,10 +247,13 @@ foreach my $type (keys %gene_types) { -value_type => 'sum' ); $dta->store($dt) unless ($support->param('dry_run')); - $dtcache->{$block_size}->{$gene_types{$type}} = $dt; + $dtcache->{$block_size}->{$gene_types{$type}{'logic_name'}} = $dt; } } +# need a hash to stop duplcations at line +my %chromosome_done=(); + # loop over block sizes foreach my $block_size (keys %{ $chr_slices }) { @@ -231,7 +281,6 @@ foreach my $block_size (keys %{ $chr_slices }) { } my $sub_slice = $slice->sub_Slice($current_start, $current_end); my %num = (); - # count genes by type my $genes; eval { $genes = $sub_slice->get_all_Genes; }; @@ -240,22 +289,25 @@ foreach my $block_size (keys %{ $chr_slices }) { $current_start = $current_end + 1; next; } - GENE: foreach my $gene (@{$genes}) { - if ($support->param('limit_file')) { + GENE: foreach my $gene (@{$genes}) { + if ($support->param('limit_file')) { my $gsi = $gene->stable_id; next GENE unless ($gene_ids{$gsi}); $gene_ids{$gsi} |= 2; } + #ignore otter_eucomm genes (mouse) + next GENE if ($gene->analysis->logic_name eq 'otter_eucomm'); + # only count genes that don't overlap the subslice start # (since these were already counted in the last bin) my $gene_type = $gene->biotype . '_' . $gene->status; if ($gene->start >= 1) { $total{$gene_type}++; } - $num{$gene_types{$gene_type}}++; + $num{$gene_types{$gene_type}{'logic_name'}}++; } - + # create DensityFeature objects for each type foreach my $type (keys %density_types) { push @density_features, Bio::EnsEMBL::DensityFeature->new( @@ -267,127 +319,43 @@ foreach my $block_size (keys %{ $chr_slices }) { ); } $current_start = $current_end + 1; - + # logging $support->log_verbose("Chr: $chr | Bin: $i/$bins | Counts: ". - join(",", map { $num{$gene_types{$_}} || 0 } + join(",", map { $num{$gene_types{$_}{'logic_name'}} || 0 } sort keys %gene_types)."\n", 2); } - + # store DensityFeatures for the chromosome $dfa->store(@density_features) unless ($support->param('dry_run')); - # stats my @attribs; - push @attribs, Bio::EnsEMBL::Attribute->new( - -NAME => 'protein_coding_KNOWN', - -CODE => 'KnownPCCount', - -VALUE => $total{'protein_coding_KNOWN'} || 0, - -DESCRIPTION => 'Number of Known Protein Coding', - ); - - push @attribs, Bio::EnsEMBL::Attribute->new( - -NAME => 'processed_transcript_KNOWN', - -CODE => 'KnownPTCount', - -VALUE => $total{'processed_transcript_KNOWN'} || 0, - -DESCRIPTION => 'Number of Known Processed Transcripts', - ); - - push @attribs, Bio::EnsEMBL::Attribute->new( - -NAME => 'processed_transcript_PREDICTED', - -CODE => 'PredPTCount', - -VALUE => $total{'processed_transcript_PREDICTED'} || 0, - -DESCRIPTION => 'Number of Predicted Processed Transcripts', - ); - - push @attribs, Bio::EnsEMBL::Attribute->new( - -NAME => 'processed_transcript_UNKNOWN', - -CODE => 'PTCount', - -VALUE => $total{'processed_transcript_UNKNOWN'} || 0, - -DESCRIPTION => 'Number of Processed Transcripts', - ); - - push @attribs, Bio::EnsEMBL::Attribute->new( - -NAME => 'protein_coding_NOVEL', - -CODE => 'NovelPCCount', - -VALUE => $total{'protein_coding_NOVEL'} || 0, - -DESCRIPTION => 'Number of Novel Protein Coding' - ); - - push @attribs, Bio::EnsEMBL::Attribute->new( - -NAME => 'processed_transcript_NOVEL', - -CODE => 'NovelPTCount', - -VALUE => $total{'processed_transcript_NOVEL'} || 0, - -DESCRIPTION => 'Number of Novel transcripts' - ); - - push @attribs, Bio::EnsEMBL::Attribute->new( - -NAME => 'processed_transcript_PUTATIVE', - -CODE => 'PutPTCount', - -VALUE => $total{'processed_transcript_PUTATIVE'} || 0, - -DESCRIPTION => 'Number of Putative processed transcripts' - ); - - push @attribs, Bio::EnsEMBL::Attribute->new( - -NAME => 'protein_coding_PREDICTED', - -CODE => 'PredPCCount', - -VALUE => $total{'protein_coding_PREDICTED'} || 0, - -DESCRIPTION => 'Number of Predicted transcripts' - ); - - push @attribs, Bio::EnsEMBL::Attribute->new( - -NAME => 'total_Ig_segment_UNKNOWN', - -CODE => 'IgSegCount', - -VALUE => $total{'Ig_segment_NOVEL'} - + $total{'Ig_segment_KNOWN'} || 0, - -DESCRIPTION => 'Number of Ig Segments' - ); - - push @attribs, Bio::EnsEMBL::Attribute->new( - -NAME => 'Ig_pseudogene_segment_UNKNOWN', - -CODE => 'IgPsSegCount', - -VALUE => $total{'Ig_pseudogene_segment_UNKNOWN'} || 0, - -DESCRIPTION => 'Number of Ig Pseudogene Segments' - ); - - push @attribs, Bio::EnsEMBL::Attribute->new( - -NAME => 'total_pseudogene_UNKNOWN', - -CODE => 'TotPsCount', - -VALUE => ($total{'pseudogene_UNKNOWN'} - + $total{'processed_pseudogene_UNKNOWN'} - + $total{'unprocessed_pseudogene_UNKNOWN'}) || 0, - -DESCRIPTION => 'Total Number of Pseudogenes' - ); - - push @attribs, Bio::EnsEMBL::Attribute->new( - -NAME => 'processed_pseudogene_UNKNOWN', - -CODE => 'ProcPsCount', - -VALUE => $total{'processed_pseudogene_UNKNOWN'} || 0, - -DESCRIPTION => 'Number of Processed pseudogenes' - ); - - push @attribs, Bio::EnsEMBL::Attribute->new( - -NAME => 'unprocessed_pseudogene_UNKNOWN', - -CODE => 'UnprocPsCount', - -VALUE => $total{'unprocessed_pseudogene_UNKNOWN'} || 0, - -DESCRIPTION => 'Number of Unprocessed pseudogenes' - ); - - push @attribs, Bio::EnsEMBL::Attribute->new( - -NAME => 'protein_coding_in_progress_KNOWN', - -CODE => 'KnwnPCProgCount', - -VALUE => $total{'protein_coding_in_progress_KNOWN'} || 0, - -DESCRIPTION => 'Number of Known Protein coding genes in progress' - ); - - push @attribs, Bio::EnsEMBL::Attribute->new( - -NAME => 'protein_coding_in_progress_NOVEL', - -CODE => 'NovPCProgCount', - -VALUE => $total{'protein_coding_in_progress_NOVEL'} || 0, - -DESCRIPTION => 'Number of Novel Protein coding genes in progress' - ); - - $attrib_adaptor->store_on_Slice($slice, \@attribs) unless ($support->param('dry_run')); + foreach my $name(keys %gene_types){ + my $href= $gene_types{$name}; + my $code= $href->{'code'}; + next unless $code; + my $description= $href->{'description'}; + my $value=0; + if(exists $href->{'parts'}){ + #contains multiple elements + foreach my $part(@{$href->{'parts'}}){ + $value += ($total{$part} || 0); + } + }else{ + #contains a single element + $value= $total{$name} || 0; + } + push @attribs, Bio::EnsEMBL::Attribute->new( + -NAME => $name, + -CODE => $code, + -VALUE => $value, + -DESCRIPTION => $description + ); + } + if(! $chromosome_done{$chr}){ + $attrib_adaptor->store_on_Slice($slice, \@attribs) unless ($support->param('dry_run')); + $chromosome_done{$chr}=1; + } # log stats $support->log("\n"); @@ -397,12 +365,15 @@ foreach my $block_size (keys %{ $chr_slices }) { map { $support->log(sprintf($FMT, $_, $total{$_}), 2) } sort keys %total; $support->log_stamped("\nDone.\n\n", 1); } + $support->log_stamped("Done.\n"); } + if (%gene_ids) { my $no_genes_to_look_for = keys %gene_ids; my (@genes_not_found) = grep {$gene_ids{$_} == 1} keys %gene_ids; $support->log("Of the $no_genes_to_look_for genes in ".$support->param('limit_file')." , the following were missing: @genes_not_found\n"); } + $support->log_stamped("Done.\n"); # finish logfile diff --git a/misc-scripts/density_feature/vega_percent_gc_calc.pl b/misc-scripts/density_feature/vega_percent_gc_calc.pl index fd517fc2d8..8e49821a01 100644 --- a/misc-scripts/density_feature/vega_percent_gc_calc.pl +++ b/misc-scripts/density_feature/vega_percent_gc_calc.pl @@ -104,24 +104,17 @@ my $dbh= $dba->dbc->db_handle; # check for prune option (undo) if($support->param('prune')){ + exit unless ($support->user_proceed('About to delete all data from previous runs of this script. Proceed ?')); - my $query= "delete analysis, density_type, density_feature from analysis, density_type, density_feature where (analysis.program= 'percent_gc_calc.pl') and (analysis.analysis_id= density_type.analysis_id) and (density_type.density_type_id= density_feature.density_type_id)"; - - if($dbh->do($query)){ - - $support->log("prune was successfull: any previous entries in the database generated by this script have been deleted\n"); - + my $query= "delete analysis, density_type, density_feature from analysis left join density_type on analysis.analysis_id= density_type.analysis_id left join density_feature on density_type.density_type_id= density_feature.density_type_id where analysis.program= 'percent_gc_calc.pl'"; + if(my $c = $dbh->do($query)){ + $support->log("prune was successfull: $c previous entries in the database generated by this script have been deleted\n"); } else{ - $support->log_error("prune failed: any previous entries in the database generated by this script have NOT been deleted\n"); + exit; } - -} - -else{ - # Create new Analysis object my $analysis = Bio::EnsEMBL::Analysis->new( -program => "percent_gc_calc.pl", @@ -131,15 +124,15 @@ else{ -logic_name => "PercentGC", ); $aa->store($analysis) unless ($support->param('dry_run')); - + # split chromosomes by size and determine block size my $chr_slices = $support->split_chromosomes_by_size(5000000); - + # loop over block sizes foreach my $block_size (keys %{ $chr_slices }) { $support->log("Available chromosomes using block size of $block_size:\n "); $support->log(join("\n ", map { $_->seq_region_name } @{ $chr_slices->{$block_size} })."\n"); - + # create DensityType objects my $density_type = Bio::EnsEMBL::DensityType->new( -analysis => $analysis, @@ -147,7 +140,7 @@ else{ -value_type => 'ratio', ); $dta->store($density_type) unless ($support->param('dry_run')); - + # loop over chromosomes $support->log_stamped("Looping over chromosomes...\n"); my ($current_start, $current_end); @@ -156,9 +149,9 @@ else{ my $chr = $slice->seq_region_name; my $i; my $bins = POSIX::ceil($slice->end/$block_size); - + $support->log_stamped("Chromosome $chr with block size $block_size...\n", 1); - + # loop over blocks while($current_start <= $slice->end) { $i++; @@ -183,14 +176,8 @@ else{ } $support->log_stamped("Done.\n"); } - - } - - - - # finish logfile $support->finish_log; diff --git a/misc-scripts/density_feature/vega_repeat_coverage_calc.pl b/misc-scripts/density_feature/vega_repeat_coverage_calc.pl index 9cd0626dba..09e6e50efe 100644 --- a/misc-scripts/density_feature/vega_repeat_coverage_calc.pl +++ b/misc-scripts/density_feature/vega_repeat_coverage_calc.pl @@ -107,7 +107,10 @@ my $dbh= $dba->dbc->db_handle; # check for prune option (undo) if($support->param('prune')){ - my $query= "delete analysis, density_type, density_feature from analysis, density_type, density_feature where (analysis.program= 'repeat_coverage_calc.pl') and (analysis.analysis_id= density_type.analysis_id) and (density_type.density_type_id= density_feature.density_type_id)"; + #my $query= "delete analysis, density_type, density_feature from analysis, density_type, density_feature where (analysis.program= 'repeat_coverage_calc.pl') and (analysis.analysis_id= density_type.analysis_id) and (density_type.density_type_id= density_feature.density_type_id)"; + + my $query= "delete analysis, density_type, density_feature from analysis left join density_type on analysis.analysis_id= density_type.analysis_id left join density_feature on density_type.density_type_id= density_feature.density_type_id where analysis.program= 'repeat_coverage_calc.pl'"; + if($dbh->do($query)){ -- GitLab