#!/usr/local/bin/perl =head1 NAME vega_percent_gc_calc.pl - calculate GC content =head1 SYNOPSIS vega_percent_gc_calc.pl [options] General options: --conffile, --conf=FILE read parameters from FILE (default: conf/Conversion.ini) --dbname, db_name=NAME use database NAME --host, --dbhost, --db_host=HOST use database host HOST --port, --dbport, --db_port=PORT use database port PORT --user, --dbuser, --db_user=USER use database username USER --pass, --dbpass, --db_pass=PASS use database passwort PASS --logfile, --log=FILE log to FILE (default: *STDOUT) --logpath=PATH write logfile to PATH (default: .) --logappend, --log_append append to logfile (default: truncate) -v, --verbose verbose logging (default: false) -i, --interactive=0|1 run script interactively (default: true) -n, --dry_run, --dry=0|1 don't write results to database -h, --help, -? print help (this message) --prune undo, i.e. delete from the database changes caused by running the script =head1 DESCRIPTION This script calculates GC content per chromosomes. The block size is determined so that you have 150 bins for the smallest chromosome over 5 Mb in length. For chromosomes smaller than 5 Mb, an additional smaller block size is used to yield 150 bins for the overall smallest chromosome. This will result in reasonable resolution for small chromosomes and high performance for big ones. =head1 LICENCE This code is distributed under an Apache style licence: Please see http://www.ensembl.org/code_licence.html for details =head1 AUTHOR Patrick Meidl =head1 CONTACT Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk =cut use strict; use warnings; no warnings 'uninitialized'; use FindBin qw($Bin); use vars qw($SERVERROOT); BEGIN { $SERVERROOT = "$Bin/../../.."; unshift(@INC, "$SERVERROOT/ensembl/modules"); unshift(@INC, "$SERVERROOT/bioperl-live"); } use Getopt::Long; use Pod::Usage; use Bio::EnsEMBL::Utils::ConversionSupport; use Bio::EnsEMBL::DensityType; use Bio::EnsEMBL::DensityFeature; use POSIX; $| = 1; my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT); # parse options $support->parse_common_options(@_); $support->parse_extra_options('prune'); $support->allowed_params($support->get_common_params, 'prune'); if ($support->param('help') or $support->error) { warn $support->error if $support->error; pod2usage(1); } # ask user to confirm parameters to proceed $support->confirm_params; # get log filehandle and print heading and parameters to logfile $support->init_log; # connect to database and get adaptors my $dba = $support->get_database('ensembl'); my $dfa = $dba->get_DensityFeatureAdaptor; my $dta = $dba->get_DensityTypeAdaptor; my $aa = $dba->get_AnalysisAdaptor; my $dbh= $dba->dbc->db_handle; # check for prune option (undo) if($support->param('prune')){ exit unless ($support->user_proceed('About to delete all data from previous runs of this script. Proceed ?')); my $query= "delete analysis, density_type, density_feature from analysis left join density_type on analysis.analysis_id= density_type.analysis_id left join density_feature on density_type.density_type_id= density_feature.density_type_id where analysis.program= 'percent_gc_calc.pl'"; if(my $c = $dbh->do($query)){ $support->log("prune was successfull: $c previous entries in the database generated by this script have been deleted\n"); } else{ $support->log_error("prune failed: any previous entries in the database generated by this script have NOT been deleted\n"); exit; } # Create new Analysis object my $analysis = Bio::EnsEMBL::Analysis->new( -program => "percent_gc_calc.pl", -database => "ensembl", -gff_source => "percent_gc_calc.pl", -gff_feature => "density", -logic_name => "PercentGC", ); $aa->store($analysis) unless ($support->param('dry_run')); # split chromosomes by size and determine block size my $chr_slices = $support->split_chromosomes_by_size(5000000); # loop over block sizes foreach my $block_size (keys %{ $chr_slices }) { $support->log("Available chromosomes using block size of $block_size:\n "); $support->log(join("\n ", map { $_->seq_region_name } @{ $chr_slices->{$block_size} })."\n"); # create DensityType objects my $density_type = Bio::EnsEMBL::DensityType->new( -analysis => $analysis, -block_size => $block_size, -value_type => 'ratio', ); $dta->store($density_type) unless ($support->param('dry_run')); # loop over chromosomes $support->log_stamped("Looping over chromosomes...\n"); my ($current_start, $current_end); foreach my $slice (@{ $chr_slices->{$block_size} }) { $current_start = 1; my $chr = $slice->seq_region_name; my $i; my $bins = POSIX::ceil($slice->end/$block_size); $support->log_stamped("Chromosome $chr with block size $block_size...\n", 1); # loop over blocks while($current_start <= $slice->end) { $i++; $current_end = $current_start + $block_size - 1; if ($current_end > $slice->end) { $current_end = $slice->end; } my $sub_slice = $slice->sub_Slice( $current_start, $current_end ); my $gc = $sub_slice->get_base_count->{'%gc'}; $support->log_verbose("Chr: $chr | Bin: $i/$bins | \%GC: $gc\n", 2); my $df = Bio::EnsEMBL::DensityFeature->new( -seq_region => $slice, -start => $current_start, -end => $current_end, -density_type => $density_type, -density_value => $gc, ); $dfa->store($df) unless ($support->param('dry_run')); $current_start = $current_end + 1; } $support->log_stamped("Done.\n", 1); } $support->log_stamped("Done.\n"); } } # finish logfile $support->finish_log;