vega_repeat_coverage_calc.pl 7.08 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
#!/usr/local/bin/perl

=head1 NAME

vega_repeat_coverage_calc.pl - calculate the repeat coverage

=head1 SYNOPSIS

vega_repeat_coverage_calc.pl [options]

General options:
    --conffile, --conf=FILE             read parameters from FILE
                                        (default: conf/Conversion.ini)

    --dbname, db_name=NAME              use database NAME
    --host, --dbhost, --db_host=HOST    use database host HOST
    --port, --dbport, --db_port=PORT    use database port PORT
    --user, --dbuser, --db_user=USER    use database username USER
    --pass, --dbpass, --db_pass=PASS    use database passwort PASS
    --logfile, --log=FILE               log to FILE (default: *STDOUT)
    --logpath=PATH                      write logfile to PATH (default: .)
    --logappend, --log_append           append to logfile (default: truncate)
    -v, --verbose                       verbose logging (default: false)
    -i, --interactive=0|1               run script interactively (default: true)
    -n, --dry_run, --dry=0|1            don't write results to database
    -h, --help, -?                      print help (this message)

Steve Trevanion's avatar
Steve Trevanion committed
28 29 30
    --prune				undo, i.e. delete from the database changes caused by running the script			


31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
=head1 DESCRIPTION

This script calculates the repeat coverage for given database.

The block size is determined so that you have 150 bins for the smallest
chromosome over 5 Mb in length. For chromosomes smaller than 5 Mb, an
additional smaller block size is used to yield 150 bins for the overall
smallest chromosome. This will result in reasonable resolution for small
chromosomes and high performance for big ones.

=head1 LICENCE

This code is distributed under an Apache style licence:
Please see http://www.ensembl.org/code_licence.html for details

=head1 AUTHOR

Patrick Meidl <pm2@sanger.ac.uk>

=head1 CONTACT

Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk

=cut

56
use strict;
57 58 59 60 61 62
use warnings;
no warnings 'uninitialized';

use FindBin qw($Bin);
use vars qw($SERVERROOT);

63
BEGIN {
64 65 66
    $SERVERROOT = "$Bin/../../..";
    unshift(@INC, "$SERVERROOT/ensembl/modules");
    unshift(@INC, "$SERVERROOT/bioperl-live");
67 68 69
}

use Getopt::Long;
70 71
use Pod::Usage;
use Bio::EnsEMBL::Utils::ConversionSupport;
72 73 74 75 76
use Bio::EnsEMBL::DensityType;
use Bio::EnsEMBL::DensityFeature;
use Bio::EnsEMBL::Mapper::RangeRegistry;
use POSIX;

77
$| = 1;
78

79
my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
80

81 82
# parse options
$support->parse_common_options(@_);
Steve Trevanion's avatar
Steve Trevanion committed
83 84 85 86
$support->parse_extra_options('prune');

$support->allowed_params($support->get_common_params, 'prune');

87

88 89 90
if ($support->param('help') or $support->error) {
    warn $support->error if $support->error;
    pod2usage(1);
91 92
}

93 94 95 96 97 98 99 100 101 102 103 104
# ask user to confirm parameters to proceed
$support->confirm_params;

# get log filehandle and print heading and parameters to logfile
$support->init_log;

# connect to database and get adaptors
my $dba = $support->get_database('ensembl');
my $dfa = $dba->get_DensityFeatureAdaptor;
my $dta = $dba->get_DensityTypeAdaptor;
my $aa = $dba->get_AnalysisAdaptor;

Steve Trevanion's avatar
Steve Trevanion committed
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
my $dbh= $dba->dbc->db_handle;

# check for prune option (undo)
if($support->param('prune')){

	my $query= "delete analysis, density_type, density_feature from analysis, density_type, density_feature where (analysis.program= 'repeat_coverage_calc.pl') and (analysis.analysis_id= density_type.analysis_id) and (density_type.density_type_id= density_feature.density_type_id)";

	if($dbh->do($query)){

		$support->log("prune was successfull: any previous entries in the database generated by this script have been deleted\n");

	}
	else{

		$support->log_error("prune failed: any previous entries in the database generated by this script have NOT been deleted\n");
	}


}

else{

    # Create Analysis object
    my $analysis = new Bio::EnsEMBL::Analysis (
            -program     => "repeat_coverage_calc.pl",
            -database    => "ensembl",
            -gff_source  => "repeat_coverage_calc.pl",
            -gff_feature => "density",
            -logic_name  => "PercentageRepeat",
134
    );
Steve Trevanion's avatar
Steve Trevanion committed
135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
    $aa->store($analysis) unless ($support->param('dry_run'));
    
    # split chromosomes by size and determine block size
    my $chr_slices = $support->split_chromosomes_by_size(5000000);
    
    # loop over block sizes
    foreach my $block_size (keys %{ $chr_slices }) {
        $support->log("Available chromosomes using block size of $block_size:\n    ");
        $support->log(join("\n    ", map { $_->seq_region_name } @{ $chr_slices->{$block_size} })."\n");
    
        # create DensityType objects
        my $density_type = Bio::EnsEMBL::DensityType->new(
             -analysis   => $analysis,
             -block_size => $block_size,
             -value_type => 'ratio',
        );
        $dta->store($density_type) unless ($support->param('dry_run'));
    
        # loop over chromosomes
        $support->log_stamped("Looping over chromosomes...\n");
        my ($current_start, $current_end);
        foreach my $slice (@{ $chr_slices->{$block_size} }) {
            $current_start = 1;
            my $chr = $slice->seq_region_name;
            my $i;
            my $bins = POSIX::ceil($slice->end/$block_size);
    
            $support->log_stamped("Chromosome $chr with block size $block_size...\n", 1);
    
            # loop over blocks
            while($current_start <= $slice->end) {
                $i++;
                $current_end = $current_start + $block_size - 1;
                if ($current_end > $slice->end) {
                    $current_end = $slice->end;
                }
                my $this_block_size = $current_end - $current_start + 1;
                my $sub_slice = $slice->sub_Slice($current_start, $current_end);
                my $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new or die;
                foreach my $repeat (@{ $sub_slice->get_all_RepeatFeatures }) {
                    $rr->check_and_register("1", $repeat->start, $repeat->end);
176
                }
Steve Trevanion's avatar
Steve Trevanion committed
177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
                my $count = 0;
                my $non_repeats = $rr->check_and_register("1", 1, $this_block_size);
                if (defined $non_repeats) {
                    foreach my $non_repeat (@{ $non_repeats }) {
                        $count += ($non_repeat->[1] - $non_repeat->[0]) + 1;
                    }
                }
                my $percentage_repeat = (($this_block_size-$count)/$this_block_size)*100;
                $support->log_verbose("Chr: $chr | Bin: $i/$bins | ", 2);
                $support->log_verbose("\%repeat ".sprintf("%.2f", $percentage_repeat)."\n");
                my $df = Bio::EnsEMBL::DensityFeature->new(
                    -seq_region    => $slice,
                    -start         => $current_start,
                    -end           => $current_end,
                    -density_type  => $density_type,
                    -density_value => $percentage_repeat,
                );
                $dfa->store($df) unless ($support->param('dry_run'));
                $current_start = $current_end + 1;
196
            }
Steve Trevanion's avatar
Steve Trevanion committed
197
            $support->log_stamped("Done.\n", 1);
198
        }
Steve Trevanion's avatar
Steve Trevanion committed
199
        $support->log_stamped("Done.\n");
200
    }
201
}
202 203
# finish logfile
$support->finish_log;
204