vega_repeat_coverage_calc.pl 6.04 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
#!/usr/local/bin/perl

=head1 NAME

vega_repeat_coverage_calc.pl - calculate the repeat coverage

=head1 SYNOPSIS

vega_repeat_coverage_calc.pl [options]

General options:
    --conffile, --conf=FILE             read parameters from FILE
                                        (default: conf/Conversion.ini)

    --dbname, db_name=NAME              use database NAME
    --host, --dbhost, --db_host=HOST    use database host HOST
    --port, --dbport, --db_port=PORT    use database port PORT
    --user, --dbuser, --db_user=USER    use database username USER
    --pass, --dbpass, --db_pass=PASS    use database passwort PASS
    --logfile, --log=FILE               log to FILE (default: *STDOUT)
    --logpath=PATH                      write logfile to PATH (default: .)
    --logappend, --log_append           append to logfile (default: truncate)
    -v, --verbose                       verbose logging (default: false)
    -i, --interactive=0|1               run script interactively (default: true)
    -n, --dry_run, --dry=0|1            don't write results to database
    -h, --help, -?                      print help (this message)

=head1 DESCRIPTION

This script calculates the repeat coverage for given database.

The block size is determined so that you have 150 bins for the smallest
chromosome over 5 Mb in length. For chromosomes smaller than 5 Mb, an
additional smaller block size is used to yield 150 bins for the overall
smallest chromosome. This will result in reasonable resolution for small
chromosomes and high performance for big ones.

=head1 LICENCE

This code is distributed under an Apache style licence:
Please see http://www.ensembl.org/code_licence.html for details

=head1 AUTHOR

Patrick Meidl <pm2@sanger.ac.uk>

=head1 CONTACT

Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk

=cut

53
use strict;
54 55 56 57 58 59
use warnings;
no warnings 'uninitialized';

use FindBin qw($Bin);
use vars qw($SERVERROOT);

60
BEGIN {
61 62 63 64
    $SERVERROOT = "$Bin/../../..";
    unshift(@INC, "$SERVERROOT/ensembl-otter/modules");
    unshift(@INC, "$SERVERROOT/ensembl/modules");
    unshift(@INC, "$SERVERROOT/bioperl-live");
65 66 67
}

use Getopt::Long;
68 69
use Pod::Usage;
use Bio::EnsEMBL::Utils::ConversionSupport;
70 71 72 73 74
use Bio::EnsEMBL::DensityType;
use Bio::EnsEMBL::DensityFeature;
use Bio::EnsEMBL::Mapper::RangeRegistry;
use POSIX;

75
$| = 1;
76

77
my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
78

79 80 81
# parse options
$support->parse_common_options(@_);
$support->allowed_params($support->get_common_params);
82

83 84 85
if ($support->param('help') or $support->error) {
    warn $support->error if $support->error;
    pod2usage(1);
86 87
}

88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
# ask user to confirm parameters to proceed
$support->confirm_params;

# get log filehandle and print heading and parameters to logfile
$support->init_log;

# connect to database and get adaptors
my $dba = $support->get_database('ensembl');
my $dfa = $dba->get_DensityFeatureAdaptor;
my $dta = $dba->get_DensityTypeAdaptor;
my $aa = $dba->get_AnalysisAdaptor;

# Create Analysis object
my $analysis = new Bio::EnsEMBL::Analysis (
        -program     => "repeat_coverage_calc.pl",
        -database    => "ensembl",
        -gff_source  => "repeat_coverage_calc.pl",
        -gff_feature => "density",
        -logic_name  => "PercentageRepeat",
);
$aa->store($analysis) unless ($support->param('dry_run'));

# split chromosomes by size and determine block size
my $chr_slices = $support->split_chromosomes_by_size(5000000);

# loop over block sizes
foreach my $block_size (keys %{ $chr_slices }) {
    $support->log("Available chromosomes using block size of $block_size:\n    ");
    $support->log(join("\n    ", map { $_->seq_region_name } @{ $chr_slices->{$block_size} })."\n");

    # create DensityType objects
    my $density_type = Bio::EnsEMBL::DensityType->new(
         -analysis   => $analysis,
         -block_size => $block_size,
         -value_type => 'ratio',
    );
    $dta->store($density_type) unless ($support->param('dry_run'));

    # loop over chromosomes
    $support->log_stamped("Looping over chromosomes...\n");
    my ($current_start, $current_end);
    foreach my $slice (@{ $chr_slices->{$block_size} }) {
        $current_start = 1;
        my $chr = $slice->seq_region_name;
        my $i;
        my $bins = POSIX::ceil($slice->end/$block_size);

        $support->log_stamped("Chromosome $chr with block size $block_size...\n", 1);

        # loop over blocks
        while($current_start <= $slice->end) {
            $i++;
            $current_end = $current_start + $block_size - 1;
            if ($current_end > $slice->end) {
                $current_end = $slice->end;
            }
            my $this_block_size = $current_end - $current_start + 1;
            my $sub_slice = $slice->sub_Slice($current_start, $current_end);
            my $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new or die;
            foreach my $repeat (@{ $sub_slice->get_all_RepeatFeatures }) {
                $rr->check_and_register("1", $repeat->start, $repeat->end);
            }
            my $count = 0;
            my $non_repeats = $rr->check_and_register("1", 1, $this_block_size);
            if (defined $non_repeats) {
                foreach my $non_repeat (@{ $non_repeats }) {
                    $count += ($non_repeat->[1] - $non_repeat->[0]) + 1;
                }
            }
            my $percentage_repeat = (($this_block_size-$count)/$this_block_size)*100;
            $support->log_verbose("Chr: $chr | Bin: $i/$bins | ", 2);
            $support->log_verbose("\%repeat ".sprintf("%.2f", $percentage_repeat)."\n");
            my $df = Bio::EnsEMBL::DensityFeature->new(
                -seq_region    => $slice,
                -start         => $current_start,
                -end           => $current_end,
                -density_type  => $density_type,
                -density_value => $percentage_repeat,
            );
            $dfa->store($df) unless ($support->param('dry_run'));
            $current_start = $current_end + 1;
        }
        $support->log_stamped("Done.\n", 1);
    }
    $support->log_stamped("Done.\n");
173 174
}

175 176
# finish logfile
$support->finish_log;
177