mapping_stats.pl 8.68 KB
Newer Older
Monika Komorowska's avatar
Monika Komorowska committed
1
#!/usr/bin/env perl
Magali Ruffier's avatar
Magali Ruffier committed
2
# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Tiago Grego's avatar
Tiago Grego committed
3
# Copyright [2016-2019] EMBL-European Bioinformatics Institute
4 5 6 7 8 9 10 11 12 13 14 15 16
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#      http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45

=head1 NAME

mapping_stats.pl - print some statistics about a whole genome alignment between
two assemblies.

=head1 SYNOPSIS

mapping_stats.pl [arguments]

Required arguments:

  --dbname, db_name=NAME              database name NAME
  --host, --dbhost, --db_host=HOST    database host HOST
  --port, --dbport, --db_port=PORT    database port PORT
  --user, --dbuser, --db_user=USER    database username USER
  --pass, --dbpass, --db_pass=PASS    database passwort PASS
  --assembly=ASSEMBLY                 assembly version ASSEMBLY

  --altdbname=NAME                    alternative database NAME
  --altassembly=ASSEMBLY              alternative assembly version ASSEMBLY

Optional arguments:

  --althost=HOST                      alternative databases host HOST
  --altport=PORT                      alternative database port PORT
  --altuser=USER                      alternative database username USER
  --altpass=PASS                      alternative database password PASS

46
  --chromosomes, --chr=LIST           only process LIST toplevel seq_regions
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95

  --conffile, --conf=FILE             read parameters from FILE
                                      (default: conf/Conversion.ini)

  --logfile, --log=FILE               log to FILE (default: *STDOUT)
  --logpath=PATH                      write logfile to PATH (default: .)
  --logappend, --log_append           append to logfile (default: truncate)

  -v, --verbose=0|1                   verbose logging (default: false)
  -i, --interactive=0|1               run script interactively (default: true)
  -n, --dry_run, --dry=0|1            don't write results to database
  -h, --help, -?                      print help (this message)

=head1 DESCRIPTION

This script prints some statistics about a whole genome alignment between two
assemblies, like the alignment coverage and length of alignment blocks.

=cut

use strict;
use warnings;
no warnings 'uninitialized';

use FindBin qw($Bin);
use vars qw($SERVERROOT);

BEGIN {
    $SERVERROOT = "$Bin/../../..";
    unshift(@INC, "$SERVERROOT/ensembl/modules");
    unshift(@INC, "$SERVERROOT/bioperl-live");
}

use Getopt::Long;
use Pod::Usage;
use Bio::EnsEMBL::Utils::ConversionSupport;

$| = 1;

my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);

# parse options
$support->parse_common_options(@_);
$support->parse_extra_options(
    'assembly=s',
    'altdbname=s',
    'altassembly=s',
    'althost=s',
    'altport=n',
96
    'altpass=s',
97 98 99 100 101 102 103 104 105 106
    'chromosomes|chr=s@',
);
$support->allowed_params(
    $support->get_common_params,
    'assembly',
    'altdbname',
    'altassembly',
    'althost',
    'altport',
    'chromosomes',
107
    'altpass',
108 109 110 111 112 113 114 115 116 117
);

if ($support->param('help') or $support->error) {
    warn $support->error if $support->error;
    pod2usage(1);
}

$support->comma_to_list('chromosomes');

# ask user to confirm parameters to proceed
118
# $support->confirm_params;
119 120 121 122 123 124 125 126 127 128 129 130

# get log filehandle and print heading and parameters to logfile
$support->init_log;

$support->check_required_params(
    'assembly',
    'altdbname',
    'altassembly'
);

# first set connection parameters for alternative db if not different from
# reference db
131
map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user);
132 133 134 135 136 137 138 139 140 141 142 143

# reference database
my $R_dba = $support->get_database('ensembl');
my $R_sa = $R_dba->get_SliceAdaptor;

# database containing the alternative assembly
my $A_dba = $support->get_database('core', 'alt');
my $A_sa = $A_dba->get_SliceAdaptor;

my $fmt1 = "%-40s%12s\n";
my $fmt2 = "%-40s%11.1f%%\n";
my $fmt3 = "%-44s%8.0f (%2.0f%%)\n";
144
my $fmt4 = "%-40s%12s%10s\n";
145

146
$support->log("Looping over toplevel seq_regions...\n\n");
147

148
foreach my $chr ($support->sort_chromosomes(undef, $support->param('assembly'), 1)) {
149
  $support->log_stamped("Toplevel seq_region $chr...\n", 1);
150

151 152 153 154 155 156 157 158 159
  # determine non-N sequence length of alternative toplevel seq_region
  my $A_slice = $A_sa->fetch_by_region('toplevel', $chr);

  unless ($A_slice) {
    $support->log("Not found in alternative db. Skipping.\n", 2);
    next;
  }
  
  my $cs_name = $A_slice->coord_system_name;
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177

  my $start = 1;
  my $A_chr_length = $A_slice->length;
  my $n;
  my $end;
  
  while ($start < $A_chr_length) {
    $end = $start + 10000000;
    $end = $A_chr_length if ($end > $A_chr_length);
    my $sub_slice = $A_slice->sub_Slice($start, $end);
    my $seq = $sub_slice->seq;
    $n += $seq =~ tr/N/N/;
    $start = $end + 1;
  }

  my $A_length = $A_chr_length - $n;

  # determine total length of mapping to alternative assembly
Patrick Meidl's avatar
Patrick Meidl committed
178
  my $mapping_length = 0;
179
  my %blocks;
180
  my %blocklength;
181

182 183 184
  my %cont_mapping_blocks;
  my %cont_mapping_length;

185
  # toplevel seq_region length order of magnitude
186
  my $oom = length($A_length);
187 188 189
 
  #seq region on the alternative assembly
  my $R_slice = $R_sa->fetch_by_region($cs_name, $chr,undef, undef, undef,$support->param('altassembly'));
190
  
191 192 193 194 195
  #seq region on the latest assembly
  my $R_new_assembly_slice = $R_sa->fetch_by_region($cs_name, $chr);

  #map alternative assembly to latest
  my @segments = @{ $R_slice->project($cs_name, $support->param('assembly')) };
196

197
  my $alignments = 0;
198
  my $alignment_runs = 0;
199

200 201
  my $previous_sl;
  my $cont_mapping_length = 0;
202
  foreach my $seg (@segments) {
203 204 205
    my $sl = $seg->to_Slice;

    # ignore PAR region (i.e. we project to the symlinked seq_region)
206 207
    #next if ($sl->seq_region_name ne $chr);
  
208
    my $l = $sl->length;
209
    $mapping_length += $l;
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230

    if ($previous_sl) {
	#if current slice is on the same chromosome and within 10bps of the previous slice
	#add it to the continuous mapping length
	if ($previous_sl->seq_region_name eq $sl->seq_region_name && abs ($previous_sl->end - $sl->start) <= 10) {
	    $cont_mapping_length += $l;
	} else {

	    my $c_oom = $oom;
    
	    while ($c_oom) {
		if ($cont_mapping_length > 10**($c_oom-1) and $cont_mapping_length <= 10**$c_oom) {
		    $cont_mapping_blocks{10**$c_oom}++;
		    $cont_mapping_length{10**$c_oom} += $cont_mapping_length;
		}
		$c_oom--;
	    }
	    if ($cont_mapping_length == 1) {
		$cont_mapping_blocks{10}++;
		$cont_mapping_length{10}++
	    }	    
231
	    
232
	    $cont_mapping_length = $l;
233 234

	    $alignment_runs ++;
235 236 237 238
	}
    } else {
	$cont_mapping_length = $l;
    }
239 240 241 242
    
    my $c_oom = $oom;
    
    while ($c_oom) {
243 244 245 246
      if ($l > 10**($c_oom-1) and $l <= 10**$c_oom) {
        $blocks{10**$c_oom}++;
        $blocklength{10**$c_oom} += $l;
      }
247 248
      $c_oom--;
    }
249 250 251 252
    if ($l == 1) {
	$blocks{10}++ ;
	$blocklength{10}++;
    }
253

254
    $previous_sl = $sl;
255
    $alignments++;
256 257 258 259 260
  }  
  
  # print stats
  $support->log("\n");
  
261
  $support->log(sprintf($fmt1, "Reference toplevel seq_region length:",
262
      $support->commify($R_new_assembly_slice->length)), 2);
263
  $support->log(sprintf($fmt1, "Alternative toplevel seq_region length:",
264 265 266 267 268 269 270 271 272 273
      $support->commify($A_chr_length)), 2);
  $support->log(sprintf($fmt1, "Non-N sequence length (alternative):",
      $support->commify($A_length)), 2);
  $support->log(sprintf($fmt1, "Alternative mapping length:",
      $support->commify($mapping_length)), 2);
  $support->log(sprintf($fmt2, "Mapping percentage:",
      $mapping_length/$A_length*100), 2);
  
  $support->log("\n");
  
274
  $support->log(sprintf($fmt4, "Total alignments:", $alignments, "Mapping %"), 2);
275
  
276
  if ($alignments) {
Patrick Meidl's avatar
Patrick Meidl committed
277 278 279
    for (my $i = 0; $i < $oom; $i++) {
      my $from = 10**$i;
      my $to = 10**($i+1);
280
      $support->log(sprintf($fmt3, "    ".$support->commify($from)." - ".$support->commify($to)." bp:", $blocks{$to}, $blocklength{$to}/$A_length*100), 2);
Patrick Meidl's avatar
Patrick Meidl committed
281
    }
282 283 284
  }
  
  $support->log("\n");
285 286


287
  $support->log(sprintf($fmt4, "Continuous alignment runs:", $alignment_runs, "Mapping %"), 2);
288 289
  $support->log("(gaps up to 10bp)\n",2);  

290 291 292 293 294 295 296 297 298
  if ($alignments) {
    for (my $i = 0; $i < $oom; $i++) {
      my $from = 10**$i;
      my $to = 10**($i+1);
      $support->log(sprintf($fmt3, "    ".$support->commify($from)." - ".$support->commify($to)." bp:", $cont_mapping_blocks{$to}, $cont_mapping_length{$to}/$A_length*100), 2);
    }
  }
  
  $support->log("\n");
299 300 301 302 303 304
  
}

# finish logfile
$support->finish_log;