From 422be3cf5ccf3ac5b02c7e76182501d92a36beb2 Mon Sep 17 00:00:00 2001 From: Patrick Meidl <pm2@sanger.ac.uk> Date: Wed, 10 May 2006 12:43:48 +0000 Subject: [PATCH] scripts to assess quality of assembly mapping --- misc-scripts/assembly/check_mapping.pl | 209 +++++++++++++++++ misc-scripts/assembly/compare_assemblies.pl | 234 ++++++++++++++++++++ misc-scripts/assembly/mapping_stats.pl | 231 +++++++++++++++++++ 3 files changed, 674 insertions(+) create mode 100755 misc-scripts/assembly/check_mapping.pl create mode 100755 misc-scripts/assembly/compare_assemblies.pl create mode 100755 misc-scripts/assembly/mapping_stats.pl diff --git a/misc-scripts/assembly/check_mapping.pl b/misc-scripts/assembly/check_mapping.pl new file mode 100755 index 0000000000..8b94390243 --- /dev/null +++ b/misc-scripts/assembly/check_mapping.pl @@ -0,0 +1,209 @@ +#!/usr/local/bin/perl + +=head1 NAME + +check_mapping.pl - script to check whole genome alignment between two +assemblies. + +=head1 SYNOPSIS + +check_mapping.pl [arguments] + +Required arguments: + + --dbname, db_name=NAME database name NAME + --host, --dbhost, --db_host=HOST database host HOST + --port, --dbport, --db_port=PORT database port PORT + --user, --dbuser, --db_user=USER database username USER + --pass, --dbpass, --db_pass=PASS database passwort PASS + --assembly=ASSEMBLY assembly version ASSEMBLY + + --altdbname=NAME alternative database NAME + --altassembly=ASSEMBLY alternative assembly version ASSEMBLY + +Optional arguments: + + --althost=HOST alternative databases host HOST + --altport=PORT alternative database port PORT + --altuser=USER alternative database username USER + --altpass=PASS alternative database password PASS + + --chromosomes, --chr=LIST only process LIST chromosomes + + --conffile, --conf=FILE read parameters from FILE + (default: conf/Conversion.ini) + + --logfile, --log=FILE log to FILE (default: *STDOUT) + --logpath=PATH write logfile to PATH (default: .) + --logappend, --log_append append to logfile (default: truncate) + + -v, --verbose=0|1 verbose logging (default: false) + -i, --interactive=0|1 run script interactively (default: true) + -n, --dry_run, --dry=0|1 don't write results to database + -h, --help, -? print help (this message) + +=head1 DESCRIPTION + +This script checks if the whole genome alignment between two assemblies is +correct. It does so by comparing the sequence in the reference database with +the sequence of the projected fragments in the alternative database. + +=head1 RELATED FILES + +The whole process of creating a whole genome alignment between two assemblies +is done by a series of scripts. Please see + + ensembl/misc-scripts/assembly/README + +for a high-level description of this process, and POD in the individual scripts +for the details. + +=head1 LICENCE + +This code is distributed under an Apache style licence: +Please see http://www.ensembl.org/code_licence.html for details + +=head1 AUTHOR + +Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team + +=head1 CONTACT + +Please post comments/questions to the Ensembl development list +<ensembl-dev@ebi.ac.uk> + +=cut + +use strict; +use warnings; +no warnings 'uninitialized'; + +use FindBin qw($Bin); +use vars qw($SERVERROOT); + +BEGIN { + $SERVERROOT = "$Bin/../../.."; + unshift(@INC, "$SERVERROOT/ensembl/modules"); + unshift(@INC, "$SERVERROOT/bioperl-live"); +} + +use Getopt::Long; +use Pod::Usage; +use Bio::EnsEMBL::Utils::ConversionSupport; + +$| = 1; + +my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT); + +# parse options +$support->parse_common_options(@_); +$support->parse_extra_options( + 'assembly=s', + 'altdbname=s', + 'altassembly=s', + 'althost=s', + 'altport=n', + 'chromosomes|chr=s@', +); +$support->allowed_params( + $support->get_common_params, + 'assembly', + 'altdbname', + 'altassembly', + 'althost', + 'altport', + 'chromosomes', +); + +if ($support->param('help') or $support->error) { + warn $support->error if $support->error; + pod2usage(1); +} + +$support->comma_to_list('chromosomes'); + +# ask user to confirm parameters to proceed +$support->confirm_params; + +# get log filehandle and print heading and parameters to logfile +$support->init_log; + +$support->check_required_params( + 'assembly', + 'altdbname', + 'altassembly' +); + +# first set connection parameters for alternative db if not different from +# reference db +map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user pass); + +# reference database +my $R_dba = $support->get_database('ensembl'); +my $R_sa = $R_dba->get_SliceAdaptor; + +# database containing the alternative assembly +my $A_dba = $support->get_database('core', 'alt'); +my $A_sa = $A_dba->get_SliceAdaptor; + +$support->log("Looping over chromosomes...\n\n"); + +foreach my $chr ($support->sort_chromosomes) { + $support->log_stamped("Chromosome $chr...\n", 1); + + my $R_slice = $R_sa->fetch_by_region('chromosome', $chr); + my $A_slice = $A_sa->fetch_by_region('chromosome', $chr); + + # compare reference and alternative sequence + my @segments = @{ $R_slice->project('chromosome', $support->param('altassembly')) }; + + foreach my $seg (@segments) { + # reference sequence + my $R_sub_slice = $R_slice->sub_Slice($seg->from_start, $seg->from_end); + my $R_seq = $R_sub_slice->seq; + + # alternative sequence + my $A_proj_slice = $seg->to_Slice; + my $A_sub_slice = $A_slice->sub_Slice($A_proj_slice->start, $A_proj_slice->end, $A_proj_slice->strand); + my $A_seq = $A_sub_slice->seq; + + # compare + my $i; + + if ($R_seq eq $A_seq) { + # sequences are identical -> ok + $support->log_verbose("Sequence match at ".$R_sub_slice->name."\n", 2); + + } else { + # not identical -> something is wrong + $support->log_warning("Sequence mismatch at ".$R_sub_slice->name."\n", 2); + + my $R_sub_seq; + my $A_sub_seq; + + if ($R_sub_slice->length > 20) { + $R_sub_seq = substr($R_seq, 0, 10)."...".substr($R_seq, -10, 10); + $A_sub_seq = substr($A_seq, 0, 10)."...".substr($A_seq, -10, 10); + } else { + $R_sub_seq = substr($R_seq, 0, 20); + $A_sub_seq = substr($A_seq, 0, 20); + } + + $support->log("Ref: $R_sub_seq\n", 3); + $support->log("Alt: $A_sub_seq\n", 3); + + $i++; + } + } + + if ($i) { + $support->log("Total: $i problematic alignments.\n", 2); + } else { + $support->log("All alignments ok.\n", 2); + + $support->log_stamped("Done.\n\n", 1); +} + +# finish logfile +$support->finish_log; + diff --git a/misc-scripts/assembly/compare_assemblies.pl b/misc-scripts/assembly/compare_assemblies.pl new file mode 100755 index 0000000000..1b97ba58df --- /dev/null +++ b/misc-scripts/assembly/compare_assemblies.pl @@ -0,0 +1,234 @@ +#!/usr/local/bin/perl + +=head1 NAME + +compare_assemblies.pl - compare two assemblies + +=head1 SYNOPSIS + +compare_assemblies.pl [arguments] + +Required arguments: + + --dbname, db_name=NAME database name NAME + --host, --dbhost, --db_host=HOST database host HOST + --port, --dbport, --db_port=PORT database port PORT + --user, --dbuser, --db_user=USER database username USER + --pass, --dbpass, --db_pass=PASS database passwort PASS + --assembly=ASSEMBLY assembly version ASSEMBLY + + --altdbname=NAME alternative database NAME + --altassembly=ASSEMBLY alternative assembly version ASSEMBLY + +Optional arguments: + + --althost=HOST alternative databases host HOST + --altport=PORT alternative database port PORT + --altuser=USER alternative database username USER + --altpass=PASS alternative database password PASS + + --chromosomes, --chr=LIST only process LIST chromosomes + + --conffile, --conf=FILE read parameters from FILE + (default: conf/Conversion.ini) + + --logfile, --log=FILE log to FILE (default: *STDOUT) + --logpath=PATH write logfile to PATH (default: .) + --logappend, --log_append append to logfile (default: truncate) + + -v, --verbose=0|1 verbose logging (default: false) + -i, --interactive=0|1 run script interactively (default: true) + -n, --dry_run, --dry=0|1 don't write results to database + -h, --help, -? print help (this message) + +=head1 DESCRIPTION + +This script compares two assemblies. At the moment all it does is to list +clones that are on different chromosomes in the two assemblies. + +=head1 RELATED FILES + +The whole process of creating a whole genome alignment between two assemblies +is done by a series of scripts. Please see + + ensembl/misc-scripts/assembly/README + +for a high-level description of this process, and POD in the individual scripts +for the details. + +=head1 LICENCE + +This code is distributed under an Apache style licence: +Please see http://www.ensembl.org/code_licence.html for details + +=head1 AUTHOR + +Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team + +=head1 CONTACT + +Please post comments/questions to the Ensembl development list +<ensembl-dev@ebi.ac.uk> + +=cut + +use strict; +use warnings; +no warnings 'uninitialized'; + +use FindBin qw($Bin); +use vars qw($SERVERROOT); + +BEGIN { + $SERVERROOT = "$Bin/../../.."; + unshift(@INC, "$SERVERROOT/ensembl/modules"); + unshift(@INC, "$SERVERROOT/bioperl-live"); +} + +use Getopt::Long; +use Pod::Usage; +use Bio::EnsEMBL::Utils::ConversionSupport; + +$| = 1; + +my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT); + +# parse options +$support->parse_common_options(@_); +$support->parse_extra_options( + 'assembly=s', + 'altdbname=s', + 'altassembly=s', + 'althost=s', + 'altport=n', + 'chromosomes|chr=s@', +); +$support->allowed_params( + $support->get_common_params, + 'assembly', + 'altdbname', + 'altassembly', + 'althost', + 'altport', + 'chromosomes', +); + +if ($support->param('help') or $support->error) { + warn $support->error if $support->error; + pod2usage(1); +} + +$support->comma_to_list('chromosomes'); + +# ask user to confirm parameters to proceed +$support->confirm_params; + +# get log filehandle and print heading and parameters to logfile +$support->init_log; + +$support->check_required_params( + 'assembly', + 'altdbname', + 'altassembly' +); + +# first set connection parameters for alternative db if not different from +# reference db +map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user pass); + +# reference database +my $R_dba = $support->get_database('ensembl'); +my $R_sa = $R_dba->get_SliceAdaptor; +my $R_sa = $R_dba->get_SliceAdaptor; + +# database containing the alternative assembly +my $A_dba = $support->get_database('core', 'alt'); +my $A_sa = $A_dba->get_SliceAdaptor; +my $A_sa = $A_dba->get_SliceAdaptor; + +my $fmt1 = "%-20s%-12s%-12s\n"; +my $fmt2 = "%-35s%10.0f\n"; +my @diff_total; +my %stats_total; + +$support->log_stamped("Looping over chromosomes...\n"); + +foreach my $R_chr ($support->sort_chromosomes) { + $support->log_stamped("\nChromosome $R_chr...\n", 1); + + # fetch chromosome slice and project to clones + my $R_slice = $R_sa->fetch_by_region('chromosome', $R_chr); + my @R_clones = @{ $R_slice->project('clone') }; + + # loop over reference clones + my @diff; + my %stats; + foreach my $R_seg (@R_clones) { + $stats{'num_clones'}++; + my $R_clone = $R_seg->to_Slice; + my $R_clone_name = $R_clone->seq_region_name; + + # fetch clone from alternative db and project to chromosome + my $A_clone = $A_sa->fetch_by_region('clone', $R_clone_name); + if ($A_clone) { + my ($A_seg) = @{ $A_clone->project('chromosome') }; + if ($A_seg) { + my $A_slice = $A_seg->to_Slice; + + # compare chromosome names + my $A_chr = $A_slice->seq_region_name; + unless ($R_chr eq $A_chr) { + push @diff, [$R_clone_name, $R_chr, $A_chr]; + } + } else { + $stats{'does_not_project'}++; + $support->log_verbose("Clone $R_clone_name doesn't project to chromosome.\n", 2); + } + } else { + $stats{'not_in_alt'}++; + $support->log_verbose("Clone $R_clone_name not in alternative db.\n", 2); + } + } + push @diff_total, @diff; + map { $stats_total{$_} += $stats{$_} } + qw(num_clones does_not_project not_in_alt); + + # print results for this chromosome + $support->log("\nStats for chromosome $R_chr:\n\n", 2); + $support->log(sprintf($fmt2, "Total number of clones:", $stats{'num_clones'}), 3); + $support->log(sprintf($fmt2, "Clones not in alternative db:", $stats{'not_in_alt'}), 3); + $support->log(sprintf($fmt2, "Clones not in alternative assembly:", $stats{'does_not_project'}), 3); + if (@diff) { + $support->log("\nClones on different chromosomes in the 2 assemblies:\n\n", 3); + $support->log(sprintf($fmt1, qw(CLONE R_CHR A_CHR)), 4); + $support->log(('-'x44)."\n", 4); + foreach my $d (@diff) { + $support->log(sprintf($fmt1, @{ $d }), 4); + } + } else { + $support->log("\nAll matching clones on same chromosome in the 2 assemblies.\n", 3); + } +} + +$support->log_stamepd("Done.\n\n"); + +# print overall results +$support->log("\nOverall stats:\n"); +$support->log(sprintf($fmt2, "Total number of clones:", $stats_total{'num_clones'}), 1); +$support->log(sprintf($fmt2, "Clones not in alternative db:", $stats_total{'not_in_alt'}), 1); +$support->log(sprintf($fmt2, "Clones not in alternative assembly:", $stats_total{'does_not_project'}), 1); +if (@diff_total) { + $support->log("\nClones on different chromosomes in the 2 assemblies:\n\n", 1); + $support->log(sprintf($fmt1, qw(CLONE R_CHR A_CHR)), 2); + $support->log(('-'x44)."\n", 2); + foreach my $d (@diff_total) { + $support->log(sprintf($fmt1, @{ $d }), 2); + } +} else { + $support->log("\nAll clones on same chromosome in the 2 assemblies.\n", 2); +} +$support->log("\n"); + +# finish logfile +$support->finish_log; + diff --git a/misc-scripts/assembly/mapping_stats.pl b/misc-scripts/assembly/mapping_stats.pl new file mode 100755 index 0000000000..358c9a3662 --- /dev/null +++ b/misc-scripts/assembly/mapping_stats.pl @@ -0,0 +1,231 @@ +#!/usr/local/bin/perl + +=head1 NAME + +mapping_stats.pl - print some statistics about a whole genome alignment between +two assemblies. + +=head1 SYNOPSIS + +mapping_stats.pl [arguments] + +Required arguments: + + --dbname, db_name=NAME database name NAME + --host, --dbhost, --db_host=HOST database host HOST + --port, --dbport, --db_port=PORT database port PORT + --user, --dbuser, --db_user=USER database username USER + --pass, --dbpass, --db_pass=PASS database passwort PASS + --assembly=ASSEMBLY assembly version ASSEMBLY + + --altdbname=NAME alternative database NAME + --altassembly=ASSEMBLY alternative assembly version ASSEMBLY + +Optional arguments: + + --althost=HOST alternative databases host HOST + --altport=PORT alternative database port PORT + --altuser=USER alternative database username USER + --altpass=PASS alternative database password PASS + + --chromosomes, --chr=LIST only process LIST chromosomes + + --conffile, --conf=FILE read parameters from FILE + (default: conf/Conversion.ini) + + --logfile, --log=FILE log to FILE (default: *STDOUT) + --logpath=PATH write logfile to PATH (default: .) + --logappend, --log_append append to logfile (default: truncate) + + -v, --verbose=0|1 verbose logging (default: false) + -i, --interactive=0|1 run script interactively (default: true) + -n, --dry_run, --dry=0|1 don't write results to database + -h, --help, -? print help (this message) + +=head1 DESCRIPTION + +This script prints some statistics about a whole genome alignment between two +assemblies, like the alignment coverage and length of alignment blocks. + +=head1 RELATED FILES + +The whole process of creating a whole genome alignment between two assemblies +is done by a series of scripts. Please see + + ensembl/misc-scripts/assembly/README + +for a high-level description of this process, and POD in the individual scripts +for the details. + +=head1 LICENCE + +This code is distributed under an Apache style licence: +Please see http://www.ensembl.org/code_licence.html for details + +=head1 AUTHOR + +Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team + +=head1 CONTACT + +Please post comments/questions to the Ensembl development list +<ensembl-dev@ebi.ac.uk> + +=cut + +use strict; +use warnings; +no warnings 'uninitialized'; + +use FindBin qw($Bin); +use vars qw($SERVERROOT); + +BEGIN { + $SERVERROOT = "$Bin/../../.."; + unshift(@INC, "$SERVERROOT/ensembl/modules"); + unshift(@INC, "$SERVERROOT/bioperl-live"); +} + +use Getopt::Long; +use Pod::Usage; +use Bio::EnsEMBL::Utils::ConversionSupport; + +$| = 1; + +my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT); + +# parse options +$support->parse_common_options(@_); +$support->parse_extra_options( + 'assembly=s', + 'altdbname=s', + 'altassembly=s', + 'althost=s', + 'altport=n', + 'chromosomes|chr=s@', +); +$support->allowed_params( + $support->get_common_params, + 'assembly', + 'altdbname', + 'altassembly', + 'althost', + 'altport', + 'chromosomes', +); + +if ($support->param('help') or $support->error) { + warn $support->error if $support->error; + pod2usage(1); +} + +$support->comma_to_list('chromosomes'); + +# ask user to confirm parameters to proceed +$support->confirm_params; + +# get log filehandle and print heading and parameters to logfile +$support->init_log; + +$support->check_required_params( + 'assembly', + 'altdbname', + 'altassembly' +); + +# first set connection parameters for alternative db if not different from +# reference db +map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user pass); + +# reference database +my $R_dba = $support->get_database('ensembl'); +my $R_sa = $R_dba->get_SliceAdaptor; + +# database containing the alternative assembly +my $A_dba = $support->get_database('core', 'alt'); +my $A_sa = $A_dba->get_SliceAdaptor; + +my $fmt1 = "%-40s%12s\n"; +my $fmt2 = "%-40s%11.1f%%\n"; +my $fmt3 = "%-44s%8.0f (%2.0f%%)\n"; + +$support->log("Looping over chromosomes...\n\n"); + +foreach my $chr ($support->sort_chromosomes) { + $support->log_stamped("Chromosome $chr...\n", 1); + + # determine non-N sequence length of alternative chromosome + my $A_slice = $A_sa->fetch_by_region('chromosome', $chr); + + my $start = 1; + my $A_chr_length = $A_slice->length; + my $n; + my $end; + + while ($start < $A_chr_length) { + $end = $start + 10000000; + $end = $A_chr_length if ($end > $A_chr_length); + my $sub_slice = $A_slice->sub_Slice($start, $end); + my $seq = $sub_slice->seq; + $n += $seq =~ tr/N/N/; + $start = $end + 1; + } + + my $A_length = $A_chr_length - $n; + + # determine total length of mapping to alternative assembly + my $mapping_length; + my %blocks; + + # chromosome length order of magnitude + my $oom = length($A_length); + + my $R_slice = $R_sa->fetch_by_region('chromosome', $chr); + my @segments = @{ $R_slice->project('chromosome', $support->param('altassembly')) }; + + foreach my $seg (@segments) { + my $l = $seg->to_Slice->length; + $mapping_length += $l; + + my $c_oom = $oom; + + while ($c_oom) { + if ($l > 10**($c_oom-1) and $l <= 10**$c_oom) { $blocks{10**$c_oom}++; } + $c_oom--; + } + $blocks{10}++ if ($l == 1); + } + + # print stats + $support->log("\n"); + + $support->log(sprintf($fmt1, "Reference chromosome length:", + $support->commify($R_slice->length)), 2); + $support->log(sprintf($fmt1, "Alternative chromosome length:", + $support->commify($A_chr_length)), 2); + $support->log(sprintf($fmt1, "Non-N sequence length (alternative):", + $support->commify($A_length)), 2); + $support->log(sprintf($fmt1, "Alternative mapping length:", + $support->commify($mapping_length)), 2); + $support->log(sprintf($fmt2, "Mapping percentage:", + $mapping_length/$A_length*100), 2); + + $support->log("\n"); + + my $segs = scalar(@segments); + + $support->log(sprintf($fmt1, "Total alignments:", $segs), 2); + + for (my $i = 0; $i < $oom; $i++) { + my $from = 10**$i; + my $to = 10**($i+1); + $support->log(sprintf($fmt3, " ".$support->commify($from)." - ".$support->commify($to)." bp:", $blocks{$to}, $blocks{$to}/$segs*100), 2); + } + + $support->log("\n"); + +} + +# finish logfile +$support->finish_log; + -- GitLab