Skip to content
Snippets Groups Projects
Commit 422be3cf authored by Patrick Meidl's avatar Patrick Meidl
Browse files

scripts to assess quality of assembly mapping

parent edf42bd4
No related branches found
No related tags found
No related merge requests found
#!/usr/local/bin/perl
=head1 NAME
check_mapping.pl - script to check whole genome alignment between two
assemblies.
=head1 SYNOPSIS
check_mapping.pl [arguments]
Required arguments:
--dbname, db_name=NAME database name NAME
--host, --dbhost, --db_host=HOST database host HOST
--port, --dbport, --db_port=PORT database port PORT
--user, --dbuser, --db_user=USER database username USER
--pass, --dbpass, --db_pass=PASS database passwort PASS
--assembly=ASSEMBLY assembly version ASSEMBLY
--altdbname=NAME alternative database NAME
--altassembly=ASSEMBLY alternative assembly version ASSEMBLY
Optional arguments:
--althost=HOST alternative databases host HOST
--altport=PORT alternative database port PORT
--altuser=USER alternative database username USER
--altpass=PASS alternative database password PASS
--chromosomes, --chr=LIST only process LIST chromosomes
--conffile, --conf=FILE read parameters from FILE
(default: conf/Conversion.ini)
--logfile, --log=FILE log to FILE (default: *STDOUT)
--logpath=PATH write logfile to PATH (default: .)
--logappend, --log_append append to logfile (default: truncate)
-v, --verbose=0|1 verbose logging (default: false)
-i, --interactive=0|1 run script interactively (default: true)
-n, --dry_run, --dry=0|1 don't write results to database
-h, --help, -? print help (this message)
=head1 DESCRIPTION
This script checks if the whole genome alignment between two assemblies is
correct. It does so by comparing the sequence in the reference database with
the sequence of the projected fragments in the alternative database.
=head1 RELATED FILES
The whole process of creating a whole genome alignment between two assemblies
is done by a series of scripts. Please see
ensembl/misc-scripts/assembly/README
for a high-level description of this process, and POD in the individual scripts
for the details.
=head1 LICENCE
This code is distributed under an Apache style licence:
Please see http://www.ensembl.org/code_licence.html for details
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use strict;
use warnings;
no warnings 'uninitialized';
use FindBin qw($Bin);
use vars qw($SERVERROOT);
BEGIN {
$SERVERROOT = "$Bin/../../..";
unshift(@INC, "$SERVERROOT/ensembl/modules");
unshift(@INC, "$SERVERROOT/bioperl-live");
}
use Getopt::Long;
use Pod::Usage;
use Bio::EnsEMBL::Utils::ConversionSupport;
$| = 1;
my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
# parse options
$support->parse_common_options(@_);
$support->parse_extra_options(
'assembly=s',
'altdbname=s',
'altassembly=s',
'althost=s',
'altport=n',
'chromosomes|chr=s@',
);
$support->allowed_params(
$support->get_common_params,
'assembly',
'altdbname',
'altassembly',
'althost',
'altport',
'chromosomes',
);
if ($support->param('help') or $support->error) {
warn $support->error if $support->error;
pod2usage(1);
}
$support->comma_to_list('chromosomes');
# ask user to confirm parameters to proceed
$support->confirm_params;
# get log filehandle and print heading and parameters to logfile
$support->init_log;
$support->check_required_params(
'assembly',
'altdbname',
'altassembly'
);
# first set connection parameters for alternative db if not different from
# reference db
map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user pass);
# reference database
my $R_dba = $support->get_database('ensembl');
my $R_sa = $R_dba->get_SliceAdaptor;
# database containing the alternative assembly
my $A_dba = $support->get_database('core', 'alt');
my $A_sa = $A_dba->get_SliceAdaptor;
$support->log("Looping over chromosomes...\n\n");
foreach my $chr ($support->sort_chromosomes) {
$support->log_stamped("Chromosome $chr...\n", 1);
my $R_slice = $R_sa->fetch_by_region('chromosome', $chr);
my $A_slice = $A_sa->fetch_by_region('chromosome', $chr);
# compare reference and alternative sequence
my @segments = @{ $R_slice->project('chromosome', $support->param('altassembly')) };
foreach my $seg (@segments) {
# reference sequence
my $R_sub_slice = $R_slice->sub_Slice($seg->from_start, $seg->from_end);
my $R_seq = $R_sub_slice->seq;
# alternative sequence
my $A_proj_slice = $seg->to_Slice;
my $A_sub_slice = $A_slice->sub_Slice($A_proj_slice->start, $A_proj_slice->end, $A_proj_slice->strand);
my $A_seq = $A_sub_slice->seq;
# compare
my $i;
if ($R_seq eq $A_seq) {
# sequences are identical -> ok
$support->log_verbose("Sequence match at ".$R_sub_slice->name."\n", 2);
} else {
# not identical -> something is wrong
$support->log_warning("Sequence mismatch at ".$R_sub_slice->name."\n", 2);
my $R_sub_seq;
my $A_sub_seq;
if ($R_sub_slice->length > 20) {
$R_sub_seq = substr($R_seq, 0, 10)."...".substr($R_seq, -10, 10);
$A_sub_seq = substr($A_seq, 0, 10)."...".substr($A_seq, -10, 10);
} else {
$R_sub_seq = substr($R_seq, 0, 20);
$A_sub_seq = substr($A_seq, 0, 20);
}
$support->log("Ref: $R_sub_seq\n", 3);
$support->log("Alt: $A_sub_seq\n", 3);
$i++;
}
}
if ($i) {
$support->log("Total: $i problematic alignments.\n", 2);
} else {
$support->log("All alignments ok.\n", 2);
$support->log_stamped("Done.\n\n", 1);
}
# finish logfile
$support->finish_log;
#!/usr/local/bin/perl
=head1 NAME
compare_assemblies.pl - compare two assemblies
=head1 SYNOPSIS
compare_assemblies.pl [arguments]
Required arguments:
--dbname, db_name=NAME database name NAME
--host, --dbhost, --db_host=HOST database host HOST
--port, --dbport, --db_port=PORT database port PORT
--user, --dbuser, --db_user=USER database username USER
--pass, --dbpass, --db_pass=PASS database passwort PASS
--assembly=ASSEMBLY assembly version ASSEMBLY
--altdbname=NAME alternative database NAME
--altassembly=ASSEMBLY alternative assembly version ASSEMBLY
Optional arguments:
--althost=HOST alternative databases host HOST
--altport=PORT alternative database port PORT
--altuser=USER alternative database username USER
--altpass=PASS alternative database password PASS
--chromosomes, --chr=LIST only process LIST chromosomes
--conffile, --conf=FILE read parameters from FILE
(default: conf/Conversion.ini)
--logfile, --log=FILE log to FILE (default: *STDOUT)
--logpath=PATH write logfile to PATH (default: .)
--logappend, --log_append append to logfile (default: truncate)
-v, --verbose=0|1 verbose logging (default: false)
-i, --interactive=0|1 run script interactively (default: true)
-n, --dry_run, --dry=0|1 don't write results to database
-h, --help, -? print help (this message)
=head1 DESCRIPTION
This script compares two assemblies. At the moment all it does is to list
clones that are on different chromosomes in the two assemblies.
=head1 RELATED FILES
The whole process of creating a whole genome alignment between two assemblies
is done by a series of scripts. Please see
ensembl/misc-scripts/assembly/README
for a high-level description of this process, and POD in the individual scripts
for the details.
=head1 LICENCE
This code is distributed under an Apache style licence:
Please see http://www.ensembl.org/code_licence.html for details
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use strict;
use warnings;
no warnings 'uninitialized';
use FindBin qw($Bin);
use vars qw($SERVERROOT);
BEGIN {
$SERVERROOT = "$Bin/../../..";
unshift(@INC, "$SERVERROOT/ensembl/modules");
unshift(@INC, "$SERVERROOT/bioperl-live");
}
use Getopt::Long;
use Pod::Usage;
use Bio::EnsEMBL::Utils::ConversionSupport;
$| = 1;
my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
# parse options
$support->parse_common_options(@_);
$support->parse_extra_options(
'assembly=s',
'altdbname=s',
'altassembly=s',
'althost=s',
'altport=n',
'chromosomes|chr=s@',
);
$support->allowed_params(
$support->get_common_params,
'assembly',
'altdbname',
'altassembly',
'althost',
'altport',
'chromosomes',
);
if ($support->param('help') or $support->error) {
warn $support->error if $support->error;
pod2usage(1);
}
$support->comma_to_list('chromosomes');
# ask user to confirm parameters to proceed
$support->confirm_params;
# get log filehandle and print heading and parameters to logfile
$support->init_log;
$support->check_required_params(
'assembly',
'altdbname',
'altassembly'
);
# first set connection parameters for alternative db if not different from
# reference db
map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user pass);
# reference database
my $R_dba = $support->get_database('ensembl');
my $R_sa = $R_dba->get_SliceAdaptor;
my $R_sa = $R_dba->get_SliceAdaptor;
# database containing the alternative assembly
my $A_dba = $support->get_database('core', 'alt');
my $A_sa = $A_dba->get_SliceAdaptor;
my $A_sa = $A_dba->get_SliceAdaptor;
my $fmt1 = "%-20s%-12s%-12s\n";
my $fmt2 = "%-35s%10.0f\n";
my @diff_total;
my %stats_total;
$support->log_stamped("Looping over chromosomes...\n");
foreach my $R_chr ($support->sort_chromosomes) {
$support->log_stamped("\nChromosome $R_chr...\n", 1);
# fetch chromosome slice and project to clones
my $R_slice = $R_sa->fetch_by_region('chromosome', $R_chr);
my @R_clones = @{ $R_slice->project('clone') };
# loop over reference clones
my @diff;
my %stats;
foreach my $R_seg (@R_clones) {
$stats{'num_clones'}++;
my $R_clone = $R_seg->to_Slice;
my $R_clone_name = $R_clone->seq_region_name;
# fetch clone from alternative db and project to chromosome
my $A_clone = $A_sa->fetch_by_region('clone', $R_clone_name);
if ($A_clone) {
my ($A_seg) = @{ $A_clone->project('chromosome') };
if ($A_seg) {
my $A_slice = $A_seg->to_Slice;
# compare chromosome names
my $A_chr = $A_slice->seq_region_name;
unless ($R_chr eq $A_chr) {
push @diff, [$R_clone_name, $R_chr, $A_chr];
}
} else {
$stats{'does_not_project'}++;
$support->log_verbose("Clone $R_clone_name doesn't project to chromosome.\n", 2);
}
} else {
$stats{'not_in_alt'}++;
$support->log_verbose("Clone $R_clone_name not in alternative db.\n", 2);
}
}
push @diff_total, @diff;
map { $stats_total{$_} += $stats{$_} }
qw(num_clones does_not_project not_in_alt);
# print results for this chromosome
$support->log("\nStats for chromosome $R_chr:\n\n", 2);
$support->log(sprintf($fmt2, "Total number of clones:", $stats{'num_clones'}), 3);
$support->log(sprintf($fmt2, "Clones not in alternative db:", $stats{'not_in_alt'}), 3);
$support->log(sprintf($fmt2, "Clones not in alternative assembly:", $stats{'does_not_project'}), 3);
if (@diff) {
$support->log("\nClones on different chromosomes in the 2 assemblies:\n\n", 3);
$support->log(sprintf($fmt1, qw(CLONE R_CHR A_CHR)), 4);
$support->log(('-'x44)."\n", 4);
foreach my $d (@diff) {
$support->log(sprintf($fmt1, @{ $d }), 4);
}
} else {
$support->log("\nAll matching clones on same chromosome in the 2 assemblies.\n", 3);
}
}
$support->log_stamepd("Done.\n\n");
# print overall results
$support->log("\nOverall stats:\n");
$support->log(sprintf($fmt2, "Total number of clones:", $stats_total{'num_clones'}), 1);
$support->log(sprintf($fmt2, "Clones not in alternative db:", $stats_total{'not_in_alt'}), 1);
$support->log(sprintf($fmt2, "Clones not in alternative assembly:", $stats_total{'does_not_project'}), 1);
if (@diff_total) {
$support->log("\nClones on different chromosomes in the 2 assemblies:\n\n", 1);
$support->log(sprintf($fmt1, qw(CLONE R_CHR A_CHR)), 2);
$support->log(('-'x44)."\n", 2);
foreach my $d (@diff_total) {
$support->log(sprintf($fmt1, @{ $d }), 2);
}
} else {
$support->log("\nAll clones on same chromosome in the 2 assemblies.\n", 2);
}
$support->log("\n");
# finish logfile
$support->finish_log;
#!/usr/local/bin/perl
=head1 NAME
mapping_stats.pl - print some statistics about a whole genome alignment between
two assemblies.
=head1 SYNOPSIS
mapping_stats.pl [arguments]
Required arguments:
--dbname, db_name=NAME database name NAME
--host, --dbhost, --db_host=HOST database host HOST
--port, --dbport, --db_port=PORT database port PORT
--user, --dbuser, --db_user=USER database username USER
--pass, --dbpass, --db_pass=PASS database passwort PASS
--assembly=ASSEMBLY assembly version ASSEMBLY
--altdbname=NAME alternative database NAME
--altassembly=ASSEMBLY alternative assembly version ASSEMBLY
Optional arguments:
--althost=HOST alternative databases host HOST
--altport=PORT alternative database port PORT
--altuser=USER alternative database username USER
--altpass=PASS alternative database password PASS
--chromosomes, --chr=LIST only process LIST chromosomes
--conffile, --conf=FILE read parameters from FILE
(default: conf/Conversion.ini)
--logfile, --log=FILE log to FILE (default: *STDOUT)
--logpath=PATH write logfile to PATH (default: .)
--logappend, --log_append append to logfile (default: truncate)
-v, --verbose=0|1 verbose logging (default: false)
-i, --interactive=0|1 run script interactively (default: true)
-n, --dry_run, --dry=0|1 don't write results to database
-h, --help, -? print help (this message)
=head1 DESCRIPTION
This script prints some statistics about a whole genome alignment between two
assemblies, like the alignment coverage and length of alignment blocks.
=head1 RELATED FILES
The whole process of creating a whole genome alignment between two assemblies
is done by a series of scripts. Please see
ensembl/misc-scripts/assembly/README
for a high-level description of this process, and POD in the individual scripts
for the details.
=head1 LICENCE
This code is distributed under an Apache style licence:
Please see http://www.ensembl.org/code_licence.html for details
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use strict;
use warnings;
no warnings 'uninitialized';
use FindBin qw($Bin);
use vars qw($SERVERROOT);
BEGIN {
$SERVERROOT = "$Bin/../../..";
unshift(@INC, "$SERVERROOT/ensembl/modules");
unshift(@INC, "$SERVERROOT/bioperl-live");
}
use Getopt::Long;
use Pod::Usage;
use Bio::EnsEMBL::Utils::ConversionSupport;
$| = 1;
my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
# parse options
$support->parse_common_options(@_);
$support->parse_extra_options(
'assembly=s',
'altdbname=s',
'altassembly=s',
'althost=s',
'altport=n',
'chromosomes|chr=s@',
);
$support->allowed_params(
$support->get_common_params,
'assembly',
'altdbname',
'altassembly',
'althost',
'altport',
'chromosomes',
);
if ($support->param('help') or $support->error) {
warn $support->error if $support->error;
pod2usage(1);
}
$support->comma_to_list('chromosomes');
# ask user to confirm parameters to proceed
$support->confirm_params;
# get log filehandle and print heading and parameters to logfile
$support->init_log;
$support->check_required_params(
'assembly',
'altdbname',
'altassembly'
);
# first set connection parameters for alternative db if not different from
# reference db
map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user pass);
# reference database
my $R_dba = $support->get_database('ensembl');
my $R_sa = $R_dba->get_SliceAdaptor;
# database containing the alternative assembly
my $A_dba = $support->get_database('core', 'alt');
my $A_sa = $A_dba->get_SliceAdaptor;
my $fmt1 = "%-40s%12s\n";
my $fmt2 = "%-40s%11.1f%%\n";
my $fmt3 = "%-44s%8.0f (%2.0f%%)\n";
$support->log("Looping over chromosomes...\n\n");
foreach my $chr ($support->sort_chromosomes) {
$support->log_stamped("Chromosome $chr...\n", 1);
# determine non-N sequence length of alternative chromosome
my $A_slice = $A_sa->fetch_by_region('chromosome', $chr);
my $start = 1;
my $A_chr_length = $A_slice->length;
my $n;
my $end;
while ($start < $A_chr_length) {
$end = $start + 10000000;
$end = $A_chr_length if ($end > $A_chr_length);
my $sub_slice = $A_slice->sub_Slice($start, $end);
my $seq = $sub_slice->seq;
$n += $seq =~ tr/N/N/;
$start = $end + 1;
}
my $A_length = $A_chr_length - $n;
# determine total length of mapping to alternative assembly
my $mapping_length;
my %blocks;
# chromosome length order of magnitude
my $oom = length($A_length);
my $R_slice = $R_sa->fetch_by_region('chromosome', $chr);
my @segments = @{ $R_slice->project('chromosome', $support->param('altassembly')) };
foreach my $seg (@segments) {
my $l = $seg->to_Slice->length;
$mapping_length += $l;
my $c_oom = $oom;
while ($c_oom) {
if ($l > 10**($c_oom-1) and $l <= 10**$c_oom) { $blocks{10**$c_oom}++; }
$c_oom--;
}
$blocks{10}++ if ($l == 1);
}
# print stats
$support->log("\n");
$support->log(sprintf($fmt1, "Reference chromosome length:",
$support->commify($R_slice->length)), 2);
$support->log(sprintf($fmt1, "Alternative chromosome length:",
$support->commify($A_chr_length)), 2);
$support->log(sprintf($fmt1, "Non-N sequence length (alternative):",
$support->commify($A_length)), 2);
$support->log(sprintf($fmt1, "Alternative mapping length:",
$support->commify($mapping_length)), 2);
$support->log(sprintf($fmt2, "Mapping percentage:",
$mapping_length/$A_length*100), 2);
$support->log("\n");
my $segs = scalar(@segments);
$support->log(sprintf($fmt1, "Total alignments:", $segs), 2);
for (my $i = 0; $i < $oom; $i++) {
my $from = 10**$i;
my $to = 10**($i+1);
$support->log(sprintf($fmt3, " ".$support->commify($from)." - ".$support->commify($to)." bp:", $blocks{$to}, $blocks{$to}/$segs*100), 2);
}
$support->log("\n");
}
# finish logfile
$support->finish_log;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment