From 422be3cf5ccf3ac5b02c7e76182501d92a36beb2 Mon Sep 17 00:00:00 2001
From: Patrick Meidl <pm2@sanger.ac.uk>
Date: Wed, 10 May 2006 12:43:48 +0000
Subject: [PATCH] scripts to assess quality of assembly mapping

---
 misc-scripts/assembly/check_mapping.pl      | 209 +++++++++++++++++
 misc-scripts/assembly/compare_assemblies.pl | 234 ++++++++++++++++++++
 misc-scripts/assembly/mapping_stats.pl      | 231 +++++++++++++++++++
 3 files changed, 674 insertions(+)
 create mode 100755 misc-scripts/assembly/check_mapping.pl
 create mode 100755 misc-scripts/assembly/compare_assemblies.pl
 create mode 100755 misc-scripts/assembly/mapping_stats.pl

diff --git a/misc-scripts/assembly/check_mapping.pl b/misc-scripts/assembly/check_mapping.pl
new file mode 100755
index 0000000000..8b94390243
--- /dev/null
+++ b/misc-scripts/assembly/check_mapping.pl
@@ -0,0 +1,209 @@
+#!/usr/local/bin/perl
+
+=head1 NAME
+
+check_mapping.pl - script to check whole genome alignment between two
+assemblies.
+
+=head1 SYNOPSIS
+
+check_mapping.pl [arguments]
+
+Required arguments:
+
+  --dbname, db_name=NAME              database name NAME
+  --host, --dbhost, --db_host=HOST    database host HOST
+  --port, --dbport, --db_port=PORT    database port PORT
+  --user, --dbuser, --db_user=USER    database username USER
+  --pass, --dbpass, --db_pass=PASS    database passwort PASS
+  --assembly=ASSEMBLY                 assembly version ASSEMBLY
+
+  --altdbname=NAME                    alternative database NAME
+  --altassembly=ASSEMBLY              alternative assembly version ASSEMBLY
+
+Optional arguments:
+
+  --althost=HOST                      alternative databases host HOST
+  --altport=PORT                      alternative database port PORT
+  --altuser=USER                      alternative database username USER
+  --altpass=PASS                      alternative database password PASS
+
+  --chromosomes, --chr=LIST           only process LIST chromosomes
+
+  --conffile, --conf=FILE             read parameters from FILE
+                                      (default: conf/Conversion.ini)
+
+  --logfile, --log=FILE               log to FILE (default: *STDOUT)
+  --logpath=PATH                      write logfile to PATH (default: .)
+  --logappend, --log_append           append to logfile (default: truncate)
+
+  -v, --verbose=0|1                   verbose logging (default: false)
+  -i, --interactive=0|1               run script interactively (default: true)
+  -n, --dry_run, --dry=0|1            don't write results to database
+  -h, --help, -?                      print help (this message)
+
+=head1 DESCRIPTION
+
+This script checks if the whole genome alignment between two assemblies is
+correct. It does so by comparing the sequence in the reference database with
+the sequence of the projected fragments in the alternative database.
+
+=head1 RELATED FILES
+
+The whole process of creating a whole genome alignment between two assemblies
+is done by a series of scripts. Please see
+
+  ensembl/misc-scripts/assembly/README
+
+for a high-level description of this process, and POD in the individual scripts
+for the details.
+
+=head1 LICENCE
+
+This code is distributed under an Apache style licence:
+Please see http://www.ensembl.org/code_licence.html for details
+
+=head1 AUTHOR
+
+Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
+
+=head1 CONTACT
+
+Please post comments/questions to the Ensembl development list
+<ensembl-dev@ebi.ac.uk>
+
+=cut
+
+use strict;
+use warnings;
+no warnings 'uninitialized';
+
+use FindBin qw($Bin);
+use vars qw($SERVERROOT);
+
+BEGIN {
+    $SERVERROOT = "$Bin/../../..";
+    unshift(@INC, "$SERVERROOT/ensembl/modules");
+    unshift(@INC, "$SERVERROOT/bioperl-live");
+}
+
+use Getopt::Long;
+use Pod::Usage;
+use Bio::EnsEMBL::Utils::ConversionSupport;
+
+$| = 1;
+
+my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
+
+# parse options
+$support->parse_common_options(@_);
+$support->parse_extra_options(
+    'assembly=s',
+    'altdbname=s',
+    'altassembly=s',
+    'althost=s',
+    'altport=n',
+    'chromosomes|chr=s@',
+);
+$support->allowed_params(
+    $support->get_common_params,
+    'assembly',
+    'altdbname',
+    'altassembly',
+    'althost',
+    'altport',
+    'chromosomes',
+);
+
+if ($support->param('help') or $support->error) {
+    warn $support->error if $support->error;
+    pod2usage(1);
+}
+
+$support->comma_to_list('chromosomes');
+
+# ask user to confirm parameters to proceed
+$support->confirm_params;
+
+# get log filehandle and print heading and parameters to logfile
+$support->init_log;
+
+$support->check_required_params(
+    'assembly',
+    'altdbname',
+    'altassembly'
+);
+
+# first set connection parameters for alternative db if not different from
+# reference db
+map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user pass);
+
+# reference database
+my $R_dba = $support->get_database('ensembl');
+my $R_sa = $R_dba->get_SliceAdaptor;
+
+# database containing the alternative assembly
+my $A_dba = $support->get_database('core', 'alt');
+my $A_sa = $A_dba->get_SliceAdaptor;
+
+$support->log("Looping over chromosomes...\n\n");
+
+foreach my $chr ($support->sort_chromosomes) {
+  $support->log_stamped("Chromosome $chr...\n", 1);
+
+  my $R_slice = $R_sa->fetch_by_region('chromosome', $chr);
+  my $A_slice = $A_sa->fetch_by_region('chromosome', $chr);
+
+  # compare reference and alternative sequence
+  my @segments = @{ $R_slice->project('chromosome', $support->param('altassembly')) };
+
+  foreach my $seg (@segments) {
+    # reference sequence
+    my $R_sub_slice = $R_slice->sub_Slice($seg->from_start, $seg->from_end);
+    my $R_seq = $R_sub_slice->seq;
+    
+    # alternative sequence
+    my $A_proj_slice = $seg->to_Slice;
+    my $A_sub_slice = $A_slice->sub_Slice($A_proj_slice->start, $A_proj_slice->end, $A_proj_slice->strand);
+    my $A_seq = $A_sub_slice->seq;
+
+    # compare
+    my $i;
+
+    if ($R_seq eq $A_seq) {
+      # sequences are identical -> ok
+      $support->log_verbose("Sequence match at ".$R_sub_slice->name."\n", 2);
+
+    } else {
+      # not identical -> something is wrong
+      $support->log_warning("Sequence mismatch at ".$R_sub_slice->name."\n", 2);
+
+      my $R_sub_seq;
+      my $A_sub_seq;
+      
+      if ($R_sub_slice->length > 20) {
+        $R_sub_seq = substr($R_seq, 0, 10)."...".substr($R_seq, -10, 10);
+        $A_sub_seq = substr($A_seq, 0, 10)."...".substr($A_seq, -10, 10);
+      } else {
+        $R_sub_seq = substr($R_seq, 0, 20);
+        $A_sub_seq = substr($A_seq, 0, 20);
+      }
+      
+      $support->log("Ref: $R_sub_seq\n", 3);
+      $support->log("Alt: $A_sub_seq\n", 3);
+
+      $i++;
+    }
+  }
+
+  if ($i) {
+    $support->log("Total: $i problematic alignments.\n", 2);
+  } else {
+    $support->log("All alignments ok.\n", 2);
+
+  $support->log_stamped("Done.\n\n", 1);
+}
+
+# finish logfile
+$support->finish_log;
+
diff --git a/misc-scripts/assembly/compare_assemblies.pl b/misc-scripts/assembly/compare_assemblies.pl
new file mode 100755
index 0000000000..1b97ba58df
--- /dev/null
+++ b/misc-scripts/assembly/compare_assemblies.pl
@@ -0,0 +1,234 @@
+#!/usr/local/bin/perl
+
+=head1 NAME
+
+compare_assemblies.pl - compare two assemblies
+
+=head1 SYNOPSIS
+
+compare_assemblies.pl [arguments]
+
+Required arguments:
+
+  --dbname, db_name=NAME              database name NAME
+  --host, --dbhost, --db_host=HOST    database host HOST
+  --port, --dbport, --db_port=PORT    database port PORT
+  --user, --dbuser, --db_user=USER    database username USER
+  --pass, --dbpass, --db_pass=PASS    database passwort PASS
+  --assembly=ASSEMBLY                 assembly version ASSEMBLY
+
+  --altdbname=NAME                    alternative database NAME
+  --altassembly=ASSEMBLY              alternative assembly version ASSEMBLY
+
+Optional arguments:
+
+  --althost=HOST                      alternative databases host HOST
+  --altport=PORT                      alternative database port PORT
+  --altuser=USER                      alternative database username USER
+  --altpass=PASS                      alternative database password PASS
+
+  --chromosomes, --chr=LIST           only process LIST chromosomes
+
+  --conffile, --conf=FILE             read parameters from FILE
+                                      (default: conf/Conversion.ini)
+
+  --logfile, --log=FILE               log to FILE (default: *STDOUT)
+  --logpath=PATH                      write logfile to PATH (default: .)
+  --logappend, --log_append           append to logfile (default: truncate)
+
+  -v, --verbose=0|1                   verbose logging (default: false)
+  -i, --interactive=0|1               run script interactively (default: true)
+  -n, --dry_run, --dry=0|1            don't write results to database
+  -h, --help, -?                      print help (this message)
+
+=head1 DESCRIPTION
+
+This script compares two assemblies. At the moment all it does is to list
+clones that are on different chromosomes in the two assemblies.
+
+=head1 RELATED FILES
+
+The whole process of creating a whole genome alignment between two assemblies
+is done by a series of scripts. Please see
+
+  ensembl/misc-scripts/assembly/README
+
+for a high-level description of this process, and POD in the individual scripts
+for the details.
+
+=head1 LICENCE
+
+This code is distributed under an Apache style licence:
+Please see http://www.ensembl.org/code_licence.html for details
+
+=head1 AUTHOR
+
+Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
+
+=head1 CONTACT
+
+Please post comments/questions to the Ensembl development list
+<ensembl-dev@ebi.ac.uk>
+
+=cut
+
+use strict;
+use warnings;
+no warnings 'uninitialized';
+
+use FindBin qw($Bin);
+use vars qw($SERVERROOT);
+
+BEGIN {
+    $SERVERROOT = "$Bin/../../..";
+    unshift(@INC, "$SERVERROOT/ensembl/modules");
+    unshift(@INC, "$SERVERROOT/bioperl-live");
+}
+
+use Getopt::Long;
+use Pod::Usage;
+use Bio::EnsEMBL::Utils::ConversionSupport;
+
+$| = 1;
+
+my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
+
+# parse options
+$support->parse_common_options(@_);
+$support->parse_extra_options(
+    'assembly=s',
+    'altdbname=s',
+    'altassembly=s',
+    'althost=s',
+    'altport=n',
+    'chromosomes|chr=s@',
+);
+$support->allowed_params(
+    $support->get_common_params,
+    'assembly',
+    'altdbname',
+    'altassembly',
+    'althost',
+    'altport',
+    'chromosomes',
+);
+
+if ($support->param('help') or $support->error) {
+    warn $support->error if $support->error;
+    pod2usage(1);
+}
+
+$support->comma_to_list('chromosomes');
+
+# ask user to confirm parameters to proceed
+$support->confirm_params;
+
+# get log filehandle and print heading and parameters to logfile
+$support->init_log;
+
+$support->check_required_params(
+    'assembly',
+    'altdbname',
+    'altassembly'
+);
+
+# first set connection parameters for alternative db if not different from
+# reference db
+map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user pass);
+
+# reference database
+my $R_dba = $support->get_database('ensembl');
+my $R_sa = $R_dba->get_SliceAdaptor;
+my $R_sa = $R_dba->get_SliceAdaptor;
+
+# database containing the alternative assembly
+my $A_dba = $support->get_database('core', 'alt');
+my $A_sa = $A_dba->get_SliceAdaptor;
+my $A_sa = $A_dba->get_SliceAdaptor;
+
+my $fmt1 = "%-20s%-12s%-12s\n";
+my $fmt2 = "%-35s%10.0f\n";
+my @diff_total;
+my %stats_total;
+
+$support->log_stamped("Looping over chromosomes...\n");
+
+foreach my $R_chr ($support->sort_chromosomes) {
+    $support->log_stamped("\nChromosome $R_chr...\n", 1);
+
+    # fetch chromosome slice and project to clones
+    my $R_slice = $R_sa->fetch_by_region('chromosome', $R_chr);
+    my @R_clones = @{ $R_slice->project('clone') };
+
+    # loop over reference clones
+    my @diff;
+    my %stats;
+    foreach my $R_seg (@R_clones) {
+        $stats{'num_clones'}++;
+        my $R_clone = $R_seg->to_Slice;
+        my $R_clone_name = $R_clone->seq_region_name;
+
+        # fetch clone from alternative db and project to chromosome
+        my $A_clone = $A_sa->fetch_by_region('clone', $R_clone_name);
+        if ($A_clone) {
+            my ($A_seg) = @{ $A_clone->project('chromosome') };
+            if ($A_seg) {
+                my $A_slice = $A_seg->to_Slice;
+
+                # compare chromosome names
+                my $A_chr = $A_slice->seq_region_name;
+                unless ($R_chr eq $A_chr) {
+                    push @diff, [$R_clone_name, $R_chr, $A_chr];
+                }
+            } else {
+                $stats{'does_not_project'}++;
+                $support->log_verbose("Clone $R_clone_name doesn't project to chromosome.\n", 2);
+            }
+        } else {
+            $stats{'not_in_alt'}++;
+            $support->log_verbose("Clone $R_clone_name not in alternative db.\n", 2);
+        }
+    }
+    push @diff_total, @diff;
+    map { $stats_total{$_} += $stats{$_} }
+        qw(num_clones does_not_project not_in_alt);
+
+    # print results for this chromosome
+    $support->log("\nStats for chromosome $R_chr:\n\n", 2);
+    $support->log(sprintf($fmt2, "Total number of clones:", $stats{'num_clones'}), 3);
+    $support->log(sprintf($fmt2, "Clones not in alternative db:", $stats{'not_in_alt'}), 3);
+    $support->log(sprintf($fmt2, "Clones not in alternative assembly:", $stats{'does_not_project'}), 3);
+    if (@diff) {
+        $support->log("\nClones on different chromosomes in the 2 assemblies:\n\n", 3);
+        $support->log(sprintf($fmt1, qw(CLONE R_CHR A_CHR)), 4);
+        $support->log(('-'x44)."\n", 4);
+        foreach my $d (@diff) {
+            $support->log(sprintf($fmt1, @{ $d }), 4);
+        }
+    } else {
+        $support->log("\nAll matching clones on same chromosome in the 2 assemblies.\n", 3);
+    }
+}
+
+$support->log_stamepd("Done.\n\n");
+
+# print overall results
+$support->log("\nOverall stats:\n");
+$support->log(sprintf($fmt2, "Total number of clones:", $stats_total{'num_clones'}), 1);
+$support->log(sprintf($fmt2, "Clones not in alternative db:", $stats_total{'not_in_alt'}), 1);
+$support->log(sprintf($fmt2, "Clones not in alternative assembly:", $stats_total{'does_not_project'}), 1);
+if (@diff_total) {
+    $support->log("\nClones on different chromosomes in the 2 assemblies:\n\n", 1);
+    $support->log(sprintf($fmt1, qw(CLONE R_CHR A_CHR)), 2);
+    $support->log(('-'x44)."\n", 2);
+    foreach my $d (@diff_total) {
+        $support->log(sprintf($fmt1, @{ $d }), 2);
+    }
+} else {
+    $support->log("\nAll clones on same chromosome in the 2 assemblies.\n", 2);
+}
+$support->log("\n");
+
+# finish logfile
+$support->finish_log;
+
diff --git a/misc-scripts/assembly/mapping_stats.pl b/misc-scripts/assembly/mapping_stats.pl
new file mode 100755
index 0000000000..358c9a3662
--- /dev/null
+++ b/misc-scripts/assembly/mapping_stats.pl
@@ -0,0 +1,231 @@
+#!/usr/local/bin/perl
+
+=head1 NAME
+
+mapping_stats.pl - print some statistics about a whole genome alignment between
+two assemblies.
+
+=head1 SYNOPSIS
+
+mapping_stats.pl [arguments]
+
+Required arguments:
+
+  --dbname, db_name=NAME              database name NAME
+  --host, --dbhost, --db_host=HOST    database host HOST
+  --port, --dbport, --db_port=PORT    database port PORT
+  --user, --dbuser, --db_user=USER    database username USER
+  --pass, --dbpass, --db_pass=PASS    database passwort PASS
+  --assembly=ASSEMBLY                 assembly version ASSEMBLY
+
+  --altdbname=NAME                    alternative database NAME
+  --altassembly=ASSEMBLY              alternative assembly version ASSEMBLY
+
+Optional arguments:
+
+  --althost=HOST                      alternative databases host HOST
+  --altport=PORT                      alternative database port PORT
+  --altuser=USER                      alternative database username USER
+  --altpass=PASS                      alternative database password PASS
+
+  --chromosomes, --chr=LIST           only process LIST chromosomes
+
+  --conffile, --conf=FILE             read parameters from FILE
+                                      (default: conf/Conversion.ini)
+
+  --logfile, --log=FILE               log to FILE (default: *STDOUT)
+  --logpath=PATH                      write logfile to PATH (default: .)
+  --logappend, --log_append           append to logfile (default: truncate)
+
+  -v, --verbose=0|1                   verbose logging (default: false)
+  -i, --interactive=0|1               run script interactively (default: true)
+  -n, --dry_run, --dry=0|1            don't write results to database
+  -h, --help, -?                      print help (this message)
+
+=head1 DESCRIPTION
+
+This script prints some statistics about a whole genome alignment between two
+assemblies, like the alignment coverage and length of alignment blocks.
+
+=head1 RELATED FILES
+
+The whole process of creating a whole genome alignment between two assemblies
+is done by a series of scripts. Please see
+
+  ensembl/misc-scripts/assembly/README
+
+for a high-level description of this process, and POD in the individual scripts
+for the details.
+
+=head1 LICENCE
+
+This code is distributed under an Apache style licence:
+Please see http://www.ensembl.org/code_licence.html for details
+
+=head1 AUTHOR
+
+Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
+
+=head1 CONTACT
+
+Please post comments/questions to the Ensembl development list
+<ensembl-dev@ebi.ac.uk>
+
+=cut
+
+use strict;
+use warnings;
+no warnings 'uninitialized';
+
+use FindBin qw($Bin);
+use vars qw($SERVERROOT);
+
+BEGIN {
+    $SERVERROOT = "$Bin/../../..";
+    unshift(@INC, "$SERVERROOT/ensembl/modules");
+    unshift(@INC, "$SERVERROOT/bioperl-live");
+}
+
+use Getopt::Long;
+use Pod::Usage;
+use Bio::EnsEMBL::Utils::ConversionSupport;
+
+$| = 1;
+
+my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
+
+# parse options
+$support->parse_common_options(@_);
+$support->parse_extra_options(
+    'assembly=s',
+    'altdbname=s',
+    'altassembly=s',
+    'althost=s',
+    'altport=n',
+    'chromosomes|chr=s@',
+);
+$support->allowed_params(
+    $support->get_common_params,
+    'assembly',
+    'altdbname',
+    'altassembly',
+    'althost',
+    'altport',
+    'chromosomes',
+);
+
+if ($support->param('help') or $support->error) {
+    warn $support->error if $support->error;
+    pod2usage(1);
+}
+
+$support->comma_to_list('chromosomes');
+
+# ask user to confirm parameters to proceed
+$support->confirm_params;
+
+# get log filehandle and print heading and parameters to logfile
+$support->init_log;
+
+$support->check_required_params(
+    'assembly',
+    'altdbname',
+    'altassembly'
+);
+
+# first set connection parameters for alternative db if not different from
+# reference db
+map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user pass);
+
+# reference database
+my $R_dba = $support->get_database('ensembl');
+my $R_sa = $R_dba->get_SliceAdaptor;
+
+# database containing the alternative assembly
+my $A_dba = $support->get_database('core', 'alt');
+my $A_sa = $A_dba->get_SliceAdaptor;
+
+my $fmt1 = "%-40s%12s\n";
+my $fmt2 = "%-40s%11.1f%%\n";
+my $fmt3 = "%-44s%8.0f (%2.0f%%)\n";
+
+$support->log("Looping over chromosomes...\n\n");
+
+foreach my $chr ($support->sort_chromosomes) {
+  $support->log_stamped("Chromosome $chr...\n", 1);
+
+  # determine non-N sequence length of alternative chromosome
+  my $A_slice = $A_sa->fetch_by_region('chromosome', $chr);
+
+  my $start = 1;
+  my $A_chr_length = $A_slice->length;
+  my $n;
+  my $end;
+  
+  while ($start < $A_chr_length) {
+    $end = $start + 10000000;
+    $end = $A_chr_length if ($end > $A_chr_length);
+    my $sub_slice = $A_slice->sub_Slice($start, $end);
+    my $seq = $sub_slice->seq;
+    $n += $seq =~ tr/N/N/;
+    $start = $end + 1;
+  }
+
+  my $A_length = $A_chr_length - $n;
+
+  # determine total length of mapping to alternative assembly
+  my $mapping_length;
+  my %blocks;
+
+  # chromosome length order of magnitude
+  my $oom = length($A_length);
+  
+  my $R_slice = $R_sa->fetch_by_region('chromosome', $chr);
+  my @segments = @{ $R_slice->project('chromosome', $support->param('altassembly')) };
+
+  foreach my $seg (@segments) {
+    my $l = $seg->to_Slice->length;
+    $mapping_length += $l;
+    
+    my $c_oom = $oom;
+    
+    while ($c_oom) {
+      if ($l > 10**($c_oom-1) and $l <= 10**$c_oom) { $blocks{10**$c_oom}++; }
+      $c_oom--;
+    }
+    $blocks{10}++ if ($l == 1);
+  }  
+  
+  # print stats
+  $support->log("\n");
+  
+  $support->log(sprintf($fmt1, "Reference chromosome length:",
+      $support->commify($R_slice->length)), 2);
+  $support->log(sprintf($fmt1, "Alternative chromosome length:",
+      $support->commify($A_chr_length)), 2);
+  $support->log(sprintf($fmt1, "Non-N sequence length (alternative):",
+      $support->commify($A_length)), 2);
+  $support->log(sprintf($fmt1, "Alternative mapping length:",
+      $support->commify($mapping_length)), 2);
+  $support->log(sprintf($fmt2, "Mapping percentage:",
+      $mapping_length/$A_length*100), 2);
+  
+  $support->log("\n");
+  
+  my $segs = scalar(@segments);
+  
+  $support->log(sprintf($fmt1, "Total alignments:", $segs), 2);
+  
+  for (my $i = 0; $i < $oom; $i++) {
+    my $from = 10**$i;
+    my $to = 10**($i+1);
+    $support->log(sprintf($fmt3, "    ".$support->commify($from)." - ".$support->commify($to)." bp:", $blocks{$to}, $blocks{$to}/$segs*100), 2);
+  }
+  
+  $support->log("\n");
+  
+}
+
+# finish logfile
+$support->finish_log;
+
-- 
GitLab