From 998c9df90646898b5fbcffe9e60b1838359f4cab Mon Sep 17 00:00:00 2001
From: Patrick Meidl <pm2@sanger.ac.uk>
Date: Fri, 15 Jun 2007 12:19:38 +0000
Subject: [PATCH] changes from latest run (mouse NCBIM36 to 37):

  - allow mismatches
  - fixes to cope with changes to ConversionSupport->sort_chromosomes()
  - allow most scripts (with the exception of load_alternative_assembly.pl) to
    use source and target dbs on different hosts
  - various other improvements
---
 .../assembly/AssemblyMapper/BlastzAligner.pm  | 10 ++++++----
 misc-scripts/assembly/README                  |  3 +++
 .../assembly/align_by_clone_identity.pl       | 19 +++++++++++++------
 .../assembly/align_nonident_regions.pl        | 15 +++++++++++----
 .../align_nonident_regions_wrapper.pl         | 10 +++++++++-
 misc-scripts/assembly/check_mapping.pl        | 13 ++++++++-----
 misc-scripts/assembly/compare_assemblies.pl   |  2 +-
 misc-scripts/assembly/fix_overlaps.pl         |  2 +-
 misc-scripts/assembly/mapping_stats.pl        | 10 +++++++---
 9 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/misc-scripts/assembly/AssemblyMapper/BlastzAligner.pm b/misc-scripts/assembly/AssemblyMapper/BlastzAligner.pm
index 6525df8f3d..f9015c5a86 100644
--- a/misc-scripts/assembly/AssemblyMapper/BlastzAligner.pm
+++ b/misc-scripts/assembly/AssemblyMapper/BlastzAligner.pm
@@ -279,7 +279,9 @@ sub find_best_alignment {
 
   Example     : $aligner->parse_blastz_output;
   Description : Reads a blastz alignment result from an axt file and creates
-                a datastructure containing ungapped alignments from it.
+                a datastructure containing ungapped alignments from it. Note
+                that mismatches are allowed, but separate stats will be
+                collected for them.
   Return type : none
   Exceptions  : none
   Caller      : general
@@ -329,15 +331,15 @@ sub parse_blastz_output {
                     $self->stats_incr('R_gap', 1) if ($R_arr[$j] eq '-');
                     $match_flag = 0;
                 } else {
+                    $self->found_match($match_flag, $j, \%coords);
+                    $match_flag = 1;
+
                     # match
                     if ($A_arr[$j] eq $R_arr[$j]) {
-                        $self->found_match($match_flag, $j, \%coords);
                         $self->stats_incr('match', 1);
-                        $match_flag = 1;
                     # mismatch
                     } else {
                         $self->stats_incr('mismatch', 1);
-                        $match_flag = 0;
                     }
                 }
             }
diff --git a/misc-scripts/assembly/README b/misc-scripts/assembly/README
index 6441d3d761..fec6a4a27e 100644
--- a/misc-scripts/assembly/README
+++ b/misc-scripts/assembly/README
@@ -90,6 +90,9 @@ chromosome at a time over lsf:
 
   $ ensembl/misc-scripts/assembly/align_nonident_regions_wrapper.pl
 
+After running this, check all logfiles for errors (also the lsf logs in the lsf/
+subdirectory of your logpath).
+
 
 4. Remove overlaps:
 ===================
diff --git a/misc-scripts/assembly/align_by_clone_identity.pl b/misc-scripts/assembly/align_by_clone_identity.pl
index 9ae7183a31..7be0ee22ca 100755
--- a/misc-scripts/assembly/align_by_clone_identity.pl
+++ b/misc-scripts/assembly/align_by_clone_identity.pl
@@ -120,6 +120,10 @@ my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
 $support->parse_common_options(@_);
 $support->parse_extra_options(
     'assembly=s',
+    'althost=s',
+    'altport=i',
+    'altuser=s',
+    'altpass=s',
     'altdbname=s',
     'altassembly=s',
     'chromosomes|chr=s@',
@@ -128,6 +132,10 @@ $support->parse_extra_options(
 $support->allowed_params(
     $support->get_common_params,
     'assembly',
+    'althost',
+    'altport',
+    'altuser',
+    'altpass',
     'altdbname',
     'altassembly',
     'chromosomes',
@@ -167,10 +175,9 @@ if ($support->param('verbose') and $support->user_proceed($txt)) {
 #
 my ($dba, $dbh, $sql, $sth);
 
-# first set connection parameters for alternative db
-# both databases have to be on the same host, so we don't need to configure
-# them separately
-map { $support->param("alt$_", $support->param($_)) } qw(host port user pass);
+# first set connection parameters for alternative db if not different from
+# reference db
+map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user);
 
 # reference database
 my $R_dba = $support->get_database('ensembl');
@@ -206,8 +213,8 @@ unless ($support->param('dry_run')) {
 #####
 # get reference and alternative chromosomes
 #
-my $R_chrlength = $support->get_chrlength($R_dba, $support->param('assembly'));
-my $A_chrlength = $support->get_chrlength($R_dba, $support->param('altassembly'));
+my $R_chrlength = $support->get_chrlength($R_dba, $support->param('assembly'), 'chromosome');
+my $A_chrlength = $support->get_chrlength($R_dba, $support->param('altassembly'), 'chromosome');
 my $ref_chr_map = $support->get_ensembl_chr_mapping($R_dba, $support->param('assembly'));
 
 #####
diff --git a/misc-scripts/assembly/align_nonident_regions.pl b/misc-scripts/assembly/align_nonident_regions.pl
index c37a4358a7..42dd6a1199 100755
--- a/misc-scripts/assembly/align_nonident_regions.pl
+++ b/misc-scripts/assembly/align_nonident_regions.pl
@@ -119,6 +119,10 @@ my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
 $support->parse_common_options(@_);
 $support->parse_extra_options(
     'assembly=s',
+    'althost=s',
+    'altport=i',
+    'altuser=s',
+    'altpass=s',
     'altdbname=s',
     'altassembly=s',
     'bindir=s',
@@ -128,6 +132,10 @@ $support->parse_extra_options(
 $support->allowed_params(
     $support->get_common_params,
     'assembly',
+    'althost',
+    'altport',
+    'altuser',
+    'altpass',
     'altdbname',
     'altassembly',
     'bindir',
@@ -157,10 +165,9 @@ $support->check_required_params(
 #
 my ($dba, $dbh, $sql, $sth);
 
-# first set connection parameters for alternative db
-# both databases have to be on the same host, so we don't need to configure
-# them separately
-map { $support->param("alt$_", $support->param($_)) } qw(host port user pass);
+# first set connection parameters for alternative db if not different from
+# reference db
+map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user);
 
 # reference database
 my $R_dba = $support->get_database('ensembl');
diff --git a/misc-scripts/assembly/align_nonident_regions_wrapper.pl b/misc-scripts/assembly/align_nonident_regions_wrapper.pl
index af629ab316..6e6c8016e3 100755
--- a/misc-scripts/assembly/align_nonident_regions_wrapper.pl
+++ b/misc-scripts/assembly/align_nonident_regions_wrapper.pl
@@ -97,6 +97,10 @@ my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
 $support->parse_common_options(@_);
 $support->parse_extra_options(
     'assembly=s',
+    'althost=s',
+    'altport=i',
+    'altuser=s',
+    'altpass=s',
     'altdbname=s',
     'altassembly=s',
     'bindir=s',
@@ -106,6 +110,10 @@ $support->parse_extra_options(
 $support->allowed_params(
     $support->get_common_params,
     'assembly',
+    'althost',
+    'altport',
+    'altuser',
+    'altpass',
     'altdbname',
     'altassembly',
     'bindir',
@@ -140,7 +148,7 @@ my $R_dba = $support->get_database('ensembl');
 # loop over chromosomes
 $support->log_stamped("Looping over chromosomes...\n");
 
-foreach my $chr ($support->sort_chromosomes) {
+foreach my $chr ($support->sort_chromosomes($support->get_chrlength(undef, undef, 'chromosome'))) {
     $support->log_stamped("Chromosome $chr...\n", 1);
 
     # run align_nonident_regions.pl via lsf
diff --git a/misc-scripts/assembly/check_mapping.pl b/misc-scripts/assembly/check_mapping.pl
index de0e7aa915..45788cb9f5 100755
--- a/misc-scripts/assembly/check_mapping.pl
+++ b/misc-scripts/assembly/check_mapping.pl
@@ -148,7 +148,7 @@ my $A_sa = $A_dba->get_SliceAdaptor;
 
 $support->log("Looping over chromosomes...\n\n");
 
-foreach my $chr ($support->sort_chromosomes) {
+foreach my $chr ($support->sort_chromosomes($support->get_chrlength(undef, undef, 'chromosome'))) {
   $support->log_stamped("Chromosome $chr...\n", 1);
 
   my $R_slice = $R_sa->fetch_by_region('chromosome', $chr);
@@ -158,6 +158,7 @@ foreach my $chr ($support->sort_chromosomes) {
   my @segments = @{ $R_slice->project('chromosome', $support->param('altassembly')) };
   
   my $i;
+  my $k;
 
   foreach my $seg (@segments) {
     # reference sequence
@@ -180,7 +181,7 @@ foreach my $chr ($support->sort_chromosomes) {
 
     } else {
       # not identical -> something is wrong
-      $support->log_warning("Sequence mismatch at ".$R_sub_slice->name."\n", 2);
+      $support->log("Sequence mismatch at ".$R_sub_slice->name."\n", 2);
 
       my $R_sub_seq;
       my $A_sub_seq;
@@ -194,16 +195,18 @@ foreach my $chr ($support->sort_chromosomes) {
       }
       
       $support->log("Ref: $R_sub_seq\n", 3);
-      $support->log("Alt: $A_sub_seq\n", 3);
+      $support->log("Alt: $A_sub_seq\n\n", 3);
 
       $i++;
     }
+
+    $k++;
   }
 
   if ($i) {
-    $support->log("Total: $i problematic alignments.\n", 2);
+    $support->log("Total: $i (of $k) alignments contain sequence mismatches.\n", 2);
   } else {
-    $support->log("All alignments ok.\n", 2);
+    $support->log("All $k alignments ok.\n", 2);
   }
 
   $support->log_stamped("Done.\n\n", 1);
diff --git a/misc-scripts/assembly/compare_assemblies.pl b/misc-scripts/assembly/compare_assemblies.pl
index b240d13125..ee29d9e9d5 100755
--- a/misc-scripts/assembly/compare_assemblies.pl
+++ b/misc-scripts/assembly/compare_assemblies.pl
@@ -151,7 +151,7 @@ my %stats_total;
 
 $support->log_stamped("Looping over chromosomes...\n");
 
-foreach my $R_chr ($support->sort_chromosomes) {
+foreach my $R_chr ($support->sort_chromosomes($support->get_chrlength(undef, undef, 'chromosome'))) {
     $support->log_stamped("\nChromosome $R_chr...\n", 1);
 
     # fetch chromosome slice and project to clones
diff --git a/misc-scripts/assembly/fix_overlaps.pl b/misc-scripts/assembly/fix_overlaps.pl
index 310b91f226..1d29e947d2 100755
--- a/misc-scripts/assembly/fix_overlaps.pl
+++ b/misc-scripts/assembly/fix_overlaps.pl
@@ -142,7 +142,7 @@ my $sth = $dbh->prepare($sql);
 my $fmt1 = "%10s %10s %10s %10s %3s\n";
 my @rows = ();
 
-foreach my $chr ($support->sort_chromosomes) {
+foreach my $chr ($support->sort_chromosomes($support->get_chrlength(undef, undef, 'chromosome'))) {
   $support->log_stamped("\nChromosome $chr...\n");
 
   $sth->execute($chr);
diff --git a/misc-scripts/assembly/mapping_stats.pl b/misc-scripts/assembly/mapping_stats.pl
index 3cbba69563..29fa040535 100755
--- a/misc-scripts/assembly/mapping_stats.pl
+++ b/misc-scripts/assembly/mapping_stats.pl
@@ -151,7 +151,7 @@ my $fmt3 = "%-44s%8.0f (%2.0f%%)\n";
 
 $support->log("Looping over chromosomes...\n\n");
 
-foreach my $chr ($support->sort_chromosomes) {
+foreach my $chr ($support->sort_chromosomes($support->get_chrlength(undef, undef, 'chromosome'))) {
   $support->log_stamped("Chromosome $chr...\n", 1);
 
   # determine non-N sequence length of alternative chromosome
@@ -176,6 +176,7 @@ foreach my $chr ($support->sort_chromosomes) {
   # determine total length of mapping to alternative assembly
   my $mapping_length = 0;
   my %blocks;
+  my %blocklength;
 
   # chromosome length order of magnitude
   my $oom = length($A_length);
@@ -197,7 +198,10 @@ foreach my $chr ($support->sort_chromosomes) {
     my $c_oom = $oom;
     
     while ($c_oom) {
-      if ($l > 10**($c_oom-1) and $l <= 10**$c_oom) { $blocks{10**$c_oom}++; }
+      if ($l > 10**($c_oom-1) and $l <= 10**$c_oom) {
+        $blocks{10**$c_oom}++;
+        $blocklength{10**$c_oom} += $l;
+      }
       $c_oom--;
     }
     $blocks{10}++ if ($l == 1);
@@ -227,7 +231,7 @@ foreach my $chr ($support->sort_chromosomes) {
     for (my $i = 0; $i < $oom; $i++) {
       my $from = 10**$i;
       my $to = 10**($i+1);
-      $support->log(sprintf($fmt3, "    ".$support->commify($from)." - ".$support->commify($to)." bp:", $blocks{$to}, $blocks{$to}/$alignments*100), 2);
+      $support->log(sprintf($fmt3, "    ".$support->commify($from)." - ".$support->commify($to)." bp:", $blocks{$to}, $blocklength{$to}/$R_slice->length*100), 2);
     }
   }
   
-- 
GitLab