From 998c9df90646898b5fbcffe9e60b1838359f4cab Mon Sep 17 00:00:00 2001 From: Patrick Meidl <pm2@sanger.ac.uk> Date: Fri, 15 Jun 2007 12:19:38 +0000 Subject: [PATCH] changes from latest run (mouse NCBIM36 to 37): - allow mismatches - fixes to cope with changes to ConversionSupport->sort_chromosomes() - allow most scripts (with the exception of load_alternative_assembly.pl) to use source and target dbs on different hosts - various other improvements --- .../assembly/AssemblyMapper/BlastzAligner.pm | 10 ++++++---- misc-scripts/assembly/README | 3 +++ .../assembly/align_by_clone_identity.pl | 19 +++++++++++++------ .../assembly/align_nonident_regions.pl | 15 +++++++++++---- .../align_nonident_regions_wrapper.pl | 10 +++++++++- misc-scripts/assembly/check_mapping.pl | 13 ++++++++----- misc-scripts/assembly/compare_assemblies.pl | 2 +- misc-scripts/assembly/fix_overlaps.pl | 2 +- misc-scripts/assembly/mapping_stats.pl | 10 +++++++--- 9 files changed, 59 insertions(+), 25 deletions(-) diff --git a/misc-scripts/assembly/AssemblyMapper/BlastzAligner.pm b/misc-scripts/assembly/AssemblyMapper/BlastzAligner.pm index 6525df8f3d..f9015c5a86 100644 --- a/misc-scripts/assembly/AssemblyMapper/BlastzAligner.pm +++ b/misc-scripts/assembly/AssemblyMapper/BlastzAligner.pm @@ -279,7 +279,9 @@ sub find_best_alignment { Example : $aligner->parse_blastz_output; Description : Reads a blastz alignment result from an axt file and creates - a datastructure containing ungapped alignments from it. + a datastructure containing ungapped alignments from it. Note + that mismatches are allowed, but separate stats will be + collected for them. Return type : none Exceptions : none Caller : general @@ -329,15 +331,15 @@ sub parse_blastz_output { $self->stats_incr('R_gap', 1) if ($R_arr[$j] eq '-'); $match_flag = 0; } else { + $self->found_match($match_flag, $j, \%coords); + $match_flag = 1; + # match if ($A_arr[$j] eq $R_arr[$j]) { - $self->found_match($match_flag, $j, \%coords); $self->stats_incr('match', 1); - $match_flag = 1; # mismatch } else { $self->stats_incr('mismatch', 1); - $match_flag = 0; } } } diff --git a/misc-scripts/assembly/README b/misc-scripts/assembly/README index 6441d3d761..fec6a4a27e 100644 --- a/misc-scripts/assembly/README +++ b/misc-scripts/assembly/README @@ -90,6 +90,9 @@ chromosome at a time over lsf: $ ensembl/misc-scripts/assembly/align_nonident_regions_wrapper.pl +After running this, check all logfiles for errors (also the lsf logs in the lsf/ +subdirectory of your logpath). + 4. Remove overlaps: =================== diff --git a/misc-scripts/assembly/align_by_clone_identity.pl b/misc-scripts/assembly/align_by_clone_identity.pl index 9ae7183a31..7be0ee22ca 100755 --- a/misc-scripts/assembly/align_by_clone_identity.pl +++ b/misc-scripts/assembly/align_by_clone_identity.pl @@ -120,6 +120,10 @@ my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT); $support->parse_common_options(@_); $support->parse_extra_options( 'assembly=s', + 'althost=s', + 'altport=i', + 'altuser=s', + 'altpass=s', 'altdbname=s', 'altassembly=s', 'chromosomes|chr=s@', @@ -128,6 +132,10 @@ $support->parse_extra_options( $support->allowed_params( $support->get_common_params, 'assembly', + 'althost', + 'altport', + 'altuser', + 'altpass', 'altdbname', 'altassembly', 'chromosomes', @@ -167,10 +175,9 @@ if ($support->param('verbose') and $support->user_proceed($txt)) { # my ($dba, $dbh, $sql, $sth); -# first set connection parameters for alternative db -# both databases have to be on the same host, so we don't need to configure -# them separately -map { $support->param("alt$_", $support->param($_)) } qw(host port user pass); +# first set connection parameters for alternative db if not different from +# reference db +map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user); # reference database my $R_dba = $support->get_database('ensembl'); @@ -206,8 +213,8 @@ unless ($support->param('dry_run')) { ##### # get reference and alternative chromosomes # -my $R_chrlength = $support->get_chrlength($R_dba, $support->param('assembly')); -my $A_chrlength = $support->get_chrlength($R_dba, $support->param('altassembly')); +my $R_chrlength = $support->get_chrlength($R_dba, $support->param('assembly'), 'chromosome'); +my $A_chrlength = $support->get_chrlength($R_dba, $support->param('altassembly'), 'chromosome'); my $ref_chr_map = $support->get_ensembl_chr_mapping($R_dba, $support->param('assembly')); ##### diff --git a/misc-scripts/assembly/align_nonident_regions.pl b/misc-scripts/assembly/align_nonident_regions.pl index c37a4358a7..42dd6a1199 100755 --- a/misc-scripts/assembly/align_nonident_regions.pl +++ b/misc-scripts/assembly/align_nonident_regions.pl @@ -119,6 +119,10 @@ my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT); $support->parse_common_options(@_); $support->parse_extra_options( 'assembly=s', + 'althost=s', + 'altport=i', + 'altuser=s', + 'altpass=s', 'altdbname=s', 'altassembly=s', 'bindir=s', @@ -128,6 +132,10 @@ $support->parse_extra_options( $support->allowed_params( $support->get_common_params, 'assembly', + 'althost', + 'altport', + 'altuser', + 'altpass', 'altdbname', 'altassembly', 'bindir', @@ -157,10 +165,9 @@ $support->check_required_params( # my ($dba, $dbh, $sql, $sth); -# first set connection parameters for alternative db -# both databases have to be on the same host, so we don't need to configure -# them separately -map { $support->param("alt$_", $support->param($_)) } qw(host port user pass); +# first set connection parameters for alternative db if not different from +# reference db +map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user); # reference database my $R_dba = $support->get_database('ensembl'); diff --git a/misc-scripts/assembly/align_nonident_regions_wrapper.pl b/misc-scripts/assembly/align_nonident_regions_wrapper.pl index af629ab316..6e6c8016e3 100755 --- a/misc-scripts/assembly/align_nonident_regions_wrapper.pl +++ b/misc-scripts/assembly/align_nonident_regions_wrapper.pl @@ -97,6 +97,10 @@ my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT); $support->parse_common_options(@_); $support->parse_extra_options( 'assembly=s', + 'althost=s', + 'altport=i', + 'altuser=s', + 'altpass=s', 'altdbname=s', 'altassembly=s', 'bindir=s', @@ -106,6 +110,10 @@ $support->parse_extra_options( $support->allowed_params( $support->get_common_params, 'assembly', + 'althost', + 'altport', + 'altuser', + 'altpass', 'altdbname', 'altassembly', 'bindir', @@ -140,7 +148,7 @@ my $R_dba = $support->get_database('ensembl'); # loop over chromosomes $support->log_stamped("Looping over chromosomes...\n"); -foreach my $chr ($support->sort_chromosomes) { +foreach my $chr ($support->sort_chromosomes($support->get_chrlength(undef, undef, 'chromosome'))) { $support->log_stamped("Chromosome $chr...\n", 1); # run align_nonident_regions.pl via lsf diff --git a/misc-scripts/assembly/check_mapping.pl b/misc-scripts/assembly/check_mapping.pl index de0e7aa915..45788cb9f5 100755 --- a/misc-scripts/assembly/check_mapping.pl +++ b/misc-scripts/assembly/check_mapping.pl @@ -148,7 +148,7 @@ my $A_sa = $A_dba->get_SliceAdaptor; $support->log("Looping over chromosomes...\n\n"); -foreach my $chr ($support->sort_chromosomes) { +foreach my $chr ($support->sort_chromosomes($support->get_chrlength(undef, undef, 'chromosome'))) { $support->log_stamped("Chromosome $chr...\n", 1); my $R_slice = $R_sa->fetch_by_region('chromosome', $chr); @@ -158,6 +158,7 @@ foreach my $chr ($support->sort_chromosomes) { my @segments = @{ $R_slice->project('chromosome', $support->param('altassembly')) }; my $i; + my $k; foreach my $seg (@segments) { # reference sequence @@ -180,7 +181,7 @@ foreach my $chr ($support->sort_chromosomes) { } else { # not identical -> something is wrong - $support->log_warning("Sequence mismatch at ".$R_sub_slice->name."\n", 2); + $support->log("Sequence mismatch at ".$R_sub_slice->name."\n", 2); my $R_sub_seq; my $A_sub_seq; @@ -194,16 +195,18 @@ foreach my $chr ($support->sort_chromosomes) { } $support->log("Ref: $R_sub_seq\n", 3); - $support->log("Alt: $A_sub_seq\n", 3); + $support->log("Alt: $A_sub_seq\n\n", 3); $i++; } + + $k++; } if ($i) { - $support->log("Total: $i problematic alignments.\n", 2); + $support->log("Total: $i (of $k) alignments contain sequence mismatches.\n", 2); } else { - $support->log("All alignments ok.\n", 2); + $support->log("All $k alignments ok.\n", 2); } $support->log_stamped("Done.\n\n", 1); diff --git a/misc-scripts/assembly/compare_assemblies.pl b/misc-scripts/assembly/compare_assemblies.pl index b240d13125..ee29d9e9d5 100755 --- a/misc-scripts/assembly/compare_assemblies.pl +++ b/misc-scripts/assembly/compare_assemblies.pl @@ -151,7 +151,7 @@ my %stats_total; $support->log_stamped("Looping over chromosomes...\n"); -foreach my $R_chr ($support->sort_chromosomes) { +foreach my $R_chr ($support->sort_chromosomes($support->get_chrlength(undef, undef, 'chromosome'))) { $support->log_stamped("\nChromosome $R_chr...\n", 1); # fetch chromosome slice and project to clones diff --git a/misc-scripts/assembly/fix_overlaps.pl b/misc-scripts/assembly/fix_overlaps.pl index 310b91f226..1d29e947d2 100755 --- a/misc-scripts/assembly/fix_overlaps.pl +++ b/misc-scripts/assembly/fix_overlaps.pl @@ -142,7 +142,7 @@ my $sth = $dbh->prepare($sql); my $fmt1 = "%10s %10s %10s %10s %3s\n"; my @rows = (); -foreach my $chr ($support->sort_chromosomes) { +foreach my $chr ($support->sort_chromosomes($support->get_chrlength(undef, undef, 'chromosome'))) { $support->log_stamped("\nChromosome $chr...\n"); $sth->execute($chr); diff --git a/misc-scripts/assembly/mapping_stats.pl b/misc-scripts/assembly/mapping_stats.pl index 3cbba69563..29fa040535 100755 --- a/misc-scripts/assembly/mapping_stats.pl +++ b/misc-scripts/assembly/mapping_stats.pl @@ -151,7 +151,7 @@ my $fmt3 = "%-44s%8.0f (%2.0f%%)\n"; $support->log("Looping over chromosomes...\n\n"); -foreach my $chr ($support->sort_chromosomes) { +foreach my $chr ($support->sort_chromosomes($support->get_chrlength(undef, undef, 'chromosome'))) { $support->log_stamped("Chromosome $chr...\n", 1); # determine non-N sequence length of alternative chromosome @@ -176,6 +176,7 @@ foreach my $chr ($support->sort_chromosomes) { # determine total length of mapping to alternative assembly my $mapping_length = 0; my %blocks; + my %blocklength; # chromosome length order of magnitude my $oom = length($A_length); @@ -197,7 +198,10 @@ foreach my $chr ($support->sort_chromosomes) { my $c_oom = $oom; while ($c_oom) { - if ($l > 10**($c_oom-1) and $l <= 10**$c_oom) { $blocks{10**$c_oom}++; } + if ($l > 10**($c_oom-1) and $l <= 10**$c_oom) { + $blocks{10**$c_oom}++; + $blocklength{10**$c_oom} += $l; + } $c_oom--; } $blocks{10}++ if ($l == 1); @@ -227,7 +231,7 @@ foreach my $chr ($support->sort_chromosomes) { for (my $i = 0; $i < $oom; $i++) { my $from = 10**$i; my $to = 10**($i+1); - $support->log(sprintf($fmt3, " ".$support->commify($from)." - ".$support->commify($to)." bp:", $blocks{$to}, $blocks{$to}/$alignments*100), 2); + $support->log(sprintf($fmt3, " ".$support->commify($from)." - ".$support->commify($to)." bp:", $blocks{$to}, $blocklength{$to}/$R_slice->length*100), 2); } } -- GitLab