diff --git a/misc-scripts/assembly/fix_component_gaps.pl b/misc-scripts/assembly/fix_component_gaps.pl deleted file mode 100755 index 2cd468b418d4b6ea40df4536b2159d78d1e9f5de..0000000000000000000000000000000000000000 --- a/misc-scripts/assembly/fix_component_gaps.pl +++ /dev/null @@ -1,267 +0,0 @@ -#!/usr/local/ensembl/bin/perl - -=head1 NAME - -fix_overlaps.pl - remove overlapping mappings between two closely related -assemblies - -=head1 SYNOPSIS - -fix_overlaps.pl [arguments] - -Required arguments: - - --dbname, db_name=NAME database name NAME - --host, --dbhost, --db_host=HOST database host HOST - --port, --dbport, --db_port=PORT database port PORT - --user, --dbuser, --db_user=USER database username USER - --pass, --dbpass, --db_pass=PASS database passwort PASS - --assembly=ASSEMBLY assembly version ASSEMBLY - - --altassembly=ASSEMBLY alternative assembly version ASSEMBLY - -Optional arguments: - - --chromosomes, --chr=LIST only process LIST toplevel seq_regions - - --conffile, --conf=FILE read parameters from FILE - (default: conf/Conversion.ini) - - --logfile, --log=FILE log to FILE (default: *STDOUT) - --logpath=PATH write logfile to PATH (default: .) - --logappend, --log_append append to logfile (default: truncate) - - -v, --verbose=0|1 verbose logging (default: false) - -i, --interactive=0|1 run script interactively (default: true) - -n, --dry_run, --dry=0|1 don't write results to database - -h, --help, -? print help (this message) - -=head1 DESCRIPTION - -This script removes overlapping mappings that were generated by the code in -align_nonident_regions.pl. Mappings are trimmed so that no overlaps are present -in the assembly table, because such overlaps may break the AssemblyMapper when -projecting between the two assemblies. - -It also merges adjacent assembly segments which can result from alternating -alignments from clone identity and blastz alignment. - -=head1 RELATED FILES - -The whole process of creating a whole genome alignment between two assemblies -is done by a series of scripts. Please see - - ensembl/misc-scripts/assembly/README - -for a high-level description of this process, and POD in the individual scripts -for the details. - -=head1 LICENCE - -This code is distributed under an Apache style licence: -Please see http://www.ensembl.org/code_licence.html for details - -=head1 AUTHOR - -Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team - -=head1 CONTACT - -Please post comments/questions to the Ensembl development list -<dev@ensembl.org> - -=cut - -use strict; -use warnings; -no warnings 'uninitialized'; - -use FindBin qw($Bin); -use Getopt::Long; -use Pod::Usage; -use Bio::EnsEMBL::Utils::ConversionSupport; - -$| = 1; - -my $support = new Bio::EnsEMBL::Utils::ConversionSupport("$Bin/../../.."); - -# parse options -$support->parse_common_options(@_); -$support->parse_extra_options( - 'assembly=s', - 'altassembly=s', - 'chromosomes|chr=s@', -); -$support->allowed_params( - $support->get_common_params, - 'assembly', - 'altassembly', - 'chromosomes', -); - -if ($support->param('help') or $support->error) { - warn $support->error if $support->error; - pod2usage(1); -} - -$support->comma_to_list('chromosomes'); - -# ask user to confirm parameters to proceed -$support->confirm_params; - -# get log filehandle and print heading and parameters to logfile -$support->init_log; - -$support->check_required_params( - 'assembly', - 'altassembly' -); - -# database connection -my $dba = $support->get_database('ensembl'); -my $dbh = $dba->dbc->db_handle; - -my $assembly = $support->param('assembly'); -my $altassembly = $support->param('altassembly'); - -my $sql = qq( - SELECT a.* - FROM assembly a, seq_region sr1, seq_region sr2, - coord_system cs1, coord_system cs2 - WHERE a.asm_seq_region_id = sr1.seq_region_id - AND a.cmp_seq_region_id = sr2.seq_region_id - AND sr1.coord_system_id = cs1.coord_system_id - AND sr2.coord_system_id = cs2.coord_system_id - AND cs1.version = '$assembly' - AND cs2.version = '$altassembly' - AND sr2.name = ? - ORDER BY a.cmp_start -); - -my $sth = $dbh->prepare($sql); - -my $fmt1 = "%10s %10s %10s %10s %3s\n"; - - -foreach my $chr ($support->param('chromosomes')) { - - - $support->log_stamped("\nToplevel seq_region $chr...\n"); - - $sth->execute($chr); - - my @rows = (); - - # do an initial fetch - my $last = $sth->fetchrow_hashref; - - # skip seq_regions for which we don't have data - unless ($last) { - $support->log("No mappings found. Skipping.\n", 1); - next; - } - - push @rows, $last; - - my $i = 0; - my $j = 0; - my $k = 0; - - while ($last and (my $r = $sth->fetchrow_hashref)) { - # merge adjacent assembly segments (these can result from alternating - # alignments from clone identity and blastz alignment) - if ($last->{'asm_end'} == ($r->{'asm_start'} - 1) and - $last->{'cmp_end'} == ($r->{'cmp_start'} - 1)) { - - $j++; - - # debug warnings - $support->log_verbose('merging - last: '.sprintf($fmt1, - map { $last->{$_} } qw(asm_start asm_end cmp_start cmp_end ori)), 1); - $support->log_verbose('this: '.sprintf($fmt1, map { $r->{$_} } - qw(asm_start asm_end cmp_start cmp_end ori)), 1); - - # remove last row - pop(@rows); - - # merge segments and add new row - $last->{'asm_end'} = $r->{'asm_end'}; - $last->{'cmp_end'} = $r->{'cmp_end'}; - push @rows, $last; - - next; - } - - # bridge small gaps (again, these can result from alternating alignments - # from clone identity and blastz alignment). A maximum gap size of 10bp is - # allowed - my $asm_gap = $r->{'asm_start'} - $last->{'asm_end'} - 1; - my $cmp_gap = $r->{'cmp_start'} - $last->{'cmp_end'} - 1; - - if ($asm_gap == $cmp_gap and $asm_gap <= 10 and $asm_gap > 0) { - - $k++; - - # debug warnings - $support->log_verbose('bridging - last: '.sprintf($fmt1, - map { $last->{$_} } qw(asm_start asm_end cmp_start cmp_end ori)), 1); - $support->log_verbose('this: '.sprintf($fmt1, map { $r->{$_} } - qw(asm_start asm_end cmp_start cmp_end ori)), 1); - - # remove last row - pop(@rows); - - # merge segments and add new row - $last->{'asm_end'} = $r->{'asm_end'}; - $last->{'cmp_end'} = $r->{'cmp_end'}; - push @rows, $last; - - next; - } - - push @rows, $r; - $last = $r; - } - - $support->log("Merged $j mappings.\n", 1); - $support->log("Bridged $k gaps.\n", 1); - - if ((!$support->param('dry_run')) && ($j + $k > 0) ) { - - # delete all current mappings from the db and insert the corrected ones - my $c = $dbh->do(qq( - DELETE a - FROM assembly a, seq_region sr1, seq_region sr2, - coord_system cs1, coord_system cs2 - WHERE a.asm_seq_region_id = sr1.seq_region_id - AND a.cmp_seq_region_id = sr2.seq_region_id - AND sr1.coord_system_id = cs1.coord_system_id - AND sr2.coord_system_id = cs2.coord_system_id - AND cs1.version = '$assembly' - AND cs2.version = '$altassembly' - AND sr2.name = '$chr' - )); - - $support->log("\nDeleted $c entries from the assembly table.\n"); - - # now insert the fixed entries - $sql = qq(INSERT IGNORE INTO assembly VALUES (?, ?, ?, ?, ?, ?, ?)); - my $sth1 = $dbh->prepare($sql); - - foreach my $r (@rows) { - $sth1->execute(map { $r->{$_} } qw(asm_seq_region_id cmp_seq_region_id asm_start asm_end cmp_start cmp_end ori)); - } - - $support->log("Added ".scalar(@rows)." fixed entries to the assembly table.\n"); - - } - - -} - -$sth->finish; - - -# finish logfile -$support->finish_log; -