From 23f207665982c51442581d31995e492a72431dbf Mon Sep 17 00:00:00 2001 From: Monika Komorowska <mk8@sanger.ac.uk> Date: Thu, 1 Mar 2012 15:24:31 +0000 Subject: [PATCH] new script to fix mapping coordinates in asm_start, asm_end columns; don't merge or bridge gaps as this is done ordering by cmp_start in fix_cmp_overlaps.pl --- misc-scripts/assembly/fix_asm_overlaps.pl | 229 ++++++++++++++++++++++ 1 file changed, 229 insertions(+) create mode 100755 misc-scripts/assembly/fix_asm_overlaps.pl diff --git a/misc-scripts/assembly/fix_asm_overlaps.pl b/misc-scripts/assembly/fix_asm_overlaps.pl new file mode 100755 index 0000000000..2d6a2abf3b --- /dev/null +++ b/misc-scripts/assembly/fix_asm_overlaps.pl @@ -0,0 +1,229 @@ +#!/usr/local/ensembl/bin/perl + +=head1 NAME + +fix_asm_overlaps.pl - remove overlapping mappings between assemblies from the assembly coordinates (asm_start -asm_end) + + +=head1 SYNOPSIS + +fix_overlaps.pl [arguments] + +Required arguments: + + --dbname, db_name=NAME database name NAME + --host, --dbhost, --db_host=HOST database host HOST + --port, --dbport, --db_port=PORT database port PORT + --user, --dbuser, --db_user=USER database username USER + --pass, --dbpass, --db_pass=PASS database passwort PASS + --assembly=ASSEMBLY assembly version ASSEMBLY + + --altassembly=ASSEMBLY alternative assembly version ASSEMBLY + --chromosomes, --chr=LIST only process LIST seq_regions + +Optional arguments: + + --conffile, --conf=FILE read parameters from FILE + (default: conf/Conversion.ini) + + --logfile, --log=FILE log to FILE (default: *STDOUT) + --logpath=PATH write logfile to PATH (default: .) + --logappend, --log_append append to logfile (default: truncate) + + -v, --verbose=0|1 verbose logging (default: false) + -i, --interactive=0|1 run script interactively (default: true) + -n, --dry_run, --dry=0|1 don't write results to database + -h, --help, -? print help (this message) + +=head1 DESCRIPTION + +This script removes overlapping mappings from the assembly coordinates. +Mappings are trimmed so they don't break the AssemblyMapper when +projecting between the reference to the non-reference assembly. + +=head1 LICENCE + +This code is distributed under an Apache style licence: +Please see http://www.ensembl.org/code_licence.html for details + +=head1 AUTHOR + +Monika Komorowska <monika@ebi.ac.uk>, Ensembl core API team + +=head1 CONTACT + +Please post comments/questions to the Ensembl development list +<dev@ensembl.org> + +=cut + +use strict; +use warnings; +no warnings 'uninitialized'; + +use FindBin qw($Bin); +use Getopt::Long; +use Pod::Usage; +use Bio::EnsEMBL::Utils::ConversionSupport; + +$| = 1; + +my $support = new Bio::EnsEMBL::Utils::ConversionSupport("$Bin/../../.."); + +# parse options +$support->parse_common_options(@_); +$support->parse_extra_options( + 'assembly=s', + 'altassembly=s', + 'chromosomes|chr=s@', +); +$support->allowed_params( + $support->get_common_params, + 'assembly', + 'altassembly', + 'chromosomes', +); + +if ($support->param('help') or $support->error) { + warn $support->error if $support->error; + pod2usage(1); +} + +$support->comma_to_list('chromosomes'); + +# ask user to confirm parameters to proceed +$support->confirm_params; + +# get log filehandle and print heading and parameters to logfile +$support->init_log; + +$support->check_required_params( + 'assembly', + 'altassembly', + 'chromosomes' +); + +# database connection +my $dba = $support->get_database('ensembl'); +my $dbh = $dba->dbc->db_handle; + +my $assembly = $support->param('assembly'); +my $altassembly = $support->param('altassembly'); + +my $sql = qq( + SELECT a.* + FROM assembly a, seq_region sr1, seq_region sr2, + coord_system cs1, coord_system cs2 + WHERE a.asm_seq_region_id = sr1.seq_region_id + AND a.cmp_seq_region_id = sr2.seq_region_id + AND sr1.coord_system_id = cs1.coord_system_id + AND sr2.coord_system_id = cs2.coord_system_id + AND cs1.version = '$assembly' + AND cs2.version = '$altassembly' + AND sr1.name = ? + ORDER BY a.asm_start +); + +my $sth = $dbh->prepare($sql); + +my $fmt1 = "%10s %10s %10s %10s %3s\n"; + + +foreach my $chr ($support->param('chromosomes')) { + $support->log_stamped("\nToplevel seq_region $chr...\n"); + + $sth->execute($chr); + + my @rows = (); + + # do an initial fetch + my $last = $sth->fetchrow_hashref; + + # skip seq_regions for which we don't have data + unless ($last) { + $support->log("No mappings found. Skipping.\n", 1); + next; + } + + push @rows, $last; + + my $i = 0; + my $j = 0; + my $k = 0; + + while ($last and (my $r = $sth->fetchrow_hashref)) { + + # look for overlaps with last segment + if ($last->{'asm_end'} >= $r->{'asm_start'}) { + + $i++; + + # debug warnings + $support->log_verbose('last: '.sprintf($fmt1, map { $last->{$_} } + qw(asm_start asm_end cmp_start cmp_end ori)), 1); + $support->log_verbose('before: '.sprintf($fmt1, map { $r->{$_} } + qw(asm_start asm_end cmp_start cmp_end ori)), 1); + + # skip if this segment ends before the last one + if ($r->{'asm_end'} <= $last->{'asm_end'}) { + $support->log_verbose("skipped\n\n", 1); + next; + } + + my $overlap = $last->{'asm_end'} - $r->{'asm_start'} + 1; + + $r->{'asm_start'} += $overlap; + + if ($r->{'ori'} == -1) { + $r->{'cmp_end'} -= $overlap; + } else { + $r->{'cmp_start'} += $overlap; + } + + $support->log_verbose('after: '.sprintf($fmt1, map { $r->{$_} } + qw(asm_start asm_end cmp_start cmp_end ori))."\n", 1); + } + + push @rows, $r; + $last = $r; + } + + $support->log("Fixed $i mappings.\n", 1); + + if (!$support->param('dry_run') && ($i > 0)) { + + # delete all current mappings from the db and insert the corrected ones + my $c = $dbh->do(qq( + DELETE a + FROM assembly a, seq_region sr1, seq_region sr2, + coord_system cs1, coord_system cs2 + WHERE a.asm_seq_region_id = sr1.seq_region_id + AND a.cmp_seq_region_id = sr2.seq_region_id + AND sr1.coord_system_id = cs1.coord_system_id + AND sr2.coord_system_id = cs2.coord_system_id + AND cs1.version = '$assembly' + AND cs2.version = '$altassembly' + AND sr1.name = '$chr' + )); + + $support->log("\nDeleted $c entries from the assembly table.\n"); + + # now insert the fixed entries + $sql = qq(INSERT INTO assembly VALUES (?, ?, ?, ?, ?, ?, ?)); + my $sth1 = $dbh->prepare($sql); + + foreach my $r (@rows) { + $sth1->execute(map { $r->{$_} } qw(asm_seq_region_id cmp_seq_region_id asm_start asm_end cmp_start cmp_end ori)); + } + + $support->log("Added ".scalar(@rows)." fixed entries to the assembly table.\n"); + + } + +} + +$sth->finish; + +# finish logfile +$support->finish_log; + -- GitLab