Skip to content
Snippets Groups Projects
Commit 23f20766 authored by Monika Komorowska's avatar Monika Komorowska
Browse files

new script to fix mapping coordinates in asm_start, asm_end columns; don't...

new script to fix mapping coordinates in asm_start, asm_end columns; don't merge or bridge gaps as this is done ordering by cmp_start in fix_cmp_overlaps.pl
parent ecfd3d73
No related branches found
No related tags found
No related merge requests found
#!/usr/local/ensembl/bin/perl
=head1 NAME
fix_asm_overlaps.pl - remove overlapping mappings between assemblies from the assembly coordinates (asm_start -asm_end)
=head1 SYNOPSIS
fix_overlaps.pl [arguments]
Required arguments:
--dbname, db_name=NAME database name NAME
--host, --dbhost, --db_host=HOST database host HOST
--port, --dbport, --db_port=PORT database port PORT
--user, --dbuser, --db_user=USER database username USER
--pass, --dbpass, --db_pass=PASS database passwort PASS
--assembly=ASSEMBLY assembly version ASSEMBLY
--altassembly=ASSEMBLY alternative assembly version ASSEMBLY
--chromosomes, --chr=LIST only process LIST seq_regions
Optional arguments:
--conffile, --conf=FILE read parameters from FILE
(default: conf/Conversion.ini)
--logfile, --log=FILE log to FILE (default: *STDOUT)
--logpath=PATH write logfile to PATH (default: .)
--logappend, --log_append append to logfile (default: truncate)
-v, --verbose=0|1 verbose logging (default: false)
-i, --interactive=0|1 run script interactively (default: true)
-n, --dry_run, --dry=0|1 don't write results to database
-h, --help, -? print help (this message)
=head1 DESCRIPTION
This script removes overlapping mappings from the assembly coordinates.
Mappings are trimmed so they don't break the AssemblyMapper when
projecting between the reference to the non-reference assembly.
=head1 LICENCE
This code is distributed under an Apache style licence:
Please see http://www.ensembl.org/code_licence.html for details
=head1 AUTHOR
Monika Komorowska <monika@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<dev@ensembl.org>
=cut
use strict;
use warnings;
no warnings 'uninitialized';
use FindBin qw($Bin);
use Getopt::Long;
use Pod::Usage;
use Bio::EnsEMBL::Utils::ConversionSupport;
$| = 1;
my $support = new Bio::EnsEMBL::Utils::ConversionSupport("$Bin/../../..");
# parse options
$support->parse_common_options(@_);
$support->parse_extra_options(
'assembly=s',
'altassembly=s',
'chromosomes|chr=s@',
);
$support->allowed_params(
$support->get_common_params,
'assembly',
'altassembly',
'chromosomes',
);
if ($support->param('help') or $support->error) {
warn $support->error if $support->error;
pod2usage(1);
}
$support->comma_to_list('chromosomes');
# ask user to confirm parameters to proceed
$support->confirm_params;
# get log filehandle and print heading and parameters to logfile
$support->init_log;
$support->check_required_params(
'assembly',
'altassembly',
'chromosomes'
);
# database connection
my $dba = $support->get_database('ensembl');
my $dbh = $dba->dbc->db_handle;
my $assembly = $support->param('assembly');
my $altassembly = $support->param('altassembly');
my $sql = qq(
SELECT a.*
FROM assembly a, seq_region sr1, seq_region sr2,
coord_system cs1, coord_system cs2
WHERE a.asm_seq_region_id = sr1.seq_region_id
AND a.cmp_seq_region_id = sr2.seq_region_id
AND sr1.coord_system_id = cs1.coord_system_id
AND sr2.coord_system_id = cs2.coord_system_id
AND cs1.version = '$assembly'
AND cs2.version = '$altassembly'
AND sr1.name = ?
ORDER BY a.asm_start
);
my $sth = $dbh->prepare($sql);
my $fmt1 = "%10s %10s %10s %10s %3s\n";
foreach my $chr ($support->param('chromosomes')) {
$support->log_stamped("\nToplevel seq_region $chr...\n");
$sth->execute($chr);
my @rows = ();
# do an initial fetch
my $last = $sth->fetchrow_hashref;
# skip seq_regions for which we don't have data
unless ($last) {
$support->log("No mappings found. Skipping.\n", 1);
next;
}
push @rows, $last;
my $i = 0;
my $j = 0;
my $k = 0;
while ($last and (my $r = $sth->fetchrow_hashref)) {
# look for overlaps with last segment
if ($last->{'asm_end'} >= $r->{'asm_start'}) {
$i++;
# debug warnings
$support->log_verbose('last: '.sprintf($fmt1, map { $last->{$_} }
qw(asm_start asm_end cmp_start cmp_end ori)), 1);
$support->log_verbose('before: '.sprintf($fmt1, map { $r->{$_} }
qw(asm_start asm_end cmp_start cmp_end ori)), 1);
# skip if this segment ends before the last one
if ($r->{'asm_end'} <= $last->{'asm_end'}) {
$support->log_verbose("skipped\n\n", 1);
next;
}
my $overlap = $last->{'asm_end'} - $r->{'asm_start'} + 1;
$r->{'asm_start'} += $overlap;
if ($r->{'ori'} == -1) {
$r->{'cmp_end'} -= $overlap;
} else {
$r->{'cmp_start'} += $overlap;
}
$support->log_verbose('after: '.sprintf($fmt1, map { $r->{$_} }
qw(asm_start asm_end cmp_start cmp_end ori))."\n", 1);
}
push @rows, $r;
$last = $r;
}
$support->log("Fixed $i mappings.\n", 1);
if (!$support->param('dry_run') && ($i > 0)) {
# delete all current mappings from the db and insert the corrected ones
my $c = $dbh->do(qq(
DELETE a
FROM assembly a, seq_region sr1, seq_region sr2,
coord_system cs1, coord_system cs2
WHERE a.asm_seq_region_id = sr1.seq_region_id
AND a.cmp_seq_region_id = sr2.seq_region_id
AND sr1.coord_system_id = cs1.coord_system_id
AND sr2.coord_system_id = cs2.coord_system_id
AND cs1.version = '$assembly'
AND cs2.version = '$altassembly'
AND sr1.name = '$chr'
));
$support->log("\nDeleted $c entries from the assembly table.\n");
# now insert the fixed entries
$sql = qq(INSERT INTO assembly VALUES (?, ?, ?, ?, ?, ?, ?));
my $sth1 = $dbh->prepare($sql);
foreach my $r (@rows) {
$sth1->execute(map { $r->{$_} } qw(asm_seq_region_id cmp_seq_region_id asm_start asm_end cmp_start cmp_end ori));
}
$support->log("Added ".scalar(@rows)." fixed entries to the assembly table.\n");
}
}
$sth->finish;
# finish logfile
$support->finish_log;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment