Commit 161af036 authored by Patrick Meidl's avatar Patrick Meidl
Browse files

scripts to create whole genome alignment between two assemblies

parent 3c3c249c
This diff is collapsed.
This diff is collapsed.
#!/usr/local/bin/perl
=head1 NAME
align_nonident_regions.pl - create whole genome alignment between two closely
related assemblies for non-identical regions
=head1 SYNOPSIS
align_nonident_regions.pl [arguments]
Required arguments:
--dbname, db_name=NAME database name NAME
--host, --dbhost, --db_host=HOST database host HOST
--port, --dbport, --db_port=PORT database port PORT
--user, --dbuser, --db_user=USER database username USER
--pass, --dbpass, --db_pass=PASS database passwort PASS
--assembly=ASSEMBLY assembly version ASSEMBLY
--altdbname=NAME alternative database NAME
--altassembly=ASSEMBLY alternative assembly version ASSEMBLY
Optional arguments:
--bindir=DIR look for program binaries in DIR
--conffile, --conf=FILE read parameters from FILE
(default: conf/Conversion.ini)
--logfile, --log=FILE log to FILE (default: *STDOUT)
--logpath=PATH write logfile to PATH (default: .)
--logappend, --log_append append to logfile (default: truncate)
-v, --verbose=0|1 verbose logging (default: false)
-i, --interactive=0|1 run script interactively (default: true)
-n, --dry_run, --dry=0|1 don't write results to database
-h, --help, -? print help (this message)
=head1 DESCRIPTION
This script is part of a series of scripts to create a mapping between two
assemblies. It assembles the chromosome coordinate systems of two different
assemblies of a genome by creating a whole genome alignment between the two.
The process assumes that the two assemblies are reasonably similar, i.e. there
are no major rearrangements or clones moved from one chromosome to another.
See "Related scripts" below for an overview of the whole process.
This particular script creates a whole genome alignment between two closely
related assemblies for non-identical regions. These regions are identified by
another script (align_by_clone_identity.pl) and stored in a temporary database
table (tmp_align).
Alignments are calculated by this algorithm:
1. fetch region from tmp_align
2. write soft-masked sequences to temporary files
3. align using blastz
4. filter best hits (for query sequences, i.e. Ensembl regions) using
axtBest
5. parse blastz output to create blocks of exact matches only
6. remove overlapping target (Vega) alignments
7. write alignments to assembly table
=head1 RELATED SCRIPTS
The whole process of creating a whole genome alignment is done by these
scripts:
ensembl/misc-scripts/assembly/load_alternative_assembly.pl
ensembl/misc-scripts/assembly/align_by_clone_identity.pl
ensembl/misc-scripts/assembly/align_nonident_regions.pl
See documention in the respective script for more information.
=head1 LICENCE
This code is distributed under an Apache style licence:
Please see http://www.ensembl.org/code_licence.html for details
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use strict;
use warnings;
no warnings 'uninitialized';
use FindBin qw($Bin);
use vars qw($SERVERROOT);
BEGIN {
$SERVERROOT = "$Bin/../../..";
unshift(@INC, ".");
unshift(@INC, "$SERVERROOT/ensembl/modules");
unshift(@INC, "$SERVERROOT/bioperl-live");
}
use Getopt::Long;
use Pod::Usage;
use Bio::EnsEMBL::Utils::ConversionSupport;
use AssemblyMapper::BlastzAligner;
$| = 1;
my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
# parse options
$support->parse_common_options(@_);
$support->parse_extra_options(
'assembly=s',
'altdbname=s',
'altassembly=s',
'bindir=s',
);
$support->allowed_params(
$support->get_common_params,
'assembly',
'altdbname',
'altassembly',
'bindir',
);
if ($support->param('help') or $support->error) {
warn $support->error if $support->error;
pod2usage(1);
}
# ask user to confirm parameters to proceed
$support->confirm_params;
# get log filehandle and print heading and parameters to logfile
$support->init_log;
$support->check_required_params(
'assembly',
'altdbname',
'altassembly'
);
#####
# connect to database and get adaptors
#
my ($dba, $dbh, $sql, $sth);
# first set connection parameters for alternative db
# both databases have to be on the same host, so we don't need to configure
# them separately
map { $support->param("alt$_", $support->param($_)) } qw(host port user pass);
# reference database
my $R_dba = $support->get_database('ensembl');
my $R_dbh = $R_dba->dbc->db_handle;
my $R_sa = $R_dba->get_SliceAdaptor;
# database containing the alternative assembly
my $A_dba = $support->get_database('core', 'alt');
my $A_sa = $A_dba->get_SliceAdaptor;
# create BlastzAligner object
my $aligner = AssemblyMapper::BlastzAligner->new(-SUPPORT => $support);
# create tmpdir to store input and output
$aligner->create_tempdir;
# loop over non-aligned regions in tmp_align table
$support->log_stamped("Looping over non-aligned blocks...\n");
$sth = $R_dbh->prepare(qq(SELECT * FROM tmp_align));
$sth->execute;
while (my $row = $sth->fetchrow_hashref) {
my $id = $row->{'tmp_align_id'};
$aligner->id($id);
$aligner->seq_region_name($row->{'ref_seq_region_name'});
$support->log_stamped("Block with tmp_align_id = $id\n", 1);
my $A_slice = $A_sa->fetch_by_region(
'chromosome',
$row->{'alt_seq_region_name'},
$row->{'alt_start'},
$row->{'alt_end'},
1,
$support->param('altassembly'),
);
my $R_slice = $R_sa->fetch_by_region(
'chromosome',
$row->{'ref_seq_region_name'},
$row->{'ref_start'},
$row->{'ref_end'},
1,
$support->param('assembly'),
);
# write sequences to file, and convert sequence files from fasta to nib
# format (needed for lavToAxt)
my $A_basename = "alt_seq.$id";
my $R_basename = "ref_seq.$id";
$support->log("Writing sequences to fasta and nib files...\n", 2);
$aligner->write_sequence(
$A_slice,
$support->param('altassembly'),
$A_basename
);
$aligner->write_sequence(
$R_slice,
$support->param('assembly'),
$R_basename
);
$support->log("Done.\n", 2);
# align using blastz
$support->log("Running blastz...\n", 2);
$aligner->run_blastz($A_basename, $R_basename);
$support->log("Done.\n", 2);
# convert blastz output from lav to axt format
$support->log("Converting blastz output from lav to axt format...\n", 2);
$aligner->lav_to_axt;
$support->log("Done.\n", 2);
# find best alignment with axtBest
$support->log("Finding best alignment with axtBest...\n", 2);
$aligner->find_best_alignment;
$support->log("Done.\n", 2);
# parse blastz output, and convert relative alignment coordinates to
# chromosomal coords
$support->log("Parsing blastz output...\n", 2);
$aligner->parse_blastz_output;
$aligner->adjust_coords(
$row->{'alt_start'},
$row->{'alt_end'},
{ $id => [ $row->{'ref_start'}, $row->{'ref_end'} ] }
);
$support->log("Done.\n", 2);
# log alignment stats
$aligner->log_block_stats(2);
$support->log_stamped("Done with block $id.\n", 1);
}
$support->log_stamped("Done.\n");
# filter overlapping Vega alignment regions
$support->log_stamped("Filtering overlapping reference alignment regions...\n");
$aligner->filter_overlaps;
$support->log_stamped("Done.\n");
# write alignments to assembly table
unless ($support->param('dry_run')) {
$aligner->write_assembly($R_dba);
}
# cleanup
$support->log_stamped("\nRemoving tmpdir...\n");
$aligner->remove_tempdir;
$support->log_stamped("Done.\n");
# drop tmp_align
unless ($support->param('dry_run')) {
if ($support->user_proceed("Would you like to drop the tmp_align table?")) {
$support->log_stamped("Dropping tmp_align table...\n");
$R_dbh->do(qq(DROP TABLE tmp_align));
$support->log_stamped("Done.\n");
}
}
# overall stats
$aligner->log_overall_stats;
# finish logfile
$support->finish_log;
A C G T
100 -300 -150 -300
-300 100 -300 -150
-150 -300 100 -300
-300 -150 -300 100
#!/usr/local/bin/perl
=head1 NAME
load_alternative_assembly.pl - create a db for transfering annotation to the
Ensembl assembly
=head1 SYNOPSIS
load_alternative_assembly.pl [arguments]
Required arguments:
--dbname, db_name=NAME database name NAME
--host, --dbhost, --db_host=HOST database host HOST
--port, --dbport, --db_port=PORT database port PORT
--user, --dbuser, --db_user=USER database username USER
--pass, --dbpass, --db_pass=PASS database passwort PASS
--assembly=ASSEMBLY assembly version ASSEMBLY
--altdbname=NAME alternative database NAME
--altassembly=ASSEMBLY alternative assembly version ASSEMBLY
Optional arguments:
--conffile, --conf=FILE read parameters from FILE
(default: conf/Conversion.ini)
--logfile, --log=FILE log to FILE (default: *STDOUT)
--logpath=PATH write logfile to PATH (default: .)
--logappend, --log_append append to logfile (default: truncate)
-v, --verbose=0|1 verbose logging (default: false)
-i, --interactive=0|1 run script interactively (default: true)
-n, --dry_run, --dry=0|1 don't write results to database
-h, --help, -? print help (this message)
=head1 DESCRIPTION
This script is part of a series of scripts to create a mapping between two
assemblies. It assembles the chromosome coordinate systems of two different
assemblies of a genome by creating a whole genome alignment between the two.
The process assumes that the two assemblies are reasonably similar, i.e. there
are no major rearrangements or clones moved from one chromosome to another.
See "Related scripts" below for an overview of the whole process.
This particular script loads the alternative chromosomes into the Ensembl
database for further processing.
=head1 RELATED SCRIPTS
The whole process of creating a whole genome alignment is done by these
scripts:
ensembl/misc-scripts/assembly/load_alternative_assembly.pl
ensembl/misc-scripts/assembly/align_by_clone_identity.pl
ensembl/misc-scripts/assembly/align_nonident_regions.pl
See documention in the respective script for more information.
=head1 LICENCE
This code is distributed under an Apache style licence:
Please see http://www.ensembl.org/code_licence.html for details
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use strict;
use warnings;
no warnings 'uninitialized';
use FindBin qw($Bin);
use vars qw($SERVERROOT);
BEGIN {
$SERVERROOT = "$Bin/../../..";
unshift(@INC, "$SERVERROOT/ensembl/modules");
unshift(@INC, "$SERVERROOT/bioperl-live");
}
use Getopt::Long;
use Pod::Usage;
use Bio::EnsEMBL::Utils::ConversionSupport;
$| = 1;
my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
# parse options
$support->parse_common_options(@_);
$support->parse_extra_options(
'assembly=s',
'altdbname=s',
'altassembly=s',
);
$support->allowed_params(
$support->get_common_params,
'assembly',
'altdbname',
'altassembly',
);
if ($support->param('help') or $support->error) {
warn $support->error if $support->error;
pod2usage(1);
}
# ask user to confirm parameters to proceed
$support->confirm_params;
# get log filehandle and print heading and parameters to logfile
$support->init_log;
$support->check_required_params(
'assembly',
'altdbname',
'altassembly'
);
if ($support->param('dry_run')) {
$support->log("Nothing to do for a dry run. Exiting.\n\n");
$support->finish_log;
exit;
}
#####
# connect to database and get adaptors
#
my ($dba, $dbh, $sql, $sth);
# first set connection parameters for alternative db
# both databases have to be on the same host, so we don't need to configure
# them separately
map { $support->param("alt$_", $support->param($_)) } qw(host port user pass);
# reference database
$dba->{'ref'} = $support->get_database('ensembl');
$dbh->{'ref'} = $dba->{'ref'}->dbc->db_handle;
# database containing the alternative assembly
$dba->{'alt'} = $support->get_database('core', 'alt');
$dbh->{'alt'} = $dba->{'alt'}->dbc->db_handle;
#####
# load seq_regions from alternative assembly db
#
$support->log_stamped("Load seq_regions from alternative db...\n");
# determine max(seq_region_id) and max(coord_system_id) in Ensembl
$sql = qq(SELECT MAX(seq_region_id) FROM seq_region);
$sth = $dbh->{'ref'}->prepare($sql);
$sth->execute;
my ($max_sri) = $sth->fetchrow_array;
my $sri_adjust = 10**(length($max_sri));
$sql = qq(SELECT MAX(coord_system_id) FROM coord_system);
$sth = $dbh->{'ref'}->prepare($sql);
$sth->execute;
my ($max_csi) = $sth->fetchrow_array;
my $csi_adjust = 10**(length($max_csi));
my $ref_db = $support->param('dbname');
my $alt_assembly = $support->param('altassembly');
# fetch and insert alternative seq_regions with adjusted seq_region_id and
# coord_system_id
$sql = qq(
INSERT INTO $ref_db.seq_region
SELECT
sr.seq_region_id+$sri_adjust,
sr.name,
sr.coord_system_id+$csi_adjust,
sr.length
FROM seq_region sr, coord_system cs
WHERE sr.coord_system_id = cs.coord_system_id
AND cs.name = 'chromosome'
AND cs.version = '$alt_assembly'
);
my $c = $dbh->{'alt'}->do($sql);
$support->log_stamped("Done loading $c seq_regions.\n\n");
#####
# add appropriate entries to coord_system
#
$support->log_stamped("Adding coord_system entries...\n");
$sql = qq(
INSERT INTO $ref_db.coord_system
SELECT coord_system_id+$csi_adjust, name, version, rank+100, ''
FROM coord_system
WHERE name = 'chromosome'
AND version = '$alt_assembly'
);
$c = $dbh->{'alt'}->do($sql);
$support->log_stamped("Done adding $c coord_system entries.\n\n");
#####
# add assembly.mapping to meta table
#
$support->log_stamped("Adding assembly.mapping entry to meta table...\n");
my $mappingstring = 'chromosome:'.$support->param('assembly').
'#chromosome:'.$support->param('altassembly');
$sql = qq(
INSERT INTO meta (meta_key, meta_value)
VALUES ('assembly.mapping', '$mappingstring')
);
$c = $dbh->{'ref'}->do($sql);
$support->log_stamped("Done inserting $c meta entries.\n\n");
# finish logfile
$support->finish_log;
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment