From 8621c25793d71e7197ed9554aa56c5e37a2e75f8 Mon Sep 17 00:00:00 2001 From: Monika Komorowska <mk8@sanger.ac.uk> Date: Tue, 10 Jan 2012 17:13:08 +0000 Subject: [PATCH] load feature location in old db, new db and feature mapping between two assemblies --- .../assembly/load_feature_mappings.pl | 571 ++++++++++++++++++ 1 file changed, 571 insertions(+) create mode 100644 misc-scripts/assembly/load_feature_mappings.pl diff --git a/misc-scripts/assembly/load_feature_mappings.pl b/misc-scripts/assembly/load_feature_mappings.pl new file mode 100644 index 0000000000..8773d12e77 --- /dev/null +++ b/misc-scripts/assembly/load_feature_mappings.pl @@ -0,0 +1,571 @@ +#!/usr/local/ensembl/bin/perl + +=head1 NAME + +load_feature_mappings.pl - load feature (genes and SNPs) location in the old +database (database with alternative assembly), mapping from old to new assembly +and feature location in the new database (database with reference assembly) +to a test database +Run script test_assembly_mapping.pl to produce statistics on the quality of the +mappings. +Sample config file load_feature_mappings.ini.example. + +=head1 SYNOPSIS + +load_feature_mappings.pl [arguments] + +Required arguments: + + --host=HOST new core db host HOST + --port=PORT new core db port PORT + --user=USER new core db username USER + --pass=PASS new core db passwort PASS + --dbname=NAME new core db name NAME + --altdbname=NAME old core db name NAME + --chromosomes, --chr=LIST 'all' or comma separated list of chromosomes + --assembly=NAME new assembly NAME + --altassembly=NAME old assembly NAME + --features, --ft=LIST features to map:genes,SNPs + + --vardbname new varation db NAME (if mapping SNPs) + --varaltdbname old variation db NAME (if mapping SNPs) + + if not dry run: + --testdbname=NAME test database name NAME (test database will be dropped and recreated) + +Optional arguments: + + --conffile=filename read parameters from FILE + (default: conf/Conversion.ini) + + (if different from --host, --port, --user, --pass): + --althost=hOST old core db host HOST + --altport=PORT old core db port PORT + --altuser=USER old core db username USER + --altpass=PASS old core db passwort PASS + + --testhost=HOST test db host HOST + --testport=PORT test db port PORT + --testuser=USER test db username USER + --testpass=PASS test db passwort PASS + + --varhost=hOST new variation db host HOST + --varport=PORT new variation db port PORT + --varuser=USER new variation db username USER + --varpass=PASS new variation db passwort PASS + + --varalthost=hOST old variation db host HOST + --varaltport=PORT old variation db port PORT + --varaltuser=USER old variation db username USER + --varaltpass=PASS old variation db passwort PASS + + --logfile, --log=FILE log to FILE (default: *STDOUT) + --logpath=PATH write logfile to PATH (default: .) + --logappend, --log_append append to logfile (default: truncate) + + -v, --verbose=0|1 verbose logging (default: false) + -i, --interactive=0|1 run script interactively (default: true) + -n, --dry_run, --dry=0|1 dont write results to database + -h, --help, -? print help (this message) + +=head1 DESCRIPTION + +This script will map genes for a given chromosome list from the old to the new assembly and +find gene locations in the new database using stable_ids. +It will load the results into the new database. + +=cut + +use strict; +use warnings; +no warnings 'uninitialized'; + +use FindBin qw($Bin); +use Getopt::Long; +use Pod::Usage; +use Bio::EnsEMBL::Utils::ConversionSupport; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Variation::DBSQL::DBAdaptor; + +use vars qw($SERVERROOT); + +sub map_slice; +sub get_old_new_feature; + +BEGIN { + $SERVERROOT = "$Bin/../"; +} + + +$| = 1; + +my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT); + +# parse options +$support->parse_common_options(@_); +$support->parse_extra_options( + 'althost=s', + 'altport=n', + 'altuser=s', + 'altpass=s', + 'altdbname=s', + 'assembly=s', + 'altassembly=s', + 'testhost=s', + 'testport=n', + 'testuser=s', + 'testpass=s', + 'testdbname=s', + 'chromosomes|chr=s@', + 'features|ft=s@', + 'varhost=s', + 'varport=n', + 'varuser=s', + 'varpass=s', + 'vardbname=s', + 'varalthost=s', + 'varaltport=n', + 'varaltuser=s', + 'varaltpass=s', + 'varaltdbname=s', +); +$support->allowed_params( + $support->get_common_params, + 'althost', + 'altport', + 'altuser', + 'altdbname', + 'assembly', + 'altassembly', + 'testhost', + 'testport', + 'testuser', + 'testpass', + 'testdbname', + 'chromosomes', + 'features', + 'varhost', + 'varport', + 'varuser', + 'varpass', + 'vardbname', + 'varalthost', + 'varaltport', + 'varaltuser', + 'varaltpass', + 'varaltdbname', +); + +if ($support->param('help') or $support->error) { + warn $support->error if $support->error; + pod2usage(1); +} + +# ask user to confirm parameters to proceed +$support->confirm_params; + +# get log filehandle and print heading and parameters to logfile +$support->init_log; + +$support->check_required_params( + 'altdbname', + 'assembly', + 'altassembly', + 'chromosomes', + 'features', + 'dbname', + 'host', + 'user' +); + +if ($support->param('dry_run')) { + $support->log_stamped("Dry run. Test database will not be updated. Writing results to logfile instead.\n\n"); + $support->param('verbose',1); +} else { + if ( !$support->param('testdbname') ) { + $support->log_error("testdbname is required if not a dry run\n",1); + } +} + +my %valid_features = ('genes' => 1, +'SNPs' => 1 ); + +$support->comma_to_list('features'); +my @feature_types = $support->param('features'); +my %feature_types = map { $_ => 1} @feature_types; + +foreach my $feature_type (@feature_types ) { + if (!exists($valid_features{$feature_type}) ) { + delete $feature_types{$feature_type}; + print STDERR "invalid feature type: $feature_type\n"; + } +} + +if (scalar(keys %feature_types) == 0) { + $support->log_error("listed feature(s) not valid: use genes or SNPs only\n",1); +} + +if (exists($feature_types{'SNPs'}) ){ + if( !$support->param('vardbname') or !$support->param('varaltdbname') ) { + $support->log_error("vardbname and varaltdbname are required if mapping SNPs\n",1); + } + +} + + +##### +# connect to database and get adaptors +# +my ($dba, $dbh, $sth); + +if ( !defined($support->param('pass')) ) { + $support->param('pass', ''); +} + + +# first set connection parameters for alternative db and test db +if ( !defined($support->param('althost')) ) { $support->param('althost',$support->param('host')); } +if ( !defined($support->param('altport')) ) { $support->param('altport',$support->param('port')); } +if ( !defined($support->param('altuser')) ) { $support->param('altuser',$support->param('user')); } +if ( !defined($support->param('altpass')) ) { $support->param('altpass',$support->param('pass')); } + +if ( !defined($support->param('testhost')) ) { $support->param('testhost',$support->param('host')); } +if ( !defined($support->param('testport')) ) { $support->param('testport',$support->param('port')); } +if ( !defined($support->param('testuser')) ) { $support->param('testuser',$support->param('user')); } +if ( !defined($support->param('testpass')) ) { $support->param('testpass',$support->param('pass')); } + +# reference database +$dba->{'ref'} = new Bio::EnsEMBL::DBSQL::DBAdaptor( -host => $support->param('host'), + -user => $support->param('user'), + -pass => $support->param('pass'), + -port => $support->param('port'), + -dbname => $support->param('dbname') ); + +$dbh->{'ref'} = $dba->{'ref'}->dbc->db_handle; + +# database containing the alternative assembly +$dba->{'alt'} = new Bio::EnsEMBL::DBSQL::DBAdaptor( -host => $support->param('althost'), + -user => $support->param('altuser'), + -pass => $support->param('altpass'), + -port => $support->param('altport'), + -dbname => $support->param('altdbname') ); + +$dbh->{'alt'} = $dba->{'alt'}->dbc->db_handle; + +#test database +my $test_dbname = $support->param('testdbname'); +$support->param('testdbname',''); +$dbh->{'test'} = $support->get_dbconnection('test'); + +$support->comma_to_list('chromosomes'); +my @chrs = $support->param('chromosomes'); + +#get chrmosomes + +my %db_chrs; + +my $chr_name; +$sth = $dbh->{'ref'}->prepare("select s.name from seq_region s join coord_system c using(coord_system_id) where rank=1 and length(s.name < 3) order by s.name+0"); +$sth->execute; +$sth->bind_columns(\$chr_name); +while ($sth->fetch){ + $db_chrs{$chr_name} = 1; +} +$sth->finish; + +if ($chrs[0] eq 'all') { + @chrs = sort keys %db_chrs; +} else { + my %temp_chrs; + foreach my $chr (@chrs) { + if ( exists($db_chrs{$chr}) ) { + $temp_chrs{$chr} = 1; + } else { + print STDERR "unknown chromosome $chr\n"; + } + } + @chrs = sort keys %temp_chrs; +} + +if ( scalar(@chrs) == 0) { + $support->log_error("listed chromosome(s) not found in the database\n",1); +} + + +if (!$support->param('dry_run')) { + #create the test database + + eval{ + $dbh->{'test'}->do("drop database if exists $test_dbname"); + $dbh->{'test'}->do("create database $test_dbname"); + $dbh->{'test'}->do("use $test_dbname"); + $dbh->{'test'}->do("CREATE TABLE mapping ( + feature_id int(10) unsigned NOT NULL AUTO_INCREMENT, + feature_type char(30) DEFAULT NULL, + stable_id varchar(128) DEFAULT NULL, + old_assembly varchar(30) DEFAULT NULL, + old_chr char(3) DEFAULT NULL, + old_start int(20) DEFAULT NULL, + old_end int(20) DEFAULT NULL, + old_length int(20) DEFAULT NULL, + old_strand tinyint(2) DEFAULT NULL, + feature_found_in_new_db tinyint(1) unsigned DEFAULT NULL, + new_chr char(3) DEFAULT NULL, + new_start int(20) DEFAULT NULL, + new_end int(20) DEFAULT NULL, + new_length int(20) DEFAULT NULL, + new_strand tinyint(2) DEFAULT NULL, + new_assembly varchar(30) DEFAULT NULL, + mapping_quality tinyint(1) unsigned DEFAULT NULL, + mapping_start int(20) DEFAULT NULL, + mapping_end int(20) DEFAULT NULL, + mapping_length int(20) DEFAULT NULL, + mapping_strands varchar(5) DEFAULT NULL, + mapping_chrs varchar(10) DEFAULT NULL, + PRIMARY KEY (feature_id))"); + + }; + if ( $@ ) { + $support->log_error("errors encountered when creating the test database\n",1); + } +} + + +if ( exists($feature_types{'SNPs'}) ) { + #connect to the variation db + + if ( !defined($support->param('varhost')) ) { $support->param('varhost',$support->param('host')); } + if ( !defined($support->param('varport')) ) { $support->param('varport',$support->param('port')); } + if ( !defined($support->param('varuser')) ) { $support->param('varuser',$support->param('user')); } + if ( !defined($support->param('varpass')) ) { $support->param('varpass',$support->param('pass')); } + + if ( !defined($support->param('varalthost')) ) { $support->param('varalthost',$support->param('host')); } + if ( !defined($support->param('varaltport')) ) { $support->param('varaltport',$support->param('port')); } + if ( !defined($support->param('varaltuser')) ) { $support->param('varaltuser',$support->param('user')); } + if ( !defined($support->param('varaltpass')) ) { $support->param('varaltpass',$support->param('pass')); } + + + + $dba->{'var'} = new Bio::EnsEMBL::Variation::DBSQL::DBAdaptor( + -host => $support->param('varhost'), + -port => $support->param('varport'), + -user => $support->param('varuser'), + -pass => $support->param('varpass'), + -dbname => $support->param('vardbname')); + + $dba->{'varalt'} = new Bio::EnsEMBL::Variation::DBSQL::DBAdaptor( + -host => $support->param('varalthost'), + -port => $support->param('varaltport'), + -user => $support->param('varaltuser'), + -pass => $support->param('varaltpass'), + -dbname => $support->param('varaltdbname')); + +} + + +#connect to the test mapping db to store results +if ( !$support->param('dry_run') ) { + $sth = $dbh->{'test'}->prepare("INSERT INTO mapping(feature_type, stable_id, old_assembly,old_chr,old_start,old_end,old_length,old_strand,feature_found_in_new_db,new_chr,new_start,new_end,new_length,new_strand,new_assembly,mapping_quality,mapping_start,mapping_end,mapping_length,mapping_strands,mapping_chrs ) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"); + +} + +my $new_sa = $dba->{'ref'}->get_adaptor("slice"); +my $old_sa = $dba->{'alt'}->get_adaptor("slice"); + +if ( exists($feature_types{'genes'}) ) { + + + my $new_ga = $dba->{'ref'}->get_adaptor("gene"); + my $old_ga = $dba->{'alt'}->get_adaptor("gene"); + + foreach my $chr (@chrs) { + my $feature_type = 'gene'; + get_old_new_feature($old_ga,$new_ga,$feature_type,$chr, undef); + + } + +} + +if ( exists($feature_types{'SNPs'}) ) { + + my $new_vfa = $dba->{'var'}->get_adaptor('variationfeature'); + my $new_va = $dba->{'var'}->get_adaptor('variation'); + my $old_vfa = $dba->{'varalt'}->get_adaptor('variationfeature'); + + foreach my $chr (@chrs) { + my $feature_type = 'SNP'; + get_old_new_feature($old_vfa,$new_vfa,$feature_type,$chr, $new_va); + } +} + +if ( !$support->param('dry_run') ){ + $sth->finish(); +} + +# finish logfile +$support->finish_log; + + +sub get_old_new_feature { + + my $old_adaptor = shift; + my $new_adaptor = shift; + my $feature_type = shift; + my $chr = shift; + my $new_va = shift; + + $support->log_stamped("fetching $feature_type mappings for chromosome $chr\n",1); + + #get all genes on the chromosome from the old database + my $old_slice = $old_sa->fetch_by_region('chromosome', $chr, undef, undef, + undef, $support->param('altassembly')); + + # get a slice on the old assembly from the new database + my $newdb_oldasm_chr = $new_sa->fetch_by_region('chromosome', $chr, undef, undef, + undef, $support->param('altassembly')); + + my @old_features = @{$old_adaptor->fetch_all_by_Slice($old_slice)}; + foreach my $old_feature (@old_features) { + my $feature_name; + if ($feature_type eq 'gene') { + $feature_name = $old_feature->stable_id; + } elsif ($feature_type eq 'SNP') { + $feature_name = $old_feature->variation_name; + } + $support->log_verbose("$feature_type $feature_name location in old db: " . $old_feature->start . ":" . $old_feature->end . ":" . $old_feature->strand . " chromosome $chr\n",1); + + + if ($old_feature->length == 0) { + next; + } + + my $newdb_old_asm_slice = Bio::EnsEMBL::Slice->new(-coord_system => $newdb_oldasm_chr->coord_system, + -start => $old_feature->start, + -end => $old_feature->end, + -strand => $old_feature->strand, + -seq_region_name => $chr, + -seq_region_length => $old_feature->length, + -adaptor => $new_sa); + + my $mapping_start = $newdb_oldasm_chr->end; + my ($mapping_end, $mapping_length, $mapping_strands, $mapping_chrs, $mapping); + ($mapping_start, $mapping_end, $mapping_length, $mapping_strands,$mapping_chrs, $mapping) = map_slice($newdb_old_asm_slice, $mapping_start); + + #get the feature from the new db + my $new_feature; + if ($feature_type eq 'gene') { + $new_feature = $new_adaptor->fetch_by_stable_id($feature_name); + } elsif ($feature_type eq 'SNP') { + + my $variation = $new_va->fetch_by_name($feature_name); + if ($variation) { + my @new_features = @{$new_adaptor->fetch_all_by_Variation($variation)}; + foreach my $vf (@new_features) { + if ($vf->allele_string eq $old_feature->allele_string) { + $new_feature = $vf; + } + } + } + } + + + my $new_feature_found = 0; + my ($new_chr,$new_start,$new_end,$new_length,$new_strand); + if (defined($new_feature) ) { + $new_chr = $new_feature->slice->seq_region_name; + $new_start = $new_feature->start; + $new_end = $new_feature->end; + $new_strand = $new_feature->strand; + $new_length = $new_feature->end - $new_feature->start + 1; + $new_feature_found = 1; + $support->log_verbose("$feature_type $feature_name found in the new db; location:$new_start:$new_end:$new_strand chromosome $new_chr\n",1); + + } else { + $support->log_verbose("$feature_type $feature_name not found in the new db; can't compare $feature_type location to mapped location\n",1); + } + + #store mapping in the db + if ( !$support->param('dry_run') ) { + $sth->execute($feature_type,$feature_name,$support->param('altassembly'),$chr,$old_feature->start, $old_feature->end, $old_feature->end - $old_feature->start + 1, $old_feature->strand, $new_feature_found, $new_chr, $new_start, $new_end,$new_length, $new_strand, $support->param('assembly'), $mapping, $mapping_start, $mapping_end, $mapping_length, $mapping_strands, $mapping_chrs ); + } + + } + +} + + +sub map_slice { + + my $newdb_old_asm_slice = shift; + my $new_asm_start = shift; + + $support->log_verbose("projection to new assembly: \n",1); + #project to new assembly + my @segments = @{ $newdb_old_asm_slice->project('chromosome', $support->param('assembly')) }; + + my $new_asm_end = 0; + my %mapping_strands; + my %mapping_chrs; + my $first_strand; + my $first_chr; + my $mapping_length = 0; + + foreach my $segment (@segments) { + my $p_slice = $segment->to_Slice; + if (! defined($first_strand)) { + $first_strand = $p_slice->strand; + } + if (! defined($first_chr) ){ + $first_chr = $p_slice->seq_region_name; + } + if ($p_slice->start < $new_asm_start && $p_slice->strand == $first_strand && $p_slice->seq_region_name eq $first_chr) { + $new_asm_start = $p_slice->start; + } + if ($p_slice->end > $new_asm_end && $p_slice->strand == $first_strand && $p_slice->seq_region_name eq $first_chr ) { + $new_asm_end = $p_slice->end; + } + if ($new_asm_end > 0 && $p_slice->strand == $first_strand && $p_slice->seq_region_name eq $first_chr ) { + + $mapping_length += $p_slice->length; + } + $mapping_strands{$p_slice->strand} = 1; + $mapping_chrs{$p_slice->seq_region_name} = 1; + $support->log_verbose($p_slice->start() . ":" . $p_slice->end() . ":" . $p_slice->strand . " chromosome ".$p_slice->seq_region_name ."\n",1); + } + + my $mapping; #0-no mapping, 1-gaps exist, 2-no gaps + + if (scalar(@segments) == 0) { + $support->log_verbose("no mapping available\n",1); + $mapping = 0; + undef $new_asm_start; + undef $new_asm_end; + undef $mapping_length; + } else { + $support->log_verbose("maping start $new_asm_start mapping end $new_asm_end\n",1); + + if (scalar(@segments) > 1) { + $support->log_verbose("gaps in new assembly sequence mapping\n",1); + $mapping = 1; + } else { + $support->log_verbose("no gaps in new assembly sequence mapping\n",1); + $mapping = 2; + } + } + + my $mapping_strands; + if (scalar(keys %mapping_strands) > 1) { + $mapping_strands = 'both'; + } + if (scalar(keys %mapping_strands) == 1) { + my @strands = keys %mapping_strands; + $mapping_strands = $strands[0]; + } + my $mapping_chrs; + if (scalar(keys %mapping_chrs) >= 1) { + my @chrs = keys %mapping_chrs; + $mapping_chrs = join(',',@chrs); + } + + return($new_asm_start, $new_asm_end, $mapping_length, $mapping_strands,$mapping_chrs, $mapping); +} -- GitLab