diff --git a/misc-scripts/assembly/exon_conservation_check.pl b/misc-scripts/assembly/exon_conservation_check.pl new file mode 100644 index 0000000000000000000000000000000000000000..80ebc3952007a1c6fc1494f1feda760ed306b4ab --- /dev/null +++ b/misc-scripts/assembly/exon_conservation_check.pl @@ -0,0 +1,135 @@ +#!/usr/bin/env perl + +=head1 SUMMARY + +This script is intended to highlight issues with an assembly mapping, by inspecting +the equivalent sequence for each exon. The resulting log is grep-suitable and keyed +for severity. + +=head1 SYNOPSIS + +perl exon_conservation_check.pl <many arguments> + + --dbname, db_name=NAME database name NAME + --host, --dbhost, --db_host=HOST database host HOST + --port, --dbport, --db_port=PORT database port PORT + --user, --dbuser, --db_user=USER database username USER + --pass, --dbpass, --db_pass=PASS database passwort PASS + --assembly=ASSEMBLY assembly version ASSEMBLY + --altdbname=NAME alternative database NAME + --altassembly=ASSEMBLY alternative assembly version ASSEMBLY + +Optional options + --logfile, --log=FILE log to FILE (default: *STDOUT) + --logpath=PATH write logfile to PATH (default: .) +=cut + +use strict; +use warnings; + +use FindBin qw($Bin); +use lib "$Bin"; + +use AssemblyMapper::Support; +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Pod::Usage; +use Bio::EnsEMBL::DBSQL::OntologyDBAdaptor; +use Bio::EnsEMBL::Utils::BiotypeMapper; + +my $support = AssemblyMapper::Support->new( + +); +unless ($support->parse_arguments(@_)) { + warn $support->error if $support->error; + pod2usage(1); +} +$support->connect_dbs; + + +my $onto_db_adaptor = Bio::EnsEMBL::DBSQL::OntologyDBAdaptor->new( + -DBNAME => $support->ref_dba->dbc->dbname, + -DBCONN => $support->ref_dba->dbc, +); +my $biotype_mapper = new Bio::EnsEMBL::Utils::BiotypeMapper($onto_db_adaptor); + +$support->log_stamped("Beginning analysis.\n"); +$support->log("!! = Very bad, %% = Somewhat bad, ?? = No mapping, might be bad\n"); +my $ok = $support->iterate_chromosomes( + prev_stage => '40-fix_overlaps', + this_stage => '41-exon-conservation', + worker => \&compare_exons, +); + +$support->log_stamped("Finished.\n"); + +sub compare_exons { + my ($asp) = @_; + + my $R_chr = $asp->ref_chr; + my $A_chr = $asp->alt_chr; + + my $R_slice = $asp->ref_slice; + my $A_slice = $asp->alt_slice; + + my $old_exons = $A_slice->get_all_Exons; + while (my $old_exon = shift @$old_exons) { + # Establish equivalent locations on old and new DBs + my $coord_system = $old_exon->slice->coord_system; + + my $new_slice = $R_slice->adaptor->fetch_by_region( + $coord_system->name, + $old_exon->seq_region_name, + $old_exon->start, + $old_exon->end, + $old_exon->strand, + $coord_system->version + ); + + # make a shadow exon for the new database + my $shadow_exon = new Bio::EnsEMBL::Feature ( + -start => $old_exon->seq_region_start, + -end => $old_exon->seq_region_end, + -strand => $old_exon->strand, + -slice => $new_slice, + ); + # project new exon to new assembly + my $projected_exon = $shadow_exon->transform($A_slice->coord_system->name,$A_slice->coord_system->version,$R_slice); + + # Note that Anacode database naming patterns interfere with normal Registry adaptor fetching, + # hence we must go around the houses somewhat when looking for the appropriate source gene. + #warn "!! fetching gene adaptor ".$old_exon->adaptor->species.":".$old_exon->adaptor->dbc->dbname."Gene"; + my $old_gene_adaptor = $old_exon->adaptor->db->get_GeneAdaptor(); + my $gene_list = $old_gene_adaptor->fetch_nearest_Gene_by_Feature($old_exon, undef, undef); + if (scalar(@$gene_list) >1) {warn "Encountered many possible genes for the exon."} + my $parent_gene = $gene_list->[0]; + # compare sequences if a projection exists + if ($projected_exon && $projected_exon->seq ne $old_exon->seq) { + # Now we have a problem - the feature's sequence was not conserved between assemblies. + # Determine severity of the problem + + my $group_list = $biotype_mapper->belongs_to_groups($parent_gene->biotype); + my $warned = 0; + foreach my $group (@$group_list) { + if ($group eq 'protein_coding') { + # Maximum badness. + $support->log("!! ".$parent_gene->stable_id." - ".$old_exon->stable_id. + " projected - ".$projected_exon->start.":".$projected_exon->end."\n" + ); + $warned = 1; + } + } + unless ($warned) { + # Middle badness. + $support->log("%% ".$parent_gene->stable_id." - ".$old_exon->stable_id."\n"); + } + } + + if (! $projected_exon) { + # No projection possibly bad news + $support->log("?? ".$parent_gene->stable_id." - ".$old_exon->stable_id."\n"); + } + + } + +} +