Newer
Older

Kieron Taylor
committed
#!/usr/bin/env perl
use strict;
use warnings;
use FindBin qw($Bin);
use lib "$Bin";
#Bring in pipeline as well for their helpers
use lib "$Bin/../../../ensembl-pipeline/scripts/Finished/assembly";
use lib "$Bin/../../../ensembl-analysis/modules";
#runtime include normally
require AssemblyMapper::Support;

Kieron Taylor
committed
use Bio::EnsEMBL::Utils::Exception qw( throw );
use Pod::Usage;
use Bio::EnsEMBL::DBSQL::OntologyDBAdaptor;
use Bio::EnsEMBL::Utils::BiotypeMapper;
#Genebuilder utils
require Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils;
Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils->import(qw/clone_Transcript/);
require Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonUtils;
Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonUtils->import(qw/clone_Exon/);

Kieron Taylor
committed
my $support = AssemblyMapper::Support->new(
extra_options => [
qw/
check_transcripts!
check_exons!
/
]

Kieron Taylor
committed
);
unless ($support->parse_arguments(@_)) {
warn $support->error if $support->error;
pod2usage(1);

Kieron Taylor
committed
}
$support->connect_dbs;
my $onto_db_adaptor = Bio::EnsEMBL::DBSQL::OntologyDBAdaptor->new(
-DBNAME => $support->ref_dba->dbc->dbname,
-DBCONN => $support->ref_dba->dbc,

Kieron Taylor
committed
);
my $biotype_mapper = new Bio::EnsEMBL::Utils::BiotypeMapper($onto_db_adaptor);
$support->log_stamped("Beginning analysis.\n");
$support->log("EXON KEY : !! = Very bad (pc mismatch), %% = Somewhat bad (mismatch), ?? = No mapping, might be bad\n");
$support->log("TRANSCRIPT KEY : @@ = Very bad (pc translation mismatch), = Very bad (pc transcript mismatch), ** = Somewhat bad (mismatch), XX = No mapping, might be bad\n");
$support->iterate_chromosomes(
prev_stage => '40-fix_overlaps',
this_stage => '41-conservation',
worker => \&compare,

Kieron Taylor
committed
);
$support->log_stamped("Finished.\n");
sub compare {
my ($asp) = @_;
if ($support->param('check_exons')) {
compare_exons($asp);
}
if ($support->param('check_transcripts')) {
compare_transcripts($asp);
}
return;
}

Kieron Taylor
committed
sub compare_exons {
my ($asp) = @_;
my $R_chr = $asp->ref_chr;
my $A_chr = $asp->alt_chr;
my $R_slice = $asp->ref_slice;
my $A_slice = $asp->alt_slice;
my $new_slice_adaptor = $R_slice->adaptor();
my $old_exons = $A_slice->get_all_Exons;
$support->log(sprintf("Total exons %d\n", scalar(@{$old_exons})));
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
while (my $old_exon = shift @$old_exons) {
# Establish equivalent locations on old and new DBs
my $new_A_slice = new_Slice($old_exon, $new_slice_adaptor);
# make a shadow exon for the new database
my $shadow_exon = clone_Exon($old_exon);
$shadow_exon->slice($new_A_slice);
# project new exon to new assembly
my $projected_exon = $shadow_exon->transform($R_slice->coord_system->name, $R_slice->coord_system->version, $R_slice);
# Note that Anacode database naming patterns interfere with normal Registry adaptor fetching,
# hence we must go around the houses somewhat when looking for the appropriate source gene.
#warn "!! fetching gene adaptor ".$old_exon->adaptor->species.":".$old_exon->adaptor->dbc->dbname."Gene";
my $old_gene_adaptor = $old_exon->adaptor->db->get_GeneAdaptor();
my $gene_list = $old_gene_adaptor->fetch_nearest_Gene_by_Feature($old_exon, undef, undef);
if (scalar(@$gene_list) > 1) { warn "Encountered many possible genes for the exon." }
my $parent_gene = $gene_list->[0];
my $state;
my $location;
my $difference;
my $total_length;
# compare sequences if a projection exists
if ($projected_exon) {
my $old_seq = $old_exon->seq()->seq();
my $projected_seq = $projected_exon->seq()->seq();
$total_length = length($old_seq);
if ($projected_seq ne $old_seq) {
# Now we have a problem - the feature's sequence was not conserved between assemblies.
# Determine severity of the problem
$difference = diff(\$old_seq, \$projected_seq);
my $group_list = $biotype_mapper->belongs_to_groups($parent_gene->biotype);
foreach my $group (@$group_list) {
if ($group eq 'protein_coding') {
$state = '!!';
last;
}
}
if (!$state) {
# Middle badness.
$state = '%%';
}
}
$location = sprintf('%d : %d', $projected_exon->start(), $projected_exon->end());
}

Kieron Taylor
committed
if (!$projected_exon) {
$state = '??';
$location = 'None';
$total_length = 0;
$difference = 0;
}

Kieron Taylor
committed
if($state) {
$support->log(sprintf('%s | ID : %s | Gene ID: %s | Location : %s | Differences : %d | Length : %d', $state, $old_exon->stable_id(), $parent_gene->stable_id(), $location, $difference, $total_length) . "\n");
}
}
}
sub compare_transcripts {
my ($asp) = @_;
my $R_slice = $asp->ref_slice;
my $A_slice = $asp->alt_slice;
my $new_slice_adaptor = $R_slice->adaptor();
my $old_transcripts = $A_slice->get_all_Transcripts();
$support->log(sprintf("Total transcripts %d\n", scalar(@{$old_transcripts})));
while (my $old_transcript = shift @$old_transcripts) {
my $new_A_slice = new_Slice($old_transcript, $new_slice_adaptor);
my $shadow_transcript = clone_Transcript($old_transcript);
foreach my $e (@{$shadow_transcript->get_all_Exons()}) {
$e->slice($new_A_slice);
}
foreach my $se (@{$shadow_transcript->get_all_supporting_features()}) {
$se->slice($new_A_slice);
}
$shadow_transcript->slice($new_A_slice);
my $projected_transcript = $shadow_transcript->transform($R_slice->coord_system->name, $R_slice->coord_system->version, $R_slice);
my $state;
my $location;
my $total_length;
my $difference;
if ($projected_transcript) {
#Check if it was protein coding
my $group_list = $biotype_mapper->belongs_to_groups($projected_transcript->biotype);
my $is_pc = 0;
foreach my $group (@$group_list) {
if ($group eq 'protein_coding') {
$is_pc = 1;
last;

Kieron Taylor
committed
}
}
#Now check for protein sequence mis-match
if ($is_pc) {
my $old_seq = $old_transcript->translate()->seq();
my $new_seq = $projected_transcript->translate()->seq();
$total_length = length($old_seq);
if ($old_seq ne $new_seq) {
$state = '@@';
$difference = diff(\$old_seq, \$new_seq);
}
}
if (!$state) {
my $old_seq = $old_transcript->spliced_seq();
my $new_seq = $projected_transcript->spliced_seq();
$total_length = length($old_seq);
if ($old_seq ne $new_seq) {
$state = ($is_pc) ? '' : '**';
$difference = diff(\$old_seq, \$new_seq);

Kieron Taylor
committed
}
}
$location = sprintf('%d : %d', $projected_transcript->start(), $projected_transcript->end());
}
else {
$state = '**';
$location = 'None';
$total_length = 0;
$difference = 0;
}
if ($state) {
$support->log(sprintf('%s | ID : %s | Location : %s | Differences : %d | Length : %d', $state, $old_transcript->stable_id(), $location, $difference, $total_length) . "\n");

Kieron Taylor
committed
}
}
return;
}
sub new_Slice {
my ($feature, $new_slice_adaptor) = @_;
my $old_slice = $feature->slice();
my $coord_system = $old_slice->coord_system;
my $new_slice = $new_slice_adaptor->fetch_by_region($coord_system->name, $old_slice->seq_region_name, $old_slice->start, $old_slice->end, $old_slice->strand, $coord_system->version);
return $new_slice;
}
#Count number of diffs between strings
sub diff {
my ($s1, $s2) = @_;
my $diff = ${$s1} ^ ${$s2};
my $total = 0;
$total += ($+[1] - $-[1]) while $diff =~ m{ ([^\x00]+) }xmsg;
return $total;

Kieron Taylor
committed
}

Kieron Taylor
committed
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
exon_conservation_check.pl
=head1 SUMMARY
This script is intended to highlight issues with an assembly mapping, by inspecting
the equivalent sequence for each exon. The resulting log is grep-suitable and keyed
for severity.
=head1 SYNOPSIS
perl exon_conservation_check.pl <many arguments>
--dbname, db_name=NAME database name NAME
--host, --dbhost, --db_host=HOST database host HOST
--port, --dbport, --db_port=PORT database port PORT
--user, --dbuser, --db_user=USER database username USER
--pass, --dbpass, --db_pass=PASS database passwort PASS
--assembly=ASSEMBLY assembly version ASSEMBLY
--altdbname=NAME alternative database NAME
--altassembly=ASSEMBLY alternative assembly version ASSEMBLY
Optional options
--logfile, --log=FILE log to FILE (default: *STDOUT)
--logpath=PATH write logfile to PATH (default: .)
--check_exons Test exons for their robustness
between assembly mappings
--check_transcripts Expand the test to check if we still
get full length transcript encoding
=cut