Skip to content
Snippets Groups Projects
exon_conservation_check.pl 4.82 KiB
Newer Older
#!/usr/bin/env perl

=head1 SUMMARY

This script is intended to highlight issues with an assembly mapping, by inspecting
the equivalent sequence for each exon. The resulting log is grep-suitable and keyed
for severity.

=head1 SYNOPSIS

perl exon_conservation_check.pl <many arguments>

    --dbname, db_name=NAME              database name NAME
    --host, --dbhost, --db_host=HOST    database host HOST
    --port, --dbport, --db_port=PORT    database port PORT
    --user, --dbuser, --db_user=USER    database username USER
    --pass, --dbpass, --db_pass=PASS    database passwort PASS
    --assembly=ASSEMBLY                 assembly version ASSEMBLY
    --altdbname=NAME                    alternative database NAME
    --altassembly=ASSEMBLY              alternative assembly version ASSEMBLY

Optional options
    --logfile, --log=FILE               log to FILE (default: *STDOUT)
    --logpath=PATH                      write logfile to PATH (default: .)
=cut

use strict;
use warnings;

use FindBin qw($Bin);
use lib "$Bin";

use AssemblyMapper::Support;
use Bio::EnsEMBL::Utils::Exception qw( throw );
use Pod::Usage;
use Bio::EnsEMBL::DBSQL::OntologyDBAdaptor;
use Bio::EnsEMBL::Utils::BiotypeMapper;

my $support = AssemblyMapper::Support->new(
   
);
unless ($support->parse_arguments(@_)) {
    warn $support->error if $support->error;
    pod2usage(1);
}
$support->connect_dbs;


my $onto_db_adaptor = Bio::EnsEMBL::DBSQL::OntologyDBAdaptor->new( 
    -DBNAME => $support->ref_dba->dbc->dbname,
    -DBCONN => $support->ref_dba->dbc,
);
my $biotype_mapper = new Bio::EnsEMBL::Utils::BiotypeMapper($onto_db_adaptor);

$support->log_stamped("Beginning analysis.\n");
$support->log("!! = Very bad, %% = Somewhat bad, ?? = No mapping, might be bad\n");
my $ok = $support->iterate_chromosomes(
    prev_stage => '40-fix_overlaps',
    this_stage =>     '41-exon-conservation',
    worker =>         \&compare_exons,
);

$support->log_stamped("Finished.\n");

sub compare_exons {
    my ($asp) = @_;
    
    my $R_chr   = $asp->ref_chr;
    my $A_chr   = $asp->alt_chr;

    my $R_slice = $asp->ref_slice;
    my $A_slice = $asp->alt_slice;
    
    my $old_exons = $A_slice->get_all_Exons;
    while (my $old_exon = shift @$old_exons) {
        # Establish equivalent locations on old and new DBs
        my $coord_system = $old_exon->slice->coord_system;

        my $new_slice = $R_slice->adaptor->fetch_by_region(
            $coord_system->name,
            $old_exon->seq_region_name,
            $old_exon->start,
            $old_exon->end,
            $old_exon->strand,
            $coord_system->version
        );
        
        # make a shadow exon for the new database
        my $shadow_exon = new Bio::EnsEMBL::Feature (
            -start => $old_exon->seq_region_start,
            -end => $old_exon->seq_region_end,
            -strand => $old_exon->strand,
            -slice => $new_slice,
        );
        # project new exon to new assembly
        my $projected_exon = $shadow_exon->transform($A_slice->coord_system->name,$A_slice->coord_system->version,$R_slice);

        # Note that Anacode database naming patterns interfere with normal Registry adaptor fetching,
        # hence we must go around the houses somewhat when looking for the appropriate source gene.
        #warn "!! fetching gene adaptor ".$old_exon->adaptor->species.":".$old_exon->adaptor->dbc->dbname."Gene";
        my $old_gene_adaptor = $old_exon->adaptor->db->get_GeneAdaptor();
        my $gene_list = $old_gene_adaptor->fetch_nearest_Gene_by_Feature($old_exon, undef, undef);
        if (scalar(@$gene_list) >1) {warn "Encountered many possible genes for the exon."}
        my $parent_gene = $gene_list->[0];
        # compare sequences if a projection exists
        if ($projected_exon && $projected_exon->seq ne $old_exon->seq) {
            # Now we have a problem - the feature's sequence was not conserved between assemblies.
            # Determine severity of the problem
            
            my $group_list = $biotype_mapper->belongs_to_groups($parent_gene->biotype);
            my $warned = 0;
            foreach my $group (@$group_list) {
                if ($group eq 'protein_coding') {
                    # Maximum badness.
                    $support->log("!! ".$parent_gene->stable_id." - ".$old_exon->stable_id.
                        " projected - ".$projected_exon->start.":".$projected_exon->end."\n"
                    );
                    $warned = 1;
                }
            }
            unless ($warned) {
                # Middle badness. 
                $support->log("%% ".$parent_gene->stable_id." - ".$old_exon->stable_id."\n");
            }
        }
        
        if (! $projected_exon) {
            # No projection possibly bad news
            $support->log("?? ".$parent_gene->stable_id." - ".$old_exon->stable_id."\n");
        }
        
    }
       
}