exon_conservation_check.pl

#!/usr/bin/env perl

=head1 SUMMARY

This script is intended to highlight issues with an assembly mapping, by inspecting
the equivalent sequence for each exon. The resulting log is grep-suitable and keyed
for severity.

=head1 SYNOPSIS

perl exon_conservation_check.pl <many arguments>

    --dbname, db_name=NAME              database name NAME
    --host, --dbhost, --db_host=HOST    database host HOST
    --port, --dbport, --db_port=PORT    database port PORT
    --user, --dbuser, --db_user=USER    database username USER
    --pass, --dbpass, --db_pass=PASS    database passwort PASS
    --assembly=ASSEMBLY                 assembly version ASSEMBLY
    --altdbname=NAME                    alternative database NAME
    --altassembly=ASSEMBLY              alternative assembly version ASSEMBLY

Optional options
    --logfile, --log=FILE               log to FILE (default: *STDOUT)
    --logpath=PATH                      write logfile to PATH (default: .)
    
    --check_exons                       Test exons for their robustness 
                                        between assembly mappings
                                        
    --check_transcripts                 Expand the test to check if we still 
                                        get full length transcript encoding
=cut

use strict;
use warnings;

use FindBin qw($Bin);
use lib "$Bin";

#Bring in pipeline as well for their helpers
use lib "$Bin/../../../ensembl-pipeline/scripts/Finished/assembly";
use lib "$Bin/../../../ensembl-analysis/modules";

#runtime include normally
require AssemblyMapper::Support;
use Bio::EnsEMBL::Utils::Exception qw( throw );
use Pod::Usage;
use Bio::EnsEMBL::DBSQL::OntologyDBAdaptor;
use Bio::EnsEMBL::Utils::BiotypeMapper;

#Genebuilder utils
require Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils;
Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils->import(qw/clone_Transcript/);
require Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonUtils;
Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonUtils->import(qw/clone_Exon/);

my $support = AssemblyMapper::Support->new(
  extra_options => [
    qw/
      check_transcripts!
      check_exons!
      /
  ]
);
unless ($support->parse_arguments(@_)) {
  warn $support->error if $support->error;
  pod2usage(1);
}
$support->connect_dbs;

my $onto_db_adaptor = Bio::EnsEMBL::DBSQL::OntologyDBAdaptor->new(
  -DBNAME => $support->ref_dba->dbc->dbname,
  -DBCONN => $support->ref_dba->dbc,
);
my $biotype_mapper = new Bio::EnsEMBL::Utils::BiotypeMapper($onto_db_adaptor);

$support->log_stamped("Beginning analysis.\n");
$support->log("EXON KEY       : !! = Very bad (pc mismatch), %% = Somewhat bad (mismatch), ?? = No mapping, might be bad\n");
$support->log("TRANSCRIPT KEY : @@ = Very bad (pc translation mismatch),  = Very bad (pc transcript mismatch), ** = Somewhat bad (mismatch), XX = No mapping, might be bad\n");


$support->iterate_chromosomes(
  prev_stage => '40-fix_overlaps',
  this_stage => '41-conservation',
  worker     => \&compare,
);

$support->log_stamped("Finished.\n");

sub compare {
  my ($asp) = @_;
  if ($support->param('check_exons')) {
    compare_exons($asp);
  }
  if ($support->param('check_transcripts')) {
    compare_transcripts($asp);
  }
  return;
}

sub compare_exons {
  my ($asp) = @_;

  my $R_chr = $asp->ref_chr;
  my $A_chr = $asp->alt_chr;

  my $R_slice = $asp->ref_slice;
  my $A_slice = $asp->alt_slice;

  my $new_slice_adaptor = $R_slice->adaptor();

  my $old_exons = $A_slice->get_all_Exons;
  while (my $old_exon = shift @$old_exons) {

    # Establish equivalent locations on old and new DBs
    my $new_A_slice = new_Slice($old_exon, $new_slice_adaptor);

    # make a shadow exon for the new database
    my $shadow_exon = clone_Exon($old_exon);
    $shadow_exon->slice($new_A_slice);

    # project new exon to new assembly
    my $projected_exon = $shadow_exon->transform($R_slice->coord_system->name, $R_slice->coord_system->version, $R_slice);

    # Note that Anacode database naming patterns interfere with normal Registry adaptor fetching,
    # hence we must go around the houses somewhat when looking for the appropriate source gene.
    #warn "!! fetching gene adaptor ".$old_exon->adaptor->species.":".$old_exon->adaptor->dbc->dbname."Gene";
    my $old_gene_adaptor = $old_exon->adaptor->db->get_GeneAdaptor();
    my $gene_list = $old_gene_adaptor->fetch_nearest_Gene_by_Feature($old_exon, undef, undef);
    if (scalar(@$gene_list) > 1) { warn "Encountered many possible genes for the exon." }
    my $parent_gene = $gene_list->[0];

    my $state;
    my $location;
    my $difference;
    my $total_length;

    # compare sequences if a projection exists
    if ($projected_exon) {
      my $old_seq       = $old_exon->seq()->seq();
      my $projected_seq = $projected_exon->seq()->seq();
      $total_length = length($old_seq);

      if ($projected_seq ne $old_seq) {

        # Now we have a problem - the feature's sequence was not conserved between assemblies.
        # Determine severity of the problem
        $difference = diff(\$old_seq, \$projected_seq);

        my $group_list = $biotype_mapper->belongs_to_groups($parent_gene->biotype);
        foreach my $group (@$group_list) {
          if ($group eq 'protein_coding') {
            $state = '!!';
            last;
          }
        }
        if (!$state) {

          # Middle badness.
          $state = '%%';
        }
      }

      $location = sprintf('%d : %d', $projected_exon->start(), $projected_exon->end());
    }

    if (!$projected_exon) {
      $state        = '??';
      $location     = 'None';
      $total_length = 0;
      $difference   = 0;
    }
    
    if($state) {
      $support->log(sprintf('%s | ID : %s | Gene ID: %s | Location : %s | Differences : %d | Length : %d', $state, $old_exon->stable_id(), $parent_gene->stable_id(), $location, $difference, $total_length) . "\n");
    }
  }

}

sub compare_transcripts {
  my ($asp)             = @_;
  my $R_slice           = $asp->ref_slice;
  my $A_slice           = $asp->alt_slice;
  my $new_slice_adaptor = $R_slice->adaptor();

  my $old_transcripts = $A_slice->get_all_Transcripts();
  while (my $old_transcript = shift @$old_transcripts) {
    my $new_A_slice = new_Slice($old_transcript, $new_slice_adaptor);
    my $shadow_transcript = clone_Transcript($old_transcript);
    foreach my $e (@{$shadow_transcript->get_all_Exons()}) {
      $e->slice($new_A_slice);
    }
    foreach my $se (@{$shadow_transcript->get_all_supporting_features()}) {
      $se->slice($new_A_slice);
    }
    $shadow_transcript->slice($new_A_slice);
    my $projected_transcript = $shadow_transcript->transform($R_slice->coord_system->name, $R_slice->coord_system->version, $R_slice);

    my $state;
    my $location;
    my $total_length;
    my $difference;
    if ($projected_transcript) {
      #Check if it was protein coding
      my $group_list = $biotype_mapper->belongs_to_groups($projected_transcript->biotype);
      my $is_pc      = 0;
      foreach my $group (@$group_list) {
        if ($group eq 'protein_coding') {
          $is_pc = 1;
          last;
        }
      }

      #Now check for protein sequence mis-match
      if ($is_pc) {
        my $old_seq = $old_transcript->translate()->seq();
        my $new_seq = $projected_transcript->translate()->seq();
        $total_length = length($old_seq);
        if ($old_seq ne $new_seq) {
          $state = '@@';
          $difference = diff(\$old_seq, \$new_seq);
        }
      }

      if (!$state) {
        my $old_seq = $old_transcript->spliced_seq();
        my $new_seq = $projected_transcript->spliced_seq();
        $total_length = length($old_seq);
        if ($old_seq ne $new_seq) {
          $state = ($is_pc) ? '' : '**';
          $difference = diff(\$old_seq, \$new_seq);
        }
      }

      $location = sprintf('%d : %d', $projected_transcript->start(), $projected_transcript->end());
    }
    else {
      $state        = '**';
      $location     = 'None';
      $total_length = 0;
      $difference   = 0;
    }

    if ($state) {
      $support->log(sprintf('%s | ID : %s | Location : %s | Differences : %d | Length : %d', $state, $old_transcript->stable_id(), $location, $difference, $total_length) . "\n");
    }
  }

  return;
}

sub new_Slice {
  my ($feature, $new_slice_adaptor) = @_;
  my $old_slice    = $feature->slice();
  my $coord_system = $old_slice->coord_system;
  my $new_slice    = $new_slice_adaptor->fetch_by_region($coord_system->name, $old_slice->seq_region_name, $old_slice->start, $old_slice->end, $old_slice->strand, $coord_system->version);
  return $new_slice;
}

#Count number of diffs between strings
sub diff {
  my ($s1, $s2) = @_;
  my $diff  = ${$s1} ^ ${$s2};
  my $total = 0;
  $total += ($+[1] - $-[1]) while $diff =~ m{ ([^\x00]+) }xmsg;
  return $total;
}