Skip to content
Snippets Groups Projects
Commit c9f32319 authored by Graham McVicker's avatar Graham McVicker
Browse files

*** empty log message ***

parent f6753cbf
No related branches found
No related tags found
No related merge requests found
......@@ -4,6 +4,8 @@ use warnings;
use Getopt::Long;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Transcript;
use Bio::EnsEMBL::Exon;
my $MAX_CODING_INDEL = 12; #max allowed coding indel in exon
......@@ -75,7 +77,7 @@ foreach my $slice (@$slices) {
my $genes = $gene_adaptor->fetch_all_by_Slice($slice);
foreach my $gene (@$genes) {
foreach my $gene (reverse @$genes) {
my $transcripts = $gene->get_all_Transcripts();
foreach my $transcript (@$transcripts) {
......@@ -105,11 +107,24 @@ sub transfer_transcript {
my $slice_adaptor = $chimp_db->get_SliceAdaptor();
my $exons = $transcript->get_all_Exons();
my $translation = $transcript->translation();
my @chimp_exons;
my @chimp_transcripts;
my $chimp_transcript = Bio::EnsEMBL::Transcript->new();
my $chimp_translation;
if($translation) { #watch out for pseudogenes
$chimp_translation = Bio::EnsEMBL::Translation->new();
$chimp_translation->start($translation->start());
$chimp_translation->end($translation->end());
}
my $chimp_exon = undef;
EXON:
foreach my $exon (@$exons) {
my @coords = $mapper->map($exon->seq_region_name, $exon->seq_region_start,
$exon->seq_region_end, $exon->seq_region_strand,
$human_cs);
......@@ -127,13 +142,13 @@ sub transfer_transcript {
$cds_coords[0]->isa('Bio::EnsEMBL::Mapper::Gap')) {
debug('entire utr exon deletion - ok');
# TBD
# we can simply throw out this exon, it was all UTR
next EXON;
} else {
### TBD: can do more sensible checking and maybe split transcript
debug('entire coding exon deletion - discarding transcript');
return ();
return @chimp_transcripts;
}
} else {
#exon mapped completely
......@@ -146,22 +161,50 @@ sub transfer_transcript {
undef,undef,undef,
$chimp_cs->version);
push @chimp_exons, Bio::EnsEMBL::Exon->new
(-START => $c->start,
-END => $c->end,
-PHASE => $exon->phase,
-END_PHASE => $exon->end_phase,
-ANALYSIS => $exon->analysis,
-SLICE => $slice);
### TBD handle problems with seq_region/strand jumping exons...
$chimp_transcript->add_Exon( Bio::EnsEMBL::Exon->new
( -START => $c->start,
-END => $c->end,
-STRAND => $c->strand,
-PHASE => $exon->phase,
-END_PHASE => $exon->end_phase,
-ANALYSIS => $exon->analysis,
-SLICE => $slice) );
}
} else {
my $num = scalar(@coords);
my $exon_position = 0;
my $chimp_exon = Bio::EnsEMBL::Exon->new();
my $min_start = undef;
my $max_end = undef;
my $seq_region = undef;
my $is_start_exon = 0;
my $is_end_exon = 0;
# Fix translation start/end if it needs adjusting here
if($translation) {
if($translation->start_Exon()->hashkey() eq $exon->hashkey()) {
$is_start_exon = 1;
$chimp_translation->start_Exon($chimp_exon);
}
if($translation->end_Exon()->hashkey eq $exon->hashkey()) {
$is_end_exon = 1;
$chimp_translation->end_Exon($chimp_exon);
}
}
COORD:
for(my $i=0; $i < $num; $i++) {
my $c = $coords[$i];
$exon_position += $c->length();
if($c->isa('Bio::EnsEMBL::Mapper::Gap')) {
......@@ -169,81 +212,250 @@ sub transfer_transcript {
# this is a DELETION
#
# convert to CDS coords
# convert to CDS coords and categorize them
# gaps are UTR deletions, coords are CDS deletions
#
my @cds_coords = $trans_mapper->genomic2cds($c->start(),$c->end(),
$exon->strand());
@cds_coords =
grep {!$_->isa('Bio::EnsEMBL::Mapper::Gap')} @cds_coords;
my $cds_delete;
my @utr_deletes;
foreach my $cds_coord (@cds_coords) {
if($cds_coord->isa('Bio::EnsEMBL::Mapper::Gap')) {
push @utr_deletes, $cds_coord;
} else {
#sanity check:
if($cds_delete) {
throw("Unexpected: multiple cds deletes for single exon.");
}
$cds_delete = $cds_coord;
}
}
if(@utr_deletes > 2) {
throw("Unexpected: >2 UTR deletes for single exon.");
}
if(@cds_coords == 0) {
#this deletion was entirely in the UTR, accept it
foreach my $utr_delete (@utr_deletes) {
# the UTR deletions may require adjustments to the translation
debug("utr deletion - ok");
# TBD fix translation start/end if it needs adjusting here
# Fix translation start/end if it needs adjusting here
if($is_start_exon) {
next COORD;
}
#need to adjust the start of translation if deletion was
#in 5prime UTR
if($exon_position < $translation->start()) {
$chimp_translation->start($chimp_translation->start() -
$utr_delete->length());
#also need to adjust end translation if this is also the
#end exon
if($is_end_exon) {
$chimp_translation->end($chimp_translation->end() -
$utr_delete->length());
if(@cds_coords != 1) {
# it should not be possible to get more than one cds coord
# back since we are dealing with a single exon
throw("Unexpected, got multiple cds coords for single exon");
}
}
}
}
my $cds_coord = $cds_coords[0];
if($cds_delete) {
# if this is a long deletion of coding sequence, then we should
# throw out this transcript
# if this is a long deletion of coding sequence, then we should
# throw out this transcript
my $del_len = $cds_delete->length();
my $cds_len = get_cds_len($transcript);
my $del_len = $cds_coord->length();
if($cds_len == $del_len) {
debug("entire cds deletion - discarding transcript");
return @chimp_transcripts;
}
if($del_len > $MAX_CODING_INDEL) {
debug("long cds deletion ($del_len) - discarding transcript");
if($del_len > $MAX_CODING_INDEL) {
debug("long cds deletion ($del_len) - discarding transcript");
# TBD split transcript in two parts and keep both halves
# TBD split transcript in two parts and keep both halves
return ();
}
return @chimp_transcripts;
}
# if this deletion is short and %3 == 0, then we do not need to
# do much, but may have to adjust coding start/end of translation
if($del_len % 3 == 0) {
debug("short ($del_len) in frame cds deletion - keeping exon");
#adjust the translation end accordingly
if($is_end_exon) {
#adjust translation end accordingly
$chimp_translation->end($chimp_translation->end - $del_len);
}
#TBD
my $frame_shift = $del_len %3;
next COORD;
}
# if this deletion is short and %3 == 0, then we do not need to
# do much, but may have to adjust coding end of translation
if($frame_shift == 0) {
debug("short ($del_len) in frame cds deletion - ok");
if($del_len > $MAX_FRAMESHIFT_INDEL) {
debug("medium ($del_len) frameshifting cds deletion - " .
"discarding transcript");
if($is_end_exon) {
$chimp_translation->end($chimp_translation->end() - $del_len);
}
#TBD split transcript in two parts and keep both halves
next COORD;
}
return ();
}
if($del_len > $MAX_FRAMESHIFT_INDEL) {
debug("medium ($del_len) frameshifting cds deletion - " .
"discarding transcript");
#TBD split transcript in two parts and keep both halves
return @chimp_transcripts;
}
# this is a short frameshifting deletion
# check if deletion is at beginning of CDS
if($cds_delete->start() == 1) {
# we need to adjust the start of the exon now to
# put everything back into frame
#sanity check:
if(!$is_start_exon) {
throw("Unexpected: start of cds is not in start exon");
}
# this is a short frameshifting deletion
# create a 'frameshift intron' and move on
debug("short ($del_len) frameshifting cds deletion - " .
"creating fake intron");
my $ex_start = $chimp_exon->start();
# if this exon the first coding exon, then we need to adjust the
# coding start of the translation, and possibly the exon start
# as well
#if there is already UTR, just shift it
if($chimp_translation->start > 1) {
debug("frameshift deletion at start of cds - extending utr");
$chimp_translation->start($chimp_translation->start +
3 - $frame_shift);
} else {
$ex_start = $chimp_exon->start();
# if this exon is the last coding exon, then we need to adjust the
# coding end
#there was no UTR so adjust exon start, leave w/ no UTR
if(!defined($ex_start)) {
#sanity check:
if(@coords < $i+2) {
throw("Unexpected: no exon start defined and " .
"no more coords available");
}
$ex_start = $coords[$i+1]->start(); #peek at next coord
}
debug("frameshift deletion at start of cds - " .
"moving exon start");
$chimp_exon->start($ex_start + 3 - $frame_shift);
}
next COORD;
}
# check if deletion is at end of CDS
if($cds_coord->end() == get_cds_len($transcript)) {
# sanity check:
if(!$is_end_exon) {
throw("Unexpected: end of cds is not in end exon");
}
# if there was already a UTR just shift it
if($translation->end() < $exon->length()) {
$chimp_translation->end($chimp_translation->end()
- 3 + $frame_shift);
} else {
# no utr and at end of CDS, so must be end of exon
# sanity checks:
if(!defined($chimp_exon->end())) {
throw("Unexpected: at end of exon and no end defined");
}
debug("frameshift deletion at end of cds - " .
"moving exon end and translation end");
$chimp_exon->end($chimp_exon->end() - 3 + $frame_shift);
$chimp_translation->end($chimp_exon->length());
}
next COORD;
}
# the deletion was not at either end of the cds, but could still
# be at the beginning or end of the exon
# check if this is a deletion at the exon start
if($i == 0) {
# sanity check:
if($is_start_exon) {
throw("Unexpected: delete at start of start exon is " .
"not at cds start");
}
debug("frameshift deletion at start of exon - " .
"adjusting start of exon");
$chimp_exon->start($chimp_exon->start + 3 - $frameshift);
if($is_end_exon) {
debug("adjusting translation end");
$chimp_exon->translation_end($chimp_exon_translation->end -
3 + $frameshift);
}
next COORD;
}
#check if this is a deletion at the exon end
if(@coords == $i+1) {
# sanity check:
if($is_end_exon) {
throw("Unexpected delete at end of end exon is " .
"not at cds start");
}
debug("frameshift deletion at end of exon - " .
"adjusting end of exon");
$chimp_exon->end($chimp_exon->end - 3 + $frameshift);
next COORD;
}
# now we know that this is a deletion in the middle of the
# cds
# for this to be possible there should already be an exon start/end
# because we should have already seen some exon coords
# sanity check
if(!defined($chimp_exon->start())) {
throw("Unexpected: mid cds delete and no exon start defined");
}
if(!defined($chimp_exon->end())) {
throw("Unexpected: mid cds delete and no exon end defined");
}
# split this exon in two parts and make a 'frameshift' intron
# in the middle
debug("short ($del_len) frameshifting mid cds deletion - " .
"creating fake intron");
my $ex_len = $chimp_exon->length();
$chimp_exon->end($chimp_exon->end() - 3 + $frameshift)
push @chimp_exons, $chimp_exon;
$chimp_exon = Bio::EnsEMBL::Exon->new
(-START => $chimp_exon->end() + 4 - $frame_shift,
-STRAND => $chimp_exon->strand());
}
} else {
#if first is coord no need to do anything
next COORD if($i == 0);
my $prev_c = $coords[$i];
my $prev_c = $coords[$i-1];
next if($prev_c->isa('Bio::EnsEMBL::Mapper::Gap'));
......@@ -256,7 +468,7 @@ sub transfer_transcript {
#TBD it may not be necessary to discard transcript if this is UTR
return ();
return @chimp_transcripts;
}
if($c->strand() != $prev_c->strand()) {
......@@ -264,7 +476,7 @@ sub transfer_transcript {
#TBD it may not be necessary to discard transcript if this is UTR
return ();
return @chimp_transcripts;
}
......@@ -294,6 +506,14 @@ sub transfer_transcript {
$trans_mapper->genomic2cds($left_coord, $right_coord, -1);
}
if($insert_len < 1) {
debug("inverted exon - discarding transcript");
#TBD - is there something sensible that can be done here?
return @chimp_transcripts;
}
if(@cds_coords == 1 &&
$cds_coords[0]->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
......@@ -305,7 +525,7 @@ sub transfer_transcript {
# TBD may be ok to keep transcript and split in two?
return ();
return @chimp_transcripts;
}
if($insert_len % 3 == 0) {
......@@ -323,7 +543,7 @@ sub transfer_transcript {
#TBD may be ok to keep transcript and split in two
return ();
return @chimp_transcripts;
}
debug("short ($insert_len) frameshifting cds insert " .
......@@ -358,11 +578,24 @@ sub transfer_transcript {
# * exons on different scaffolds
# * exons on different strands
# * exons in which the splice order changed
# * transcripts which lost all exons
# maybe phases should be set here
}
sub get_cds_len {
my $transcript = shift;
my $len = 0;
foreach my $exon (@{$transcript->get_all_translateable_Exons()}) {
$len += $exon->length();
}
return $len;
}
sub debug {
my $msg = shift;
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment