Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
ensembl
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Iterations
Wiki
Requirements
Jira
Code
Merge requests
1
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ensembl-gh-mirror
ensembl
Commits
c9f32319
Commit
c9f32319
authored
21 years ago
by
Graham McVicker
Browse files
Options
Downloads
Patches
Plain Diff
*** empty log message ***
parent
f6753cbf
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
misc-scripts/chimp/human2chimp.pl
+292
-59
292 additions, 59 deletions
misc-scripts/chimp/human2chimp.pl
with
292 additions
and
59 deletions
misc-scripts/chimp/human2chimp.pl
+
292
−
59
View file @
c9f32319
...
...
@@ -4,6 +4,8 @@ use warnings;
use
Getopt::
Long
;
use
Bio::EnsEMBL::DBSQL::
DBAdaptor
;
use
Bio::EnsEMBL::
Transcript
;
use
Bio::EnsEMBL::
Exon
;
my
$MAX_CODING_INDEL
=
12
;
#max allowed coding indel in exon
...
...
@@ -75,7 +77,7 @@ foreach my $slice (@$slices) {
my
$genes
=
$gene_adaptor
->
fetch_all_by_Slice
(
$slice
);
foreach
my
$gene
(
@$genes
)
{
foreach
my
$gene
(
reverse
@$genes
)
{
my
$transcripts
=
$gene
->
get_all_Transcripts
();
foreach
my
$transcript
(
@$transcripts
)
{
...
...
@@ -105,11 +107,24 @@ sub transfer_transcript {
my
$slice_adaptor
=
$chimp_db
->
get_SliceAdaptor
();
my
$exons
=
$transcript
->
get_all_Exons
();
my
$translation
=
$transcript
->
translation
();
my
@chimp_exons
;
my
@chimp_transcripts
;
my
$chimp_transcript
=
Bio::EnsEMBL::
Transcript
->
new
();
my
$chimp_translation
;
if
(
$translation
)
{
#watch out for pseudogenes
$chimp_translation
=
Bio::EnsEMBL::
Translation
->
new
();
$chimp_translation
->
start
(
$translation
->
start
());
$chimp_translation
->
end
(
$translation
->
end
());
}
my
$chimp_exon
=
undef
;
EXON:
foreach
my
$exon
(
@$exons
)
{
my
@coords
=
$mapper
->
map
(
$exon
->
seq_region_name
,
$exon
->
seq_region_start
,
$exon
->
seq_region_end
,
$exon
->
seq_region_strand
,
$human_cs
);
...
...
@@ -127,13 +142,13 @@ sub transfer_transcript {
$cds_coords
[
0
]
->
isa
('
Bio::EnsEMBL::Mapper::Gap
'))
{
debug
('
entire utr exon deletion - ok
');
#
TBD
#
we can simply throw out this exon, it was all UTR
next
EXON
;
}
else
{
### TBD: can do more sensible checking and maybe split transcript
debug
('
entire coding exon deletion - discarding transcript
');
return
()
;
return
@chimp_transcripts
;
}
}
else
{
#exon mapped completely
...
...
@@ -146,22 +161,50 @@ sub transfer_transcript {
undef
,
undef
,
undef
,
$chimp_cs
->
version
);
push
@chimp_exons
,
Bio::EnsEMBL::
Exon
->
new
(
-
START
=>
$c
->
start
,
-
END
=>
$c
->
end
,
-
PHASE
=>
$exon
->
phase
,
-
END_PHASE
=>
$exon
->
end_phase
,
-
ANALYSIS
=>
$exon
->
analysis
,
-
SLICE
=>
$slice
);
### TBD handle problems with seq_region/strand jumping exons...
$chimp_transcript
->
add_Exon
(
Bio::EnsEMBL::
Exon
->
new
(
-
START
=>
$c
->
start
,
-
END
=>
$c
->
end
,
-
STRAND
=>
$c
->
strand
,
-
PHASE
=>
$exon
->
phase
,
-
END_PHASE
=>
$exon
->
end_phase
,
-
ANALYSIS
=>
$exon
->
analysis
,
-
SLICE
=>
$slice
)
);
}
}
else
{
my
$num
=
scalar
(
@coords
);
my
$exon_position
=
0
;
my
$chimp_exon
=
Bio::EnsEMBL::
Exon
->
new
();
my
$min_start
=
undef
;
my
$max_end
=
undef
;
my
$seq_region
=
undef
;
my
$is_start_exon
=
0
;
my
$is_end_exon
=
0
;
# Fix translation start/end if it needs adjusting here
if
(
$translation
)
{
if
(
$translation
->
start_Exon
()
->
hashkey
()
eq
$exon
->
hashkey
())
{
$is_start_exon
=
1
;
$chimp_translation
->
start_Exon
(
$chimp_exon
);
}
if
(
$translation
->
end_Exon
()
->
hashkey
eq
$exon
->
hashkey
())
{
$is_end_exon
=
1
;
$chimp_translation
->
end_Exon
(
$chimp_exon
);
}
}
COORD:
for
(
my
$i
=
0
;
$i
<
$num
;
$i
++
)
{
my
$c
=
$coords
[
$i
];
$exon_position
+=
$c
->
length
();
if
(
$c
->
isa
('
Bio::EnsEMBL::Mapper::Gap
'))
{
...
...
@@ -169,81 +212,250 @@ sub transfer_transcript {
# this is a DELETION
#
# convert to CDS coords
# convert to CDS coords and categorize them
# gaps are UTR deletions, coords are CDS deletions
#
my
@cds_coords
=
$trans_mapper
->
genomic2cds
(
$c
->
start
(),
$c
->
end
(),
$exon
->
strand
());
@cds_coords
=
grep
{
!
$_
->
isa
('
Bio::EnsEMBL::Mapper::Gap
')}
@cds_coords
;
my
$cds_delete
;
my
@utr_deletes
;
foreach
my
$cds_coord
(
@cds_coords
)
{
if
(
$cds_coord
->
isa
('
Bio::EnsEMBL::Mapper::Gap
'))
{
push
@utr_deletes
,
$cds_coord
;
}
else
{
#sanity check:
if
(
$cds_delete
)
{
throw
("
Unexpected: multiple cds deletes for single exon.
");
}
$cds_delete
=
$cds_coord
;
}
}
if
(
@utr_deletes
>
2
)
{
throw
("
Unexpected: >2 UTR deletes for single exon.
");
}
if
(
@cds_coords
==
0
)
{
#this deletion was entirely in the UTR, accept it
foreach
my
$utr_delete
(
@utr_deletes
)
{
# the UTR deletions may require adjustments to the translation
debug
("
utr deletion - ok
");
# TBD fix translation start/end if it needs adjusting here
# Fix translation start/end if it needs adjusting here
if
(
$is_start_exon
)
{
next
COORD
;
}
#need to adjust the start of translation if deletion was
#in 5prime UTR
if
(
$exon_position
<
$translation
->
start
())
{
$chimp_translation
->
start
(
$chimp_translation
->
start
()
-
$utr_delete
->
length
());
#also need to adjust end translation if this is also the
#end exon
if
(
$is_end_exon
)
{
$chimp_translation
->
end
(
$chimp_translation
->
end
()
-
$utr_delete
->
length
());
if
(
@cds_coords
!=
1
)
{
# it should not be possible to get more than one cds coord
# back since we are dealing with a single exon
throw
("
Unexpected, got multiple cds coords for single exon
");
}
}
}
}
my
$cds_coord
=
$cds_coords
[
0
];
if
(
$cds_delete
)
{
# if this is a long deletion of coding sequence, then we should
# throw out this transcript
# if this is a long deletion of coding sequence, then we should
# throw out this
transcript
my
$del_len
=
$cds_delete
->
length
();
my
$cds_len
=
get_cds_len
(
$
transcript
);
my
$del_len
=
$cds_coord
->
length
();
if
(
$cds_len
==
$del_len
)
{
debug
("
entire cds deletion - discarding transcript
");
return
@chimp_transcripts
;
}
if
(
$del_len
>
$MAX_CODING_INDEL
)
{
debug
("
long cds deletion (
$del_len
) - discarding transcript
");
if
(
$del_len
>
$MAX_CODING_INDEL
)
{
debug
("
long cds deletion (
$del_len
) - discarding transcript
");
# TBD split transcript in two parts and keep both halves
# TBD split transcript in two parts and keep both halves
return
()
;
}
return
@chimp_transcripts
;
}
# if this deletion is short and %3 == 0, then we do not need to
# do much, but may have to adjust coding start/end of translation
if
(
$del_len
%
3
==
0
)
{
debug
("
short (
$del_len
) in frame cds deletion - keeping exon
");
#adjust the translation end accordingly
if
(
$is_end_exon
)
{
#adjust translation end accordingly
$chimp_translation
->
end
(
$chimp_translation
->
end
-
$del_len
);
}
#TBD
my
$frame_shift
=
$del_len
%
3
;
next
COORD
;
}
# if this deletion is short and %3 == 0, then we do not need to
# do much, but may have to adjust coding end of translation
if
(
$frame_shift
==
0
)
{
debug
("
short (
$del_len
) in frame cds deletion - ok
");
if
(
$del_len
>
$MAX_FRAMESHIFT_INDEL
)
{
debug
("
medium (
$del_len
) frameshifting cds deletion -
"
.
"
discarding transcript
");
if
(
$is_end_exon
)
{
$chimp_translation
->
end
(
$chimp_translation
->
end
()
-
$del_len
);
}
#TBD split transcript in two parts and keep both halves
next
COORD
;
}
return
();
}
if
(
$del_len
>
$MAX_FRAMESHIFT_INDEL
)
{
debug
("
medium (
$del_len
) frameshifting cds deletion -
"
.
"
discarding transcript
");
#TBD split transcript in two parts and keep both halves
return
@chimp_transcripts
;
}
# this is a short frameshifting deletion
# check if deletion is at beginning of CDS
if
(
$cds_delete
->
start
()
==
1
)
{
# we need to adjust the start of the exon now to
# put everything back into frame
#sanity check:
if
(
!
$is_start_exon
)
{
throw
("
Unexpected: start of cds is not in start exon
");
}
# this is a short frameshifting deletion
# create a 'frameshift intron' and move on
debug
("
short (
$del_len
) frameshifting cds deletion -
"
.
"
creating fake intron
");
my
$ex_start
=
$chimp_exon
->
start
();
# if this exon the first coding exon, then we need to adjust the
# coding start of the translation, and possibly the exon start
# as well
#if there is already UTR, just shift it
if
(
$chimp_translation
->
start
>
1
)
{
debug
("
frameshift deletion at start of cds - extending utr
");
$chimp_translation
->
start
(
$chimp_translation
->
start
+
3
-
$frame_shift
);
}
else
{
$ex_start
=
$chimp_exon
->
start
();
# if this exon is the last coding exon, then we need to adjust the
# coding end
#there was no UTR so adjust exon start, leave w/ no UTR
if
(
!
defined
(
$ex_start
))
{
#sanity check:
if
(
@coords
<
$i
+
2
)
{
throw
("
Unexpected: no exon start defined and
"
.
"
no more coords available
");
}
$ex_start
=
$coords
[
$i
+
1
]
->
start
();
#peek at next coord
}
debug
("
frameshift deletion at start of cds -
"
.
"
moving exon start
");
$chimp_exon
->
start
(
$ex_start
+
3
-
$frame_shift
);
}
next
COORD
;
}
# check if deletion is at end of CDS
if
(
$cds_coord
->
end
()
==
get_cds_len
(
$transcript
))
{
# sanity check:
if
(
!
$is_end_exon
)
{
throw
("
Unexpected: end of cds is not in end exon
");
}
# if there was already a UTR just shift it
if
(
$translation
->
end
()
<
$exon
->
length
())
{
$chimp_translation
->
end
(
$chimp_translation
->
end
()
-
3
+
$frame_shift
);
}
else
{
# no utr and at end of CDS, so must be end of exon
# sanity checks:
if
(
!
defined
(
$chimp_exon
->
end
()))
{
throw
("
Unexpected: at end of exon and no end defined
");
}
debug
("
frameshift deletion at end of cds -
"
.
"
moving exon end and translation end
");
$chimp_exon
->
end
(
$chimp_exon
->
end
()
-
3
+
$frame_shift
);
$chimp_translation
->
end
(
$chimp_exon
->
length
());
}
next
COORD
;
}
# the deletion was not at either end of the cds, but could still
# be at the beginning or end of the exon
# check if this is a deletion at the exon start
if
(
$i
==
0
)
{
# sanity check:
if
(
$is_start_exon
)
{
throw
("
Unexpected: delete at start of start exon is
"
.
"
not at cds start
");
}
debug
("
frameshift deletion at start of exon -
"
.
"
adjusting start of exon
");
$chimp_exon
->
start
(
$chimp_exon
->
start
+
3
-
$frameshift
);
if
(
$is_end_exon
)
{
debug
("
adjusting translation end
");
$chimp_exon
->
translation_end
(
$chimp_exon_translation
->
end
-
3
+
$frameshift
);
}
next
COORD
;
}
#check if this is a deletion at the exon end
if
(
@coords
==
$i
+
1
)
{
# sanity check:
if
(
$is_end_exon
)
{
throw
("
Unexpected delete at end of end exon is
"
.
"
not at cds start
");
}
debug
("
frameshift deletion at end of exon -
"
.
"
adjusting end of exon
");
$chimp_exon
->
end
(
$chimp_exon
->
end
-
3
+
$frameshift
);
next
COORD
;
}
# now we know that this is a deletion in the middle of the
# cds
# for this to be possible there should already be an exon start/end
# because we should have already seen some exon coords
# sanity check
if
(
!
defined
(
$chimp_exon
->
start
()))
{
throw
("
Unexpected: mid cds delete and no exon start defined
");
}
if
(
!
defined
(
$chimp_exon
->
end
()))
{
throw
("
Unexpected: mid cds delete and no exon end defined
");
}
# split this exon in two parts and make a 'frameshift' intron
# in the middle
debug
("
short (
$del_len
) frameshifting mid cds deletion -
"
.
"
creating fake intron
");
my
$ex_len
=
$chimp_exon
->
length
();
$chimp_exon
->
end
(
$chimp_exon
->
end
()
-
3
+
$frameshift
)
push
@chimp_exons
,
$chimp_exon
;
$chimp_exon
=
Bio::EnsEMBL::
Exon
->
new
(
-
START
=>
$chimp_exon
->
end
()
+
4
-
$frame_shift
,
-
STRAND
=>
$chimp_exon
->
strand
());
}
}
else
{
#if first is coord no need to do anything
next
COORD
if
(
$i
==
0
);
my
$prev_c
=
$coords
[
$i
];
my
$prev_c
=
$coords
[
$i
-
1
];
next
if
(
$prev_c
->
isa
('
Bio::EnsEMBL::Mapper::Gap
'));
...
...
@@ -256,7 +468,7 @@ sub transfer_transcript {
#TBD it may not be necessary to discard transcript if this is UTR
return
()
;
return
@chimp_transcripts
;
}
if
(
$c
->
strand
()
!=
$prev_c
->
strand
())
{
...
...
@@ -264,7 +476,7 @@ sub transfer_transcript {
#TBD it may not be necessary to discard transcript if this is UTR
return
()
;
return
@chimp_transcripts
;
}
...
...
@@ -294,6 +506,14 @@ sub transfer_transcript {
$trans_mapper
->
genomic2cds
(
$left_coord
,
$right_coord
,
-
1
);
}
if
(
$insert_len
<
1
)
{
debug
("
inverted exon - discarding transcript
");
#TBD - is there something sensible that can be done here?
return
@chimp_transcripts
;
}
if
(
@cds_coords
==
1
&&
$cds_coords
[
0
]
->
isa
('
Bio::EnsEMBL::Mapper::Coordinate
'))
{
...
...
@@ -305,7 +525,7 @@ sub transfer_transcript {
# TBD may be ok to keep transcript and split in two?
return
()
;
return
@chimp_transcripts
;
}
if
(
$insert_len
%
3
==
0
)
{
...
...
@@ -323,7 +543,7 @@ sub transfer_transcript {
#TBD may be ok to keep transcript and split in two
return
()
;
return
@chimp_transcripts
;
}
debug
("
short (
$insert_len
) frameshifting cds insert
"
.
...
...
@@ -358,11 +578,24 @@ sub transfer_transcript {
# * exons on different scaffolds
# * exons on different strands
# * exons in which the splice order changed
# * transcripts which lost all exons
# maybe phases should be set here
}
sub
get_cds_len
{
my
$transcript
=
shift
;
my
$len
=
0
;
foreach
my
$exon
(
@
{
$transcript
->
get_all_translateable_Exons
()})
{
$len
+=
$exon
->
length
();
}
return
$len
;
}
sub
debug
{
my
$msg
=
shift
;
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment