Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
ensembl
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Iterations
Wiki
Requirements
Jira
Code
Merge requests
1
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ensembl-gh-mirror
ensembl
Commits
a5c5d208
Commit
a5c5d208
authored
21 years ago
by
Eduardo Eyras
Browse files
Options
Downloads
Patches
Plain Diff
module with some methods to convert gff into ensembl exons
parent
fdfd4dd6
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
modules/Bio/EnsEMBL/Utils/GFF2Exon.pm
+368
-0
368 additions, 0 deletions
modules/Bio/EnsEMBL/Utils/GFF2Exon.pm
with
368 additions
and
0 deletions
modules/Bio/EnsEMBL/Utils/GFF2Exon.pm
0 → 100644
+
368
−
0
View file @
a5c5d208
# You may distribute this module under the same terms as perl itself
# POD documentation - main docs before the code
=head1 NAME
GFF2Exon
=head1 SYNOPSIS
use Bio::EnsEMBL::Utils::GFF2Exon;
my $gff = GFF2Exon->new(-gff_version => 2);
my $exon = $gff->exon_from_gff( $gff_string );
=cut
# Let the code begin...
package
Bio::EnsEMBL::Utils::
GFF2Exon
;
use
vars
qw(@ISA)
;
use
strict
;
use
Bio::EnsEMBL::
Root
;
@ISA
=
qw(Bio::EnsEMBL::Root)
;
sub
new
{
my
(
$class
,
@args
)
=
@_
;
my
$self
=
$class
->
SUPER::
new
(
@args
);
my
(
$gff_version
)
=
$self
->
_rearrange
([
qw(GFF_VERSION)
],
@args
);
if
((
$gff_version
!=
1
)
&&
(
$gff_version
!=
2
))
{
$self
->
throw
("
unknown version:
$gff_version
");
}
$self
->
gff_version
(
$gff_version
);
return
$self
;
}
=head2 exon_from_gff
Usage : my $exon = $gff->exon_from_gff_string($gff_string);
Function: Sets properties of an exon object from a GFF-formatted
string. Interpretation of the string depends on the version
that has been specified at initialization.
Args : The GFF-formatted string
=cut
sub
exon_from_gff
{
my
(
$self
,
$gff_string
,
$trans_hash
)
=
@_
;
if
(
$self
->
gff_version
()
==
1
)
{
return
$self
->
_from_gff1_string
(
$gff_string
,
$trans_hash
);
}
else
{
return
$self
->
_from_gff2_string
(
$gff_string
,
$trans_hash
);
}
}
=head2 _from_gff1_string
Returns : Exon
Args : The GFF-formatted string to initialize it from
=cut
sub
_from_gff1_string
{
my
(
$self
,
$gffstring
,
$trans_hash
)
=
@_
;
#print STDERR "gff_string: $gffstring\n";
#39896 human_cdna exon 1030942 1031591 100 + 0 6604
#39897 human_cdna exon 1034227 1034285 100 + 0 6604
#39898 human_cdna exon 1035280 1035339 100 + 0 6604
#39899 human_cdna exon 1035741 1035820 100 + 0 6604
#39900 human_cdna exon 1036477 1036629 100 + 0 6604
#243299 human_cdna exon 1037161 1037229 100 + 0 40150
#243301 human_cdna exon 1043180 1043315 100 + 0 40150
#243303 human_cdna exon 1048194 1048340 100 + 0 40150
#243305 human_cdna exon 1053724 1053855 100 + 0 40150
#243307 human_cdna exon 1054346 1054451 100 + 0 40150
#243309 human_cdna exon 1055667 1055833 100 + 0 40150
#243310 human_cdna exon 1057048 1057221 100 + 0 40150
#243311 human_cdna exon 1058529 1058630 100 + 0 40150
#243313 human_cdna exon 1061487 1062080 100 + 0 40150
my
(
$seqname
,
$source
,
$primary
,
$start
,
$end
,
$score
,
$strand
,
$frame
,
@group
)
=
split
(
/\s+/
,
$gffstring
);
if
(
!
defined
$frame
)
{
$self
->
throw
("
[
$gffstring
] does not look like GFF to me
");
}
$frame
=
0
unless
(
$frame
=~
/^\d+$/
);
my
$exon
=
Bio::EnsEMBL::
Exon
->
new
();
$exon
->
seqname
(
$seqname
);
#$exon->source_tag($source);
$exon
->
start
(
$start
);
$exon
->
end
(
$end
);
my
$phase
=
(
3
-
$frame
)
%
3
;
$exon
->
phase
(
$phase
);
$exon
->
end_phase
(
(
$exon
->
phase
+
$exon
->
length
)
%
3
);
if
(
$score
){
$exon
->
score
(
$score
);
}
if
(
$strand
eq
'
-
'
)
{
$exon
->
strand
(
-
1
);
}
if
(
$strand
eq
'
+
'
)
{
$exon
->
strand
(
1
);
}
if
(
$strand
eq
'
.
'
)
{
$exon
->
strand
(
0
);
}
############################################################
# warning: it parses only the first element of the group
$trans_hash
->
{
$exon
}
=
$group
[
0
];
return
$exon
;
}
=head2 _from_gff2_string
Returns : Exon
Args : The GFF2-formatted string to initialize it from
=cut
sub
_from_gff2_string
{
my
(
$self
,
$gffstring
)
=
@_
;
chomp
(
$gffstring
);
# according to the Sanger website, GFF2 should be single-tab separated elements, and the
# free-text at the end should contain text-translated tab symbols but no "real" tabs,
# so splitting on \t is safe, and $attribs gets the
# entire attributes field to be parsed later
my
(
$seqname
,
$source
,
$primary
,
$start
,
$end
,
$score
,
$strand
,
$frame
,
@attribs
)
=
split
(
/\t+/
,
$gffstring
);
# just in case the rule against tab characters has been broken
my
$attribs
=
join
'',
@attribs
;
if
(
!
defined
$frame
)
{
$self
->
throw
("
[
$gffstring
] does not look like GFF2 to me
");
}
my
$exon
=
Bio::EnsEMBL::
Exon
->
new
();
$exon
->
seqname
(
$seqname
);
$exon
->
source_tag
(
$source
);
$exon
->
analysis
->
gff_feature
(
$primary
);
$exon
->
start
(
$start
);
$exon
->
end
(
$end
);
$exon
->
phase
(
$frame
);
$exon
->
end_phase
(
(
$exon
->
phase
+
$exon
->
length
)
%
3
);
if
(
$score
eq
'
.
'
)
{
#$exon->score(undef);
}
else
{
$exon
->
score
(
$score
);
}
if
(
$strand
eq
'
-
'
)
{
$exon
->
strand
(
-
1
);
}
if
(
$strand
eq
'
+
'
)
{
$exon
->
strand
(
1
);
}
if
(
$strand
eq
'
.
'
)
{
$exon
->
strand
(
0
);
}
# <Begin Inefficient Code from Mark Wilkinson>
# this routine is necessay to allow the presence of semicolons in quoted text
# Semicolons are the delimiting character for new tag/value attributes.
# it is more or less a "state" machine, with the "quoted" flag going up and down
# as we pass thorugh quotes to distinguish free-text semicolon and hash
# symbols from GFF control characters
# remove comments field (format: #blah blah blah... at the end of the GFF line)
#$attribs =~ s/\#(.*)$//;
#my @att = split //, $attribs; # split into individual characters
#my $num = $#att; # count them
#my $flag = 0;
#my @parsed; # this is needed to hold the characters that have been parsed
# run through each character one at a time and check it
#for (my $a = 0; $a <= $num ; $a +=1){
# if ($att[$a] eq "\""){$flag=($flag==0)?1:0} # flag up on entering quoted text, down on leaving it
# if (($att[$a] eq ";") && $flag){$att[$a] = "INSERT_SEMICOLON_HERE"} # replace semicolon with an unusual message if the quoted text flag is up
# if (($att[$a] eq "#") && !$flag){last} # an unquoted hash symbol means the beginning of the comments field - discard
# push @parsed, $att[$a] # take the parsed character and push it onto the parsed list
# }
#$attribs = join "", @parsed; # rejoin into a single string
# <End Inefficient Code> Please feel free to fix this and make it more "perlish"
# my @key_vals = split /;/, $attribs; # attributes are semicolon-delimited
# foreach my $pair ( @key_vals ) {
# $pair =~ s/INSERT_SEMICOLON_HERE/;/g; # replace semicolons that were removed from free-text above.
# my ($blank, $key, $values) = split /^\s*([\w\d]+)\s/, $pair; # separate the key from the value
# my @values;
# while ($values =~ s/"(.*?)"//){ # free text is quoted, so match each free-text block
# if ($1){push @values, $1}; # and push it on to the list of values (tags may have more than one value...)
#}
# my @othervals = split /\s+/, $values; # and what is left over should be space-separated non-free-text values
# foreach my $othervalue(@othervals){
# if (CORE::length($othervalue) > 0){push @values, $othervalue} # get rid of any empty strings which might result from the split
# }
# foreach my $value(@values){
# $exon->add_tag_value($key, $value);
# }
# }
return
$exon
;
}
=head2 gff_string
=cut
sub
gff_string
{
my
(
$self
,
$exon
,
$transcript
)
=
@_
;
if
(
$self
->
gff_version
()
==
1
)
{
return
$self
->
_gff1_string
(
$exon
,
$transcript
);
}
else
{
return
$self
->
_gff2_string
(
$exon
,
$transcript
);
}
}
=head2 _gff1_string
=cut
sub
_gff1_string
{
my
(
$self
,
$exon
,
$transcript
)
=
@_
;
my
(
$str
,
$source
,
$primary_tag
,
$score
,
$frame
,
$name
,
$strand
);
if
(
$exon
->
can
('
score
')
)
{
$score
=
$exon
->
score
();
}
$score
=
'
.
'
unless
defined
$score
;
if
(
$exon
->
can
('
frame
')
)
{
$frame
=
$exon
->
frame
();
}
$frame
=
'
.
'
unless
defined
$frame
;
$strand
=
$exon
->
strand
();
if
(
!
$strand
)
{
$strand
=
"
.
";
}
elsif
(
$strand
==
1
)
{
$strand
=
'
+
';
}
elsif
(
$exon
->
strand
==
-
1
)
{
$strand
=
'
-
';
}
$name
=
$exon
->
seqname
();
$source
=
"
merged
";
$primary_tag
=
"
exon
";
$str
=
join
("
\t
",
$name
,
$source
,
$primary_tag
,
$exon
->
start
(),
$exon
->
end
(),
$score
,
$strand
,
$frame
);
my
$tag_str
=
$transcript
->
type
;
$str
.=
"
\t
$tag_str
";
return
$str
;
}
=head2 _gff2_string
=cut
sub
_gff2_string
{
my
(
$gff
,
$exon
)
=
@_
;
my
(
$str
,
$score
,
$frame
,
$name
,
$strand
);
if
(
$exon
->
can
('
score
')
)
{
$score
=
$exon
->
score
();
}
$score
=
'
.
'
unless
defined
$score
;
if
(
$exon
->
can
('
frame
')
)
{
$frame
=
$exon
->
frame
();
}
$frame
=
'
.
'
unless
defined
$frame
;
$strand
=
$exon
->
strand
();
if
(
!
$strand
)
{
$strand
=
"
.
";
}
elsif
(
$strand
==
1
)
{
$strand
=
'
+
';
}
elsif
(
$exon
->
strand
==
-
1
)
{
$strand
=
'
-
';
}
if
(
$exon
->
can
('
seqname
')
)
{
$name
=
$exon
->
seqname
();
$name
||=
'
SEQ
';
}
else
{
$name
=
'
SEQ
';
}
$str
=
join
("
\t
",
$name
,
$exon
->
source_tag
(),
$exon
->
analysis
->
gff_feature
(),
$exon
->
start
(),
$exon
->
end
(),
$score
,
$strand
,
$frame
);
# the routine below is the only modification I made to the original
# ->gff_string routine (above) as on November 17th, 2000, the
# Sanger webpage describing GFF2 format reads: "From version 2
# onwards, the attribute field must have a tag value structure
# following the syntax used within objects in a .ace file,
# flattened onto one line by semicolon separators. Tags must be
# standard identifiers ([A-Za-z][A-Za-z0-9_]*). Free text values
# must be quoted with double quotes".
# MW
#my $valuestr;
#if ($exon->all_tags){ # only play this game if it is worth playing...
# $str .= "\t"; # my interpretation of the GFF2 specification suggests the need for this additional TAB character...??
# foreach my $tag ( $exon->all_tags ) {
# my $valuestr; # a string which will hold one or more values for this tag, with quoted free text and space-separated individual values.
# foreach my $value ( $exon->each_tag_value($tag) ) {
# if ($value =~ /[^A-Za-z0-9_]/){
# $value =~ s/\t/\\t/g; # substitute tab and newline characters
# $value =~ s/\n/\\n/g; # to their UNIX equivalents
# $value = '"' . $value . '" '} # if the value contains anything other than valid tag/value characters, then quote it
# $valuestr .= $value . " "; # with a trailing space in case there are multiple values
# # for this tag (allowed in GFF2 and .ace format)
# }
# $str .= "$tag $valuestr ; "; # semicolon delimited with no '=' sign
# }
# chop $str; chop $str # remove the trailing semicolon and space
# }
return
$str
;
}
=head2 gff_version
=cut
sub
gff_version
{
my
(
$self
,
$value
)
=
@_
;
if
(
defined
$value
&&
((
$value
==
1
)
||
(
$value
==
2
)))
{
$self
->
{'
GFF_VERSION
'}
=
$value
;
}
return
$self
->
{'
GFF_VERSION
'};
}
1
;
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment