Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
ensembl
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Iterations
Wiki
Requirements
Jira
Code
Merge requests
1
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ensembl-gh-mirror
ensembl
Commits
ee9aabdf
Commit
ee9aabdf
authored
12 years ago
by
Monika Komorowska
Browse files
Options
Downloads
Patches
Plain Diff
support for empty lines and fasta sequence section
parent
af53381b
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
modules/Bio/EnsEMBL/Utils/IO/GFFParser.pm
+179
-104
179 additions, 104 deletions
modules/Bio/EnsEMBL/Utils/IO/GFFParser.pm
with
179 additions
and
104 deletions
modules/Bio/EnsEMBL/Utils/IO/GFFParser.pm
+
179
−
104
View file @
ee9aabdf
=pod
=pod
=head1 LICENSE
=head1 LICENSE
...
@@ -24,7 +23,6 @@ Monika Komorowska, 2012 - monika@ebi.ac.uk
...
@@ -24,7 +23,6 @@ Monika Komorowska, 2012 - monika@ebi.ac.uk
use strict;
use strict;
use Bio::EnsEMBL::Utils::IO::GFFParser;
use Bio::EnsEMBL::Utils::IO::GFFParser;
use Bio::EnsEMBL::Utils::Scalar qw/wrap_array/;
use FileHandle;
use FileHandle;
my $file_name = "features.gff";
my $file_name = "features.gff";
...
@@ -47,21 +45,34 @@ while (defined($feature) ) {
...
@@ -47,21 +45,34 @@ while (defined($feature) ) {
#do something with the feature, e.g. print hash keys and values
#do something with the feature, e.g. print hash keys and values
foreach my $key (keys %feature) {
foreach my $key (keys %feature) {
if ($key ne 'attribute') {
if ($key ne 'attribute') {
print $key . " " . $feature{$key} ."\n";
print $key . " " . $feature{$key} ."\n";
} else {
} else {
print $key . "\n";
print $key . "\n";
my %attribs = %{$feature{$key}};
my %attribs = %{$feature{$key}};
foreach my $attrib_key (keys %attribs) {
foreach my $attrib_key (keys %attribs) {
my $values = $attribs{$attrib_key}
;
printf("\t%s %s\n", $attrib_key, join(q{, }, @{wrap_array($values)}))
;
printf("\t%s %s\n", $attrib_key, join(q{, }, @{wrap_array($values)}));
}
}
}
}
}
}
print "\n\n";
print "\n\n";
$feature = $parser->parse_next_feature();
$feature = $parser->parse_next_feature();
}
}
my $sequence = $parser->parse_next_sequence();
while (defined($sequence)) {
my %sequence = %{$sequence};
foreach my $key (keys %sequence) {
print $key . " " . $sequence{$key} ."\n";
}
print "\n\n";
$sequence = $parser->parse_next_sequence();
}
$parser->close();
$parser->close();
$fh->close();
$fh->close();
...
@@ -70,7 +81,7 @@ $fh->close();
...
@@ -70,7 +81,7 @@ $fh->close();
=head1 DESCRIPTION
=head1 DESCRIPTION
GFF3 format as defined in http://www.sequenceontology.org/gff3.shtml
.
GFF3 format as defined in http://www.sequenceontology.org/gff3.shtml
Use parse_header method to parse a GFF3 file header, and parse_next_feature to parse the next feature line in the file.
Use parse_header method to parse a GFF3 file header, and parse_next_feature to parse the next feature line in the file.
...
@@ -85,8 +96,10 @@ use warnings;
...
@@ -85,8 +96,10 @@ use warnings;
use
Bio::EnsEMBL::Utils::
Exception
;
use
Bio::EnsEMBL::Utils::
Exception
;
use
IO::
File
;
use
IO::
File
;
use
URI::
Escape
;
use
URI::
Escape
;
use
Bio::EnsEMBL::Utils::
Scalar
qw/wrap_array/
;
my
%strand_conversion
=
(
'
+
'
=>
'
1
',
'
?
'
=>
'
0
',
'
-
'
=>
'
-1
'
);
my
%strand_conversion
=
(
'
+
'
=>
'
1
',
'
?
'
=>
'
0
',
'
-
'
=>
'
-1
');
=head2 new
=head2 new
...
@@ -98,13 +111,15 @@ my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1' );
...
@@ -98,13 +111,15 @@ my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1' );
=cut
=cut
sub
new
{
sub
new
{
my
$class
=
shift
;
my
$class
=
shift
;
my
$self
=
{
filehandle
=>
shift
,
};
my
$self
=
{
bless
$self
,
$class
;
filehandle
=>
shift
,
if
(
!
defined
(
$self
->
{'
filehandle
'}
)
)
{
};
throw
("
GFFParser requires a valid filehandle to a GFF3 formatted file
");
bless
$self
,
$class
;
}
if
(
!
defined
(
$self
->
{'
filehandle
'}))
{
return
$self
;
throw
("
GFFParser requires a valid filehandle to a GFF3 formatted file
");
}
return
$self
;
}
}
...
@@ -118,31 +133,31 @@ sub new {
...
@@ -118,31 +133,31 @@ sub new {
sub
parse_header
{
sub
parse_header
{
my
$self
=
shift
;
my
$self
=
shift
;
my
$next_line
;
my
$next_line
;
my
@header_lines
;
my
@header_lines
;
while
(
(
$next_line
=
$self
->
_read_line
()
)
&&
(
$next_line
=~
/^[\#|\s]/
)
)
while
((
$next_line
=
$self
->
_read_line
())
&&
(
$next_line
=~
/^[\#|\s]/
)
)
{
{
#stop parsing features if ##FASTA directive encountered
#header lines start with ##
last
if
(
$next_line
=~
/\#\#FASTA/
);
if
(
$next_line
=~
/^[\#]{2}/
)
{
push
@header_lines
,
$next_line
;
#header lines start with ## (except for the ##FASTA directive indicating sequence section)
if
(
$next_line
=~
/gff-version\s+(\d+)/
)
{
if
(
$next_line
=~
/^[\#]{2}/
)
{
if
(
$
1
!=
3
)
{
push
@header_lines
,
$next_line
;
warning
(
if
(
$next_line
=~
/gff-version\s+(\d+)/
)
{
"
File has been formatted in GFF version $1. GFFParser may return unexpected results as it is designed to parse GFF3 formatted files.
"
if
(
$
1
!=
3
)
{
);
warning
("
File has been formatted in GFF version $1. GFFParser may return unexpected results as it is designed to parse GFF3 formatted files.
");
}
}
}
}
}
}
}
}
if
(
defined
(
$next_line
)
&&
(
$next_line
!~
/^[\#|\s]/
)
)
{
if
(
defined
(
$next_line
))
{
$self
->
{'
first_
feature
_line
'}
=
$next_line
;
$self
->
{'
first_
non_header
_line
'}
=
$next_line
;
}
}
return
\
@header_lines
;
return
\
@header_lines
;
}
}
...
@@ -162,93 +177,153 @@ sub parse_header {
...
@@ -162,93 +177,153 @@ sub parse_header {
attribute => hashref,
attribute => hashref,
}
}
If the attribute value held more than one value then we hold an arrayref
not a scalar
Returntype : Hashref of a GFF3 feature line
Returntype : Hashref of a GFF3 feature line
=cut
=cut
sub
parse_next_feature
{
sub
parse_next_feature
{
my
$self
=
shift
;
my
$self
=
shift
;
# my $next_line;
my
$next_line
;
my
$feature_line
;
my
$feature_line
;
while
(
my
(
$next_line
)
=
$self
->
_read_line
()
)
{
while
((
$next_line
=
$self
->
_read_line
()
)
&&
defined
(
$next_line
)
)
{
next
if
(
$next_line
=~
/^\#/
||
$next_line
=~
/^\s*$/
||
$next_line
=~
/^\/\//
);
$feature_line
=
$next_line
;
last
;
}
return
undef
unless
$feature_line
;
my
%feature
;
#stop parsing features if ##FASTA directive
my
%attribute
;
last
if
(
$next_line
=~
/\#\#FASTA/
)
;
#strip off trailing comments
$feature_line
=~
s/\#.*//
;
my
@chunks
=
split
(
/\t/
,
$feature_line
);
next
if
(
$next_line
=~
/^\#/
||
$next_line
=~
/^\s*$/
||
$next_line
=~
/^\/\//
);
%feature
=
(
$feature_line
=
$next_line
;
'
seqid
'
=>
uri_unescape
(
$chunks
[
0
]
),
last
;
'
source
'
=>
uri_unescape
(
$chunks
[
1
]
),
}
'
type
'
=>
uri_unescape
(
$chunks
[
2
]
),
'
start
'
=>
$chunks
[
3
],
'
end
'
=>
$chunks
[
4
],
'
score
'
=>
$chunks
[
5
],
'
strand
'
=>
$strand_conversion
{
$chunks
[
6
]
},
'
phase
'
=>
$chunks
[
7
]
);
if
(
$chunks
[
8
]
)
{
return
undef
unless
$feature_line
;
my
%feature
;
my
%attribute
;
#strip off trailing comments
$feature_line
=~
s/\#.*//
;
my
@chunks
=
split
(
/\t/
,
$feature_line
);
%feature
=
(
'
seqid
'
=>
uri_unescape
(
$chunks
[
0
]),
'
source
'
=>
uri_unescape
(
$chunks
[
1
]),
'
type
'
=>
uri_unescape
(
$chunks
[
2
]),
'
start
'
=>
$chunks
[
3
],
'
end
'
=>
$chunks
[
4
],
'
score
'
=>
$chunks
[
5
],
'
strand
'
=>
$strand_conversion
{
$chunks
[
6
]},
'
phase
'
=>
$chunks
[
7
]
);
if
(
$chunks
[
8
])
{
my
@attributes
=
split
(
/;/
,
$chunks
[
8
]
);
my
@attributes
=
split
(
/;/
,
$chunks
[
8
]
);
my
%attributes
;
my
%attributes
;
foreach
my
$attribute
(
@attributes
)
{
foreach
my
$attribute
(
@attributes
)
{
my
(
$name
,
$value
)
=
split
(
/=/
,
$attribute
);
my
(
$name
,
$value
)
=
split
(
/=/
,
$attribute
);
$name
=
uri_unescape
(
$name
);
$name
=
uri_unescape
(
$name
);
my
@split_values
=
map
{
uri_unescape
(
$_
)
}
split
(
/,/
,
$value
);
my
@split_values
=
map
{
uri_unescape
(
$_
)
}
split
(
/,/
,
$value
);
if
(
scalar
(
@split_values
)
>
1
)
{
if
(
scalar
(
@split_values
)
>
1
)
{
$attributes
{
$name
}
=
\
@split_values
;
$attributes
{
$name
}
=
\
@split_values
;
}
}
else
{
else
{
$attributes
{
$name
}
=
$split_values
[
0
];
$attributes
{
$name
}
=
$split_values
[
0
];
}
}
}
$feature
{'
attribute
'}
=
\
%attributes
;
}
}
$feature
{'
attribute
'}
=
\
%attributes
;
}
return
\
%feature
;
return
\
%feature
;
}
}
sub
_read_line
{
=head2 parse_next_sequence
my
$self
=
shift
;
Arg [1] : File handle
my
$fh
=
$self
->
{'
filehandle
'};
Description: Returns a hashref in the format -
{
header => scalar,
sequence => scalar,
}
Returntype : Hashref of a GFF3 sequence line
my
$line
;
=cut
sub
parse_next_sequence
{
my
$self
=
shift
;
if
(
defined
(
$self
->
{'
first_feature_line
'}
)
)
{
my
$next_line
;
$line
=
$self
->
{'
first_feature_line
'};
my
$sequence
;
$self
->
{'
first_feature_line
'}
=
undef
;
my
$header
;
}
else
{
while
((
$next_line
=
$self
->
_read_line
()
)
&&
defined
(
$next_line
)
)
{
$line
=
<
$fh
>
;
if
(
defined
(
$line
)
)
{
next
if
(
$next_line
=~
/^\#/
||
$next_line
=~
/^\s*$/
||
chomp
$line
;
$next_line
=~
/^\/\//
);
if
(
$next_line
=~
/^>/
)
{
if
(
$header
)
{
#next fasta header encountered
$self
->
{'
next_fasta_header
'}
=
$next_line
;
last
;
}
else
{
$header
=
$next_line
;
}
}
else
{
$sequence
.=
$next_line
;
}
}
}
}
return
$line
;
return
undef
unless
(
$sequence
||
$header
);
my
%sequence
=
(
header
=>
$header
,
sequence
=>
$sequence
);
return
\
%sequence
;
}
sub
_read_line
{
my
$self
=
shift
;
my
$fh
=
$self
->
{'
filehandle
'};
my
$line
;
if
(
defined
(
$self
->
{'
first_non_header_line
'}))
{
$line
=
$self
->
{'
first_non_header_line
'};
$self
->
{'
first_non_header_line
'}
=
undef
;
}
elsif
(
defined
(
$self
->
{'
next_fasta_header
'}
))
{
$line
=
$self
->
{'
next_fasta_header
'};
$self
->
{'
next_fasta_header
'}
=
undef
;
}
else
{
$line
=
<
$fh
>
;
if
(
defined
(
$line
))
{
chomp
$line
;
if
(
!
$line
)
{
#parse next line if current line is empty
$line
=
$self
->
_read_line
();
}
}
}
return
$line
;
}
}
sub
close
{
sub
close
{
my
$self
=
shift
;
$self
->
{"
filehandle
"}
=
undef
;
my
$self
=
shift
;
$self
->
{"
filehandle
"}
=
undef
;
}
}
1
;
1
;
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment