Skip to content
Snippets Groups Projects
Commit ee9aabdf authored by Monika Komorowska's avatar Monika Komorowska
Browse files

support for empty lines and fasta sequence section

parent af53381b
No related branches found
No related tags found
No related merge requests found
=pod =pod
=head1 LICENSE =head1 LICENSE
...@@ -24,7 +23,6 @@ Monika Komorowska, 2012 - monika@ebi.ac.uk ...@@ -24,7 +23,6 @@ Monika Komorowska, 2012 - monika@ebi.ac.uk
use strict; use strict;
use Bio::EnsEMBL::Utils::IO::GFFParser; use Bio::EnsEMBL::Utils::IO::GFFParser;
use Bio::EnsEMBL::Utils::Scalar qw/wrap_array/;
use FileHandle; use FileHandle;
my $file_name = "features.gff"; my $file_name = "features.gff";
...@@ -47,21 +45,34 @@ while (defined($feature) ) { ...@@ -47,21 +45,34 @@ while (defined($feature) ) {
#do something with the feature, e.g. print hash keys and values #do something with the feature, e.g. print hash keys and values
foreach my $key (keys %feature) { foreach my $key (keys %feature) {
if ($key ne 'attribute') { if ($key ne 'attribute') {
print $key . " " . $feature{$key} ."\n"; print $key . " " . $feature{$key} ."\n";
} else { } else {
print $key . "\n"; print $key . "\n";
my %attribs = %{$feature{$key}}; my %attribs = %{$feature{$key}};
foreach my $attrib_key (keys %attribs) { foreach my $attrib_key (keys %attribs) {
my $values = $attribs{$attrib_key}; printf("\t%s %s\n", $attrib_key, join(q{, }, @{wrap_array($values)}));
printf("\t%s %s\n", $attrib_key, join(q{, }, @{wrap_array($values)}));
} }
} }
} }
print "\n\n"; print "\n\n";
$feature = $parser->parse_next_feature(); $feature = $parser->parse_next_feature();
} }
my $sequence = $parser->parse_next_sequence();
while (defined($sequence)) {
my %sequence = %{$sequence};
foreach my $key (keys %sequence) {
print $key . " " . $sequence{$key} ."\n";
}
print "\n\n";
$sequence = $parser->parse_next_sequence();
}
$parser->close(); $parser->close();
$fh->close(); $fh->close();
...@@ -70,7 +81,7 @@ $fh->close(); ...@@ -70,7 +81,7 @@ $fh->close();
=head1 DESCRIPTION =head1 DESCRIPTION
GFF3 format as defined in http://www.sequenceontology.org/gff3.shtml. GFF3 format as defined in http://www.sequenceontology.org/gff3.shtml
Use parse_header method to parse a GFF3 file header, and parse_next_feature to parse the next feature line in the file. Use parse_header method to parse a GFF3 file header, and parse_next_feature to parse the next feature line in the file.
...@@ -85,8 +96,10 @@ use warnings; ...@@ -85,8 +96,10 @@ use warnings;
use Bio::EnsEMBL::Utils::Exception; use Bio::EnsEMBL::Utils::Exception;
use IO::File; use IO::File;
use URI::Escape; use URI::Escape;
use Bio::EnsEMBL::Utils::Scalar qw/wrap_array/;
my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1' ); my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1');
=head2 new =head2 new
...@@ -98,13 +111,15 @@ my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1' ); ...@@ -98,13 +111,15 @@ my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1' );
=cut =cut
sub new { sub new {
my $class = shift; my $class = shift;
my $self = { filehandle => shift, }; my $self = {
bless $self, $class; filehandle => shift,
if ( !defined( $self->{'filehandle'} ) ) { };
throw("GFFParser requires a valid filehandle to a GFF3 formatted file"); bless $self, $class;
} if (!defined($self->{'filehandle'})) {
return $self; throw("GFFParser requires a valid filehandle to a GFF3 formatted file");
}
return $self;
} }
...@@ -118,31 +133,31 @@ sub new { ...@@ -118,31 +133,31 @@ sub new {
sub parse_header { sub parse_header {
my $self = shift; my $self = shift;
my $next_line; my $next_line;
my @header_lines; my @header_lines;
while ( ( $next_line = $self->_read_line() ) && ( $next_line =~ /^[\#|\s]/ ) ) while (($next_line = $self->_read_line()) && ($next_line =~ /^[\#|\s]/) ) {
{
#stop parsing features if ##FASTA directive encountered
#header lines start with ## last if ($next_line =~ /\#\#FASTA/ );
if ( $next_line =~ /^[\#]{2}/ ) {
push @header_lines, $next_line; #header lines start with ## (except for the ##FASTA directive indicating sequence section)
if ( $next_line =~ /gff-version\s+(\d+)/ ) { if ($next_line =~ /^[\#]{2}/ ) {
if ( $1 != 3 ) { push @header_lines, $next_line;
warning( if ($next_line =~ /gff-version\s+(\d+)/) {
"File has been formatted in GFF version $1. GFFParser may return unexpected results as it is designed to parse GFF3 formatted files." if ($1 != 3) {
); warning("File has been formatted in GFF version $1. GFFParser may return unexpected results as it is designed to parse GFF3 formatted files.");
} }
} }
}
} }
}
if ( defined($next_line) && ( $next_line !~ /^[\#|\s]/ ) ) { if (defined($next_line)) {
$self->{'first_feature_line'} = $next_line; $self->{'first_non_header_line'} = $next_line;
} }
return \@header_lines; return \@header_lines;
} }
...@@ -162,93 +177,153 @@ sub parse_header { ...@@ -162,93 +177,153 @@ sub parse_header {
attribute => hashref, attribute => hashref,
} }
If the attribute value held more than one value then we hold an arrayref
not a scalar
Returntype : Hashref of a GFF3 feature line Returntype : Hashref of a GFF3 feature line
=cut =cut
sub parse_next_feature { sub parse_next_feature {
my $self = shift; my $self = shift;
# my $next_line; my $next_line;
my $feature_line; my $feature_line;
while ( my ($next_line) = $self->_read_line() ) { while (($next_line = $self->_read_line() ) && defined($next_line) ) {
next
if ( $next_line =~ /^\#/
|| $next_line =~ /^\s*$/
|| $next_line =~ /^\/\// );
$feature_line = $next_line;
last;
}
return undef unless $feature_line;
my %feature; #stop parsing features if ##FASTA directive
my %attribute; last if ($next_line =~ /\#\#FASTA/);
#strip off trailing comments
$feature_line =~ s/\#.*//;
my @chunks = split( /\t/, $feature_line ); next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ ||
$next_line =~ /^\/\//);
%feature = ( $feature_line = $next_line;
'seqid' => uri_unescape( $chunks[0] ), last;
'source' => uri_unescape( $chunks[1] ), }
'type' => uri_unescape( $chunks[2] ),
'start' => $chunks[3],
'end' => $chunks[4],
'score' => $chunks[5],
'strand' => $strand_conversion{ $chunks[6] },
'phase' => $chunks[7]
);
if ( $chunks[8] ) { return undef unless $feature_line;
my %feature;
my %attribute;
#strip off trailing comments
$feature_line =~ s/\#.*//;
my @chunks = split(/\t/, $feature_line);
%feature = (
'seqid' => uri_unescape($chunks[0]),
'source' => uri_unescape($chunks[1]),
'type' => uri_unescape($chunks[2]),
'start' => $chunks[3],
'end' => $chunks[4],
'score' => $chunks[5],
'strand' => $strand_conversion{$chunks[6]},
'phase' => $chunks[7]
);
if ($chunks[8]) {
my @attributes = split( /;/, $chunks[8] ); my @attributes = split( /;/, $chunks[8] );
my %attributes; my %attributes;
foreach my $attribute (@attributes) { foreach my $attribute (@attributes) {
my ( $name, $value ) = split( /=/, $attribute ); my ( $name, $value ) = split( /=/, $attribute );
$name = uri_unescape($name); $name = uri_unescape($name);
my @split_values = map { uri_unescape($_) } split(/,/, $value); my @split_values = map { uri_unescape($_) } split(/,/, $value);
if(scalar(@split_values) > 1) { if(scalar(@split_values) > 1) {
$attributes{$name} = \@split_values; $attributes{$name} = \@split_values;
} }
else { else {
$attributes{$name} = $split_values[0]; $attributes{$name} = $split_values[0];
}
} }
$feature{'attribute'} = \%attributes;
} }
$feature{'attribute'} = \%attributes;
}
return \%feature; return \%feature;
} }
sub _read_line { =head2 parse_next_sequence
my $self = shift; Arg [1] : File handle
my $fh = $self->{'filehandle'}; Description: Returns a hashref in the format -
{
header => scalar,
sequence => scalar,
}
Returntype : Hashref of a GFF3 sequence line
my $line; =cut
sub parse_next_sequence {
my $self = shift;
if ( defined( $self->{'first_feature_line'} ) ) { my $next_line;
$line = $self->{'first_feature_line'}; my $sequence;
$self->{'first_feature_line'} = undef; my $header;
}
else { while (($next_line = $self->_read_line() ) && defined($next_line) ) {
$line = <$fh>;
if ( defined($line) ) { next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ ||
chomp $line; $next_line =~ /^\/\//);
if ($next_line =~ /^>/) {
if ($header) {
#next fasta header encountered
$self->{'next_fasta_header'} = $next_line;
last;
} else {
$header = $next_line;
}
} else {
$sequence .= $next_line;
}
} }
}
return $line; return undef unless ($sequence || $header);
my %sequence = (header => $header , sequence => $sequence );
return \%sequence;
}
sub _read_line {
my $self = shift;
my $fh = $self->{'filehandle'};
my $line;
if (defined($self->{'first_non_header_line'})) {
$line = $self->{'first_non_header_line'};
$self->{'first_non_header_line'} = undef;
} elsif ( defined($self->{'next_fasta_header'} )) {
$line = $self->{'next_fasta_header'};
$self->{'next_fasta_header'} = undef;
}
else {
$line = <$fh>;
if (defined($line)) {
chomp $line;
if (!$line) {
#parse next line if current line is empty
$line = $self->_read_line();
}
}
}
return $line;
} }
sub close { sub close {
my $self = shift;
$self->{"filehandle"} = undef; my $self = shift;
$self->{"filehandle"} = undef;
} }
1; 1;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment