Skip to content
Snippets Groups Projects
Commit ee9aabdf authored by Monika Komorowska's avatar Monika Komorowska
Browse files

support for empty lines and fasta sequence section

parent af53381b
No related branches found
No related tags found
No related merge requests found
=pod
=head1 LICENSE
......@@ -24,7 +23,6 @@ Monika Komorowska, 2012 - monika@ebi.ac.uk
use strict;
use Bio::EnsEMBL::Utils::IO::GFFParser;
use Bio::EnsEMBL::Utils::Scalar qw/wrap_array/;
use FileHandle;
my $file_name = "features.gff";
......@@ -47,21 +45,34 @@ while (defined($feature) ) {
#do something with the feature, e.g. print hash keys and values
foreach my $key (keys %feature) {
if ($key ne 'attribute') {
print $key . " " . $feature{$key} ."\n";
} else {
print $key . "\n";
my %attribs = %{$feature{$key}};
foreach my $attrib_key (keys %attribs) {
my $values = $attribs{$attrib_key};
printf("\t%s %s\n", $attrib_key, join(q{, }, @{wrap_array($values)}));
}
}
if ($key ne 'attribute') {
print $key . " " . $feature{$key} ."\n";
} else {
print $key . "\n";
my %attribs = %{$feature{$key}};
foreach my $attrib_key (keys %attribs) {
printf("\t%s %s\n", $attrib_key, join(q{, }, @{wrap_array($values)}));
}
}
}
print "\n\n";
$feature = $parser->parse_next_feature();
}
my $sequence = $parser->parse_next_sequence();
while (defined($sequence)) {
my %sequence = %{$sequence};
foreach my $key (keys %sequence) {
print $key . " " . $sequence{$key} ."\n";
}
print "\n\n";
$sequence = $parser->parse_next_sequence();
}
$parser->close();
$fh->close();
......@@ -70,7 +81,7 @@ $fh->close();
=head1 DESCRIPTION
GFF3 format as defined in http://www.sequenceontology.org/gff3.shtml.
GFF3 format as defined in http://www.sequenceontology.org/gff3.shtml
Use parse_header method to parse a GFF3 file header, and parse_next_feature to parse the next feature line in the file.
......@@ -85,8 +96,10 @@ use warnings;
use Bio::EnsEMBL::Utils::Exception;
use IO::File;
use URI::Escape;
use Bio::EnsEMBL::Utils::Scalar qw/wrap_array/;
my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1' );
my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1');
=head2 new
......@@ -98,13 +111,15 @@ my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1' );
=cut
sub new {
my $class = shift;
my $self = { filehandle => shift, };
bless $self, $class;
if ( !defined( $self->{'filehandle'} ) ) {
throw("GFFParser requires a valid filehandle to a GFF3 formatted file");
}
return $self;
my $class = shift;
my $self = {
filehandle => shift,
};
bless $self, $class;
if (!defined($self->{'filehandle'})) {
throw("GFFParser requires a valid filehandle to a GFF3 formatted file");
}
return $self;
}
......@@ -118,31 +133,31 @@ sub new {
sub parse_header {
my $self = shift;
my $self = shift;
my $next_line;
my @header_lines;
while ( ( $next_line = $self->_read_line() ) && ( $next_line =~ /^[\#|\s]/ ) )
{
#header lines start with ##
if ( $next_line =~ /^[\#]{2}/ ) {
push @header_lines, $next_line;
if ( $next_line =~ /gff-version\s+(\d+)/ ) {
if ( $1 != 3 ) {
warning(
"File has been formatted in GFF version $1. GFFParser may return unexpected results as it is designed to parse GFF3 formatted files."
);
}
}
my $next_line;
my @header_lines;
while (($next_line = $self->_read_line()) && ($next_line =~ /^[\#|\s]/) ) {
#stop parsing features if ##FASTA directive encountered
last if ($next_line =~ /\#\#FASTA/ );
#header lines start with ## (except for the ##FASTA directive indicating sequence section)
if ($next_line =~ /^[\#]{2}/ ) {
push @header_lines, $next_line;
if ($next_line =~ /gff-version\s+(\d+)/) {
if ($1 != 3) {
warning("File has been formatted in GFF version $1. GFFParser may return unexpected results as it is designed to parse GFF3 formatted files.");
}
}
}
}
}
if ( defined($next_line) && ( $next_line !~ /^[\#|\s]/ ) ) {
$self->{'first_feature_line'} = $next_line;
}
return \@header_lines;
if (defined($next_line)) {
$self->{'first_non_header_line'} = $next_line;
}
return \@header_lines;
}
......@@ -162,93 +177,153 @@ sub parse_header {
attribute => hashref,
}
If the attribute value held more than one value then we hold an arrayref
not a scalar
Returntype : Hashref of a GFF3 feature line
=cut
sub parse_next_feature {
my $self = shift;
my $self = shift;
# my $next_line;
my $feature_line;
while ( my ($next_line) = $self->_read_line() ) {
next
if ( $next_line =~ /^\#/
|| $next_line =~ /^\s*$/
|| $next_line =~ /^\/\// );
$feature_line = $next_line;
last;
}
return undef unless $feature_line;
my $next_line;
my $feature_line;
while (($next_line = $self->_read_line() ) && defined($next_line) ) {
my %feature;
my %attribute;
#stop parsing features if ##FASTA directive
last if ($next_line =~ /\#\#FASTA/);
#strip off trailing comments
$feature_line =~ s/\#.*//;
my @chunks = split( /\t/, $feature_line );
next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ ||
$next_line =~ /^\/\//);
%feature = (
'seqid' => uri_unescape( $chunks[0] ),
'source' => uri_unescape( $chunks[1] ),
'type' => uri_unescape( $chunks[2] ),
'start' => $chunks[3],
'end' => $chunks[4],
'score' => $chunks[5],
'strand' => $strand_conversion{ $chunks[6] },
'phase' => $chunks[7]
);
$feature_line = $next_line;
last;
}
if ( $chunks[8] ) {
return undef unless $feature_line;
my %feature;
my %attribute;
#strip off trailing comments
$feature_line =~ s/\#.*//;
my @chunks = split(/\t/, $feature_line);
%feature = (
'seqid' => uri_unescape($chunks[0]),
'source' => uri_unescape($chunks[1]),
'type' => uri_unescape($chunks[2]),
'start' => $chunks[3],
'end' => $chunks[4],
'score' => $chunks[5],
'strand' => $strand_conversion{$chunks[6]},
'phase' => $chunks[7]
);
if ($chunks[8]) {
my @attributes = split( /;/, $chunks[8] );
my %attributes;
foreach my $attribute (@attributes) {
my ( $name, $value ) = split( /=/, $attribute );
$name = uri_unescape($name);
my @split_values = map { uri_unescape($_) } split(/,/, $value);
if(scalar(@split_values) > 1) {
$attributes{$name} = \@split_values;
}
else {
$attributes{$name} = $split_values[0];
my %attributes;
foreach my $attribute (@attributes) {
my ( $name, $value ) = split( /=/, $attribute );
$name = uri_unescape($name);
my @split_values = map { uri_unescape($_) } split(/,/, $value);
if(scalar(@split_values) > 1) {
$attributes{$name} = \@split_values;
}
else {
$attributes{$name} = $split_values[0];
}
}
$feature{'attribute'} = \%attributes;
}
$feature{'attribute'} = \%attributes;
}
return \%feature;
return \%feature;
}
sub _read_line {
=head2 parse_next_sequence
my $self = shift;
my $fh = $self->{'filehandle'};
Arg [1] : File handle
Description: Returns a hashref in the format -
{
header => scalar,
sequence => scalar,
}
Returntype : Hashref of a GFF3 sequence line
my $line;
=cut
sub parse_next_sequence {
my $self = shift;
if ( defined( $self->{'first_feature_line'} ) ) {
$line = $self->{'first_feature_line'};
$self->{'first_feature_line'} = undef;
}
else {
$line = <$fh>;
if ( defined($line) ) {
chomp $line;
my $next_line;
my $sequence;
my $header;
while (($next_line = $self->_read_line() ) && defined($next_line) ) {
next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ ||
$next_line =~ /^\/\//);
if ($next_line =~ /^>/) {
if ($header) {
#next fasta header encountered
$self->{'next_fasta_header'} = $next_line;
last;
} else {
$header = $next_line;
}
} else {
$sequence .= $next_line;
}
}
}
return $line;
return undef unless ($sequence || $header);
my %sequence = (header => $header , sequence => $sequence );
return \%sequence;
}
sub _read_line {
my $self = shift;
my $fh = $self->{'filehandle'};
my $line;
if (defined($self->{'first_non_header_line'})) {
$line = $self->{'first_non_header_line'};
$self->{'first_non_header_line'} = undef;
} elsif ( defined($self->{'next_fasta_header'} )) {
$line = $self->{'next_fasta_header'};
$self->{'next_fasta_header'} = undef;
}
else {
$line = <$fh>;
if (defined($line)) {
chomp $line;
if (!$line) {
#parse next line if current line is empty
$line = $self->_read_line();
}
}
}
return $line;
}
sub close {
my $self = shift;
$self->{"filehandle"} = undef;
my $self = shift;
$self->{"filehandle"} = undef;
}
1;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment