support for empty lines and fasta sequence section

ee9aabdf · Monika Komorowska · af53381b · ee9aabdf
Commit ee9aabdf authored 12 years ago by Monika Komorowska
--- a/modules/Bio/EnsEMBL/Utils/IO/GFFParser.pm
+++ b/modules/Bio/EnsEMBL/Utils/IO/GFFParser.pm
 =pod
 =head1 LICENSE
@@ -24,7 +23,6 @@ Monika Komorowska, 2012 - monika@ebi.ac.uk
 use strict;
 use Bio::EnsEMBL::Utils::IO::GFFParser;
-use Bio::EnsEMBL::Utils::Scalar qw/wrap_array/;
 use FileHandle;
 my $file_name = "features.gff";
@@ -47,21 +45,34 @@ while (defined($feature) ) {
    #do something with the feature, e.g. print hash keys and values 
    foreach my $key (keys %feature) {
-        if ($key ne 'attribute') {
+	if ($key ne 'attribute') {
-            print $key . " " . $feature{$key} ."\n";
+	    print $key . " " . $feature{$key} ."\n";
-        } else {
+	} else {
-        print $key . "\n";
+	    print $key . "\n";
-        my %attribs =  %{$feature{$key}};
+	    my %attribs =  %{$feature{$key}};
-        foreach my $attrib_key (keys %attribs) {
+	    foreach my $attrib_key (keys %attribs) {
-            my $values = $attribs{$attrib_key};
+		printf("\t%s %s\n", $attrib_key, join(q{, }, @{wrap_array($values)}));
-            printf("\t%s %s\n", $attrib_key, join(q{, }, @{wrap_array($values)}));
-        }
+	    }
-      }
+	}
    }
    print "\n\n";
    $feature = $parser->parse_next_feature();
 }
+my $sequence = $parser->parse_next_sequence();
+while (defined($sequence)) {
+    my %sequence = %{$sequence};
+    foreach my $key (keys %sequence) {      
+        print $key . " " . $sequence{$key} ."\n";
+    }
+    print "\n\n";   
+    $sequence = $parser->parse_next_sequence();
+}
 $parser->close();
 $fh->close();
@@ -70,7 +81,7 @@ $fh->close();
 =head1 DESCRIPTION
-GFF3 format as defined in http://www.sequenceontology.org/gff3.shtml.
+GFF3 format as defined in http://www.sequenceontology.org/gff3.shtml
 Use parse_header method to parse a GFF3 file header, and parse_next_feature to parse the next feature line in the file.
@@ -85,8 +96,10 @@ use warnings;
 use Bio::EnsEMBL::Utils::Exception;
 use IO::File;
 use URI::Escape;
+use Bio::EnsEMBL::Utils::Scalar qw/wrap_array/;
-my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1' );
+my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1');
 =head2 new
@@ -98,13 +111,15 @@ my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1' );
 =cut
 sub new {
-  my $class = shift;
+    my $class = shift;
-  my $self = { filehandle => shift, };
+    my $self = {
-  bless $self, $class;
+        filehandle => shift,
-  if ( !defined( $self->{'filehandle'} ) ) {
+    };
-    throw("GFFParser requires a valid filehandle to a GFF3 formatted file");
+    bless $self, $class;
-  }
+    if (!defined($self->{'filehandle'})) {
-  return $self;
+        throw("GFFParser requires a valid filehandle to a GFF3 formatted file"); 
+    }
+    return $self;
 }
@@ -118,31 +133,31 @@ sub new {
 sub parse_header {
-  my $self = shift;
+    my $self = shift;
-  my $next_line;
+    my $next_line;
-  my @header_lines;
+    my @header_lines;
-  while ( ( $next_line = $self->_read_line() ) && ( $next_line =~ /^[\#|\s]/ ) )
+    while (($next_line = $self->_read_line()) && ($next_line =~ /^[\#|\s]/) )  {
-  {
+	#stop parsing features if ##FASTA directive encountered
-    #header lines start with ##
+	last if ($next_line =~ /\#\#FASTA/ );
-    if ( $next_line =~ /^[\#]{2}/ ) {
-      push @header_lines, $next_line;
+	#header lines start with ## (except for the ##FASTA directive indicating sequence section)
-      if ( $next_line =~ /gff-version\s+(\d+)/ ) {
+	if ($next_line =~ /^[\#]{2}/ ) {
-        if ( $1 != 3 ) {
+	    push @header_lines, $next_line;
-          warning(
+	    if ($next_line =~ /gff-version\s+(\d+)/) {
-"File has been formatted in GFF version $1. GFFParser may return unexpected results as it is designed to parse GFF3 formatted files."
+		if ($1 != 3) {
-          );
+		    warning("File has been formatted in GFF version $1. GFFParser may return unexpected results as it is designed to parse GFF3 formatted files.");  
-        }
+		}
-      }
+	    }
+	}
    }
-  }
-  if ( defined($next_line) && ( $next_line !~ /^[\#|\s]/ ) ) {
+    if (defined($next_line)) {
-    $self->{'first_feature_line'} = $next_line;
+	$self->{'first_non_header_line'} = $next_line;
-  }
+    }
-  return \@header_lines;
+    return \@header_lines;
 }
@@ -162,93 +177,153 @@ sub parse_header {
                   attribute => hashref, 
 		 }
-		If the attribute value held more than one value then we hold an arrayref
-		not a scalar
    Returntype : Hashref of a GFF3 feature line
 =cut
 sub parse_next_feature {
-  my $self = shift;
+    my $self = shift;
-  #    my $next_line;
+    my $next_line;
-  my $feature_line;
+    my $feature_line;
-  while ( my ($next_line) = $self->_read_line() ) {
+    while (($next_line = $self->_read_line() ) && defined($next_line) ) {
-    next
-      if ( $next_line =~ /^\#/
-      || $next_line =~ /^\s*$/
-      || $next_line =~ /^\/\// );
-    $feature_line = $next_line;
-    last;
-  }
-  return undef unless $feature_line;
-  my %feature;
+	#stop parsing features if ##FASTA directive
-  my %attribute;
+	last if ($next_line =~ /\#\#FASTA/);
-  #strip off trailing comments
-  $feature_line =~ s/\#.*//;
-  my @chunks = split( /\t/, $feature_line );
+	next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ ||
+		$next_line =~ /^\/\//);
-  %feature = (
+	$feature_line = $next_line;
-    'seqid'  => uri_unescape( $chunks[0] ),
+	last;
-    'source' => uri_unescape( $chunks[1] ),
+    }
-    'type'   => uri_unescape( $chunks[2] ),
-    'start'  => $chunks[3],
-    'end'    => $chunks[4],
-    'score'  => $chunks[5],
-    'strand' => $strand_conversion{ $chunks[6] },
-    'phase'  => $chunks[7]
-  );
-  if ( $chunks[8] ) {
+    return undef unless $feature_line;
+    my %feature;
+    my %attribute;
+    #strip off trailing comments
+    $feature_line =~ s/\#.*//;
+    my @chunks = split(/\t/, $feature_line);
+    %feature = (
+	    'seqid' => uri_unescape($chunks[0]),
+            'source' => uri_unescape($chunks[1]),
+            'type' => uri_unescape($chunks[2]),
+            'start' => $chunks[3],
+            'end' => $chunks[4],
+            'score' => $chunks[5],
+            'strand' => $strand_conversion{$chunks[6]},
+            'phase' => $chunks[7] 
+    );
+    if ($chunks[8]) {
    my @attributes = split( /;/, $chunks[8] );
-    my %attributes;
+      my %attributes;
-    foreach my $attribute (@attributes) {
+      foreach my $attribute (@attributes) {
-      my ( $name, $value ) = split( /=/, $attribute );
+        my ( $name, $value ) = split( /=/, $attribute );
-      $name = uri_unescape($name);
+        $name = uri_unescape($name);
-      my @split_values = map { uri_unescape($_) } split(/,/, $value);
+        my @split_values = map { uri_unescape($_) } split(/,/, $value);
-      if(scalar(@split_values) > 1) {
+        if(scalar(@split_values) > 1) {
-        $attributes{$name} = \@split_values;
+          $attributes{$name} = \@split_values;
-      }
+        }
-      else {
+        else {
-        $attributes{$name} = $split_values[0];
+          $attributes{$name} = $split_values[0];
+        }
      }
+      $feature{'attribute'} = \%attributes;
    }
-    $feature{'attribute'} = \%attributes;
-  }
-  return \%feature;
+    return \%feature;    
 }
-sub _read_line {
+=head2 parse_next_sequence
-  my $self = shift;
+    Arg [1]    : File handle
-  my $fh   = $self->{'filehandle'};
+    Description: Returns a hashref in the format -
+                 {
+                   header => scalar,
+                   sequence => scalar,
+		 }
+    Returntype : Hashref of a GFF3 sequence line
-  my $line;
+=cut
+sub parse_next_sequence {
+    my $self = shift;
-  if ( defined( $self->{'first_feature_line'} ) ) {
+    my $next_line;
-    $line = $self->{'first_feature_line'};
+    my $sequence;
-    $self->{'first_feature_line'} = undef;
+    my $header;
-  }
-  else {
+    while (($next_line = $self->_read_line() ) && defined($next_line) ) {
-    $line = <$fh>;
-    if ( defined($line) ) {
+	next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ ||
-      chomp $line;
+		$next_line =~ /^\/\//);
+	if ($next_line =~ /^>/) {
+	    if ($header) {
+		#next fasta header encountered
+		$self->{'next_fasta_header'} = $next_line; 
+		last;
+	    } else {
+		$header = $next_line;
+	    }
+	} else {
+	    $sequence .= $next_line;
+	}
    }
-  }
-  return $line;
+    return undef unless ($sequence || $header);
+    my %sequence = (header => $header , sequence => $sequence );
+    return \%sequence;    
+}
+sub _read_line {
+    my $self = shift;
+    my $fh = $self->{'filehandle'};
+    my $line;
+    if (defined($self->{'first_non_header_line'})) {
+	$line = $self->{'first_non_header_line'};
+	$self->{'first_non_header_line'} = undef;
+    } elsif ( defined($self->{'next_fasta_header'} )) {
+	$line = $self->{'next_fasta_header'};
+	$self->{'next_fasta_header'} = undef;
+    }
+    else {
+	$line = <$fh>;
+	if (defined($line)) {
+	    chomp $line;
+	    if (!$line) {
+		#parse next line if current line is empty
+		$line = $self->_read_line();
+	    }
+	}
+    }
+    return $line;
 }
 sub close {
-  my $self = shift;
-  $self->{"filehandle"} = undef;
+    my $self = shift;
+    $self->{"filehandle"} = undef;
 }
 1;