From 99aabc67700ee82d88ca32b722dd5ead5fdcb067 Mon Sep 17 00:00:00 2001
From: Monika Komorowska <mk8@sanger.ac.uk>
Date: Tue, 6 Mar 2012 16:37:21 +0000
Subject: [PATCH] parser for GFF3 formatted files

---
 modules/Bio/EnsEMBL/Utils/IO/GFFParser.pm | 240 ++++++++++++++++++++++
 1 file changed, 240 insertions(+)
 create mode 100644 modules/Bio/EnsEMBL/Utils/IO/GFFParser.pm

diff --git a/modules/Bio/EnsEMBL/Utils/IO/GFFParser.pm b/modules/Bio/EnsEMBL/Utils/IO/GFFParser.pm
new file mode 100644
index 0000000000..d87a4a0453
--- /dev/null
+++ b/modules/Bio/EnsEMBL/Utils/IO/GFFParser.pm
@@ -0,0 +1,240 @@
+=pod
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 NAME
+
+GFFParser - simple gff3 parser.
+
+
+=head1 AUTHOR
+
+Monika Komorowska, 2012 - monika@ebi.ac.uk
+
+=head1 SYNOPSIS
+
+use strict;
+use Bio::EnsEMBL::Utils::IO::GFFParser;
+use FileHandle;
+
+my $file_name = "features.gff";
+my $fh = FileHandle->new;
+$fh->open("< $file_name");
+my $parser = Bio::EnsEMBL::Utils::IO::GFFParser->new($fh);
+
+my @header_lines = @{$parser->parse_header()};
+#do something with the header lines array, e.g. print array elements
+
+foreach my $header_line (@header_lines) {
+    print $header_line . "\n";
+}
+print "\n\n";
+my $feature = $parser->parse_next_feature();
+
+while (defined($feature) ) {
+
+    my %feature = %{$feature};
+
+    #do something with the feature, e.g. print hash keys and values 
+    foreach my $key (keys %feature) {
+	if ($key ne 'attribute') {
+	    print $key . " " . $feature{$key} ."\n";
+	} else {
+	    print $key . "\n";
+	    my %attribs =  %{$feature{$key}};
+	    foreach my $attrib_key (keys %attribs) {
+		print "\t" . $attrib_key . " " .$attribs{$attrib_key}."\n";
+
+	    }
+	}
+    }
+    print "\n\n";
+    $feature = $parser->parse_next_feature();
+}
+
+$parser->close();
+
+$fh->close();
+
+
+
+=head1 DESCRIPTION
+
+GFF3 format as defined in http://www.sequenceontology.org/gff3.shtml.
+
+Use parse_header method to parse a GFF3 file header, and parse_next_feature to parse the next feature line in the file.
+
+This class can be extended to convert a feature hash into a feature object reversing
+the processing done by GFFSerializer.
+
+=cut
+
+package Bio::EnsEMBL::Utils::IO::GFFParser;
+use strict;
+use warnings;
+use Bio::EnsEMBL::Utils::Exception;
+use IO::File;
+use URI::Escape;
+
+my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1');
+
+=head2 new
+
+    Constructor
+    Arg [1]    : File handle
+    
+    Returntype : Bio::EnsEMBL::Utils::IO::GFFParser
+
+=cut
+
+sub new {
+    my $class = shift;
+    my $self = {
+        filehandle => shift,
+    };
+    bless $self, $class;
+    if (!defined($self->{'filehandle'})) {
+        throw("GFFParser requires a valid filehandle to a GFF3 formatted file"); 
+    }
+    return $self;
+
+}
+
+=head2 parse_header
+
+    Arg [1]    : File handle 
+    Description: Returns a arrayref with each header line stored in array element
+    Returntype : Arrayref of GFF3 file header lines
+
+=cut
+
+sub parse_header {
+
+    my $self = shift;
+
+    my $next_line;
+    my @header_lines;
+    
+    while (($next_line = $self->_read_line()) && ($next_line =~ /^[\#|\s]/) )  {
+	#header lines start with ##
+	if ($next_line =~ /^[\#]{2}/) {
+	    push @header_lines, $next_line;
+	    if ($next_line =~ /gff-version\s+(\d+)/) {
+		if ($1 != 3) {
+		    warning("File has been formatted in GFF version $1. GFFParser may return unexpected results as it is designed to parse GFF3 formatted files.");  
+		}
+	    }
+	}
+    }
+
+    if (defined($next_line) && ($next_line !~ /^[\#|\s]/)) {
+	$self->{'first_feature_line'} = $next_line;
+    }
+    return \@header_lines;
+
+}
+
+=head2 parse_next_feature
+
+    Arg [1]    : File handle
+    Description: Returns a hashref in the format -
+                 {
+                   seqid => scalar,
+                   source => scalar,
+                   type => scalar,
+                   start => scalar,
+                   end => scalar,
+                   score => scalar,
+                   strand => scalar,
+                   phase => scalar,
+                   attribute => hashref, 
+                   
+		 }
+    Returntype : Hashref of a GFF3 feature line
+
+=cut
+
+sub parse_next_feature {
+
+    my $self = shift;
+
+    my $next_line;
+    my $feature_line;
+    
+    while (($next_line = $self->_read_line() ) && defined($next_line) ) {
+	next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ ||
+		$next_line =~ /^\/\//);
+	$feature_line = $next_line;
+	last;
+    }
+
+    return undef unless $feature_line;
+
+    my %feature;
+    my %attribute;
+
+
+    #strip off trailing comments
+    $feature_line =~ s/\#.*//;
+	
+    my @chunks = split(/\t/, $feature_line);
+
+    %feature = (
+	    'seqid' => uri_unescape($chunks[0]),
+            'source' => uri_unescape($chunks[1]),
+            'type' => uri_unescape($chunks[2]),
+            'start' => $chunks[3],
+            'end' => $chunks[4],
+            'score' => $chunks[5],
+            'strand' => $strand_conversion{$chunks[6]},
+            'phase' => $chunks[7] );
+	
+     if ($chunks[8]) {
+	    my @attributes = split(/;/,$chunks[8]);
+	    my %attributes;
+	    foreach my $attribute (@attributes) {
+		my ($name, $value) = split(/=/,$attribute);
+		$attributes{uri_unescape($name)} = uri_unescape($value);
+	    }
+	    $feature{'attribute'} = \%attributes;
+     }
+
+    return \%feature;    
+}
+
+sub _read_line {
+
+    my $self = shift;
+    my $fh = $self->{'filehandle'};
+
+    my $line;
+    
+    if (defined($self->{'first_feature_line'})) {
+	$line = $self->{'first_feature_line'};
+	$self->{'first_feature_line'} = undef;
+    } else {
+	$line = <$fh>;
+	if (defined($line)) {
+	    chomp $line;
+	}
+    }
+
+    return $line;
+}
+
+sub close {
+
+    my $self = shift;
+    $self->{"filehandle"} = undef;
+
+}
+
+1;
-- 
GitLab