Skip to content
Snippets Groups Projects
Commit 024a2eba authored by Andy Yates's avatar Andy Yates
Browse files

ENSCORESW-137. Code could not handle empty lines nor attributes with multiple...

ENSCORESW-137. Code could not handle empty lines nor attributes with multiple entries. Hope this isn't going to mess up anything else
parent 30758bbb
No related branches found
No related tags found
No related merge requests found
=pod
=head1 LICENSE
......@@ -23,6 +24,7 @@ Monika Komorowska, 2012 - monika@ebi.ac.uk
use strict;
use Bio::EnsEMBL::Utils::IO::GFFParser;
use Bio::EnsEMBL::Utils::Scalar qw/wrap_array/;
use FileHandle;
my $file_name = "features.gff";
......@@ -51,8 +53,8 @@ while (defined($feature) ) {
print $key . "\n";
my %attribs = %{$feature{$key}};
foreach my $attrib_key (keys %attribs) {
print "\t" . $attrib_key . " " .$attribs{$attrib_key}."\n";
my $values = $attribs{$attrib_key};
printf("\t%s %s\n", $attrib_key, join(q{, }, wrap_array($values)));
}
}
}
......@@ -84,7 +86,7 @@ use Bio::EnsEMBL::Utils::Exception;
use IO::File;
use URI::Escape;
my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1');
my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1' );
=head2 new
......@@ -96,15 +98,13 @@ my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1');
=cut
sub new {
my $class = shift;
my $self = {
filehandle => shift,
};
bless $self, $class;
if (!defined($self->{'filehandle'})) {
throw("GFFParser requires a valid filehandle to a GFF3 formatted file");
}
return $self;
my $class = shift;
my $self = { filehandle => shift, };
bless $self, $class;
if ( !defined( $self->{'filehandle'} ) ) {
throw("GFFParser requires a valid filehandle to a GFF3 formatted file");
}
return $self;
}
......@@ -118,27 +118,31 @@ sub new {
sub parse_header {
my $self = shift;
my $self = shift;
my $next_line;
my @header_lines;
while (($next_line = $self->_read_line()) && ($next_line =~ /^[\#|\s]/) ) {
#header lines start with ##
if ($next_line =~ /^[\#]{2}/) {
push @header_lines, $next_line;
if ($next_line =~ /gff-version\s+(\d+)/) {
if ($1 != 3) {
warning("File has been formatted in GFF version $1. GFFParser may return unexpected results as it is designed to parse GFF3 formatted files.");
}
}
}
}
my $next_line;
my @header_lines;
if (defined($next_line) && ($next_line !~ /^[\#|\s]/)) {
$self->{'first_feature_line'} = $next_line;
while ( ( $next_line = $self->_read_line() ) && ( $next_line =~ /^[\#|\s]/ ) )
{
#header lines start with ##
if ( $next_line =~ /^[\#]{2}/ ) {
push @header_lines, $next_line;
if ( $next_line =~ /gff-version\s+(\d+)/ ) {
if ( $1 != 3 ) {
warning(
"File has been formatted in GFF version $1. GFFParser may return unexpected results as it is designed to parse GFF3 formatted files."
);
}
}
}
return \@header_lines;
}
if ( defined($next_line) && ( $next_line !~ /^[\#|\s]/ ) ) {
$self->{'first_feature_line'} = $next_line;
}
return \@header_lines;
}
......@@ -158,83 +162,93 @@ sub parse_header {
attribute => hashref,
}
If the attribute value held more than one value then we hold an arrayref
not a scalar
Returntype : Hashref of a GFF3 feature line
=cut
sub parse_next_feature {
my $self = shift;
my $next_line;
my $feature_line;
while (($next_line = $self->_read_line() ) && defined($next_line) ) {
next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ ||
$next_line =~ /^\/\//);
$feature_line = $next_line;
last;
my $self = shift;
# my $next_line;
my $feature_line;
while ( my ($next_line) = $self->_read_line() ) {
next
if ( $next_line =~ /^\#/
|| $next_line =~ /^\s*$/
|| $next_line =~ /^\/\// );
$feature_line = $next_line;
last;
}
return undef unless $feature_line;
my %feature;
my %attribute;
#strip off trailing comments
$feature_line =~ s/\#.*//;
my @chunks = split( /\t/, $feature_line );
%feature = (
'seqid' => uri_unescape( $chunks[0] ),
'source' => uri_unescape( $chunks[1] ),
'type' => uri_unescape( $chunks[2] ),
'start' => $chunks[3],
'end' => $chunks[4],
'score' => $chunks[5],
'strand' => $strand_conversion{ $chunks[6] },
'phase' => $chunks[7]
);
if ( $chunks[8] ) {
my @attributes = split( /;/, $chunks[8] );
my %attributes;
foreach my $attribute (@attributes) {
my ( $name, $value ) = split( /=/, $attribute );
$name = uri_unescape($name);
my @split_values = map { uri_unescape($_) } split(/,/, $value);
if(scalar(@split_values) > 1) {
$attributes{$name} = \@split_values;
}
else {
$attributes{$name} = $split_values[0];
}
}
$feature{'attribute'} = \%attributes;
}
return undef unless $feature_line;
my %feature;
my %attribute;
#strip off trailing comments
$feature_line =~ s/\#.*//;
my @chunks = split(/\t/, $feature_line);
%feature = (
'seqid' => uri_unescape($chunks[0]),
'source' => uri_unescape($chunks[1]),
'type' => uri_unescape($chunks[2]),
'start' => $chunks[3],
'end' => $chunks[4],
'score' => $chunks[5],
'strand' => $strand_conversion{$chunks[6]},
'phase' => $chunks[7] );
if ($chunks[8]) {
my @attributes = split(/;/,$chunks[8]);
my %attributes;
foreach my $attribute (@attributes) {
my ($name, $value) = split(/=/,$attribute);
$attributes{uri_unescape($name)} = uri_unescape($value);
}
$feature{'attribute'} = \%attributes;
}
return \%feature;
return \%feature;
}
sub _read_line {
my $self = shift;
my $fh = $self->{'filehandle'};
my $self = shift;
my $fh = $self->{'filehandle'};
my $line;
if (defined($self->{'first_feature_line'})) {
$line = $self->{'first_feature_line'};
$self->{'first_feature_line'} = undef;
} else {
$line = <$fh>;
if (defined($line)) {
chomp $line;
}
my $line;
if ( defined( $self->{'first_feature_line'} ) ) {
$line = $self->{'first_feature_line'};
$self->{'first_feature_line'} = undef;
}
else {
$line = <$fh>;
if ( defined($line) ) {
chomp $line;
}
}
return $line;
return $line;
}
sub close {
my $self = shift;
$self->{"filehandle"} = undef;
my $self = shift;
$self->{"filehandle"} = undef;
}
1;
use strict;
use warnings;
use Test::More;
use IO::String;
use Bio::EnsEMBL::Utils::IO::GFFParser;
{
my $io = IO::String->new(<<'GFF');
##gff-version 3
##sequence-region ctg123 1 1497228
##taken-from http://www.sequenceontology.org/gff3.shtml
ctg123 . gene 1000 9000 . + . ID=gene00001;Name=EDEN
ctg123 . TF_binding_site 1000 1012 . + . ID=tfbs00001;Parent=gene00001
ctg123 . mRNA 1050 9000 . + . ID=mRNA00001;Parent=gene00001;Name=EDEN.1
ctg123 . mRNA 1050 9000 . + . ID=mRNA00002;Parent=gene00001;Name=EDEN.2
ctg123 . mRNA 1300 9000 . + . ID=mRNA00003;Parent=gene00001;Name=EDEN.3
ctg123 . exon 1300 1500 . + . ID=exon00001;Parent=mRNA00003
ctg123 . exon 1050 1500 . + . ID=exon00002;Parent=mRNA00001,mRNA00002
ctg123 . exon 3000 3902 . + . ID=exon00003;Parent=mRNA00001,mRNA00003
ctg123 . exon 5000 5500 . + . ID=exon00004;Parent=mRNA00001,mRNA00002,mRNA00003
ctg123 . exon 7000 9000 . + . ID=exon00005;Parent=mRNA00001,mRNA00002,mRNA00003
ctg123 . CDS 1201 1500 . + 0 ID=cds00001;Parent=mRNA00001;Name=edenprotein.1
ctg123 . CDS 3000 3902 . + 0 ID=cds00001;Parent=mRNA00001;Name=edenprotein.1
ctg123 . CDS 5000 5500 . + 0 ID=cds00001;Parent=mRNA00001;Name=edenprotein.1
ctg123 . CDS 7000 7600 . + 0 ID=cds00001;Parent=mRNA00001;Name=edenprotein.1
GFF
my $gff = Bio::EnsEMBL::Utils::IO::GFFParser->new($io);
my $header = $gff->parse_header();
is_deeply(
$header,
[ '##gff-version 3', '##sequence-region ctg123 1 1497228', '##taken-from http://www.sequenceontology.org/gff3.shtml'],
'Checking headers all parse'
);
my $actual_gene = $gff->parse_next_feature();
my $expected_gene = {
seqid => 'ctg123', start => 1000, end => 9000, strand => 1,
source => '.', type => 'gene', score => '.', phase => '.',
attribute => { ID => 'gene00001', Name => 'EDEN' }
};
is_deeply($actual_gene, $expected_gene, 'Checking gene record parses');
my $actual_tf = $gff->parse_next_feature();
my $expected_tf = {
seqid => 'ctg123', start => 1000, end => 1012, strand => 1,
source => '.', type => 'TF_binding_site', score => '.', phase => '.',
attribute => { ID => 'tfbs00001', Parent => 'gene00001' }
};
is_deeply($actual_tf, $expected_tf, 'Checking TF record parses');
#SKIP TO EXONS
$gff->parse_next_feature(); #mrna
$gff->parse_next_feature(); #mrna
$gff->parse_next_feature(); #mrna
#EXONS
{
my $actual = $gff->parse_next_feature();
my $expected = {
seqid => 'ctg123', start => 1300, end => 1500, strand => 1,
source => '.', type => 'exon', score => '.', phase => '.',
attribute => { ID => 'exon00001', Parent => 'mRNA00003' }
};
is_deeply($actual, $expected, 'Checking Exon 1 record parses');
}
{
my $actual = $gff->parse_next_feature();
my $expected = {
seqid => 'ctg123', start => 1050, end => 1500, strand => 1,
source => '.', type => 'exon', score => '.', phase => '.',
attribute => { ID => 'exon00002', Parent => ['mRNA00001', 'mRNA00002'] }
};
is_deeply($actual, $expected, 'Checking Exon 2 record parses');
}
}
done_testing();
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment