Skip to content
Snippets Groups Projects
Commit 2354ee8b authored by Andy Yates's avatar Andy Yates
Browse files

[ENSCORESW-490]. Fixing the GFF serializer which would add an additional ; at...

[ENSCORESW-490]. Fixing the GFF serializer which would add an additional ; at times and now supports a custom source field. Gene and transcript now both emit this
parent ab9c7991
No related branches found
No related tags found
No related merge requests found
......@@ -1527,6 +1527,7 @@ sub summary_as_hash {
$summary_ref->{'biotype'} = $self->biotype;
$summary_ref->{'external_name'} = $self->external_name;
$summary_ref->{'logic_name'} = $self->analysis->logic_name();
$summary_ref->{'source'} = $self->source();
return $summary_ref;
}
......
......@@ -2849,6 +2849,7 @@ sub summary_as_hash {
$summary_ref->{'logic_name'} = $self->analysis->logic_name();
my $parent_gene = $self->get_Gene();
$summary_ref->{'Parent'} = $parent_gene->stable_id;
$summary_ref->{'source'} = $self->source() || $parent_gene->source();
return $summary_ref;
}
......
......@@ -52,6 +52,7 @@ my %strand_conversion = ( '1' => '+', '0' => '?', '-1' => '-');
Constructor
Arg [1] : Ontology Adaptor
Arg [2] : Optional File handle
Arg [3] : Default source of the features. Defaults to ensembl
Returntype : Bio::EnsEMBL::Utils::IO::GFFSerializer
......@@ -62,6 +63,7 @@ sub new {
my $self = {
ontology_adaptor => shift,
filehandle => shift,
default_source => shift
};
bless $self, $class;
if ( ! check_ref($self->{'ontology_adaptor'}, "Bio::EnsEMBL::DBSQL::OntologyTermAdaptor" )) {
......@@ -74,6 +76,9 @@ sub new {
open $self->{'filehandle'}, ">&STDOUT";
$self->{'stdout'} = 1;
}
if(!defined $self->{default_source}) {
$self->{default_source} = 'ensembl';
}
return $self;
}
......@@ -102,8 +107,9 @@ sub print_feature {
if (!defined($summary{'seq_region_name'})) {$summary{'seq_region_name'} = "?";}
$row .= $summary{'seq_region_name'}."\t";
# Column 2 - source, complicated with Ensembl not being the originator of all data
$row .= "EnsEMBL\t";
# Column 2 - source, complicated with Ensembl not being the originator of all data but user can specify or it switches to ensembl.
$row .= $summary{source} || $self->_default_source();
$row .= qq{\t};
# Column 3 - feature, the ontology term for the kind of feature this row is
my $so_term = $biotype_mapper->translate_feature_to_SO_term($feature);
......@@ -157,6 +163,7 @@ sub print_feature {
delete $summary{'end'};
delete $summary{'strand'};
delete $summary{'score'};
delete $summary{'source'};
# Slice the hash for specific keys in GFF-friendly order
my @ordered_keys = qw(ID Name Alias Parent Target Gap Derives_from Note Dbxref Ontology_term Is_circular);
my @ordered_values = @summary{@ordered_keys};
......@@ -168,17 +175,20 @@ sub print_feature {
delete $summary{$key};
}
# Catch the remaining keys, containing whatever else the Feature provided
my @keys = keys %summary;
my @keys = sort keys %summary;
while(my $attribute = shift @keys) {
if (ref $summary{$attribute} eq "ARRAY") {
my $data_written = 0;
if (ref $summary{$attribute} eq "ARRAY" && scalar(@{$summary{$attribute}}) > 0) {
$row .= $attribute."=".join (',',map { uri_escape($_,'\t\n\r;=%&,') } grep { defined $_ } @{$summary{$attribute}});
$data_written = 1;
}
else {
if ($summary{$attribute}) {
$row .= $attribute."=".uri_escape($summary{$attribute},'\t\n\r;=%&,');
$data_written = 1;
}
}
$row .= ';' if scalar(@keys) > 0;
$row .= ';' if scalar(@keys) > 0 && $data_written;
}
# trim off any trailing commas left by the ordered keys stage above:
$text_buffer .= $row."\n";
......@@ -218,5 +228,10 @@ sub print_metadata {
print $fh "\n#".$text."\n";
}
sub _default_source {
my ($self) = @_;
return $self->{default_source};
}
1;
......@@ -42,27 +42,50 @@ my $dba = $db->get_DBAdaptor('core');
my $id = 'ENSG00000131044';
my $ga = $dba->get_GeneAdaptor();
my $gene = $ga->fetch_by_stable_id($id);
{
my $ota = Test::SO->new();
my $fh = IO::String->new();
my $ser = Bio::EnsEMBL::Utils::IO::GFFSerializer->new($ota, $fh);
$ser->print_main_header([$gene->feature_Slice()]);
$ser->print_feature($gene);
my $gene = $ga->fetch_by_stable_id($id);
delete $gene->{source};
$gene->{description} = undef; #empty value means don't emit the key
my $expected = <<'OUT';
##gff-version 3
##sequence-region 20 30274334 30300924
OUT
#Have to do this outside of the HERETO thanks to tabs
$expected .= join("\t",
qw/20 ensembl feature 30274334 30300924 . + ./,
'ID=ENSG00000131044;biotype=protein_coding;external_name=C20orf125;logic_name=ensembl'
);
$expected .= "\n";
assert_gff3($gene, $expected, 'Gene with no source serialises to GFF3 as expected. Source is ensembl');
}
{
my $gene = $ga->fetch_by_stable_id($id);
$gene->source('wibble');
my $expected = <<'OUT';
##gff-version 3
##sequence-region 20 30274334 30300924
OUT
#Have to do this outside of the HERETO thanks to tabs
$expected .= join("\t",
qw/20 EnsEMBL feature 30274334 30300924 . + ./,
'ID=ENSG00000131044;logic_name=ensembl;external_name=C20orf125;description=DJ310O13.1.2 (NOVEL PROTEIN SIMILAR DROSOPHILA PROTEIN CG7474%2C ISOFORM 2 ) (FRAGMENT). [Source:SPTREMBL%3BAcc:Q9BR18];biotype=protein_coding'
qw/20 wibble feature 30274334 30300924 . + ./,
'ID=ENSG00000131044;biotype=protein_coding;description=DJ310O13.1.2 (NOVEL PROTEIN SIMILAR DROSOPHILA PROTEIN CG7474%2C ISOFORM 2 ) (FRAGMENT). [Source:SPTREMBL%3BAcc:Q9BR18];external_name=C20orf125;logic_name=ensembl'
);
$expected .= "\n";
is(${$fh->string_ref()}, $expected, 'Gene serialises to GFF3 as expected');
assert_gff3($gene, $expected, 'Gene with custom source serialises to GFF3 as expected. Source is wibble');
}
sub assert_gff3 {
my ($feature, $expected, $msg) = @_;
my $ota = Test::SO->new();
my $fh = IO::String->new();
my $ser = Bio::EnsEMBL::Utils::IO::GFFSerializer->new($ota, $fh);
$ser->print_main_header([$feature->feature_Slice()]);
$ser->print_feature($feature);
is(${$fh->string_ref()}, $expected, $msg);
}
done_testing();
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment