Skip to content
Snippets Groups Projects
Commit 2354ee8b authored by Andy Yates's avatar Andy Yates
Browse files

[ENSCORESW-490]. Fixing the GFF serializer which would add an additional ; at...

[ENSCORESW-490]. Fixing the GFF serializer which would add an additional ; at times and now supports a custom source field. Gene and transcript now both emit this
parent ab9c7991
No related branches found
No related tags found
No related merge requests found
...@@ -1527,6 +1527,7 @@ sub summary_as_hash { ...@@ -1527,6 +1527,7 @@ sub summary_as_hash {
$summary_ref->{'biotype'} = $self->biotype; $summary_ref->{'biotype'} = $self->biotype;
$summary_ref->{'external_name'} = $self->external_name; $summary_ref->{'external_name'} = $self->external_name;
$summary_ref->{'logic_name'} = $self->analysis->logic_name(); $summary_ref->{'logic_name'} = $self->analysis->logic_name();
$summary_ref->{'source'} = $self->source();
return $summary_ref; return $summary_ref;
} }
......
...@@ -2849,6 +2849,7 @@ sub summary_as_hash { ...@@ -2849,6 +2849,7 @@ sub summary_as_hash {
$summary_ref->{'logic_name'} = $self->analysis->logic_name(); $summary_ref->{'logic_name'} = $self->analysis->logic_name();
my $parent_gene = $self->get_Gene(); my $parent_gene = $self->get_Gene();
$summary_ref->{'Parent'} = $parent_gene->stable_id; $summary_ref->{'Parent'} = $parent_gene->stable_id;
$summary_ref->{'source'} = $self->source() || $parent_gene->source();
return $summary_ref; return $summary_ref;
} }
......
...@@ -52,6 +52,7 @@ my %strand_conversion = ( '1' => '+', '0' => '?', '-1' => '-'); ...@@ -52,6 +52,7 @@ my %strand_conversion = ( '1' => '+', '0' => '?', '-1' => '-');
Constructor Constructor
Arg [1] : Ontology Adaptor Arg [1] : Ontology Adaptor
Arg [2] : Optional File handle Arg [2] : Optional File handle
Arg [3] : Default source of the features. Defaults to ensembl
Returntype : Bio::EnsEMBL::Utils::IO::GFFSerializer Returntype : Bio::EnsEMBL::Utils::IO::GFFSerializer
...@@ -62,6 +63,7 @@ sub new { ...@@ -62,6 +63,7 @@ sub new {
my $self = { my $self = {
ontology_adaptor => shift, ontology_adaptor => shift,
filehandle => shift, filehandle => shift,
default_source => shift
}; };
bless $self, $class; bless $self, $class;
if ( ! check_ref($self->{'ontology_adaptor'}, "Bio::EnsEMBL::DBSQL::OntologyTermAdaptor" )) { if ( ! check_ref($self->{'ontology_adaptor'}, "Bio::EnsEMBL::DBSQL::OntologyTermAdaptor" )) {
...@@ -74,6 +76,9 @@ sub new { ...@@ -74,6 +76,9 @@ sub new {
open $self->{'filehandle'}, ">&STDOUT"; open $self->{'filehandle'}, ">&STDOUT";
$self->{'stdout'} = 1; $self->{'stdout'} = 1;
} }
if(!defined $self->{default_source}) {
$self->{default_source} = 'ensembl';
}
return $self; return $self;
} }
...@@ -102,8 +107,9 @@ sub print_feature { ...@@ -102,8 +107,9 @@ sub print_feature {
if (!defined($summary{'seq_region_name'})) {$summary{'seq_region_name'} = "?";} if (!defined($summary{'seq_region_name'})) {$summary{'seq_region_name'} = "?";}
$row .= $summary{'seq_region_name'}."\t"; $row .= $summary{'seq_region_name'}."\t";
# Column 2 - source, complicated with Ensembl not being the originator of all data # Column 2 - source, complicated with Ensembl not being the originator of all data but user can specify or it switches to ensembl.
$row .= "EnsEMBL\t"; $row .= $summary{source} || $self->_default_source();
$row .= qq{\t};
# Column 3 - feature, the ontology term for the kind of feature this row is # Column 3 - feature, the ontology term for the kind of feature this row is
my $so_term = $biotype_mapper->translate_feature_to_SO_term($feature); my $so_term = $biotype_mapper->translate_feature_to_SO_term($feature);
...@@ -157,6 +163,7 @@ sub print_feature { ...@@ -157,6 +163,7 @@ sub print_feature {
delete $summary{'end'}; delete $summary{'end'};
delete $summary{'strand'}; delete $summary{'strand'};
delete $summary{'score'}; delete $summary{'score'};
delete $summary{'source'};
# Slice the hash for specific keys in GFF-friendly order # Slice the hash for specific keys in GFF-friendly order
my @ordered_keys = qw(ID Name Alias Parent Target Gap Derives_from Note Dbxref Ontology_term Is_circular); my @ordered_keys = qw(ID Name Alias Parent Target Gap Derives_from Note Dbxref Ontology_term Is_circular);
my @ordered_values = @summary{@ordered_keys}; my @ordered_values = @summary{@ordered_keys};
...@@ -168,17 +175,20 @@ sub print_feature { ...@@ -168,17 +175,20 @@ sub print_feature {
delete $summary{$key}; delete $summary{$key};
} }
# Catch the remaining keys, containing whatever else the Feature provided # Catch the remaining keys, containing whatever else the Feature provided
my @keys = keys %summary; my @keys = sort keys %summary;
while(my $attribute = shift @keys) { while(my $attribute = shift @keys) {
if (ref $summary{$attribute} eq "ARRAY") { my $data_written = 0;
if (ref $summary{$attribute} eq "ARRAY" && scalar(@{$summary{$attribute}}) > 0) {
$row .= $attribute."=".join (',',map { uri_escape($_,'\t\n\r;=%&,') } grep { defined $_ } @{$summary{$attribute}}); $row .= $attribute."=".join (',',map { uri_escape($_,'\t\n\r;=%&,') } grep { defined $_ } @{$summary{$attribute}});
$data_written = 1;
} }
else { else {
if ($summary{$attribute}) { if ($summary{$attribute}) {
$row .= $attribute."=".uri_escape($summary{$attribute},'\t\n\r;=%&,'); $row .= $attribute."=".uri_escape($summary{$attribute},'\t\n\r;=%&,');
$data_written = 1;
} }
} }
$row .= ';' if scalar(@keys) > 0; $row .= ';' if scalar(@keys) > 0 && $data_written;
} }
# trim off any trailing commas left by the ordered keys stage above: # trim off any trailing commas left by the ordered keys stage above:
$text_buffer .= $row."\n"; $text_buffer .= $row."\n";
...@@ -218,5 +228,10 @@ sub print_metadata { ...@@ -218,5 +228,10 @@ sub print_metadata {
print $fh "\n#".$text."\n"; print $fh "\n#".$text."\n";
} }
sub _default_source {
my ($self) = @_;
return $self->{default_source};
}
1; 1;
...@@ -42,27 +42,50 @@ my $dba = $db->get_DBAdaptor('core'); ...@@ -42,27 +42,50 @@ my $dba = $db->get_DBAdaptor('core');
my $id = 'ENSG00000131044'; my $id = 'ENSG00000131044';
my $ga = $dba->get_GeneAdaptor(); my $ga = $dba->get_GeneAdaptor();
my $gene = $ga->fetch_by_stable_id($id);
{ {
my $ota = Test::SO->new(); my $gene = $ga->fetch_by_stable_id($id);
my $fh = IO::String->new(); delete $gene->{source};
my $ser = Bio::EnsEMBL::Utils::IO::GFFSerializer->new($ota, $fh); $gene->{description} = undef; #empty value means don't emit the key
$ser->print_main_header([$gene->feature_Slice()]); my $expected = <<'OUT';
$ser->print_feature($gene); ##gff-version 3
##sequence-region 20 30274334 30300924
OUT
#Have to do this outside of the HERETO thanks to tabs
$expected .= join("\t",
qw/20 ensembl feature 30274334 30300924 . + ./,
'ID=ENSG00000131044;biotype=protein_coding;external_name=C20orf125;logic_name=ensembl'
);
$expected .= "\n";
assert_gff3($gene, $expected, 'Gene with no source serialises to GFF3 as expected. Source is ensembl');
}
{
my $gene = $ga->fetch_by_stable_id($id);
$gene->source('wibble');
my $expected = <<'OUT'; my $expected = <<'OUT';
##gff-version 3 ##gff-version 3
##sequence-region 20 30274334 30300924 ##sequence-region 20 30274334 30300924
OUT OUT
#Have to do this outside of the HERETO thanks to tabs #Have to do this outside of the HERETO thanks to tabs
$expected .= join("\t", $expected .= join("\t",
qw/20 EnsEMBL feature 30274334 30300924 . + ./, qw/20 wibble feature 30274334 30300924 . + ./,
'ID=ENSG00000131044;logic_name=ensembl;external_name=C20orf125;description=DJ310O13.1.2 (NOVEL PROTEIN SIMILAR DROSOPHILA PROTEIN CG7474%2C ISOFORM 2 ) (FRAGMENT). [Source:SPTREMBL%3BAcc:Q9BR18];biotype=protein_coding' 'ID=ENSG00000131044;biotype=protein_coding;description=DJ310O13.1.2 (NOVEL PROTEIN SIMILAR DROSOPHILA PROTEIN CG7474%2C ISOFORM 2 ) (FRAGMENT). [Source:SPTREMBL%3BAcc:Q9BR18];external_name=C20orf125;logic_name=ensembl'
); );
$expected .= "\n"; $expected .= "\n";
is(${$fh->string_ref()}, $expected, 'Gene serialises to GFF3 as expected'); assert_gff3($gene, $expected, 'Gene with custom source serialises to GFF3 as expected. Source is wibble');
}
sub assert_gff3 {
my ($feature, $expected, $msg) = @_;
my $ota = Test::SO->new();
my $fh = IO::String->new();
my $ser = Bio::EnsEMBL::Utils::IO::GFFSerializer->new($ota, $fh);
$ser->print_main_header([$feature->feature_Slice()]);
$ser->print_feature($feature);
is(${$fh->string_ref()}, $expected, $msg);
} }
done_testing(); done_testing();
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment