diff --git a/modules/Bio/EnsEMBL/Gene.pm b/modules/Bio/EnsEMBL/Gene.pm index 9269744af01081d807b47b88843a2582f853a4eb..f611c4189b57e2f458b50f2efbaf2c0f1a33a3fb 100755 --- a/modules/Bio/EnsEMBL/Gene.pm +++ b/modules/Bio/EnsEMBL/Gene.pm @@ -1527,6 +1527,7 @@ sub summary_as_hash { $summary_ref->{'biotype'} = $self->biotype; $summary_ref->{'external_name'} = $self->external_name; $summary_ref->{'logic_name'} = $self->analysis->logic_name(); + $summary_ref->{'source'} = $self->source(); return $summary_ref; } diff --git a/modules/Bio/EnsEMBL/Transcript.pm b/modules/Bio/EnsEMBL/Transcript.pm index 092953da0191338c30cafad561b7923e12f6eeae..fb87877b40b418d131925d19c2cf9ba3d02173c8 100755 --- a/modules/Bio/EnsEMBL/Transcript.pm +++ b/modules/Bio/EnsEMBL/Transcript.pm @@ -2849,6 +2849,7 @@ sub summary_as_hash { $summary_ref->{'logic_name'} = $self->analysis->logic_name(); my $parent_gene = $self->get_Gene(); $summary_ref->{'Parent'} = $parent_gene->stable_id; + $summary_ref->{'source'} = $self->source() || $parent_gene->source(); return $summary_ref; } diff --git a/modules/Bio/EnsEMBL/Utils/IO/GFFSerializer.pm b/modules/Bio/EnsEMBL/Utils/IO/GFFSerializer.pm index dafe847ecc8119cb87a580a64b1c427166a82045..3274d2ba0bd332ea07fdca46fdb84036ee3532e2 100644 --- a/modules/Bio/EnsEMBL/Utils/IO/GFFSerializer.pm +++ b/modules/Bio/EnsEMBL/Utils/IO/GFFSerializer.pm @@ -52,6 +52,7 @@ my %strand_conversion = ( '1' => '+', '0' => '?', '-1' => '-'); Constructor Arg [1] : Ontology Adaptor Arg [2] : Optional File handle + Arg [3] : Default source of the features. Defaults to ensembl Returntype : Bio::EnsEMBL::Utils::IO::GFFSerializer @@ -62,6 +63,7 @@ sub new { my $self = { ontology_adaptor => shift, filehandle => shift, + default_source => shift }; bless $self, $class; if ( ! check_ref($self->{'ontology_adaptor'}, "Bio::EnsEMBL::DBSQL::OntologyTermAdaptor" )) { @@ -74,6 +76,9 @@ sub new { open $self->{'filehandle'}, ">&STDOUT"; $self->{'stdout'} = 1; } + if(!defined $self->{default_source}) { + $self->{default_source} = 'ensembl'; + } return $self; } @@ -102,8 +107,9 @@ sub print_feature { if (!defined($summary{'seq_region_name'})) {$summary{'seq_region_name'} = "?";} $row .= $summary{'seq_region_name'}."\t"; -# Column 2 - source, complicated with Ensembl not being the originator of all data - $row .= "EnsEMBL\t"; +# Column 2 - source, complicated with Ensembl not being the originator of all data but user can specify or it switches to ensembl. + $row .= $summary{source} || $self->_default_source(); + $row .= qq{\t}; # Column 3 - feature, the ontology term for the kind of feature this row is my $so_term = $biotype_mapper->translate_feature_to_SO_term($feature); @@ -157,6 +163,7 @@ sub print_feature { delete $summary{'end'}; delete $summary{'strand'}; delete $summary{'score'}; + delete $summary{'source'}; # Slice the hash for specific keys in GFF-friendly order my @ordered_keys = qw(ID Name Alias Parent Target Gap Derives_from Note Dbxref Ontology_term Is_circular); my @ordered_values = @summary{@ordered_keys}; @@ -168,17 +175,20 @@ sub print_feature { delete $summary{$key}; } # Catch the remaining keys, containing whatever else the Feature provided - my @keys = keys %summary; + my @keys = sort keys %summary; while(my $attribute = shift @keys) { - if (ref $summary{$attribute} eq "ARRAY") { + my $data_written = 0; + if (ref $summary{$attribute} eq "ARRAY" && scalar(@{$summary{$attribute}}) > 0) { $row .= $attribute."=".join (',',map { uri_escape($_,'\t\n\r;=%&,') } grep { defined $_ } @{$summary{$attribute}}); + $data_written = 1; } else { if ($summary{$attribute}) { $row .= $attribute."=".uri_escape($summary{$attribute},'\t\n\r;=%&,'); + $data_written = 1; } } - $row .= ';' if scalar(@keys) > 0; + $row .= ';' if scalar(@keys) > 0 && $data_written; } # trim off any trailing commas left by the ordered keys stage above: $text_buffer .= $row."\n"; @@ -218,5 +228,10 @@ sub print_metadata { print $fh "\n#".$text."\n"; } +sub _default_source { + my ($self) = @_; + return $self->{default_source}; +} + 1; diff --git a/modules/t/gffSerialiser.t b/modules/t/gffSerialiser.t index ad888cfcc312567b7113830d8bdea6cf14343d99..4fa0c8e557c6b68979c0e383f9ff388e998903f8 100644 --- a/modules/t/gffSerialiser.t +++ b/modules/t/gffSerialiser.t @@ -42,27 +42,50 @@ my $dba = $db->get_DBAdaptor('core'); my $id = 'ENSG00000131044'; my $ga = $dba->get_GeneAdaptor(); -my $gene = $ga->fetch_by_stable_id($id); { - my $ota = Test::SO->new(); - my $fh = IO::String->new(); - my $ser = Bio::EnsEMBL::Utils::IO::GFFSerializer->new($ota, $fh); - $ser->print_main_header([$gene->feature_Slice()]); - $ser->print_feature($gene); - + my $gene = $ga->fetch_by_stable_id($id); + delete $gene->{source}; + $gene->{description} = undef; #empty value means don't emit the key + my $expected = <<'OUT'; +##gff-version 3 +##sequence-region 20 30274334 30300924 +OUT + #Have to do this outside of the HERETO thanks to tabs + $expected .= join("\t", + qw/20 ensembl feature 30274334 30300924 . + ./, + 'ID=ENSG00000131044;biotype=protein_coding;external_name=C20orf125;logic_name=ensembl' + ); + $expected .= "\n"; + + assert_gff3($gene, $expected, 'Gene with no source serialises to GFF3 as expected. Source is ensembl'); +} + +{ + my $gene = $ga->fetch_by_stable_id($id); + $gene->source('wibble'); my $expected = <<'OUT'; ##gff-version 3 ##sequence-region 20 30274334 30300924 OUT #Have to do this outside of the HERETO thanks to tabs $expected .= join("\t", - qw/20 EnsEMBL feature 30274334 30300924 . + ./, - 'ID=ENSG00000131044;logic_name=ensembl;external_name=C20orf125;description=DJ310O13.1.2 (NOVEL PROTEIN SIMILAR DROSOPHILA PROTEIN CG7474%2C ISOFORM 2 ) (FRAGMENT). [Source:SPTREMBL%3BAcc:Q9BR18];biotype=protein_coding' + qw/20 wibble feature 30274334 30300924 . + ./, + 'ID=ENSG00000131044;biotype=protein_coding;description=DJ310O13.1.2 (NOVEL PROTEIN SIMILAR DROSOPHILA PROTEIN CG7474%2C ISOFORM 2 ) (FRAGMENT). [Source:SPTREMBL%3BAcc:Q9BR18];external_name=C20orf125;logic_name=ensembl' ); $expected .= "\n"; - is(${$fh->string_ref()}, $expected, 'Gene serialises to GFF3 as expected'); + assert_gff3($gene, $expected, 'Gene with custom source serialises to GFF3 as expected. Source is wibble'); +} + +sub assert_gff3 { + my ($feature, $expected, $msg) = @_; + my $ota = Test::SO->new(); + my $fh = IO::String->new(); + my $ser = Bio::EnsEMBL::Utils::IO::GFFSerializer->new($ota, $fh); + $ser->print_main_header([$feature->feature_Slice()]); + $ser->print_feature($feature); + is(${$fh->string_ref()}, $expected, $msg); } done_testing(); \ No newline at end of file