GFFSerializer.pm 12.3 KB
Newer Older
1
2
=head1 LICENSE

3
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4
5
6
7
8
9

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0
10

11
12
13
14
15
16
17
18
19
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

=cut

=pod
20
21
22
23
24
25
26
27
28
29
30


=head1 NAME

GFFSerializer - Feature to GFF converter

=head1 SYNOPSIS

use Bio::EnsEMBL::Utils::IO::GFFSerializer;

my $ontology_adaptor = $registry->get_adaptor( 'Multi', 'Ontology', 'OntologyTerm' );
31
my $serializer = Bio::EnsEMBL::Utils::IO::GFFSerializer->new($ontology_adaptor,$output_fh);
32
33
34
35
36
37
38
39
40

my $variation_feature_adaptor = $registry->get_adaptor( $config{'species'}, 'variation', 'variationfeature' );
$serializer->print_metadata("Variation Features:");
my $iterator = $variation_feature_adaptor->fetch_Iterator_by_Slice($slice,undef,60000);
$serializer->print_feature_Iterator($iterator);

=head1 DESCRIPTION

Subclass of Serializer that can turn a feature into a line for the GFF3 format. Requires
41
42
a SequenceOntologyMapper in order to translate features (biotypes in case of genes and 
transcripts) to SO terms.
43
44
45
46

=cut

package Bio::EnsEMBL::Utils::IO::GFFSerializer;
47

48
49
50
use strict;
use warnings;
use Bio::EnsEMBL::Utils::Exception;
51
use Bio::EnsEMBL::Utils::SequenceOntologyMapper;
52
use URI::Escape;
53
use Bio::EnsEMBL::Utils::IO::FeatureSerializer;
Andy Yates's avatar
Andy Yates committed
54
use Bio::EnsEMBL::Utils::Scalar qw/check_ref/;
55

56
use base qw(Bio::EnsEMBL::Utils::IO::FeatureSerializer);
57

58
my %strand_conversion = ( '1' => '+', '0' => '.', '-1' => '-');
59
60
61

=head2 new

62
63
64
    Constructor
    Arg [1]    : Ontology Adaptor
    Arg [2]    : Optional File handle
65
    Arg [3]    : Default source of the features. Defaults to .
66
67
    
    Returntype : Bio::EnsEMBL::Utils::IO::GFFSerializer
68
69
70
71

=cut

sub new {
72
73
74
75
    my $class = shift;
    my $self = {
        ontology_adaptor => shift,
        filehandle => shift,
76
        default_source => shift
77
78
    };
    bless $self, $class;
Andy Yates's avatar
Andy Yates committed
79
    if ( ! check_ref($self->{'ontology_adaptor'}, "Bio::EnsEMBL::DBSQL::OntologyTermAdaptor" )) {
80
        throw("GFF format requires an instance of Bio::EnsEMBL::DBSQL::OntologyTermAdaptor to function.");        
81
    }
82
    $self->{'mapper'} = Bio::EnsEMBL::Utils::SequenceOntologyMapper->new($self->{'ontology_adaptor'});
83
84
85
86
87
88
    
    if (!defined ($self->{'filehandle'})) {
        # no file handle, let the handle point to a copy of STDOUT instead
        open $self->{'filehandle'}, ">&STDOUT";
        $self->{'stdout'} = 1;
    }
89
    if(!defined $self->{default_source}) {
90
        $self->{default_source} = '.';
91
    }
92
    return $self;
93
94
95
96
}

=head2 print_feature

97
98
99
100
101
102
103
104
    Arg [1]    : Bio::EnsEMBL::Feature, subclass or related pseudo-feature
    Example    : $reporter->print_feature($feature,$slice_start_coordinate,"X")
    Description: Asks a feature for its summary, and generates a GFF3 
                 compliant entry to hand back again
                 Additional attributes are handed through to column 9 of the 
                 output using exact spelling and capitalisation of the 
                 feature-supplied hash.
    Returntype : none
105
106
107
=cut

sub print_feature {
108
109
    my $self = shift;
    my $feature = shift;
110
    my $so_mapper = $self->{'mapper'};
111

112
    my $text_buffer = "";
113
    if ($feature->can('summary_as_hash') ) {
114
115
116
117
118
        my %summary = %{$feature->summary_as_hash};
        my $row = "";
#    Column 1 - seqname, the name of the sequence/chromosome the feature is on. Landmark for start below
        if (!defined($summary{'seq_region_name'})) {$summary{'seq_region_name'} = "?";}
        $row .= $summary{'seq_region_name'}."\t";
119

120
#    Column 2 - source, complicated with Ensembl not being the originator of all data but user can specify or it switches to ensembl.
121
122
123
124
#     Check whether the analysis has been defined with a 'gff_source' before using the default.
        if (defined $summary{source}) {
          $row .= $summary{source};
        } else {
125
126
127
128
129
130
          if ( ref($feature)->isa('Bio::EnsEMBL::Feature') ) {
            if ( defined($feature->analysis) && $feature->analysis->gff_source() ) {
              $row .= $feature->analysis->gff_source();
            } else {
              $row .= $self->_default_source();
            }
131
132
          }
        }
133
        $row .= qq{\t};
134
135

#   Column 3 - feature, the ontology term for the kind of feature this row is
136
	my $so_term = eval { $so_mapper->to_name($feature); };
137
	$@ and throw sprintf "Unable to map feature %s to SO term.\n$@", $summary{id};
Magali Ruffier's avatar
Magali Ruffier committed
138
139
140
141
        if ($so_term eq 'protein_coding_gene') { 
# Special treatment for protein_coding_gene, as more commonly expected term is 'gene'
          $so_term = 'gene';
        }
142
        $row .= $so_term."\t";
143

144
145
#    Column 4 - start, the start coordinate of the feature, here shifted to chromosomal coordinates
#    Start and end must be in ascending order for GFF. Circular genomes require the length of 
146
#   the circuit to be added on.
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
        if ($summary{'start'} > $summary{'end'}) {
            #assumes this is not a Compara circular sequence and can treat is as a Feature
            if ($feature->slice() && $feature->slice()->is_circular() ) {
                $summary{'end'} = $summary{'end'} + $feature->seq_region_length;
            }
            # non-circular, but end still before start
            else {$summary{'end'} = $summary{'start'};}
        }
        $row .= $summary{'start'} . "\t";

#    Column 5 - end, coordinates (absolute) for the end of this feature
        $row .= $summary{'end'} . "\t";

#    Column 6 - score, for variations only.
        if (exists($summary{'score'})) {
            $row .= $summary{'score'}."\t";
        }
        else {
            $row .= ".\t";
        }

#    Column 7 - strand, up or down
        if (exists($summary{'strand'})) {
            $row .= $strand_conversion{$summary{'strand'}}."\t";
        }
        else {
173
            $row .= "?\t";
174
        }
175

176
177
178
179
180
181
182
#   Column 8 - reading phase, necessary only for Exons
        if (exists($summary{'phase'})) {
          $row .= $summary{'phase'}."\t";
        }
        else {
          $row .= ".\t";
        }
183
184
185
186
187
188
189
190

#    Column 9 - the 'other' section for all GFF and GVF compliant attributes
#    We include Stable ID and biotype where possible to supplement the information in the other columns
        delete $summary{'seq_region_start'};
        delete $summary{'seq_region_name'};
        delete $summary{'start'};
        delete $summary{'end'};
        delete $summary{'strand'};
Magali Ruffier's avatar
Magali Ruffier committed
191
        delete $summary{'phase'};
192
        delete $summary{'score'};
193
        delete $summary{'source'};
194
        delete $summary{'type'};
195
#   Slice the hash for specific keys in GFF-friendly order
196
        my @ordered_keys = grep { exists $summary{$_} } qw(id Name Alias Parent Target Gap Derives_from Note Dbxref Ontology_term Is_circular);
197
198
199
        my @ordered_values = @summary{@ordered_keys};
        while (my $key = shift @ordered_keys) {
            my $value = shift @ordered_values;
200
            delete $summary{$key};
201
            if ($value && $value ne '') {
202
                if ($key =~ /id/) {
Magali Ruffier's avatar
Magali Ruffier committed
203
204
205
206
                  if ($feature->isa('Bio::EnsEMBL::Transcript')) {
                    $value = 'transcript:' . $value;
                  } elsif ($feature->isa('Bio::EnsEMBL::Gene')) {
                    $value = 'gene:' . $value;
207
208
                  } elsif ($feature->isa('Bio::EnsEMBL::Exon')) {
                    $key = 'Name';
209
210
211
212
                  } elsif ($feature->isa('Bio::EnsEMBL::CDS')) {
                    my $trans_spliced = $feature->transcript->get_all_Attributes('trans_spliced');
                    if (scalar(@$trans_spliced)) {
                      $value = $so_term . ':' . join('_', $value, $feature->seq_region_name, $feature->seq_region_strand);
213
214
                    } else {
                      $value = $so_term . ':' . $value;
215
                    }
Magali Ruffier's avatar
Magali Ruffier committed
216
217
218
219
220
221
222
                  } else {
                    $value = $so_term . ':' . $value;
                  }
                }
                if ($key eq 'Parent') {
                 if ($feature->isa('Bio::EnsEMBL::Transcript')) {
                    $value = 'gene:' . $value;
223
                  } elsif ($feature->isa('Bio::EnsEMBL::Exon') || $feature->isa('Bio::EnsEMBL::UTR') || $feature->isa('Bio::EnsEMBL::CDS')) {
Magali Ruffier's avatar
Magali Ruffier committed
224
225
226
                    $value = 'transcript:' . $value;
                  }
                }
227
                $key = uc($key) if $key eq 'id';
228
229
230
231
232
                if (ref $value eq "ARRAY" && scalar(@{$value}) > 0) {
                  $row .= $key."=".join (',',map { uri_escape($_,'\t\n\r;=%&,') } grep { defined $_ } @{$value});
                } else {
                  $row .= $key."=".uri_escape($value,'\t\n\r;=%&,');
                }
233
                $row .= ';' if scalar(@ordered_keys) > 0 || scalar(keys %summary) > 0;
234
235
            }
        }
236
#   Catch the remaining keys, containing whatever else the Feature provided
237
        my @keys = sort keys %summary;
238
        #$row =~ s/;?$// if $row =~ /;$/; # Remove trailing ';' if there is any
239
        while(my $attribute = shift @keys) {
240
241
            my $data_written = 0;
            if (ref $summary{$attribute} eq "ARRAY" && scalar(@{$summary{$attribute}}) > 0) {
242
                $row .= $attribute."=".join (',',map { uri_escape($_,'\t\n\r;=%&,') } grep { defined $_ } @{$summary{$attribute}});
243
                $data_written = 1;
244
245
            }
            else {
Magali Ruffier's avatar
Magali Ruffier committed
246
                if (defined $summary{$attribute}) { 
247
                  $row .= $attribute."=".uri_escape($summary{$attribute},'\t\n\r;=%&,'); 
248
                  $data_written = 1;
249
                }
250
            }
251
            $row .= ';' if scalar(@keys) > 0 && $data_written;
252
        }
253
        $row =~ s/;?$//; # Remove trailing ';' if there is any
254
# trim off any trailing commas left by the ordered keys stage above:
255
256
257
258
259
260
261
262
        $text_buffer .= $row."\n";
    }
    else {
        warning("Feature failed to self-summarise");
    }
    #filehandle is inherited
    my $fh = $self->{'filehandle'};
    print $fh $text_buffer;
263
264
265
266
267
}

=head2 print_main_header

    Arg [1]    : Arrayref of slices going into the file.
268
269
    Description: Printing the header text or metadata required for GFF,
                 using a list of slices to be written
270
271
272
273
    Returntype : None
=cut

sub print_main_header {
274
275
    my $self = shift;
    my $arrayref_of_slices = shift;
276
    my $dba = shift;
277
278
279
280
281
282
283
    my $fh = $self->{'filehandle'};
    
    print $fh "##gff-version 3\n";
    foreach my $slice (@{$arrayref_of_slices}) {
        if (not defined($slice)) { warning("Slice not defined.\n"); return;}
        print $fh "##sequence-region   ",$slice->seq_region_name," ",$slice->start," ",$slice->end,"\n";
    }
284
285
286
287
288
289
290
291
292
293
294

    if (!$dba) { 
      print "\n";
      return;
    }

    my $mc = $dba->get_MetaContainer();
    my $gc = $dba->get_GenomeContainer();
  
    # Get the build. name gives us GRCh37.p1 where as default gives us GRCh37
    my $assembly_name = $gc->get_assembly_name();
295
    my $providers = $mc->list_value_by_key('provider.name') || '';
ChuangKee Ong's avatar
ChuangKee Ong committed
296
    my $provider = join(";", @$providers);
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
    print $fh "#!genome-build $provider $assembly_name\n" if $assembly_name;
  
    # Get the build default
    my $version = $gc->get_version();
    print $fh "#!genome-version ${version}\n" if $version;
  
    # Get the date of the genome build
    my $assembly_date = $gc->get_assembly_date();
    print $fh "#!genome-date ${assembly_date}\n" if $assembly_date;
  
    # Get accession and only print if it is there
    my $accession = $gc->get_accession();
    if($accession) {
       my $accession_source = $mc->single_value_by_key('assembly.web_accession_source');
       my $string = "#!genome-build-accession ";
       $string .= "$accession_source:" if $accession_source;
       $string .= "$accession";
  
       print $fh "$string\n";
    }
  
    # Genebuild last updated
    my $genebuild_last_date = $gc->get_genebuild_last_geneset_update();
    print $fh "#!genebuild-last-updated ${genebuild_last_date}\n" if $genebuild_last_date;

    print "\n";
  
    return;
325
326
327
}

sub print_metadata {
328
329
330
331
    my $self = shift;
    my $text = shift;
    my $fh = $self->{'filehandle'};
    print $fh "\n#".$text."\n";
332
333
}

334
335
336
337
338
sub _default_source {
    my ($self) = @_;
    return $self->{default_source};
}

339
340

1;