FASTASerializer.pm 6.41 KB
Newer Older
1
2
3
4
5
6
7
8
=head1 LICENSE

  Copyright (c) 1999-2012 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

Andy Yates's avatar
Andy Yates committed
9
  http://www.ensembl.org/info/about/code_licence.html
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <dev@ensembl.org>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=cut

=head1 NAME

Bio::EnsEMBL::Utils::IO::FASTASerializer

=head1 SYNOPSIS

  my $serializer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle);
  $serializer->chunk_factor(1000);
  $serializer->line_width(60);
30
  $serializer->print_Seq($slice);
Andy Yates's avatar
Andy Yates committed
31

32
  $serializer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle,
Andy Yates's avatar
Andy Yates committed
33
34
35
36
  sub {
    my $slice = shift;
    return "Custom header";
  }
37
  );
Andy Yates's avatar
Andy Yates committed
38

39
40
41
42
43
44
=head1 DESCRIPTION

  Replacement for SeqDumper, making better use of shared code. Outputs FASTA
  format with optional custom header and formatting parameters. Set line_width
  and chunk_factor to dictate buffer size depending on application. A 60kb
  buffer is used by default with a line width of 60 characters.
Andy Yates's avatar
Andy Yates committed
45

46
  Custom headers are set by supplying an anonymous subroutine to new(). Custom
Andy Yates's avatar
Andy Yates committed
47
  header code must accept a Slice or Bio::PrimarySeqI compliant object as
48
  argument and return a string.
Andy Yates's avatar
Andy Yates committed
49

50
51
  The custom header method can be overridden later through set_custom_header()
  but this is not normally necessary.
52
53
54
55
56
57
58
59

=cut

package Bio::EnsEMBL::Utils::IO::FASTASerializer;

use strict;
use warnings;
use Bio::EnsEMBL::Utils::Exception;
60
use Bio::EnsEMBL::Utils::Scalar qw/assert_ref check_ref/;
61
62
63
64
65

use base qw(Bio::EnsEMBL::Utils::IO::Serializer);

=head2 new

Andy Yates's avatar
Andy Yates committed
66
67
68
69
70
  Arg [1]  : Filehandle (optional)
  Arg [2]  : CODEREF subroutine for writing custom headers
  Arg [3]  : [optional] Chunking size (integer)
  Arg [4]  : [optional] Line width (integer)
  Example  : $dumper = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle,$header_function,1000,60);
71
  Description: Constructor
Andy Yates's avatar
Andy Yates committed
72
73
         Allows the specification of a custom function for rendering
         header lines.
74
75
  Returntype : Bio::EnsEMBL::Utils::IO::FASTASerializer;
  Exceptions : none
Andy Yates's avatar
Andy Yates committed
76
  Caller   : general
77
78
79
80

=cut

sub new {
Andy Yates's avatar
Andy Yates committed
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
  my $caller = shift;
  my $class = ref($caller) || $caller;
  my $filehandle = shift;
  my $header_function = shift;
  my $chunk_factor = shift;
  my $line_width = shift;

  my $self = $class->SUPER::new($filehandle);

  $self->{'header_function'} = $header_function;
  $self->{'line_width'} = ($line_width)? $line_width : 60;
  $self->{'chunk_factor'} = ($chunk_factor)? $chunk_factor : 1000;
  # gives a 60kb buffer by default, increase for higher database and disk efficiency.

  # TODO: Check this error trap works as intended
  if ( defined($self->{'header_function'}) ) {
    if (ref($self->{'header_function'}) ne "CODE") {
      throw("Custom header function must be an anonymous subroutine when instantiating FASTASerializer");}
  }
  else {
    $self->{'header_function'} = sub {
      my $slice = shift;

      if(check_ref($slice, 'Bio::EnsEMBL::Slice')) {
        my $id     = $slice->seq_region_name;
        my $seqtype  = 'dna';
        my $idtype   = $slice->coord_system->name;
        my $location = $slice->name;

        return "$id $seqtype:$idtype $location";
      }
      else {
        # must be a Bio::Seq , or we're doomed

        return $slice->display_id;
      }
    };

  }

  return $self;
122
123
124
125
}

=head2 print_metadata

Andy Yates's avatar
Andy Yates committed
126
127
128
129
130
  Arg [1]  : Bio::EnsEMBL::Slice
  Description: Printing header lines into FASTA files. Usually handled
         internally to the serializer.
  Returntype : None
  Caller   : print_Seq
131
132
133
=cut

sub print_metadata {
Andy Yates's avatar
Andy Yates committed
134
135
136
137
138
  my $self = shift;
  my $slice = shift;
  my $fh = $self->{'filehandle'};
  my $function = $self->header_function();
  my $metadata = $function->($slice);
139
  print $fh '>'.$metadata."\n" or throw "Error writing to file handle: $!";
140
141
}

142
=head2 print_Seq
143

Andy Yates's avatar
Andy Yates committed
144
145
146
147
148
149
150
151
  Arg [1]  : Bio::EnsEMBL::Slice or other Bio::PrimarySeqI compliant object

  Description: Serializes the slice into FASTA format. Buffering is used
         While other Bioperl PrimarySeqI implementations can be used,
         a custom header function will be required to accommodate it.

  Returntype : None

152
153
=cut

154
sub print_Seq {
Andy Yates's avatar
Andy Yates committed
155
156
157
158
159
160
161
162
163
164
165
166
  my $self = shift;
  my $slice = shift;
  my $fh = $self->{'filehandle'};

  $self->print_metadata($slice);
  my $width = $self->{line_width};

  # set buffer size
  my $chunk_size = $self->{'chunk_factor'} * $width;

  my $start = 1;
  my $end = $slice->length();
167
168

  #chunk the sequence to conserve memory, and print
Andy Yates's avatar
Andy Yates committed
169

170
  my $here = $start;
Andy Yates's avatar
Andy Yates committed
171

172
  while($here <= $end) {
173
    my $there = $here + $chunk_size - 1;
Andy Yates's avatar
Andy Yates committed
174
    $there = $end if($there > $end);
175
    my $seq = $slice->subseq($here, $there);
176
    $seq =~ s/(.{1,$width})/$1\n/g;
177
    print $fh $seq or throw "Error writing to file handle: $!";
178
179
    $here = $there + 1;
  }
Andy Yates's avatar
Andy Yates committed
180

181
  if ($slice->length > 0) {$self->{'achieved_something'} = 1;}
Andy Yates's avatar
Andy Yates committed
182

183
184
185
186
}

=head2 line_width

Andy Yates's avatar
Andy Yates committed
187
188
189
190
  Arg [1]  : Integer e.g. 60 or 80
  Description: Set and get FASTA format line width. Default is 60
  Returntype : Integer

191
192
193
=cut

sub line_width {
Andy Yates's avatar
Andy Yates committed
194
195
196
197
  my $self = shift;
  my $line_width = shift;
  if ($line_width) { $self->{'line_width'} = $line_width };
  return $self->{'line_width'}
198
199
200
}

=head2 chunk_factor
Andy Yates's avatar
Andy Yates committed
201
202
203
204
  Arg [1]  : Integer e.g. 1000
  Description: Set and get the multiplier used to dictate buffer size
         Chunk factor x line width = buffer size in bases.
  Returntype : Integer
205
206
207
=cut

sub chunk_factor {
Andy Yates's avatar
Andy Yates committed
208
209
210
211
  my $self = shift;
  my $chunk_factor = shift;
  if ($chunk_factor) { $self->{'chunk_factor'} = $chunk_factor};
  return $self->{'chunk_factor'}
212
213
}

214
215
=head2 set_custom_header

Andy Yates's avatar
Andy Yates committed
216
217
218
219
220
221
  Arg [1]  : CODE reference
  Description: Set the custom header function. Normally this is done at
         construction time, but can be overridden here.
  Example  : $serializer->set_custom_header( sub { return 'New header'});
  Returntype :

222
223
224
=cut

sub set_custom_header {
225
226
227
228
229
230
231
  my ($self, $new_header_function) = @_;
  $self->header_function($new_header_function);
  return;
}

=head2 header_function

Andy Yates's avatar
Andy Yates committed
232
233
234
235
236
  Arg [1]  : CODE reference (optional)
  Description: Getter/setter for the custom header code
  Example  : $serializer->header_function( sub { return 'New header'});
  Returntype : CODE

237
238
239
240
241
242
=cut

sub header_function {
  my ($self, $header_function) = @_;
  if($header_function) {
    assert_ref($header_function, 'CODE', 'header_function');
243
    $self->{header_function} = $header_function;
244
  }
245
  return $self->{header_function};
246
}
247

Andy Yates's avatar
Andy Yates committed
248
1;