Unverified Commit b9d7c492 authored by Alessandro Vullo's avatar Alessandro Vullo Committed by GitHub
Browse files

Merge pull request #79 from Ensembl/utils_io

ensembl Utils::IO into ensembl-io
parents e4ae5e92 d11a5f7f
......@@ -468,57 +468,39 @@ sub add_attr {
=head2 so_term
Description: Accessor to look up the Ontology term for an object
Description: Accessor to look up the Sequence Ontology term for an object
Args[1] : Feature to loop up term for
Returntype : String (term)
Exceptions : If the term can't be found by the Ontology adaptor
Exceptions : If the term can't be found
=cut
sub so_term {
my $self = shift;
my $object = shift;
my $so_term = eval { $self->so_mapper()->to_name($object); };
if($@) {
if ($self->no_exception) {
## Quick'n'dirty method for objects with no ontology
my @namespace = split('::', ref($object));
(my $type = $namespace[-1]) =~ s/Feature$//;
return lc $type;
}
else {
throw sprintf "Unable to map feature %s to SO term.\n$@", $object->display_id;
}
my $so_term;
# get biotype SO acc if feature can do it
if ( $object->can('get_Biotype') ) {
$so_term = $object->get_Biotype->so_term;
}
# if no biotype SO acc, get the feature one
if ( !$so_term ) {
$so_term = $object->feature_so_term;
}
# could not get it, throw exception
if ( !$so_term ) {
throw sprintf "Unable to find an SO term for feature %s.\n$@", $object->display_id;
}
if ($so_term eq 'protein_coding_gene') {
# Special treatment for protein_coding_gene, as more commonly expected term is 'gene'
if ($so_term eq 'protein_coding_gene') {
# Special treatment for protein_coding_gene, as more commonly expected term is 'gene'
$so_term = 'gene';
}
return $so_term;
}
=head2 so_mapper
Description: Accessor for the so term mapper
Returntype : sequence ontology mapper
Exceptions : If the registry isn't configured to access the database
=cut
sub so_mapper {
my $self = shift;
if( ! defined( $self->{'mapper'} ) ) {
my $oa = Bio::EnsEMBL::Registry->get_adaptor('multi', 'ontology', 'OntologyTerm');
$self->{'mapper'} = Bio::EnsEMBL::Utils::SequenceOntologyMapper->new($oa);
}
return $self->{'mapper'};
}
=head2 _default_score
Description: Return the default source type for a feature
......
=head1 LICENSE
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016-2019] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=cut
=head1 NAME
BEDSerializer - Feature to BED converter
=head1 SYNOPSIS
use Bio::EnsEMBL::Utils::IO::BEDSerializer;
=head1 DESCRIPTION
Subclass of Serializer that can turn a feature into a line for the BED format.
=cut
package Bio::EnsEMBL::Utils::IO::BEDSerializer;
use strict;
use warnings;
use Bio::EnsEMBL::Utils::Exception;
use URI::Escape;
use Bio::EnsEMBL::Utils::IO::FeatureSerializer;
use Bio::EnsEMBL::Utils::Scalar qw/check_ref/;
use base qw(Bio::EnsEMBL::Utils::IO::FeatureSerializer);
=head2 new
Constructor
Arg [1] : Optional File handle
Arg [2] : Default source of the features. Defaults to .
Arg [3] : RGB colour to emit. Defaults to black (0,0,0)
Returntype : Bio::EnsEMBL::Utils::IO::BEDSerializer
=cut
sub new {
my $class = shift;
my $self = {
filehandle => shift,
source => shift,
rgb => shift
};
bless $self, $class;
if (!defined ($self->{'filehandle'})) {
# no file handle, let the handle point to a copy of STDOUT instead
open $self->{'filehandle'}, ">&STDOUT";
$self->{'stdout'} = 1;
}
$self->{rgb} = '0,0,0' if ! defined $self->{rgb};
return $self;
}
=head2 print_feature
Arg [1] : Bio::EnsEMBL::Feature, subclass or related pseudo-feature
Example : $reporter->print_feature($feature,$slice_start_coordinate,"X")
Description: Asks a feature for its summary, and generates a BED
compliant entry to hand back again
Returntype : none
=cut
sub print_feature {
my ($self, $feature, $cache) = @_;
return unless $feature;
my $bed_array;
if($feature->isa('Bio::EnsEMBL::Transcript')) {
$bed_array = $self->_write_Transcript($feature, $cache);
}
else {
$bed_array = $self->_feature_to_bed_array($feature, $cache);
}
my $bed_line = join("\t", @{$bed_array});
my $fh = $self->{'filehandle'};
print $fh $bed_line, "\n";
return 1;
}
sub rgb {
my ($self, $rgb) = @_;
$self->{rgb} = $rgb if defined $rgb;
return $self->{rgb};
}
sub _write_Transcript {
my ($self, $transcript, $cache) = @_;
# Not liking this. If we are in this situation we need to re-fetch the transcript
# just so the thing ends up on the right Slice!
my $new_transcript = $transcript->transfer($transcript->slice()->seq_region_Slice());
$new_transcript->get_all_Exons(); # force exon loading
my $bed_array = $self->_feature_to_bed_array($transcript, $cache);
my $bed_genomic_start = $bed_array->[1]; #remember this is in 0 coords
# 12 column BED
my ($coding_start, $coding_end, $exon_starts_string, $exon_lengths_string, $exon_count) = (0,0,q{},q{},0);
my $rgb = $self->rgb();
# genePred BED extensions
my ($second_name, $cds_start_status, $cds_end_status, $exon_frames, $type, $gene_name, $second_gene_name, $gene_type) = (q{},q{none},q{none},q{},q{},q{},q{},q{});
# Set the remaining transcript attributes
$type = $transcript->biotype();
$second_name = $transcript->external_name() || q{none};
# If we have a translation then we do some maths to calc the start of
# the thick sections. Otherwise we must have a ncRNA or pseudogene
# and that thick section is just set to the transcript's end
if($new_transcript->translation()) {
my ($cdna_start, $cdna_end) = ($new_transcript->cdna_coding_start(), $new_transcript->cdna_coding_end);
if($new_transcript->strand() == -1) {
($cdna_start, $cdna_end) = ($cdna_end, $cdna_start);
}
# Rules are if it's got a coding start we will use it; if not we use the cDNA
$coding_start = $self->_cdna_to_genome($new_transcript, $cdna_start);
$coding_start--; # convert to 0 based coords
#Same again but for the end
$coding_end = $self->_cdna_to_genome($new_transcript, $cdna_end);
#Also figure out complete 5' and 3' CDS tags
my ($five_prime_nc, $three_prime_nc) = map { $_->[0] }
grep { defined $_ && @{$_} }
map { $transcript->get_all_Attributes($_) }
qw/cds_start_NF cds_end_NF/;
$cds_start_status = ($five_prime_nc) ? 'incmpl' : 'cmpl';
$cds_end_status = ($three_prime_nc) ? 'incmpl' : 'cmpl';
# reverse if we are on the negative strand
if($transcript->strand() == -1) {
($cds_start_status, $cds_end_status) = ($cds_end_status, $cds_start_status);
}
}
else {
# When we represent non-coding transcripts we set the coding start/end (thickStart/thickEnd)
# in a BED file to the start of the transcript region (lowest coordinate).
$coding_start = $bed_genomic_start;
$coding_end = $coding_start;
}
# Now for the interesting bit. Exons are given relative to the bed start
# so we need to calculate the offset. Exons are also sorted by start otherwise
# offset calcs are wrong
foreach my $exon (sort { $a->seq_region_start() <=> $b->seq_region_start() } @{$new_transcript->get_all_Exons()}) {
my $exon_start = $exon->seq_region_start();
$exon_start--; #move into 0 coords
my $offset = $exon_start - $bed_genomic_start; # just have to minus current start from the genomic start
$exon_starts_string .= $offset.',';
$exon_lengths_string .= $exon->length().',';
# We have to re-interpret a phase -1 as 0 if we are on a coding exon. Otherwise we just
# leave it (non-coding exons are always -1 it seems from UCSC's examples)
my $phase = $exon->phase();
my $exon_coding_start = $exon->coding_region_start($transcript);
if(defined $exon_coding_start) {
$phase = 0 if $phase == -1;
}
$exon_frames .= $phase.',';
$exon_count++;
}
# Get the gene and populate its fields
my $gene = $transcript->get_Gene();
$gene_name = $gene->stable_id();
$second_gene_name = $gene->external_name() || q{none};
$gene_type = $gene->biotype();
# Finally recreate 12 column BED format
push(@{$bed_array}, $coding_start, $coding_end, $rgb, $exon_count, $exon_lengths_string, $exon_starts_string);
# And then the gene pred
push(@{$bed_array}, $second_name, $cds_start_status, $cds_end_status, $exon_frames, $type, $gene_name, $second_gene_name, $gene_type);
return $bed_array;
}
sub _cdna_to_genome {
my ($self, $transcript, $coord) = @_;
my @mapped = $transcript->cdna2genomic($coord, $coord);
my $genomic_coord = $mapped[0]->start();
return $genomic_coord;
}
=head2 _feature_to_bed_array
Arg [1] : Bio::EnsEMBL::Feature
The Feature to encode
Arg [2] : HashRef
Cache of retrieved Slice names.
Description : Takes a feature and returns an extended BED record consumed up to field
6 of the format (strand). Score is left as 0
Returntype : ArrayRef of fields encoded as
[chr, start, end, display, 0, strand]
=cut
sub _feature_to_bed_array {
my ($self, $feature, $cache) = @_;
my $chr_name = $self->_feature_to_UCSC_name($feature, $cache);
my $start = $feature->seq_region_start() - 1;
my $end = $feature->seq_region_end();
my $strand = ($feature->seq_region_strand() == -1) ? '-' : '+';
my $display_id = $feature->display_id();
my $score = 1000;
return [ $chr_name, $start, $end, $display_id, $score, $strand ];
}
=head2 _feature_to_UCSC_name
Arg [1] : Bio::EnsEMBL::Feature
Feature object whose seq_region_name must be converted
Arg [2] : HashRef
Cache of retrieved names. Here because we are unaware of the
lifetime of a View in Catalyst (is it persistent between requests)
Description : Attempts to figure out what UCSC calls this Slice. First we
consult the synonyms attached to the Slice, then ask if it is a chromsome
(accounting for MT -> chrM) and adding a chr to the name if it was a reference
Slice. If it was none of these the name is just passed through
Returntype : String of the UCSC name
=cut
sub _feature_to_UCSC_name {
my ($self, $feature, $cache) = @_;
my $seq_region_name = $feature->seq_region_name();
#Return if the name was already defined (we assume we work within a single species)
return $cache->{$seq_region_name} if exists $cache->{$seq_region_name};
if(! $feature->can('slice')) {
return $cache->{$seq_region_name} = $seq_region_name;
}
my $slice = $feature->slice();
my $ucsc_name;
my $has_adaptor = ($slice->adaptor()) ? 1 : 0;
if($has_adaptor) { # if it's got an adaptor we can lookup synonyms
my $synonyms = $slice->get_all_synonyms('UCSC');
if(@{$synonyms}) {
$ucsc_name = $synonyms->[0]->name();
}
}
if(! defined $ucsc_name) {
# if it's a chromosome then we can test a few more things
if($slice->is_chromosome()) {
#MT is a special case; it's chrM
if($seq_region_name eq 'MT' ) {
$ucsc_name = 'chrM';
}
# If it was a ref region add chr onto it (only check if we have an adaptor)
elsif($has_adaptor && $slice->is_reference()) {
$ucsc_name = 'chr'.$seq_region_name;
}
}
}
#Leave it as the seq region name otherwise
$ucsc_name = $seq_region_name if ! defined $ucsc_name;
$cache->{$seq_region_name} = $ucsc_name;
return $ucsc_name;
}
1;
=head1 LICENSE
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016-2019] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=cut
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
Questions may also be sent to the Ensembl help desk at
<http://www.ensembl.org/Help/Contact>.
=cut
=head1 NAME
Bio::EnsEMBL::Utils::IO::FASTASerializer
=head1 SYNOPSIS
my $serializer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle);
$serializer->chunk_factor(1000);
$serializer->line_width(60);
$serializer->print_Seq($slice);
$serializer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle,
sub {
my $slice = shift;
return "Custom header";
}
);
=head1 DESCRIPTION
Replacement for SeqDumper, making better use of shared code. Outputs FASTA
format with optional custom header and formatting parameters. Set line_width
and chunk_factor to dictate buffer size depending on application. A 60kb
buffer is used by default with a line width of 60 characters.
Custom headers are set by supplying an anonymous subroutine to new(). Custom
header code must accept a Slice or Bio::PrimarySeqI compliant object as
argument and return a string.
The custom header method can be overridden later through set_custom_header()
but this is not normally necessary.
=cut
package Bio::EnsEMBL::Utils::IO::FASTASerializer;
use strict;
use warnings;
use Bio::EnsEMBL::Utils::Exception;
use Bio::EnsEMBL::Utils::Scalar qw/assert_ref assert_integer check_ref/;
use base qw(Bio::EnsEMBL::Utils::IO::Serializer);
=head2 new
Arg [1] : Filehandle (optional)
Arg [2] : CODEREF subroutine for writing custom headers
Arg [3] : [optional] Chunking size (integer)
Arg [4] : [optional] Line width (integer)
Example : $dumper = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle,$header_function,1000,60);
Description: Constructor
Allows the specification of a custom function for rendering
header lines.
Set line width to 0 for no linefeeds in the sequence.
Returntype : Bio::EnsEMBL::Utils::IO::FASTASerializer;
Exceptions : none
Caller : general
=cut
sub new {
my $caller = shift;
my $class = ref($caller) || $caller;
my $filehandle = shift;
my $header_function = shift;
my $chunk_factor = shift;
my $line_width = shift;
my $self = $class->SUPER::new($filehandle);
$self->{'header_function'} = $header_function;
$self->line_width( (defined $line_width)? $line_width : 60 );
$self->{'chunk_factor'} = ($chunk_factor)? $chunk_factor : 1000;
# gives a 60kb buffer by default, increase for higher database and disk efficiency.
if ( defined($self->{'header_function'}) ) {
if (ref($self->{'header_function'}) ne "CODE") {
throw("Custom header function must be an anonymous subroutine when instantiating FASTASerializer");}
}
else {
$self->{'header_function'} = sub {
my $slice = shift;
if(check_ref($slice, 'Bio::EnsEMBL::Slice')) {
my $id = $slice->seq_region_name;
my $seqtype = 'dna';
my $idtype = $slice->coord_system->name;
my $location = $slice->name;
return "$id $seqtype:$idtype $location";
}
else {
# must be a Bio::Seq , or we're doomed
return $slice->display_id;
}
};
}
return $self;
}
=head2 print_metadata
Arg [1] : Bio::EnsEMBL::Slice
Description: Printing header lines into FASTA files. Usually handled
internally to the serializer.
Returntype : None
Caller : print_Seq
=cut
sub print_metadata {
my $self = shift;
my $slice = shift;
my $fh = $self->{'filehandle'};
my $function = $self->header_function();
my $metadata = $function->($slice);
print $fh '>'.$metadata."\n" or throw "Error writing to file handle: $!";
}
=head2 print_Seq
Arg [1] : Bio::EnsEMBL::Slice or other Bio::PrimarySeqI compliant object
Description: Serializes the slice into FASTA format. Buffering is used
While other Bioperl PrimarySeqI implementations can be used,
a custom header function will be required to accommodate it.
Returntype : None
=cut
sub print_Seq {
my $self = shift;
my $slice = shift;
my $fh = $self->{'filehandle'};
$self->print_metadata($slice);
my $width = $self->{line_width};
# set buffer size
my $chunk_size = $self->{'chunk_factor'} * $width;
$chunk_size = $self->{'chunk_factor'} if $width == 0;
my $start = 1;
my $end = $slice->length();
#chunk the sequence to conserve memory, and print
my $here = $start;
if ($width == 0) {
print $fh $slice->seq."\n" or throw "Error writing to file handle: $!";
} else {
while($here <= $end) {
my $there = $here + $chunk_size - 1;
$there = $end if($there > $end);
my $seq = $slice->subseq($here, $there);
my @lines = unpack ("(A$width)*", $seq);
push @lines,''; # ensure last line has a carriage return
$seq = join "\n",@lines;
# $seq =~ s/(.{1,$width})/$1\n/g; # straightforward but has cost
print $fh $seq or throw "Error writing to file handle: $!";
$here = $there + 1;
}
}
if ($slice->length > 0) {$self->{'achieved_something'} = 1;}
}
=head2 line_width
Arg [1] : Integer e.g. 60 or 80
Description: Set and get FASTA format line width. Default is 60, maximum is 2**30
Set to 0 for no line feeds in the sequence
Returntype : Integer
=cut
sub line_width {
my $self = shift;
my $line_width = shift;
if (defined $line_width) {
assert_integer($line_width,'line width');
throw "Must have a sensible line width" if $line_width < 0;
throw "Maximum line width is 2**30, consider using 0 for no line feeds instead" if $line_width > 1073741824;
$self->{'line_width'} = $line_width
}
return $self->{'line_width'}
}
=head2 chunk_factor
Arg [1] : Integer e.g. 1000
Description: Set and get the multiplier used to dictate buffer size
Chunk factor x line width = buffer size in bases.
Returntype : Integer
=cut
sub chunk_factor {
my $self = shift;
my $chunk_factor = shift;
assert_integer($chunk_factor,'chunk factor') if $chunk_factor;
if ($chunk_factor) { $self->{'chunk_factor'} = $chunk_factor};
return $self->{'chunk_factor'}
}