Skip to content
Snippets Groups Projects
Commit 442cce07 authored by Nathan Johnson's avatar Nathan Johnson
Browse files

added FileAdaptor and CollectionADaptor

parent fd62de94
No related branches found
No related tags found
No related merge requests found
=head1 LICENSE
Copyright (c) 1999-2011 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
http://www.ensembl.org/info/about/code_licence.html
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at <dev@ensembl.org>.
Questions may also be sent to the Ensembl help desk at
<helpdesk@ensembl.org>.
=cut
=head1 NAME
Bio::EnsEMBL::DBFile::CollectionAdaptor
=head1 SYNOPSIS
For use with a Bio::EnsEMBL::Collector e.g.
package Bio::EnsEMBL::Funcgen::DBSQL::ResultFeatureAdaptor;
@ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor
Bio::EnsEMBL::Funcgen::Collector::ResultFeature
Bio::EnsEMBL::DBFile::CollectionAdaptor);
#DBSQL and DBFile inheritance here due to dynamic nature of ResultFeatureAdaptor
Fetch wrapper methods access file based data via read_collection_blob:
sub _fetch_from_file_by_Slice_ResultSet{
#define filepath/config
my $packed_scores = $self->read_collection_blob(
$filepath,
$efg_sr_id,
$conf->{$window_size}{'byte_offset'},
$conf->{$window_size}{'byte_length'},
);
#Do unpacking and object creation here
}
=head1 DESCRIPTION
Adaptor for direct collection(.col) file access, which are binary compressed fixed
width format files providing window based values across the genome. Collection files
integrate an index block which contains seq_region byte off set values.
NOTE: By default all collection files are generated and packed using little endian encoding.
Due to the lack of standards of float encoding(wrt to endianess) perl packs using the
implicit endianess of the underlying architecture. This means that accessing float
collection files located on a big endian architecture will produce unexpected results.
#endian issues will disappear with knetfile xsubs
=head1 SEE ALSO
Bio::EnsEMBL::DBFile::FileAdaptor
=cut
package Bio::EnsEMBL::DBFile::CollectionAdaptor;
use strict;
use warnings;
use Bio::EnsEMBL::DBFile::FileAdaptor;
use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate);
use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::DBFile::FileAdaptor);
=head2 initialise_filehandle
Arg[1] : string - filepath
Example : $self->initialise_filehandle($filepath);
Description: Initialises the filehandle for use, in this case reads
the index (seq_region offsets)
Returntype : None
Exceptions : warns if read fails
Caller : Bio::EnsEMBL::DBFile::FileAdaptor::get_filehandle
Status : at risk
=cut
sub initialise_filehandle{
my ($self, $filepath) = @_;
my $fh = $self->{file_cache}{$filepath}{filehandle};
#offsets include the length of the complete index block
my ($index_size, $read_bytes, $index, $num_keys, %offset_index);
### INDEX FORMAT ###
#First block of the index the index size in bytes(not inc size block).
#
#Rest of index is a hash of sr_id(v 2 bytes) key offset(V 4 bytes) value pairs
#V (long) is 4 bytes(via sys/read), which is actually an Config{intsize} i.e. i?
#long is 8 bytes according to Config{longsize}!
#read uses logical characters not necessarily in bytes
#altho this does seem to read bytes, maybe due to binmode?
#seek is in bytes
#Changed to sysread/read which both use bytes explicitly
#Can't mix sysread/seek due to I/O buffering differences
#Read index_size first encoded as v(2 bytes)
$read_bytes = sysread($fh, $index_size, 2);
if(! ((defined $read_bytes) && ($read_bytes == 2))){
#! defined is error 0 is end of file
warn "Failed to read index size from $filepath\n$!";
#Delete fh as it is useless/unsafe to retry
undef $self->{file_cache}{$filepath}{filehandle};
}
else{ #Read index
($index_size) = unpack('v', $index_size);
$read_bytes = sysread($fh, $index, $index_size); #Now read index proper
if(! ((defined $read_bytes) && ($read_bytes == $index_size))){
#! defined is error 0 is end of file
warn "Failed to read index from $filepath\n$!";
#Delete fh as it is useless/unsafe to retry
undef $self->{file_cache}{$filepath}{filehandle};
}
else{
#Number of key-value pairs => $index_size /(size of key(v 2bytes) + size of offset(V 4bytes))
$num_keys = $index_size/6;
my $unpack_template = '(vV)'.$num_keys,;
%offset_index = unpack($unpack_template, $index);
$self->{file_cache}{$filepath}{off_sets} = \%offset_index;
}
}
return $self->{file_cache}{$filepath}{off_sets};
}
=head2 read_collection_blob
Arg[1] : string - filepath
Arg[2] : int - seq_region_id
Arg[3] : int - seq_region offset. The byte offset required to
locate the required start position
Argp4[ : int - byte length to read
Example : my $blob_substr = $self->read_collection_blob($filepath,
$sr_key,
$sr_offset,
$byte_length);
Description: Reads bytes from file given a seq_region_key, byte offset and byte length.
Sets filehandle to undef if read fails.
Returntype : string - packed binary data
Exceptions : warns if seek or read errors
Caller : general e.g. fetch_from_file_by_Slice_ResultSet
Status : at risk
=cut
# We could change this to take a Slice, hence we could check
# whether an EOF error is because the slice is out of range
# and undef only if it is in range i.e. the index/file is corrupt
# overkill?
# This is something the Slice API should warn about
# but will still cause undef'd filehandle here
# Index should also contain ends, so we can validate whether the slice is out of range???
sub read_collection_blob{
my($self, $filepath, $sr_key, $sr_offset, $byte_length) = @_;
my $blob_substr;
my $fh = $self->get_filehandle($filepath, {-binmode => 1});
if(defined $fh){
#Return from query cache here?
#cache key = "$filepath:$key:$sr_offset:$byte_length"
#define total offset
#if(! exists $self->{file_cache}{$filepath}{off_sets}{$sr_key}){
# #warn "sr_key($sr_key) is not part of index for $filepath\n";
#}
#else{
if(exists $self->{file_cache}{$filepath}{off_sets}{$sr_key}){
my $total_offset = $self->{file_cache}{$filepath}{off_sets}{$sr_key} + $sr_offset;
my $seeked = sysseek($fh, $total_offset, 0);#0(whence) is SEEK_SET.
if(! $seeked){
warn("Failed to seek to byte $total_offset in $filepath");
#Don't undef fh here as this valid Slice maybe out of range
#and we don't want to kill a valid fh
#i.e. Slice start/end is past end of seq_region
}
else{
my $read_bytes = sysread($fh, $blob_substr, $byte_length);
if(! ((defined $read_bytes) && ($read_bytes == $byte_length))){
#! defined is error 0 is end of file
warn "Failed to read from $filepath\n$!";
if($read_bytes == 0){
#This maybe because the slice is out of range!
#The API gives no warning about this
warn "End Of File encountered\n";
warn "Total offset:\t".$self->{file_cache}{$filepath}{off_sets}{$sr_key}.
" key($sr_key) + $sr_offset = $total_offset\n";
#add some checks against the theoretical/true length of the file?
}
else{ #Delete fh as it is useless/unsafe to retry
undef $self->{file_cache}{$filepath}{filehandle};
#$blob_substr is now set to empty string by read
undef $blob_substr;
}
}
}
}
}
return $blob_substr;
}
1;
=head1 LICENSE
Copyright (c) 1999-2011 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
http://www.ensembl.org/info/about/code_licence.html
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at <dev@ensembl.org>.
Questions may also be sent to the Ensembl help desk at
<helpdesk@ensembl.org>.
=cut
=head1 NAME
Bio::EnsEMBL::DBFile::FileAdaptor - Base Adaptor for direct file access
=head1 SYNOPSIS
=head1 DESCRIPTION
Basic wrapper class to provide access to file based data.
This is primarily aimed at indexed Collection(.col) files which are optimised for Slice
based queries. Collections store fixed width width/windowed data as BLOBS. This makes
it possible to seek to the a required location given slice coordinate and read the only
the required amount of data covering the slice.
Currently only works as hybrid DBAdaptor e.g. ResultFeatureAdaptor which inherits both from
here and BaseFeatureAdaptor.
=head1 SEE ALSO
=cut
package Bio::EnsEMBL::DBFile::FileAdaptor;
use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate);
use strict;
use warnings;
=head2 get_filehandle
Arg[1] : string - filepath
Arg[2] : HASHREF - Optional params, see open_file
Example : my $fh = $self->get_filehandle($filepath, 1);
Description: Gets and caches a simple file handle.
Returntype : GLOB/undef - filehandle
Exceptions : warns if cache entry exists but is not defined
Caller : general
Status : at risk
=cut
sub get_filehandle{
my ($self, $filepath, $params_hash) = @_;
my $file_op = '<';
if(exists $params_hash->{-file_operator}){
$file_op = $params_hash->{-file_operator};
}else{
$params_hash->{-file_operator} = $file_op;
}
if(! exists $self->{file_cache}{$filepath}{filehandle}){
my $fh = $self->Bio::EnsEMBL::DBFile::FileAdaptor::open_file($filepath, $params_hash);
if(defined $fh){
$self->{file_cache}{$filepath}{filehandle} = $fh;
#$self->initialise_filehandle($filepath) if $self->can('initialise_filehandle');
$self->initialise_filehandle($filepath) if($file_op eq '<');
}
}
elsif(! defined $self->{file_cache}{$filepath}{filehandle}){
#This maybe one of several read/seek errors which will have already been warned
warn "Encountered and error with file handle for $filepath\n";
}
#else
# check against cache file op
# to make sure we aren't trying to open an already open fh with a different operator
return $self->{file_cache}{$filepath}{filehandle};
}
=head2 open_file
Arg[1] : string - filepath
Arg[2] : HASHREF - Optional params:
-binmode => 0|1, # Boolean i.e. treat file as binary
-file_operator => '>' # Default is '<'
#-perms_octal => # Requires FileHandle
Example : my $fh = $self->open_file($filepath, {-binmode = > 1, -file_operator => '>'});
Description: Opens a file for reading or writing.
Returntype : GLOB/undef - filehandle
Exceptions : warns if file open fails
warns if file operator unsupported
warns if failed to set binmode
Caller : general
Status : at risk
=cut
sub open_file{
my ($self, $filepath, $params_hash) = @_;
#Validate params_hash?
#rearrange? Will not warn/throw for invalid keys?
#perms octal, requires FileHandle? See EFGUtils::open_file
my $file_op = $params_hash->{-file_operator} || '<';
if(($file_op ne '<') &&
($file_op ne '>') &&
($file_op ne '>>')){
#thow rather than warn as this is a code bug
throw("Cannot perform open with unsupported operator:\t${file_op}${filepath}");
}
my $fh;
my $success = open($fh, "${file_op}${filepath}");
#$fh will be still be GLOB on fail
if(! $success){
undef $fh;
warn "Failed to open:\t$filepath\n$!\n";
}
elsif($params_hash->{-binmode}){
$success = binmode $fh;
if(! $success){
warn "Failed to set binmode:\t$filepath\n$!";
undef $fh;
}
}
return $fh;
}
=head2 validate_file_length
Arg[1] : string - filepath
Arg[2] : int - expected length in bytes
Example : $self->validate_file_length($filepath, $expected_length);
Description: Utility method which can be used during file creation
Returntype : None
Exceptions : warns if file open fails
throws if file is not expected length
Caller : general
Status : at risk - change to seek to accounts for 'logical characters'
=cut
sub validate_file_length{
my ($self, $filepath, $expected_length, $binmode) = @_;
#Currently not using cache as we rarely want to
#use the file handle afterwards
#THIS WAS USING EFGUtils::open_file imported in the Collector::ResultFeature!!!!
#which is just a sub not a class method, and is in a parallel inheritance path
#No warnings about redefining method :(
#Force use of FileAdaptor::open_file
my $fh = $self->Bio::EnsEMBL::DBFile::FileAdaptor::open_file($filepath, {-binmode => $binmode});
#sysseek always returns length in bytes, change to seek which
#uses logical characters i.e. actual encoding?
#Does seek use bytes in binmode and chars in non-binmode?
my $seeked_bytes = sysseek($fh, 0, 2);# 2 is SEEK_END
#There is no systell function. Use sysseek(FH, 0, 1) for that.
if($seeked_bytes < $expected_length){
throw("File is shorter($seeked_bytes) than expected($expected_length):\t$filepath\n");
}
elsif($seeked_bytes > $expected_length){
throw("File is longer($seeked_bytes) than expected($expected_length):\t$filepath\n");
}
return;
}
### STUBB/TEMPLATE METHODS ###
#
# If required hese should be over-ridden in the
# descendant FileAdaptor e.g. CollectionAdaptor
# Listed here rather for visibility (rather than
# using 'can')
sub initialise_filehandle{
return;
}
1;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment