From 442cce07fa66178826f26edb622baa4aeced8747 Mon Sep 17 00:00:00 2001 From: Nathan Johnson <njohnson@ebi.ac.uk> Date: Fri, 18 Feb 2011 14:48:38 +0000 Subject: [PATCH] added FileAdaptor and CollectionADaptor --- .../Bio/EnsEMBL/DBFile/CollectionAdaptor.pm | 243 ++++++++++++++++++ modules/Bio/EnsEMBL/DBFile/FileAdaptor.pm | 222 ++++++++++++++++ 2 files changed, 465 insertions(+) create mode 100755 modules/Bio/EnsEMBL/DBFile/CollectionAdaptor.pm create mode 100755 modules/Bio/EnsEMBL/DBFile/FileAdaptor.pm diff --git a/modules/Bio/EnsEMBL/DBFile/CollectionAdaptor.pm b/modules/Bio/EnsEMBL/DBFile/CollectionAdaptor.pm new file mode 100755 index 0000000000..6d01305861 --- /dev/null +++ b/modules/Bio/EnsEMBL/DBFile/CollectionAdaptor.pm @@ -0,0 +1,243 @@ +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBFile::CollectionAdaptor + +=head1 SYNOPSIS + +For use with a Bio::EnsEMBL::Collector e.g. + + package Bio::EnsEMBL::Funcgen::DBSQL::ResultFeatureAdaptor; + + @ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor + Bio::EnsEMBL::Funcgen::Collector::ResultFeature + Bio::EnsEMBL::DBFile::CollectionAdaptor); + #DBSQL and DBFile inheritance here due to dynamic nature of ResultFeatureAdaptor + + +Fetch wrapper methods access file based data via read_collection_blob: + + sub _fetch_from_file_by_Slice_ResultSet{ + + #define filepath/config + + my $packed_scores = $self->read_collection_blob( + $filepath, + $efg_sr_id, + $conf->{$window_size}{'byte_offset'}, + $conf->{$window_size}{'byte_length'}, + ); + + #Do unpacking and object creation here + + } + +=head1 DESCRIPTION + +Adaptor for direct collection(.col) file access, which are binary compressed fixed +width format files providing window based values across the genome. Collection files +integrate an index block which contains seq_region byte off set values. + +NOTE: By default all collection files are generated and packed using little endian encoding. +Due to the lack of standards of float encoding(wrt to endianess) perl packs using the +implicit endianess of the underlying architecture. This means that accessing float +collection files located on a big endian architecture will produce unexpected results. + +#endian issues will disappear with knetfile xsubs + +=head1 SEE ALSO + +Bio::EnsEMBL::DBFile::FileAdaptor + +=cut + + + +package Bio::EnsEMBL::DBFile::CollectionAdaptor; + +use strict; +use warnings; + +use Bio::EnsEMBL::DBFile::FileAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::DBFile::FileAdaptor); + + +=head2 initialise_filehandle + + Arg[1] : string - filepath + Example : $self->initialise_filehandle($filepath); + Description: Initialises the filehandle for use, in this case reads + the index (seq_region offsets) + Returntype : None + Exceptions : warns if read fails + Caller : Bio::EnsEMBL::DBFile::FileAdaptor::get_filehandle + Status : at risk + +=cut + +sub initialise_filehandle{ + my ($self, $filepath) = @_; + my $fh = $self->{file_cache}{$filepath}{filehandle}; + + #offsets include the length of the complete index block + my ($index_size, $read_bytes, $index, $num_keys, %offset_index); + + ### INDEX FORMAT ### + #First block of the index the index size in bytes(not inc size block). + # + #Rest of index is a hash of sr_id(v 2 bytes) key offset(V 4 bytes) value pairs + #V (long) is 4 bytes(via sys/read), which is actually an Config{intsize} i.e. i? + #long is 8 bytes according to Config{longsize}! + + #read uses logical characters not necessarily in bytes + #altho this does seem to read bytes, maybe due to binmode? + #seek is in bytes + #Changed to sysread/read which both use bytes explicitly + #Can't mix sysread/seek due to I/O buffering differences + + + #Read index_size first encoded as v(2 bytes) + $read_bytes = sysread($fh, $index_size, 2); + + if(! ((defined $read_bytes) && ($read_bytes == 2))){ + #! defined is error 0 is end of file + warn "Failed to read index size from $filepath\n$!"; + + #Delete fh as it is useless/unsafe to retry + undef $self->{file_cache}{$filepath}{filehandle}; + } + else{ #Read index + ($index_size) = unpack('v', $index_size); + $read_bytes = sysread($fh, $index, $index_size); #Now read index proper + + if(! ((defined $read_bytes) && ($read_bytes == $index_size))){ + #! defined is error 0 is end of file + warn "Failed to read index from $filepath\n$!"; + + #Delete fh as it is useless/unsafe to retry + undef $self->{file_cache}{$filepath}{filehandle}; + } + else{ + #Number of key-value pairs => $index_size /(size of key(v 2bytes) + size of offset(V 4bytes)) + $num_keys = $index_size/6; + my $unpack_template = '(vV)'.$num_keys,; + + %offset_index = unpack($unpack_template, $index); + $self->{file_cache}{$filepath}{off_sets} = \%offset_index; + } + } + + return $self->{file_cache}{$filepath}{off_sets}; +} + + +=head2 read_collection_blob + + Arg[1] : string - filepath + Arg[2] : int - seq_region_id + Arg[3] : int - seq_region offset. The byte offset required to + locate the required start position + Argp4[ : int - byte length to read + Example : my $blob_substr = $self->read_collection_blob($filepath, + $sr_key, + $sr_offset, + $byte_length); + Description: Reads bytes from file given a seq_region_key, byte offset and byte length. + Sets filehandle to undef if read fails. + Returntype : string - packed binary data + Exceptions : warns if seek or read errors + Caller : general e.g. fetch_from_file_by_Slice_ResultSet + Status : at risk + +=cut + +# We could change this to take a Slice, hence we could check +# whether an EOF error is because the slice is out of range +# and undef only if it is in range i.e. the index/file is corrupt +# overkill? +# This is something the Slice API should warn about +# but will still cause undef'd filehandle here +# Index should also contain ends, so we can validate whether the slice is out of range??? + + +sub read_collection_blob{ + my($self, $filepath, $sr_key, $sr_offset, $byte_length) = @_; + + my $blob_substr; + my $fh = $self->get_filehandle($filepath, {-binmode => 1}); + + if(defined $fh){ + #Return from query cache here? + #cache key = "$filepath:$key:$sr_offset:$byte_length" + + #define total offset + + #if(! exists $self->{file_cache}{$filepath}{off_sets}{$sr_key}){ + # #warn "sr_key($sr_key) is not part of index for $filepath\n"; + #} + #else{ + + if(exists $self->{file_cache}{$filepath}{off_sets}{$sr_key}){ + + my $total_offset = $self->{file_cache}{$filepath}{off_sets}{$sr_key} + $sr_offset; + my $seeked = sysseek($fh, $total_offset, 0);#0(whence) is SEEK_SET. + + if(! $seeked){ + warn("Failed to seek to byte $total_offset in $filepath"); + #Don't undef fh here as this valid Slice maybe out of range + #and we don't want to kill a valid fh + #i.e. Slice start/end is past end of seq_region + } + else{ + my $read_bytes = sysread($fh, $blob_substr, $byte_length); + + if(! ((defined $read_bytes) && ($read_bytes == $byte_length))){ + #! defined is error 0 is end of file + warn "Failed to read from $filepath\n$!"; + + if($read_bytes == 0){ + #This maybe because the slice is out of range! + #The API gives no warning about this + + warn "End Of File encountered\n"; + warn "Total offset:\t".$self->{file_cache}{$filepath}{off_sets}{$sr_key}. + " key($sr_key) + $sr_offset = $total_offset\n"; + + #add some checks against the theoretical/true length of the file? + } + else{ #Delete fh as it is useless/unsafe to retry + undef $self->{file_cache}{$filepath}{filehandle}; + #$blob_substr is now set to empty string by read + undef $blob_substr; + } + } + } + } + } + + return $blob_substr; +} + + +1; diff --git a/modules/Bio/EnsEMBL/DBFile/FileAdaptor.pm b/modules/Bio/EnsEMBL/DBFile/FileAdaptor.pm new file mode 100755 index 0000000000..8f905ae485 --- /dev/null +++ b/modules/Bio/EnsEMBL/DBFile/FileAdaptor.pm @@ -0,0 +1,222 @@ +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBFile::FileAdaptor - Base Adaptor for direct file access + +=head1 SYNOPSIS + + + +=head1 DESCRIPTION + +Basic wrapper class to provide access to file based data. + +This is primarily aimed at indexed Collection(.col) files which are optimised for Slice +based queries. Collections store fixed width width/windowed data as BLOBS. This makes +it possible to seek to the a required location given slice coordinate and read the only +the required amount of data covering the slice. + +Currently only works as hybrid DBAdaptor e.g. ResultFeatureAdaptor which inherits both from +here and BaseFeatureAdaptor. + +=head1 SEE ALSO + + +=cut + + + +package Bio::EnsEMBL::DBFile::FileAdaptor; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); +use strict; +use warnings; + + +=head2 get_filehandle + + Arg[1] : string - filepath + Arg[2] : HASHREF - Optional params, see open_file + Example : my $fh = $self->get_filehandle($filepath, 1); + Description: Gets and caches a simple file handle. + Returntype : GLOB/undef - filehandle + Exceptions : warns if cache entry exists but is not defined + Caller : general + Status : at risk + +=cut + +sub get_filehandle{ + my ($self, $filepath, $params_hash) = @_; + + my $file_op = '<'; + + if(exists $params_hash->{-file_operator}){ + $file_op = $params_hash->{-file_operator}; + }else{ + $params_hash->{-file_operator} = $file_op; + } + + if(! exists $self->{file_cache}{$filepath}{filehandle}){ + my $fh = $self->Bio::EnsEMBL::DBFile::FileAdaptor::open_file($filepath, $params_hash); + + if(defined $fh){ + $self->{file_cache}{$filepath}{filehandle} = $fh; + #$self->initialise_filehandle($filepath) if $self->can('initialise_filehandle'); + $self->initialise_filehandle($filepath) if($file_op eq '<'); + } + } + elsif(! defined $self->{file_cache}{$filepath}{filehandle}){ + #This maybe one of several read/seek errors which will have already been warned + warn "Encountered and error with file handle for $filepath\n"; + } + #else + # check against cache file op + # to make sure we aren't trying to open an already open fh with a different operator + + + return $self->{file_cache}{$filepath}{filehandle}; +} + + +=head2 open_file + + Arg[1] : string - filepath + Arg[2] : HASHREF - Optional params: + -binmode => 0|1, # Boolean i.e. treat file as binary + -file_operator => '>' # Default is '<' + #-perms_octal => # Requires FileHandle + Example : my $fh = $self->open_file($filepath, {-binmode = > 1, -file_operator => '>'}); + Description: Opens a file for reading or writing. + Returntype : GLOB/undef - filehandle + Exceptions : warns if file open fails + warns if file operator unsupported + warns if failed to set binmode + Caller : general + Status : at risk + +=cut + +sub open_file{ + my ($self, $filepath, $params_hash) = @_; + + #Validate params_hash? + #rearrange? Will not warn/throw for invalid keys? + #perms octal, requires FileHandle? See EFGUtils::open_file + + + + my $file_op = $params_hash->{-file_operator} || '<'; + + if(($file_op ne '<') && + ($file_op ne '>') && + ($file_op ne '>>')){ + #thow rather than warn as this is a code bug + throw("Cannot perform open with unsupported operator:\t${file_op}${filepath}"); + } + + my $fh; + my $success = open($fh, "${file_op}${filepath}"); + #$fh will be still be GLOB on fail + + if(! $success){ + undef $fh; + warn "Failed to open:\t$filepath\n$!\n"; + } + elsif($params_hash->{-binmode}){ + $success = binmode $fh; + + if(! $success){ + warn "Failed to set binmode:\t$filepath\n$!"; + undef $fh; + } + } + + return $fh; +} + + +=head2 validate_file_length + + Arg[1] : string - filepath + Arg[2] : int - expected length in bytes + Example : $self->validate_file_length($filepath, $expected_length); + Description: Utility method which can be used during file creation + Returntype : None + Exceptions : warns if file open fails + throws if file is not expected length + Caller : general + Status : at risk - change to seek to accounts for 'logical characters' + +=cut + +sub validate_file_length{ + my ($self, $filepath, $expected_length, $binmode) = @_; + + #Currently not using cache as we rarely want to + #use the file handle afterwards + + + #THIS WAS USING EFGUtils::open_file imported in the Collector::ResultFeature!!!! + #which is just a sub not a class method, and is in a parallel inheritance path + #No warnings about redefining method :( + #Force use of FileAdaptor::open_file + + my $fh = $self->Bio::EnsEMBL::DBFile::FileAdaptor::open_file($filepath, {-binmode => $binmode}); + + + #sysseek always returns length in bytes, change to seek which + #uses logical characters i.e. actual encoding? + #Does seek use bytes in binmode and chars in non-binmode? + + my $seeked_bytes = sysseek($fh, 0, 2);# 2 is SEEK_END + #There is no systell function. Use sysseek(FH, 0, 1) for that. + + if($seeked_bytes < $expected_length){ + throw("File is shorter($seeked_bytes) than expected($expected_length):\t$filepath\n"); + } + elsif($seeked_bytes > $expected_length){ + throw("File is longer($seeked_bytes) than expected($expected_length):\t$filepath\n"); + } + + return; +} + + + + + +### STUBB/TEMPLATE METHODS ### +# +# If required hese should be over-ridden in the +# descendant FileAdaptor e.g. CollectionAdaptor +# Listed here rather for visibility (rather than +# using 'can') + + +sub initialise_filehandle{ + return; +} + + + +1; -- GitLab