From 442cce07fa66178826f26edb622baa4aeced8747 Mon Sep 17 00:00:00 2001
From: Nathan Johnson <njohnson@ebi.ac.uk>
Date: Fri, 18 Feb 2011 14:48:38 +0000
Subject: [PATCH] added FileAdaptor and CollectionADaptor

---
 .../Bio/EnsEMBL/DBFile/CollectionAdaptor.pm   | 243 ++++++++++++++++++
 modules/Bio/EnsEMBL/DBFile/FileAdaptor.pm     | 222 ++++++++++++++++
 2 files changed, 465 insertions(+)
 create mode 100755 modules/Bio/EnsEMBL/DBFile/CollectionAdaptor.pm
 create mode 100755 modules/Bio/EnsEMBL/DBFile/FileAdaptor.pm

diff --git a/modules/Bio/EnsEMBL/DBFile/CollectionAdaptor.pm b/modules/Bio/EnsEMBL/DBFile/CollectionAdaptor.pm
new file mode 100755
index 0000000000..6d01305861
--- /dev/null
+++ b/modules/Bio/EnsEMBL/DBFile/CollectionAdaptor.pm
@@ -0,0 +1,243 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2011 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+Bio::EnsEMBL::DBFile::CollectionAdaptor
+
+=head1 SYNOPSIS
+
+For use with a Bio::EnsEMBL::Collector e.g.
+
+    package Bio::EnsEMBL::Funcgen::DBSQL::ResultFeatureAdaptor;
+
+    @ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor 
+              Bio::EnsEMBL::Funcgen::Collector::ResultFeature 
+              Bio::EnsEMBL::DBFile::CollectionAdaptor);
+    #DBSQL and DBFile inheritance here due to dynamic nature of ResultFeatureAdaptor
+
+
+Fetch wrapper methods access file based data via read_collection_blob:
+
+    sub _fetch_from_file_by_Slice_ResultSet{
+
+	    #define filepath/config
+
+        my $packed_scores =  $self->read_collection_blob(
+		    										   $filepath,
+			    									   $efg_sr_id,
+				    								   $conf->{$window_size}{'byte_offset'},
+					    							   $conf->{$window_size}{'byte_length'},
+						    						  );
+
+        #Do unpacking and object creation here
+
+    }
+
+=head1 DESCRIPTION
+
+Adaptor for direct collection(.col) file access, which are binary compressed fixed 
+width format files providing window based values across the genome. Collection files
+integrate an index block which contains seq_region byte off set values.
+
+NOTE: By default all collection files are generated and packed using little endian encoding. 
+Due to the lack of standards of float encoding(wrt to endianess) perl packs using the 
+implicit endianess of the underlying architecture. This means that accessing float
+collection files located on a big endian architecture will produce unexpected results.
+
+#endian issues will disappear with knetfile xsubs
+
+=head1 SEE ALSO
+
+Bio::EnsEMBL::DBFile::FileAdaptor
+
+=cut
+
+
+
+package Bio::EnsEMBL::DBFile::CollectionAdaptor;
+
+use strict;
+use warnings;
+
+use Bio::EnsEMBL::DBFile::FileAdaptor;
+use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate);
+use vars qw(@ISA);
+@ISA = qw(Bio::EnsEMBL::DBFile::FileAdaptor);
+
+
+=head2 initialise_filehandle
+
+  Arg[1]     : string  - filepath
+  Example    : $self->initialise_filehandle($filepath);
+  Description: Initialises the filehandle for use, in this case reads 
+               the index (seq_region offsets)
+  Returntype : None
+  Exceptions : warns if read fails
+  Caller     : Bio::EnsEMBL::DBFile::FileAdaptor::get_filehandle
+  Status     : at risk
+
+=cut
+
+sub initialise_filehandle{
+  my ($self, $filepath) = @_;
+  my $fh = $self->{file_cache}{$filepath}{filehandle};
+  
+  #offsets include the length of the complete index block
+  my ($index_size, $read_bytes, $index, $num_keys, %offset_index);
+  
+  ### INDEX FORMAT ###
+  #First block of the index the index size in bytes(not inc size block).
+  #
+  #Rest of index is a hash of sr_id(v 2 bytes) key offset(V 4 bytes) value pairs
+  #V (long) is 4 bytes(via sys/read), which is actually an Config{intsize} i.e. i? 
+  #long is 8 bytes according to Config{longsize}!
+
+  #read uses logical characters not necessarily in bytes
+  #altho this does seem to read bytes, maybe due to binmode?
+  #seek is in bytes
+  #Changed to sysread/read which both use bytes explicitly
+  #Can't mix sysread/seek due to I/O buffering differences
+
+  
+  #Read index_size first encoded as v(2 bytes)
+  $read_bytes = sysread($fh, $index_size, 2);
+    
+  if(! ((defined $read_bytes) && ($read_bytes == 2))){
+	#! defined is error 0 is end of file
+	warn "Failed to read index size from $filepath\n$!";
+
+	#Delete fh as it is useless/unsafe to retry
+	undef $self->{file_cache}{$filepath}{filehandle};
+  }
+  else{	#Read index
+	($index_size) = unpack('v', $index_size);
+	$read_bytes = sysread($fh, $index, $index_size);  #Now read index proper
+	
+	if(! ((defined $read_bytes) && ($read_bytes == $index_size))){
+	  #! defined is error 0 is end of file
+	  warn "Failed to read index from $filepath\n$!";
+
+	  #Delete fh as it is useless/unsafe to retry
+	  undef $self->{file_cache}{$filepath}{filehandle};
+	}
+	else{
+	  #Number of key-value pairs => $index_size /(size of key(v 2bytes) + size of offset(V 4bytes))
+	  $num_keys        = $index_size/6;
+	  my $unpack_template = '(vV)'.$num_keys,;
+	  
+	  %offset_index = unpack($unpack_template, $index);
+	  $self->{file_cache}{$filepath}{off_sets} = \%offset_index;
+	}
+  }
+
+  return $self->{file_cache}{$filepath}{off_sets};
+}
+
+
+=head2 read_collection_blob
+
+  Arg[1]     : string - filepath
+  Arg[2]     : int    - seq_region_id
+  Arg[3]     : int    - seq_region offset. The byte offset required to
+                        locate the required start position
+  Argp4[     : int    - byte length to read
+  Example    : my $blob_substr = $self->read_collection_blob($filepath,
+                                                             $sr_key,
+                                                             $sr_offset,
+                                                             $byte_length);
+  Description: Reads bytes from file given a seq_region_key, byte offset and byte length.
+               Sets filehandle to undef if read fails.
+  Returntype : string - packed binary data
+  Exceptions : warns if seek or read errors
+  Caller     : general e.g. fetch_from_file_by_Slice_ResultSet
+  Status     : at risk
+
+=cut
+
+# We could change this to take a Slice, hence we could check 
+# whether an EOF error is because the slice is out of range 
+# and undef only if it is in range i.e. the index/file is corrupt
+# overkill?
+# This is something the Slice API should warn about
+# but will still cause undef'd filehandle here
+# Index should also contain ends, so we can validate whether the slice is out of range???
+
+
+sub read_collection_blob{
+  my($self, $filepath, $sr_key, $sr_offset, $byte_length) = @_;
+	
+  my $blob_substr;
+  my $fh = $self->get_filehandle($filepath, {-binmode => 1});
+
+  if(defined $fh){
+	#Return from query cache here?
+	#cache key = "$filepath:$key:$sr_offset:$byte_length"
+
+	#define total offset
+
+	#if(! exists $self->{file_cache}{$filepath}{off_sets}{$sr_key}){
+	#  #warn "sr_key($sr_key) is not part of index for $filepath\n";
+	#}
+	#else{
+
+	if(exists $self->{file_cache}{$filepath}{off_sets}{$sr_key}){
+
+ 	  my $total_offset = $self->{file_cache}{$filepath}{off_sets}{$sr_key} + $sr_offset;
+	  my $seeked = sysseek($fh, $total_offset, 0);#0(whence) is SEEK_SET.
+
+	  if(! $seeked){
+		warn("Failed to seek to byte $total_offset in $filepath");
+		#Don't undef fh here as this valid Slice maybe out of range
+		#and we don't want to kill a valid fh
+		#i.e. Slice start/end is past end of seq_region
+	  }
+	  else{
+		my $read_bytes = sysread($fh, $blob_substr, $byte_length);
+		
+		if(! ((defined $read_bytes) && ($read_bytes == $byte_length))){
+		  #! defined is error 0 is end of file
+		  warn "Failed to read from $filepath\n$!";
+
+		  if($read_bytes == 0){
+			#This maybe because the slice is out of range!
+			#The API gives no warning about this
+						
+			warn "End Of File encountered\n";
+			warn "Total offset:\t".$self->{file_cache}{$filepath}{off_sets}{$sr_key}.
+			  "  key($sr_key)  + $sr_offset = $total_offset\n";
+
+			#add some checks against the theoretical/true length of the file?
+		  }
+		  else{  #Delete fh as it is useless/unsafe to retry
+			undef $self->{file_cache}{$filepath}{filehandle};
+			#$blob_substr is now set to empty string by read
+			undef $blob_substr;
+		  }
+		}		
+	  }
+	}	
+  }
+
+  return $blob_substr;
+}
+
+
+1;
diff --git a/modules/Bio/EnsEMBL/DBFile/FileAdaptor.pm b/modules/Bio/EnsEMBL/DBFile/FileAdaptor.pm
new file mode 100755
index 0000000000..8f905ae485
--- /dev/null
+++ b/modules/Bio/EnsEMBL/DBFile/FileAdaptor.pm
@@ -0,0 +1,222 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2011 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+Bio::EnsEMBL::DBFile::FileAdaptor - Base Adaptor for direct file access
+
+=head1 SYNOPSIS
+
+
+
+=head1 DESCRIPTION
+
+Basic wrapper class to provide access to file based data.
+
+This is primarily aimed at indexed Collection(.col) files which are optimised for Slice 
+based queries. Collections store fixed width width/windowed data as BLOBS.  This makes 
+it possible to seek to the a required location given slice coordinate and read the only 
+the required amount of data covering the slice.
+
+Currently only works as hybrid DBAdaptor e.g. ResultFeatureAdaptor which inherits both from 
+here and BaseFeatureAdaptor.
+
+=head1 SEE ALSO
+
+
+=cut
+
+
+
+package Bio::EnsEMBL::DBFile::FileAdaptor;
+
+use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate);
+use strict;
+use warnings;
+
+
+=head2 get_filehandle
+
+  Arg[1]     : string     - filepath
+  Arg[2]     : HASHREF    - Optional params, see open_file
+  Example    : my $fh     = $self->get_filehandle($filepath, 1);
+  Description: Gets and caches a simple file handle.
+  Returntype : GLOB/undef - filehandle
+  Exceptions : warns if cache entry exists but is not defined 
+  Caller     : general
+  Status     : at risk
+
+=cut
+
+sub get_filehandle{
+  my ($self, $filepath, $params_hash) = @_;
+
+  my $file_op = '<';
+
+  if(exists $params_hash->{-file_operator}){
+	$file_op = $params_hash->{-file_operator};
+  }else{
+	$params_hash->{-file_operator} = $file_op;
+  }
+
+  if(! exists $self->{file_cache}{$filepath}{filehandle}){
+	my $fh = $self->Bio::EnsEMBL::DBFile::FileAdaptor::open_file($filepath, $params_hash);
+
+	if(defined $fh){
+	  $self->{file_cache}{$filepath}{filehandle} = $fh;
+	  #$self->initialise_filehandle($filepath) if $self->can('initialise_filehandle');
+	  $self->initialise_filehandle($filepath) if($file_op eq '<');
+	}
+  }
+  elsif(! defined $self->{file_cache}{$filepath}{filehandle}){
+	#This maybe one of several read/seek errors which will have already been warned
+	warn "Encountered and error with file handle for $filepath\n";
+  }
+  #else
+  # check against cache file op
+  # to make sure we aren't trying to open an already open fh with a different operator
+
+ 
+  return $self->{file_cache}{$filepath}{filehandle};
+}
+
+
+=head2 open_file
+
+  Arg[1]     : string     - filepath
+  Arg[2]     : HASHREF    - Optional params:
+                                 -binmode       => 0|1,   # Boolean i.e. treat file as binary
+                                 -file_operator => '>'    # Default is '<'
+                                #-perms_octal   =>  # Requires FileHandle
+  Example    : my $fh     = $self->open_file($filepath, {-binmode = > 1, -file_operator => '>'});
+  Description: Opens a file for reading or writing.
+  Returntype : GLOB/undef - filehandle
+  Exceptions : warns if file open fails
+               warns if file operator unsupported
+               warns if failed to set binmode
+  Caller     : general
+  Status     : at risk
+
+=cut
+
+sub open_file{
+  my ($self, $filepath, $params_hash) = @_;
+
+  #Validate params_hash? 
+  #rearrange? Will not warn/throw for invalid keys?
+  #perms octal, requires FileHandle? See EFGUtils::open_file
+
+
+
+  my $file_op = $params_hash->{-file_operator} || '<';
+
+  if(($file_op ne '<') &&
+	 ($file_op ne '>') &&
+	 ($file_op ne '>>')){
+	#thow rather than warn as this is a code bug
+	throw("Cannot perform open with unsupported operator:\t${file_op}${filepath}");
+  }
+
+  my $fh;
+  my $success = open($fh, "${file_op}${filepath}");
+  #$fh will be still be GLOB on fail
+
+  if(! $success){
+	undef $fh;
+	warn "Failed to open:\t$filepath\n$!\n";
+  }
+  elsif($params_hash->{-binmode}){
+	$success = binmode $fh;
+	  
+	if(! $success){
+	  warn "Failed to set binmode:\t$filepath\n$!";
+	  undef $fh;
+	}
+  }
+
+  return $fh;
+}
+
+
+=head2 validate_file_length
+
+  Arg[1]     : string  - filepath
+  Arg[2]     : int     - expected length in bytes
+  Example    : $self->validate_file_length($filepath, $expected_length);
+  Description: Utility method which can be used during file creation
+  Returntype : None
+  Exceptions : warns if file open fails
+               throws if file is not expected length
+  Caller     : general
+  Status     : at risk - change to seek to accounts for 'logical characters'
+
+=cut
+
+sub validate_file_length{
+  my ($self, $filepath, $expected_length, $binmode) = @_;
+
+  #Currently not using cache as we rarely want to 
+  #use the file handle afterwards
+
+
+  #THIS WAS USING EFGUtils::open_file imported in the Collector::ResultFeature!!!!
+  #which is just a sub not a class method, and is in a parallel inheritance path
+  #No warnings about redefining method :(
+  #Force use of FileAdaptor::open_file
+
+  my $fh = $self->Bio::EnsEMBL::DBFile::FileAdaptor::open_file($filepath, {-binmode => $binmode});
+
+
+  #sysseek always returns length in bytes, change to seek which 
+  #uses logical characters i.e. actual encoding?
+  #Does seek use bytes in binmode and chars in non-binmode?
+
+  my $seeked_bytes = sysseek($fh, 0, 2);# 2 is SEEK_END
+  #There is no systell function. Use sysseek(FH, 0, 1) for that.
+
+  if($seeked_bytes < $expected_length){
+	throw("File is shorter($seeked_bytes) than expected($expected_length):\t$filepath\n");
+  }
+  elsif($seeked_bytes > $expected_length){
+	throw("File is longer($seeked_bytes) than expected($expected_length):\t$filepath\n");
+  }
+ 
+  return;
+}
+
+
+
+
+
+### STUBB/TEMPLATE METHODS ###
+#
+#   If required hese should be over-ridden in the 
+#   descendant FileAdaptor e.g. CollectionAdaptor
+#   Listed here rather for visibility (rather than 
+#   using 'can')
+
+
+sub initialise_filehandle{
+  return;
+}
+
+
+
+1;
-- 
GitLab