ENSCORESW-125. First commit of flatfile dumper. Being committed as a way of avoiding loosing code

fcd93104 · Andy Yates · 61a732c5 · fcd93104 · fcd93104 · fcd93104
Commit fcd93104 authored 12 years ago by Andy Yates
--- a/modules/Bio/EnsEMBL/Pipeline/Base.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/Base.pm
+package Bio::EnsEMBL::Pipeline::Base;
+
+use strict;
+use warnings;
+use base qw/Bio::EnsEMBL::Hive::Process/;
+
+use Bio::EnsEMBL::Utils::Exception qw/throw/;
+use Bio::EnsEMBL::Utils::Scalar qw/check_ref/;
+use File::Find;
+use File::Spec;
+use File::Path qw/mkpath/;
+use POSIX qw/strftime/;
+
+# Takes in a key, checks if the current $self->param() was an empty array
+# and replaces it with the value from $self->param_defaults()
+sub reset_empty_array_param {
+  my ($self, $key) = @_;
+  my $param_defaults = $self->param_defaults();
+  my $current = $self->param($key); 
+  my $replacement = $self->param_defaults()->{$key};
+  if(check_ref($current, 'ARRAY') && check_ref($replacement, 'ARRAY')) {
+    if(! @{$current}) {
+      $self->fine('Restting param %s because the given array was empty', $key);
+      $self->param($key, $replacement);
+    }
+  }
+  return;
+}
+
+sub get_Slices {
+  my ($self, $type) = @_;
+  my $dba = $self->get_DBAdaptor($type);
+  throw "Cannot get a DB adaptor" unless $dba;
+  my $sa = $dba->get_SliceAdaptor();
+  return [ sort { $a->length() <=> $b->length() }  @{$sa->fetch_all('toplevel', undef, 1, undef, undef)} ];
+}
+
+# Registry is loaded by Hive (see beekeeper_extra_cmdline_options() in conf)
+sub get_DBAdaptor {
+  my ($self, $type) = @_;
+  my $species = $self->param('species');
+  $type ||= 'core';
+  return Bio::EnsEMBL::Registry->get_DBAdaptor($species, $type);
+}
+
+sub cleanup_DBAdaptor {
+  my ($self, $type) = @_;
+  my $dba = $self->get_DBAdaptor($type);
+  $dba->clear_caches;
+  $dba->dbc->disconnect_if_idle;
+  return;
+}
+
+sub get_dir {
+  my ($self, @extras) = @_;
+  my $base_dir = $self->param('base_path');
+  my $dir = File::Spec->catdir($base_dir, @extras);
+  mkpath($dir);
+  return $dir;
+}
+
+sub web_name {
+  my ($self) = @_;
+#  my $mc = $self->get_DBAdaptor()->get_MetaContainer();
+#  my $name = $mc->single_value_by_key('species.url'); # change back
+  my $name = ucfirst($self->production_name());
+  return $name;
+}
+
+sub scientific_name {
+  my ($self) = @_;
+  my $dba = $self->get_DBAdaptor();
+  my $mc = $dba->get_MetaContainer();
+  my $name = $mc->get_scientific_name();
+  $dba->dbc()->disconnect_if_idle();
+  return $name;
+}
+
+sub assembly {
+  my ($self) = @_;
+  my $dba = $self->get_DBAdaptor();
+  return $dba->get_CoordSystemAdaptor()->fetch_all()->[0]->version();
+}
+
+sub production_name {
+  my ($self, $name) = @_;
+  my $dba;
+  if($name) {
+    $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($name, 'core');
+  }
+  else {
+    $dba = $self->get_DBAdaptor();
+  }
+  my $mc = $dba->get_MetaContainer();
+  my $prod = $mc->get_production_name();
+  $dba->dbc()->disconnect_if_idle();
+  return $prod;
+}
+
+# Closes file handle, and deletes the file stub if it contains no data
+# Returns success type
+
+sub tidy_file_handle {
+  my ($self, $fh, $path) = @_;
+  if ($fh->tell() == 0) {
+    $fh->close;
+    unlink($path);
+    return 1;
+  }
+  close($fh);
+  return 0;
+}
+
+sub info {
+  my ($self, $msg, @params) = @_;
+  if ($self->debug() > 1) {
+    my $formatted_msg;
+    if(scalar(@params)) {
+      $formatted_msg = sprintf($msg, @params);
+    } 
+    else {
+      $formatted_msg = $msg;
+    }
+    printf STDERR "INFO: %s %s\n", strftime('%c',localtime()), $formatted_msg;
+  }
+  return
+}
+
+sub fine {
+  my ($self, $msg, @params) = @_;
+  if ($self->debug() > 2) {
+    my $formatted_msg;
+    if(scalar(@params)) {
+      $formatted_msg = sprintf($msg, @params);
+    } 
+    else {
+      $formatted_msg = $msg;
+    }
+    printf STDERR "FINE: %s %s\n", strftime('%c',localtime()), $formatted_msg;
+  }
+  return
+}
+
+sub find_files {
+  my ($self, $dir, $boolean_callback) = @_;
+  $self->throw("Cannot find path $dir") unless -d $dir;
+  my @files;
+  find(sub {
+    my $path = $File::Find::name;
+    if($boolean_callback->($_)) {
+      push(@files, $path);
+    }
+  }, $dir);
+  return \@files;
+}
+
+1;
--- a/modules/Bio/EnsEMBL/Pipeline/FASTA/ChecksumGenerator.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/ChecksumGenerator.pm
@@ -20,7 +20,7 @@

 =head1 NAME

-Bio::EnsEMBL::Pipeline::FASTA::ChecksumGenerator
+Bio::EnsEMBL::Pipeline::ChecksumGenerator

 =head1 DESCRIPTION

@@ -40,12 +40,12 @@ Allowed parameters are:

 =cut

-package Bio::EnsEMBL::Pipeline::FASTA::ChecksumGenerator;
+package Bio::EnsEMBL::Pipeline::ChecksumGenerator;

 use strict;
 use warnings;

-use base qw/Bio::EnsEMBL::Pipeline::FASTA::Base/;
+use base qw/Bio::EnsEMBL::Pipeline::Base/;

 use File::Spec;
 use Bio::EnsEMBL::Utils::IO qw/work_with_file gz_work_with_file/;

--- a/modules/Bio/EnsEMBL/Pipeline/FASTA/Base.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/Base.pm
@@ -2,95 +2,15 @@ package Bio::EnsEMBL::Pipeline::FASTA::Base;

 use strict;
 use warnings;
-use base qw/Bio::EnsEMBL::Hive::Process/;
+use base qw/Bio::EnsEMBL::Pipeline::Base/;

-use Bio::EnsEMBL::Utils::Exception qw/throw/;
-use Bio::EnsEMBL::Utils::Scalar qw/check_ref/;
-use File::Find;
 use File::Spec;
-use File::Path qw/mkpath/;
-use POSIX qw/strftime/;
-
-# Takes in a key, checks if the current $self->param() was an empty array
-# and replaces it with the value from $self->param_defaults()
-sub reset_empty_array_param {
-  my ($self, $key) = @_;
-  my $param_defaults = $self->param_defaults();
-  my $current = $self->param($key); 
-  my $replacement = $self->param_defaults()->{$key};
-  if(check_ref($current, 'ARRAY') && check_ref($replacement, 'ARRAY')) {
-    if(! @{$current}) {
-      $self->fine('Restting param %s because the given array was empty', $key);
-      $self->param($key, $replacement);
-    }
-  }
-  return;
-}
-
-sub get_Slices {
-  my ($self, $type) = @_;
-  my $sa = $self->get_DBAdaptor($type)->get_SliceAdaptor();
-  return [ sort { $a->length() <=> $b->length() }  @{$sa->fetch_all('toplevel', undef, 1, undef, undef)} ];
-}
-
-# Registry is loaded by Hive (see beekeeper_extra_cmdline_options() in conf)
-sub get_DBAdaptor {
-  my ($self, $type) = @_;
-  my $species = $self->param('species');
-  $type ||= 'core';
-  return Bio::EnsEMBL::Registry->get_DBAdaptor($species, $type);
-}
-
-sub cleanup_DBAdaptor {
-  my ($self, $type) = @_;
-  my $dba = $self->get_DBAdaptor($type);
-  $dba->clear_caches;
-  $dba->dbc->disconnect_if_idle;
-  return;
-}
-
-sub get_dir {
-  my ($self, @extras) = @_;
-  my $base_dir = $self->param('base_path');
-  my $dir = File::Spec->catdir($base_dir, @extras);
-  mkpath($dir);
-  return $dir;
-}

 sub fasta_path {
  my ( $self, @extras ) = @_;
  return $self->get_dir('fasta', $self->param('species'), @extras);
 }

-sub web_name {
-  my ($self) = @_;
-#  my $mc = $self->get_DBAdaptor()->get_MetaContainer();
-#  my $name = $mc->single_value_by_key('species.url'); # change back
-  my $name = ucfirst($self->production_name());
-  return $name;
-}
-
-sub assembly {
-  my ($self) = @_;
-  my $dba = $self->get_DBAdaptor();
-  return $dba->get_CoordSystemAdaptor()->fetch_all()->[0]->version();
-}
-
-sub production_name {
-  my ($self, $name) = @_;
-  my $dba;
-  if($name) {
-    $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($name, 'core');
-  }
-  else {
-    $dba = $self->get_DBAdaptor();
-  }
-  my $mc = $dba->get_MetaContainer();
-  my $prod = $mc->get_production_name();
-  $dba->dbc()->disconnect_if_idle();
-  return $prod;
-}
-
 sub old_path {
  my ($self, $species) = @_;
  my $base = $self->param('ftp_dir');
@@ -99,61 +19,4 @@ sub old_path {
  my $dir = File::Spec->catdir($base, "release-$release", 'fasta', $prod, 'dna');
 }

-# Closes file handle, and deletes the file stub if it contains no data
-# Returns success type
-
-sub tidy_file_handle {
-  my ($self, $fh, $path) = @_;
-  if ($fh->tell() == 0) {
-    $fh->close;
-    unlink($path);
-    return 1;
-  }
-  close($fh);
-  return 0;
-}
-
-sub info {
-  my ($self, $msg, @params) = @_;
-  if ($self->debug() > 1) {
-    my $formatted_msg;
-    if(scalar(@params)) {
-      $formatted_msg = sprintf($msg, @params);
-    } 
-    else {
-      $formatted_msg = $msg;
-    }
-    printf STDERR "INFO: %s %s\n", strftime('%c',localtime()), $formatted_msg;
-  }
-  return
-}
-
-sub fine {
-  my ($self, $msg, @params) = @_;
-  if ($self->debug() > 2) {
-    my $formatted_msg;
-    if(scalar(@params)) {
-      $formatted_msg = sprintf($msg, @params);
-    } 
-    else {
-      $formatted_msg = $msg;
-    }
-    printf STDERR "FINE: %s %s\n", strftime('%c',localtime()), $formatted_msg;
-  }
-  return
-}
-
-sub find_files {
-  my ($self, $dir, $boolean_callback) = @_;
-  $self->throw("Cannot find path $dir") unless -d $dir;
-  my @files;
-  find(sub {
-    my $path = $File::Find::name;
-    if($boolean_callback->($_)) {
-      push(@files, $path);
-    }
-  }, $dir);
-  return \@files;
-}
-
 1;
--- a/modules/Bio/EnsEMBL/Pipeline/FASTA/FindDirs.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/FindDirs.pm
@@ -44,52 +44,16 @@ package Bio::EnsEMBL::Pipeline::FASTA::FindDirs;
 use strict;
 use warnings;

-use base qw/Bio::EnsEMBL::Pipeline::FASTA::Base Bio::EnsEMBL::Hive::RunnableDB::JobFactory/;
+use base qw/Bio::EnsEMBL::Pipeline::FindDirs Bio::EnsEMBL::Pipeline::FASTA::Base/;

 use File::Spec;

 sub fetch_input {
  my ($self) = @_;
  $self->throw("No 'species' parameter specified") unless $self->param('species');
-  my $dirs = $self->dirs();
-  $self->param('inputlist', $dirs);
+  $self->param('path', $self->fasta_path());
+  $self->SUPER::fetch_input();
  return;
 }

-sub run {
-  my ($self) = @_;
-  Bio::EnsEMBL::Hive::RunnableDB::JobFactory::run($self);
-  return;
-}
-
-sub write_output {
-  my ($self) = @_;
-  Bio::EnsEMBL::Hive::RunnableDB::JobFactory::write_output($self);
-  return;
-}
-
-sub dirs {
-  my ($self) = @_;
-  
-  my @dirs;
-  
-  my $dir = $self->fasta_path();
-  $self->info('Searching directory %s', $dir);
-
-  opendir(my $dh, $dir) or die "Cannot open directory $dir";
-  my @files = sort { $a cmp $b } readdir($dh);
-  closedir($dh) or die "Cannot close directory $dir";
-
-  foreach my $file (@files) {
-    next if $file =~ /^\./;         #hidden file or up/current dir
-    my $path = File::Spec->catdir($dir, $file);
-    if(-d $path) {
-      $self->fine('Adding %s to the list of found dirs', $path);
-      push(@dirs, $path);
-    }
-  }
-  
-  return \@dirs;
-}
-
 1;
--- a/modules/Bio/EnsEMBL/Pipeline/FASTA/SpeciesFactory.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/SpeciesFactory.pm
@@ -67,35 +67,22 @@ package Bio::EnsEMBL::Pipeline::FASTA::SpeciesFactory;
 use strict;
 use warnings;

-use base qw/Bio::EnsEMBL::Pipeline::FASTA::Base/;
+use base qw/Bio::EnsEMBL::Pipeline::SpeciesFactory/;

 use Bio::EnsEMBL::Registry;

 sub param_defaults {
  my ($self) = @_;
  return {
+    %{$self->SUPER::param_defaults()},
    sequence_type_list => [qw/dna cdna ncrna/],
-    db_types => [qw/core/],
-    species => []
  };
 }

 sub fetch_input {
  my ($self) = @_;
-  
+  $self->SUPER::fetch_input();
  $self->reset_empty_array_param('sequence_type_list');
-  $self->reset_empty_array_param('db_types');
-  
-  my $core_dbas = $self->get_DBAdaptors();
-  $self->info('Found %d core DBAdaptor(s) to process', scalar(@{$core_dbas}));
-  $self->param('dbas', $core_dbas);
-  
-  my %species_lookup = 
-    map { $_ => 1 } 
-    map { Bio::EnsEMBL::Registry->get_alias($_)  } 
-    @{$self->param('species')};
-  $self->param('species_lookup', \%species_lookup);
-  
  my %sequence_types = map { $_ => 1 } @{ $self->param('sequence_type_list') };
  $self->param('sequence_types', \%sequence_types);
  
@@ -139,48 +126,6 @@ sub write_output {
  return;
 }

-sub get_DBAdaptors {
-  my ($self) = @_;
-  return Bio::EnsEMBL::Registry->get_all_DBAdaptors(-GROUP => 'core');
-}
-
-sub do_flow {
-  my ($self, $key) = @_;
-  my $targets = $self->param($key);
-  foreach my $entry (@{$targets}) {
-    my ($input_id, $flow) = @{$entry};
-    $self->fine('Flowing %s to %d for %s', $input_id->{species}, $flow, $key);
-    $self->dataflow_output_id($input_id, $flow);
-  }
-  return;
-}
-
-sub process_dba {
-  my ($self, $dba) = @_;
-  
-  #Reject if DB was ancestral sequences
-  return 0 if $dba->species() =~ /ancestral/i;
-  
-  #If species is defined then make sure we only allow those species through
-  if(@{$self->param('species')}) {
-    my $lookup = $self->param('species_lookup');
-    my $name = $dba->species();
-    my $aliases = Bio::EnsEMBL::Registry->get_all_aliases($name);
-    push(@{$aliases}, $name);
-    my $found = 0;
-    foreach my $alias (@{$aliases}) {
-      if($lookup->{$alias}) {
-        $found = 1;
-        last;
-      }
-    }
-    return $found;
-  }
-  
-  #Otherwise just accept
-  return 1;
-}
-
 # return 0 if we do not want to do any flowing otherwise return 2

 sub dna_flow {
@@ -218,9 +163,4 @@ sub input_id {
  return $input_id;
 }

-sub db_types {
-  my ($self, $dba) = @_;
-  return $self->param('db_types');
-}
-
 1;
--- a/modules/Bio/EnsEMBL/Pipeline/FindDirs.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/FindDirs.pm
+=pod
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::Pipeline::FindDirs
+
+=head1 DESCRIPTION
+
+Finds all directories under the given path.
+
+Allowed parameters are:
+
+=over 8
+
+=item path - The path to search
+
+=back
+
+=cut
+
+package Bio::EnsEMBL::Pipeline::FindDirs;
+
+use strict;
+use warnings;
+
+use base qw/Bio::EnsEMBL::Hive::RunnableDB::JobFactory/;
+
+use File::Spec;
+
+sub fetch_input {
+  my ($self) = @_;
+  $self->throw("No 'path' parameter specified") unless $self->param('path');
+  my $dirs = $self->dirs();
+  $self->param('inputlist', $dirs);
+  return;
+}
+
+sub dirs {
+  my ($self) = @_;
+  
+  my @dirs;
+  
+  my $dir = $self->param('path');
+  $self->info('Searching directory %s', $dir);
+
+  opendir(my $dh, $dir) or die "Cannot open directory $dir";
+  my @files = sort { $a cmp $b } readdir($dh);
+  closedir($dh) or die "Cannot close directory $dir";
+
+  foreach my $file (@files) {
+    next if $file =~ /^\./;         #hidden file or up/current dir
+    my $path = File::Spec->catdir($dir, $file);
+    if(-d $path) {
+      $self->fine('Adding %s to the list of found dirs', $path);
+      push(@dirs, $path);
+    }
+  }
+  
+  return \@dirs;
+}
+
+1;
--- a/modules/Bio/EnsEMBL/Pipeline/Flatfile/Base.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/Flatfile/Base.pm
+package Bio::EnsEMBL::Pipeline::Flatfile::Base;
+
+use strict;
+use warnings;
+use base qw/Bio::EnsEMBL::Pipeline::Base/;
+
+sub data_path {
+  my ($self) = @_;
+  return $self->get_dir($self->param('type'), $self->param('species'));
+}
+
+1;
--- a/modules/Bio/EnsEMBL/Pipeline/Flatfile/ChecksumGenerator.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/Flatfile/ChecksumGenerator.pm
+=pod
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::Pipeline::Flatfile::ChecksumGenerator
+
+=head1 DESCRIPTION
+
+Creates a CHECKSUMS file in the given directory which is produced from running
+the sum command over every file in the directory. This excludes the CHECKSUMS
+file, parent directory or any hidden files.
+
+Allowed parameters are:
+
+=over 8
+
+=item species - Species to work with
+
+=item type - Type of data to work with
+
+=back
+
+=cut
+
+package Bio::EnsEMBL::Pipeline::Flatfile::ChecksumGenerator;
+
+use strict;
+use warnings;
+
+use base qw/Bio::EnsEMBL::Pipeline::ChecksumGenerator Bio::EnsEMBL::Pipeline::Flatfile::Base/;
+
+sub fetch_input {
+  my ($self) = @_;
+  $self->throw("No 'species' parameter specified") unless $self->param('species');
+  $self->throw("No 'type' parameter specified") unless $self->param('type');
+  my $dir = $self->data_path();
+  $self->param('dir', $dir);
+  $self->SUPER::fetch_input();
+  return;
+}
+
+1;
--- a/modules/Bio/EnsEMBL/Pipeline/Flatfile/DumpFile.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/Flatfile/DumpFile.pm
+=pod
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::Pipeline::Flatfile::DumpFile
+
+=head1 DESCRIPTION
+
+The main workhorse of the Flatfile dumping pipeline.
+
+The script is responsible for creating the filenames of these target
+files, taking data from the database and the formatting of the flat files
+headers. The final files are all Gzipped at normal levels of compression.
+
+Allowed parameters are:
+
+=over 8
+
+=item species - The species to dump
+
+=item base_path - The base of the dumps
+
+=item release - The current release we are emitting
+
+=item type - The type of data we are emitting. Should be embl or genbank
+
+=back
+
+=cut
+
+package Bio::EnsEMBL::Pipeline::Flatfile::DumpFile;
+
+use strict;
+use warnings;
+
+use base qw(Bio::EnsEMBL::Pipeline::Flatfile::Base);
+
+use Bio::EnsEMBL::Utils::Exception qw/throw/;
+use Bio::EnsEMBL::Utils::SeqDumper;
+use Bio::EnsEMBL::Utils::IO qw/gz_work_with_file/;
+use File::Path qw/rmtree/;
+
+sub param_defaults {
+  my ($self) = @_;
+  return {
+    supported_types => {embl => 1, genbank => 1},
+  };
+}
+
+sub fetch_input {
+  my ($self) = @_;
+  
+  my $type = $self->param('type');
+  throw "No type specified" unless $type;
+  throw "Unsupported type '$type' specified" unless $self->param('supported_types')->{$type};
+  
+  throw "Need a species" unless $self->param('species');
+  throw "Need a release" unless $self->param('release');
+  throw "Need a base_path" unless $self->param('base_path');
+  
+  return;
+}
+
+sub run {
+  my ($self) = @_;
+  
+  my $root = $self->data_path();
+  if(-d $root) {
+    $self->info('Directory "%s" already exists; removing', $root);
+    rmtree($root);
+  }
+  
+  my $type = $self->param('type');
+  my $target = "dump_${type}";
+  my $seq_dumper = $self->_seq_dumper();
+  
+  my @chromosomes;
+  my @non_chromosomes;
+  foreach my $s (@{$self->get_Slices()}) {
+    my $chr = $s->is_chromosome();
+    push(@chromosomes, $s) if $chr;
+    push(@non_chromosomes, $s) if ! $chr;
+  }
+  
+  if(@non_chromosomes) {
+    my $path = $self->_generate_file_name('nonchromosomal');
+    $self->info('Dumping non-chromosomal data to %s', $path);
+    gz_work_with_file($path, 'w', sub {
+      my ($fh) = @_;
+      foreach my $slice (@non_chromosomes) {
+        $self->fine('Dumping non-nhromosomal %s', $slice->name());
+        $seq_dumper->$target($slice, $fh);
+      }
+      return;
+    });
+  }
+  else {
+    $self->info('Did not find any non-chromosomal data');
+  }
+  
+  foreach my $slice (@chromosomes) {
+    $self->fine('Dumping chromosome %s', $slice->name());
+    my $path = $self->_generate_file_name($slice->coord_system_name(), $slice->seq_region_name());
+    my $args = {};
+    if(-f $path) {
+      $self->fine('Path "%s" already exists; appending', $path);
+      $args->{Append} = 1;
+    }
+    gz_work_with_file($path, 'w', sub {
+      my ($fh) = @_;
+      $seq_dumper->$target($slice, $fh);
+      return;
+    }, $args);
+  }
+  
+  return;
+}
+
+sub _seq_dumper {
+  my ($self) = @_;
+  my $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new();
+  $seq_dumper->disable_feature_type('similarity');
+  $seq_dumper->disable_feature_type('genscan');
+  $seq_dumper->disable_feature_type('variation');
+  $seq_dumper->disable_feature_type('repeat');
+  return $seq_dumper;
+}
+
+sub _generate_file_name {
+  my ($self, $section, $name) = @_;
+
+  # File name format looks like:
+  # <species>.<assembly>.<release>.<section.name|section>.dat.gz
+  # e.g. Homo_sapiens.GRCh37.64.chromosome.20.dat.gz
+  #      Homo_sapiens.GRCh37.64.nonchromosomal.dat.gz
+  my @name_bits;
+  push @name_bits, $self->web_name();
+  push @name_bits, $self->assembly();
+  push @name_bits, $self->param('release');
+  push @name_bits, $section if $section;
+  push @name_bits, $name if $name;
+  push @name_bits, 'dat', 'gz';
+
+  my $file_name = join( '.', @name_bits );
+  my $path = $self->data_path();
+  return File::Spec->catfile($path, $file_name);
+}
+
+sub _create_README {
+  my ($self) = @_;
+  my $species = $self->scientific_name();
+  my $format = uc($self->param('type'));
+  
+  my $readme = <<README;
+#### README ####
+
+IMPORTANT: Please note you can download correlation data tables, 
+supported by Ensembl, via the highly customisable BioMart and 
+EnsMart data mining tools. See http://www.ensembl.org/biomart/martview or
+http://www.ebi.ac.uk/biomart/ for more information.
+
+-----------------------
+$format FLATFILE DUMPS
+-----------------------
+This directory contains $species $format flatfile dumps.  To ease 
+downloading of the files, the $format format entries are bundled 
+into groups of chromosomes and non-chromosomal regions.  
+All files are then compacted with gzip.
+
+Ensembl provides an automatic reannotation of $species genomic data.
+These data will be dumped in a number of forms - one of them being 
+$format flat files.  As the annotation of this form comes from Ensembl, 
+and not the original sequence entry, the two annotations are 
+likely to be different.
+
+$format flat file format dumping provides all the confirmed protein coding 
+genes known by Ensembl. Considerably more information is stored in Ensembl: 
+the flat file just gives a representation which is compatible with 
+existing tools.
+
+The main body of the entry gives the same information as is in the main 
+$format flat file entry.
+
+    * ID - the $format id
+    * AC - the EMBL/GenBank/DDBJ accession number (only the primary 
+           accession number used)
+    * SV - The accession.version pair which gives the exact reference to 
+           a particular sequence
+    * CC - comment lines to help you interpret the entry 
+
+Currently the following features are dumped into the feature table of 
+the Ensembl entry:
+
+    * Transcripts as CDS entries. Each transcript has the following 
+      attributes attached
+          o Transcript id - a stable id, which Ensembl will attempt to 
+            preserve as sensibly as possible during updates of the data
+          o Gene id - indication of the gene that this transcript belongs 
+            to. gene ids are stable and preserved as sensibly as possible 
+            during updates of the data
+          o Translation - the peptide translation of the transcript. 
+    * Exons as exon entries. Each exon has the following information
+          o Exon id. The exon id is stable and preserved as sensibly 
+            as possible during sequence updates
+          o start_phase. The phase of the splice site at the 5' end 
+            of the exon. Phase 0 means between two codons, phase 1 
+            means between the first and the second base of the codon 
+            (meaning that there are 2 bases until the reading frame of 
+            the exon) and phase 2 means between the second and the third 
+            base of the codon (one base until the reading frame starts).
+          o end_phase. The phase of the splice site at the 3' end of the 
+            exon: same definition as above (though of course, being end_phase, 
+            the position relative to the exon's reading frame is different 
+            for phase 1 and 2). 
+
+We are considering other information that should be made dumpable. In 
+general we would prefer people to use database access over flat file 
+access if you want to do something serious with the data. 
+
+README
+  
+  my $path = File::Spec->catfile($self->data_path(), 'README');
+  work_with_file($path, 'w', sub {
+    my ($fh) = @_;
+    print $fh $readme;
+    return;
+  });
+  return;
+}
+
+
+1;
+
--- a/modules/Bio/EnsEMBL/Pipeline/Flatfile/FindDirs.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/Flatfile/FindDirs.pm
+=pod
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::Pipeline::Flatfile::FindDirs
+
+=head1 DESCRIPTION
+
+Finds all directories under the given species directory. This is used to
+flow any further processing only dependent on the directory
+
+Allowed parameters are:
+
+=over 8
+
+=item species - The species to work with
+
+=back
+
+=cut
+
+package Bio::EnsEMBL::Pipeline::FASTA::FindDirs;
+
+use strict;
+use warnings;
+
+use base qw/Bio::EnsEMBL::Pipeline::FindDirs Bio::EnsEMBL::Pipeline::Flatfile::Base/;
+
+use File::Spec;
+
+sub fetch_input {
+  my ($self) = @_;
+  $self->throw("No 'species' parameter specified") unless $self->param('species');
+  $self->param('path', $self->data_path());
+  $self->SUPER::fetch_input();
+  return;
+}
+
+1;
--- a/modules/Bio/EnsEMBL/Pipeline/PipeConfig/FASTA_conf.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/PipeConfig/FASTA_conf.pm
@@ -232,7 +232,7 @@ sub pipeline_analyses {
      
      {
        -logic_name => 'ChecksumGenerator',
-        -module     => 'Bio::EnsEMBL::Pipeline::FASTA::ChecksumGenerator',
+        -module     => 'Bio::EnsEMBL::Pipeline::ChecksumGenerator',
        -hive_capacity => 10,
        -rc_id      => 1,
      },

--- a/modules/Bio/EnsEMBL/Pipeline/PipeConfig/Flatfile_conf.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/PipeConfig/Flatfile_conf.pm
+package Bio::EnsEMBL::Pipeline::PipeConfig::Flatfile_conf;
+
+use strict;
+use warnings;
+
+use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf');
+
+use Bio::EnsEMBL::ApiVersion qw/software_version/;
+
+sub default_options {
+    my ($self) = @_;
+    
+    return {
+      # inherit other stuff from the base class
+      %{ $self->SUPER::default_options() }, 
+      
+      ### OVERRIDE
+      
+      #'registry' => 'Reg.pm', # default option to refer to Reg.pm, should be full path
+      #'base_path' => '', #where do you want your files
+      
+      ### Optional overrides        
+      species => [],
+      
+      release => software_version(),
+
+      types => [qw/embl genbank/],
+      
+      ### Defaults 
+      
+      pipeline_name => 'flatfile_dump_'.$self->o('release'),
+      
+      email => $self->o('ENV', 'USER').'@sanger.ac.uk',
+      
+    };
+}
+
+sub pipeline_create_commands {
+    my ($self) = @_;
+    return [
+      # inheriting database and hive tables' creation
+      @{$self->SUPER::pipeline_create_commands}, 
+    ];
+}
+
+## See diagram for pipeline structure 
+sub pipeline_analyses {
+    my ($self) = @_;
+    
+    return [
+    
+      {
+        -logic_name => 'ScheduleSpecies',
+        -module     => 'Bio::EnsEMBL::Pipeline::SpeciesFactory',
+        -parameters => {
+          species => $self->o('species')
+        },
+        -input_ids  => [ {} ],
+        -flow_into  => {
+          1 => 'Notify',
+          2 => ['DumpTypeFactory'],
+        },
+      },
+      
+      ######### DUMPING DATA
+      
+      {
+        -logic_name => 'DumpTypeFactory',
+        -module     => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
+        -parameters => {
+          column_names => ['type'],
+          inputlist => $self->o('types'),
+          input_id => { species => '#species#', type => '#type#' },
+          fan_branch_code => 2
+        },
+        -flow_into  => { 2 => ['DumpFlatfile', 'ChecksumGenerator'] },
+      },
+      
+      {
+        -logic_name => 'DumpFlatfile',
+        -module     => 'Bio::EnsEMBL::Pipeline::Flatfile::DumpFile',
+        -max_retry_count  => 1,
+        -hive_capacity    => 10,
+      },
+      
+      ####### CHECKSUMMING
+      
+      {
+        -logic_name => 'ChecksumGenerator',
+        -module     => 'Bio::EnsEMBL::Pipeline::Flatfile::ChecksumGenerator',
+        -wait_for   => [qw/DumpFlatfile/],
+        -hive_capacity => 10, 
+      },
+      
+      ####### NOTIFICATION
+      
+      {
+        -logic_name => 'Notify',
+        -module     => 'Bio::EnsEMBL::Hive::RunnableDB::NotifyByEmail',
+        -parameters => {
+          email   => $self->o('email'),
+          subject => $self->o('pipeline_name').' has finished',
+          text    => 'Your pipeline has finished. Please consult the hive output'
+        },
+        -wait_for   => ['ChecksumGenerator'],
+      }
+    
+    ];
+}
+
+sub pipeline_wide_parameters {
+    my ($self) = @_;
+    
+    return {
+        %{ $self->SUPER::pipeline_wide_parameters() },  # inherit other stuff from the base class
+        base_path => $self->o('base_path'), 
+        db_types => $self->o('db_types'),
+        release => $self->o('release'),
+    };
+}
+
+# override the default method, to force an automatic loading of the registry in all workers
+sub beekeeper_extra_cmdline_options {
+    my $self = shift;
+    return "-reg_conf ".$self->o("registry");
+}
+
+sub resource_classes {
+    my $self = shift;
+    return {
+      0 => { -desc => 'default', 'LSF' => '-q normal -M4000000 -R"select[mem>4000] rusage[mem=4000]"'},
+    }
+}
+
+1;
--- a/modules/Bio/EnsEMBL/Pipeline/SpeciesFactory.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/SpeciesFactory.pm
+=pod
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::Pipeline::SpeciesFactory
+
+=head1 DESCRIPTION
+
+A module which generates dump jobs for each species it finds in the Ensembl
+Registry. The species we run the code on can be controlled by specifying
+the I<species> parameter or by reducing the number of DBAdaptors loaded into
+the registry. 
+
+Allowed parameters are:
+
+=over 8
+
+=item species - Can be an array of species to perform dumps for or a single
+                species name. If specified only jobs will be created for
+                those species. Defaults to nothing so all species are processed
+
+item db_types - Specify the types of database to dump. Defaults to core and
+                should be an array.
+
+=back
+
+The code flows once per species to branch 2.
+
+=cut
+
+package Bio::EnsEMBL::Pipeline::SpeciesFactory;
+
+use strict;
+use warnings;
+
+use base qw/Bio::EnsEMBL::Pipeline::Base/;
+
+use Bio::EnsEMBL::Registry;
+
+sub param_defaults {
+  my ($self) = @_;
+  return {
+    db_types => [qw/core/],
+    species => []
+  };
+}
+
+sub fetch_input {
+  my ($self) = @_;
+  
+  $self->reset_empty_array_param('db_types');
+  
+  my $core_dbas = $self->get_DBAdaptors();
+  $self->info('Found %d core DBAdaptor(s) to process', scalar(@{$core_dbas}));
+  $self->param('dbas', $core_dbas);
+  
+  my %species_lookup = 
+    map { $_ => 1 } 
+    map { Bio::EnsEMBL::Registry->get_alias($_)  } 
+    @{$self->param('species')};
+  $self->param('species_lookup', \%species_lookup);
+  
+  return;
+}
+  
+sub run {
+  my ($self) = @_;
+  my @dna;
+  my @genes;
+  my @species;
+  foreach my $dba (@{$self->param('dbas')}) {
+    if(!$self->process_dba($dba)) {
+      $self->fine('Skipping %s', $dba->species());
+      next;
+    }
+    my $input_id = $self->input_id($dba);
+    push(@species, [ $input_id, 2 ]);
+  }
+  $self->param('species', \@species);
+  return;
+}
+
+sub write_output {
+  my ($self) = @_;
+  $self->do_flow('species');
+  return;
+}
+
+sub get_DBAdaptors {
+  my ($self) = @_;
+  return Bio::EnsEMBL::Registry->get_all_DBAdaptors(-GROUP => 'core');
+}
+
+sub do_flow {
+  my ($self, $key) = @_;
+  my $targets = $self->param($key);
+  foreach my $entry (@{$targets}) {
+    my ($input_id, $flow) = @{$entry};
+    $self->fine('Flowing %s to %d for %s', $input_id->{species}, $flow, $key);
+    $self->dataflow_output_id($input_id, $flow);
+  }
+  return;
+}
+
+sub process_dba {
+  my ($self, $dba) = @_;
+  
+  #Reject if DB was ancestral sequences
+  return 0 if $dba->species() =~ /ancestral/i;
+  
+  #If species is defined then make sure we only allow those species through
+  if(@{$self->param('species')}) {
+    my $lookup = $self->param('species_lookup');
+    my $name = $dba->species();
+    my $aliases = Bio::EnsEMBL::Registry->get_all_aliases($name);
+    push(@{$aliases}, $name);
+    my $found = 0;
+    foreach my $alias (@{$aliases}) {
+      if($lookup->{$alias}) {
+        $found = 1;
+        last;
+      }
+    }
+    return $found;
+  }
+  
+  #Otherwise just accept
+  return 1;
+}
+
+sub input_id {
+  my ($self, $dba, $type) = @_;
+  my $mc = $dba->get_MetaContainer();
+  my $input_id = {
+    db_types => $self->db_types($dba),
+    species => $mc->get_production_name(),
+  };
+  return $input_id;
+}
+
+sub db_types {
+  my ($self, $dba) = @_;
+  return $self->param('db_types');
+}
+
+1;