Skip to content
Snippets Groups Projects
Commit fcd93104 authored by Andy Yates's avatar Andy Yates
Browse files

ENSCORESW-125. First commit of flatfile dumper. Being committed as a way of avoiding loosing code

parent 61a732c5
No related branches found
No related tags found
No related merge requests found
Showing
with 927 additions and 244 deletions
package Bio::EnsEMBL::Pipeline::Base;
use strict;
use warnings;
use base qw/Bio::EnsEMBL::Hive::Process/;
use Bio::EnsEMBL::Utils::Exception qw/throw/;
use Bio::EnsEMBL::Utils::Scalar qw/check_ref/;
use File::Find;
use File::Spec;
use File::Path qw/mkpath/;
use POSIX qw/strftime/;
# Takes in a key, checks if the current $self->param() was an empty array
# and replaces it with the value from $self->param_defaults()
sub reset_empty_array_param {
my ($self, $key) = @_;
my $param_defaults = $self->param_defaults();
my $current = $self->param($key);
my $replacement = $self->param_defaults()->{$key};
if(check_ref($current, 'ARRAY') && check_ref($replacement, 'ARRAY')) {
if(! @{$current}) {
$self->fine('Restting param %s because the given array was empty', $key);
$self->param($key, $replacement);
}
}
return;
}
sub get_Slices {
my ($self, $type) = @_;
my $dba = $self->get_DBAdaptor($type);
throw "Cannot get a DB adaptor" unless $dba;
my $sa = $dba->get_SliceAdaptor();
return [ sort { $a->length() <=> $b->length() } @{$sa->fetch_all('toplevel', undef, 1, undef, undef)} ];
}
# Registry is loaded by Hive (see beekeeper_extra_cmdline_options() in conf)
sub get_DBAdaptor {
my ($self, $type) = @_;
my $species = $self->param('species');
$type ||= 'core';
return Bio::EnsEMBL::Registry->get_DBAdaptor($species, $type);
}
sub cleanup_DBAdaptor {
my ($self, $type) = @_;
my $dba = $self->get_DBAdaptor($type);
$dba->clear_caches;
$dba->dbc->disconnect_if_idle;
return;
}
sub get_dir {
my ($self, @extras) = @_;
my $base_dir = $self->param('base_path');
my $dir = File::Spec->catdir($base_dir, @extras);
mkpath($dir);
return $dir;
}
sub web_name {
my ($self) = @_;
# my $mc = $self->get_DBAdaptor()->get_MetaContainer();
# my $name = $mc->single_value_by_key('species.url'); # change back
my $name = ucfirst($self->production_name());
return $name;
}
sub scientific_name {
my ($self) = @_;
my $dba = $self->get_DBAdaptor();
my $mc = $dba->get_MetaContainer();
my $name = $mc->get_scientific_name();
$dba->dbc()->disconnect_if_idle();
return $name;
}
sub assembly {
my ($self) = @_;
my $dba = $self->get_DBAdaptor();
return $dba->get_CoordSystemAdaptor()->fetch_all()->[0]->version();
}
sub production_name {
my ($self, $name) = @_;
my $dba;
if($name) {
$dba = Bio::EnsEMBL::Registry->get_DBAdaptor($name, 'core');
}
else {
$dba = $self->get_DBAdaptor();
}
my $mc = $dba->get_MetaContainer();
my $prod = $mc->get_production_name();
$dba->dbc()->disconnect_if_idle();
return $prod;
}
# Closes file handle, and deletes the file stub if it contains no data
# Returns success type
sub tidy_file_handle {
my ($self, $fh, $path) = @_;
if ($fh->tell() == 0) {
$fh->close;
unlink($path);
return 1;
}
close($fh);
return 0;
}
sub info {
my ($self, $msg, @params) = @_;
if ($self->debug() > 1) {
my $formatted_msg;
if(scalar(@params)) {
$formatted_msg = sprintf($msg, @params);
}
else {
$formatted_msg = $msg;
}
printf STDERR "INFO: %s %s\n", strftime('%c',localtime()), $formatted_msg;
}
return
}
sub fine {
my ($self, $msg, @params) = @_;
if ($self->debug() > 2) {
my $formatted_msg;
if(scalar(@params)) {
$formatted_msg = sprintf($msg, @params);
}
else {
$formatted_msg = $msg;
}
printf STDERR "FINE: %s %s\n", strftime('%c',localtime()), $formatted_msg;
}
return
}
sub find_files {
my ($self, $dir, $boolean_callback) = @_;
$self->throw("Cannot find path $dir") unless -d $dir;
my @files;
find(sub {
my $path = $File::Find::name;
if($boolean_callback->($_)) {
push(@files, $path);
}
}, $dir);
return \@files;
}
1;
......@@ -20,7 +20,7 @@
=head1 NAME
Bio::EnsEMBL::Pipeline::FASTA::ChecksumGenerator
Bio::EnsEMBL::Pipeline::ChecksumGenerator
=head1 DESCRIPTION
......@@ -40,12 +40,12 @@ Allowed parameters are:
=cut
package Bio::EnsEMBL::Pipeline::FASTA::ChecksumGenerator;
package Bio::EnsEMBL::Pipeline::ChecksumGenerator;
use strict;
use warnings;
use base qw/Bio::EnsEMBL::Pipeline::FASTA::Base/;
use base qw/Bio::EnsEMBL::Pipeline::Base/;
use File::Spec;
use Bio::EnsEMBL::Utils::IO qw/work_with_file gz_work_with_file/;
......
......@@ -2,95 +2,15 @@ package Bio::EnsEMBL::Pipeline::FASTA::Base;
use strict;
use warnings;
use base qw/Bio::EnsEMBL::Hive::Process/;
use base qw/Bio::EnsEMBL::Pipeline::Base/;
use Bio::EnsEMBL::Utils::Exception qw/throw/;
use Bio::EnsEMBL::Utils::Scalar qw/check_ref/;
use File::Find;
use File::Spec;
use File::Path qw/mkpath/;
use POSIX qw/strftime/;
# Takes in a key, checks if the current $self->param() was an empty array
# and replaces it with the value from $self->param_defaults()
sub reset_empty_array_param {
my ($self, $key) = @_;
my $param_defaults = $self->param_defaults();
my $current = $self->param($key);
my $replacement = $self->param_defaults()->{$key};
if(check_ref($current, 'ARRAY') && check_ref($replacement, 'ARRAY')) {
if(! @{$current}) {
$self->fine('Restting param %s because the given array was empty', $key);
$self->param($key, $replacement);
}
}
return;
}
sub get_Slices {
my ($self, $type) = @_;
my $sa = $self->get_DBAdaptor($type)->get_SliceAdaptor();
return [ sort { $a->length() <=> $b->length() } @{$sa->fetch_all('toplevel', undef, 1, undef, undef)} ];
}
# Registry is loaded by Hive (see beekeeper_extra_cmdline_options() in conf)
sub get_DBAdaptor {
my ($self, $type) = @_;
my $species = $self->param('species');
$type ||= 'core';
return Bio::EnsEMBL::Registry->get_DBAdaptor($species, $type);
}
sub cleanup_DBAdaptor {
my ($self, $type) = @_;
my $dba = $self->get_DBAdaptor($type);
$dba->clear_caches;
$dba->dbc->disconnect_if_idle;
return;
}
sub get_dir {
my ($self, @extras) = @_;
my $base_dir = $self->param('base_path');
my $dir = File::Spec->catdir($base_dir, @extras);
mkpath($dir);
return $dir;
}
sub fasta_path {
my ( $self, @extras ) = @_;
return $self->get_dir('fasta', $self->param('species'), @extras);
}
sub web_name {
my ($self) = @_;
# my $mc = $self->get_DBAdaptor()->get_MetaContainer();
# my $name = $mc->single_value_by_key('species.url'); # change back
my $name = ucfirst($self->production_name());
return $name;
}
sub assembly {
my ($self) = @_;
my $dba = $self->get_DBAdaptor();
return $dba->get_CoordSystemAdaptor()->fetch_all()->[0]->version();
}
sub production_name {
my ($self, $name) = @_;
my $dba;
if($name) {
$dba = Bio::EnsEMBL::Registry->get_DBAdaptor($name, 'core');
}
else {
$dba = $self->get_DBAdaptor();
}
my $mc = $dba->get_MetaContainer();
my $prod = $mc->get_production_name();
$dba->dbc()->disconnect_if_idle();
return $prod;
}
sub old_path {
my ($self, $species) = @_;
my $base = $self->param('ftp_dir');
......@@ -99,61 +19,4 @@ sub old_path {
my $dir = File::Spec->catdir($base, "release-$release", 'fasta', $prod, 'dna');
}
# Closes file handle, and deletes the file stub if it contains no data
# Returns success type
sub tidy_file_handle {
my ($self, $fh, $path) = @_;
if ($fh->tell() == 0) {
$fh->close;
unlink($path);
return 1;
}
close($fh);
return 0;
}
sub info {
my ($self, $msg, @params) = @_;
if ($self->debug() > 1) {
my $formatted_msg;
if(scalar(@params)) {
$formatted_msg = sprintf($msg, @params);
}
else {
$formatted_msg = $msg;
}
printf STDERR "INFO: %s %s\n", strftime('%c',localtime()), $formatted_msg;
}
return
}
sub fine {
my ($self, $msg, @params) = @_;
if ($self->debug() > 2) {
my $formatted_msg;
if(scalar(@params)) {
$formatted_msg = sprintf($msg, @params);
}
else {
$formatted_msg = $msg;
}
printf STDERR "FINE: %s %s\n", strftime('%c',localtime()), $formatted_msg;
}
return
}
sub find_files {
my ($self, $dir, $boolean_callback) = @_;
$self->throw("Cannot find path $dir") unless -d $dir;
my @files;
find(sub {
my $path = $File::Find::name;
if($boolean_callback->($_)) {
push(@files, $path);
}
}, $dir);
return \@files;
}
1;
......@@ -44,52 +44,16 @@ package Bio::EnsEMBL::Pipeline::FASTA::FindDirs;
use strict;
use warnings;
use base qw/Bio::EnsEMBL::Pipeline::FASTA::Base Bio::EnsEMBL::Hive::RunnableDB::JobFactory/;
use base qw/Bio::EnsEMBL::Pipeline::FindDirs Bio::EnsEMBL::Pipeline::FASTA::Base/;
use File::Spec;
sub fetch_input {
my ($self) = @_;
$self->throw("No 'species' parameter specified") unless $self->param('species');
my $dirs = $self->dirs();
$self->param('inputlist', $dirs);
$self->param('path', $self->fasta_path());
$self->SUPER::fetch_input();
return;
}
sub run {
my ($self) = @_;
Bio::EnsEMBL::Hive::RunnableDB::JobFactory::run($self);
return;
}
sub write_output {
my ($self) = @_;
Bio::EnsEMBL::Hive::RunnableDB::JobFactory::write_output($self);
return;
}
sub dirs {
my ($self) = @_;
my @dirs;
my $dir = $self->fasta_path();
$self->info('Searching directory %s', $dir);
opendir(my $dh, $dir) or die "Cannot open directory $dir";
my @files = sort { $a cmp $b } readdir($dh);
closedir($dh) or die "Cannot close directory $dir";
foreach my $file (@files) {
next if $file =~ /^\./; #hidden file or up/current dir
my $path = File::Spec->catdir($dir, $file);
if(-d $path) {
$self->fine('Adding %s to the list of found dirs', $path);
push(@dirs, $path);
}
}
return \@dirs;
}
1;
......@@ -67,35 +67,22 @@ package Bio::EnsEMBL::Pipeline::FASTA::SpeciesFactory;
use strict;
use warnings;
use base qw/Bio::EnsEMBL::Pipeline::FASTA::Base/;
use base qw/Bio::EnsEMBL::Pipeline::SpeciesFactory/;
use Bio::EnsEMBL::Registry;
sub param_defaults {
my ($self) = @_;
return {
%{$self->SUPER::param_defaults()},
sequence_type_list => [qw/dna cdna ncrna/],
db_types => [qw/core/],
species => []
};
}
sub fetch_input {
my ($self) = @_;
$self->SUPER::fetch_input();
$self->reset_empty_array_param('sequence_type_list');
$self->reset_empty_array_param('db_types');
my $core_dbas = $self->get_DBAdaptors();
$self->info('Found %d core DBAdaptor(s) to process', scalar(@{$core_dbas}));
$self->param('dbas', $core_dbas);
my %species_lookup =
map { $_ => 1 }
map { Bio::EnsEMBL::Registry->get_alias($_) }
@{$self->param('species')};
$self->param('species_lookup', \%species_lookup);
my %sequence_types = map { $_ => 1 } @{ $self->param('sequence_type_list') };
$self->param('sequence_types', \%sequence_types);
......@@ -139,48 +126,6 @@ sub write_output {
return;
}
sub get_DBAdaptors {
my ($self) = @_;
return Bio::EnsEMBL::Registry->get_all_DBAdaptors(-GROUP => 'core');
}
sub do_flow {
my ($self, $key) = @_;
my $targets = $self->param($key);
foreach my $entry (@{$targets}) {
my ($input_id, $flow) = @{$entry};
$self->fine('Flowing %s to %d for %s', $input_id->{species}, $flow, $key);
$self->dataflow_output_id($input_id, $flow);
}
return;
}
sub process_dba {
my ($self, $dba) = @_;
#Reject if DB was ancestral sequences
return 0 if $dba->species() =~ /ancestral/i;
#If species is defined then make sure we only allow those species through
if(@{$self->param('species')}) {
my $lookup = $self->param('species_lookup');
my $name = $dba->species();
my $aliases = Bio::EnsEMBL::Registry->get_all_aliases($name);
push(@{$aliases}, $name);
my $found = 0;
foreach my $alias (@{$aliases}) {
if($lookup->{$alias}) {
$found = 1;
last;
}
}
return $found;
}
#Otherwise just accept
return 1;
}
# return 0 if we do not want to do any flowing otherwise return 2
sub dna_flow {
......@@ -218,9 +163,4 @@ sub input_id {
return $input_id;
}
sub db_types {
my ($self, $dba) = @_;
return $self->param('db_types');
}
1;
=pod
=head1 LICENSE
Copyright (c) 1999-2012 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
http://www.ensembl.org/info/about/code_licence.html
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at <dev@ensembl.org>.
Questions may also be sent to the Ensembl help desk at
<helpdesk@ensembl.org>.
=head1 NAME
Bio::EnsEMBL::Pipeline::FindDirs
=head1 DESCRIPTION
Finds all directories under the given path.
Allowed parameters are:
=over 8
=item path - The path to search
=back
=cut
package Bio::EnsEMBL::Pipeline::FindDirs;
use strict;
use warnings;
use base qw/Bio::EnsEMBL::Hive::RunnableDB::JobFactory/;
use File::Spec;
sub fetch_input {
my ($self) = @_;
$self->throw("No 'path' parameter specified") unless $self->param('path');
my $dirs = $self->dirs();
$self->param('inputlist', $dirs);
return;
}
sub dirs {
my ($self) = @_;
my @dirs;
my $dir = $self->param('path');
$self->info('Searching directory %s', $dir);
opendir(my $dh, $dir) or die "Cannot open directory $dir";
my @files = sort { $a cmp $b } readdir($dh);
closedir($dh) or die "Cannot close directory $dir";
foreach my $file (@files) {
next if $file =~ /^\./; #hidden file or up/current dir
my $path = File::Spec->catdir($dir, $file);
if(-d $path) {
$self->fine('Adding %s to the list of found dirs', $path);
push(@dirs, $path);
}
}
return \@dirs;
}
1;
package Bio::EnsEMBL::Pipeline::Flatfile::Base;
use strict;
use warnings;
use base qw/Bio::EnsEMBL::Pipeline::Base/;
sub data_path {
my ($self) = @_;
return $self->get_dir($self->param('type'), $self->param('species'));
}
1;
=pod
=head1 LICENSE
Copyright (c) 1999-2012 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
http://www.ensembl.org/info/about/code_licence.html
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at <dev@ensembl.org>.
Questions may also be sent to the Ensembl help desk at
<helpdesk@ensembl.org>.
=head1 NAME
Bio::EnsEMBL::Pipeline::Flatfile::ChecksumGenerator
=head1 DESCRIPTION
Creates a CHECKSUMS file in the given directory which is produced from running
the sum command over every file in the directory. This excludes the CHECKSUMS
file, parent directory or any hidden files.
Allowed parameters are:
=over 8
=item species - Species to work with
=item type - Type of data to work with
=back
=cut
package Bio::EnsEMBL::Pipeline::Flatfile::ChecksumGenerator;
use strict;
use warnings;
use base qw/Bio::EnsEMBL::Pipeline::ChecksumGenerator Bio::EnsEMBL::Pipeline::Flatfile::Base/;
sub fetch_input {
my ($self) = @_;
$self->throw("No 'species' parameter specified") unless $self->param('species');
$self->throw("No 'type' parameter specified") unless $self->param('type');
my $dir = $self->data_path();
$self->param('dir', $dir);
$self->SUPER::fetch_input();
return;
}
1;
=pod
=head1 LICENSE
Copyright (c) 1999-2012 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
http://www.ensembl.org/info/about/code_licence.html
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at <dev@ensembl.org>.
Questions may also be sent to the Ensembl help desk at
<helpdesk@ensembl.org>.
=head1 NAME
Bio::EnsEMBL::Pipeline::Flatfile::DumpFile
=head1 DESCRIPTION
The main workhorse of the Flatfile dumping pipeline.
The script is responsible for creating the filenames of these target
files, taking data from the database and the formatting of the flat files
headers. The final files are all Gzipped at normal levels of compression.
Allowed parameters are:
=over 8
=item species - The species to dump
=item base_path - The base of the dumps
=item release - The current release we are emitting
=item type - The type of data we are emitting. Should be embl or genbank
=back
=cut
package Bio::EnsEMBL::Pipeline::Flatfile::DumpFile;
use strict;
use warnings;
use base qw(Bio::EnsEMBL::Pipeline::Flatfile::Base);
use Bio::EnsEMBL::Utils::Exception qw/throw/;
use Bio::EnsEMBL::Utils::SeqDumper;
use Bio::EnsEMBL::Utils::IO qw/gz_work_with_file/;
use File::Path qw/rmtree/;
sub param_defaults {
my ($self) = @_;
return {
supported_types => {embl => 1, genbank => 1},
};
}
sub fetch_input {
my ($self) = @_;
my $type = $self->param('type');
throw "No type specified" unless $type;
throw "Unsupported type '$type' specified" unless $self->param('supported_types')->{$type};
throw "Need a species" unless $self->param('species');
throw "Need a release" unless $self->param('release');
throw "Need a base_path" unless $self->param('base_path');
return;
}
sub run {
my ($self) = @_;
my $root = $self->data_path();
if(-d $root) {
$self->info('Directory "%s" already exists; removing', $root);
rmtree($root);
}
my $type = $self->param('type');
my $target = "dump_${type}";
my $seq_dumper = $self->_seq_dumper();
my @chromosomes;
my @non_chromosomes;
foreach my $s (@{$self->get_Slices()}) {
my $chr = $s->is_chromosome();
push(@chromosomes, $s) if $chr;
push(@non_chromosomes, $s) if ! $chr;
}
if(@non_chromosomes) {
my $path = $self->_generate_file_name('nonchromosomal');
$self->info('Dumping non-chromosomal data to %s', $path);
gz_work_with_file($path, 'w', sub {
my ($fh) = @_;
foreach my $slice (@non_chromosomes) {
$self->fine('Dumping non-nhromosomal %s', $slice->name());
$seq_dumper->$target($slice, $fh);
}
return;
});
}
else {
$self->info('Did not find any non-chromosomal data');
}
foreach my $slice (@chromosomes) {
$self->fine('Dumping chromosome %s', $slice->name());
my $path = $self->_generate_file_name($slice->coord_system_name(), $slice->seq_region_name());
my $args = {};
if(-f $path) {
$self->fine('Path "%s" already exists; appending', $path);
$args->{Append} = 1;
}
gz_work_with_file($path, 'w', sub {
my ($fh) = @_;
$seq_dumper->$target($slice, $fh);
return;
}, $args);
}
return;
}
sub _seq_dumper {
my ($self) = @_;
my $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new();
$seq_dumper->disable_feature_type('similarity');
$seq_dumper->disable_feature_type('genscan');
$seq_dumper->disable_feature_type('variation');
$seq_dumper->disable_feature_type('repeat');
return $seq_dumper;
}
sub _generate_file_name {
my ($self, $section, $name) = @_;
# File name format looks like:
# <species>.<assembly>.<release>.<section.name|section>.dat.gz
# e.g. Homo_sapiens.GRCh37.64.chromosome.20.dat.gz
# Homo_sapiens.GRCh37.64.nonchromosomal.dat.gz
my @name_bits;
push @name_bits, $self->web_name();
push @name_bits, $self->assembly();
push @name_bits, $self->param('release');
push @name_bits, $section if $section;
push @name_bits, $name if $name;
push @name_bits, 'dat', 'gz';
my $file_name = join( '.', @name_bits );
my $path = $self->data_path();
return File::Spec->catfile($path, $file_name);
}
sub _create_README {
my ($self) = @_;
my $species = $self->scientific_name();
my $format = uc($self->param('type'));
my $readme = <<README;
#### README ####
IMPORTANT: Please note you can download correlation data tables,
supported by Ensembl, via the highly customisable BioMart and
EnsMart data mining tools. See http://www.ensembl.org/biomart/martview or
http://www.ebi.ac.uk/biomart/ for more information.
-----------------------
$format FLATFILE DUMPS
-----------------------
This directory contains $species $format flatfile dumps. To ease
downloading of the files, the $format format entries are bundled
into groups of chromosomes and non-chromosomal regions.
All files are then compacted with gzip.
Ensembl provides an automatic reannotation of $species genomic data.
These data will be dumped in a number of forms - one of them being
$format flat files. As the annotation of this form comes from Ensembl,
and not the original sequence entry, the two annotations are
likely to be different.
$format flat file format dumping provides all the confirmed protein coding
genes known by Ensembl. Considerably more information is stored in Ensembl:
the flat file just gives a representation which is compatible with
existing tools.
The main body of the entry gives the same information as is in the main
$format flat file entry.
* ID - the $format id
* AC - the EMBL/GenBank/DDBJ accession number (only the primary
accession number used)
* SV - The accession.version pair which gives the exact reference to
a particular sequence
* CC - comment lines to help you interpret the entry
Currently the following features are dumped into the feature table of
the Ensembl entry:
* Transcripts as CDS entries. Each transcript has the following
attributes attached
o Transcript id - a stable id, which Ensembl will attempt to
preserve as sensibly as possible during updates of the data
o Gene id - indication of the gene that this transcript belongs
to. gene ids are stable and preserved as sensibly as possible
during updates of the data
o Translation - the peptide translation of the transcript.
* Exons as exon entries. Each exon has the following information
o Exon id. The exon id is stable and preserved as sensibly
as possible during sequence updates
o start_phase. The phase of the splice site at the 5' end
of the exon. Phase 0 means between two codons, phase 1
means between the first and the second base of the codon
(meaning that there are 2 bases until the reading frame of
the exon) and phase 2 means between the second and the third
base of the codon (one base until the reading frame starts).
o end_phase. The phase of the splice site at the 3' end of the
exon: same definition as above (though of course, being end_phase,
the position relative to the exon's reading frame is different
for phase 1 and 2).
We are considering other information that should be made dumpable. In
general we would prefer people to use database access over flat file
access if you want to do something serious with the data.
README
my $path = File::Spec->catfile($self->data_path(), 'README');
work_with_file($path, 'w', sub {
my ($fh) = @_;
print $fh $readme;
return;
});
return;
}
1;
=pod
=head1 LICENSE
Copyright (c) 1999-2012 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
http://www.ensembl.org/info/about/code_licence.html
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at <dev@ensembl.org>.
Questions may also be sent to the Ensembl help desk at
<helpdesk@ensembl.org>.
=head1 NAME
Bio::EnsEMBL::Pipeline::Flatfile::FindDirs
=head1 DESCRIPTION
Finds all directories under the given species directory. This is used to
flow any further processing only dependent on the directory
Allowed parameters are:
=over 8
=item species - The species to work with
=back
=cut
package Bio::EnsEMBL::Pipeline::FASTA::FindDirs;
use strict;
use warnings;
use base qw/Bio::EnsEMBL::Pipeline::FindDirs Bio::EnsEMBL::Pipeline::Flatfile::Base/;
use File::Spec;
sub fetch_input {
my ($self) = @_;
$self->throw("No 'species' parameter specified") unless $self->param('species');
$self->param('path', $self->data_path());
$self->SUPER::fetch_input();
return;
}
1;
......@@ -232,7 +232,7 @@ sub pipeline_analyses {
{
-logic_name => 'ChecksumGenerator',
-module => 'Bio::EnsEMBL::Pipeline::FASTA::ChecksumGenerator',
-module => 'Bio::EnsEMBL::Pipeline::ChecksumGenerator',
-hive_capacity => 10,
-rc_id => 1,
},
......
package Bio::EnsEMBL::Pipeline::PipeConfig::Flatfile_conf;
use strict;
use warnings;
use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf');
use Bio::EnsEMBL::ApiVersion qw/software_version/;
sub default_options {
my ($self) = @_;
return {
# inherit other stuff from the base class
%{ $self->SUPER::default_options() },
### OVERRIDE
#'registry' => 'Reg.pm', # default option to refer to Reg.pm, should be full path
#'base_path' => '', #where do you want your files
### Optional overrides
species => [],
release => software_version(),
types => [qw/embl genbank/],
### Defaults
pipeline_name => 'flatfile_dump_'.$self->o('release'),
email => $self->o('ENV', 'USER').'@sanger.ac.uk',
};
}
sub pipeline_create_commands {
my ($self) = @_;
return [
# inheriting database and hive tables' creation
@{$self->SUPER::pipeline_create_commands},
];
}
## See diagram for pipeline structure
sub pipeline_analyses {
my ($self) = @_;
return [
{
-logic_name => 'ScheduleSpecies',
-module => 'Bio::EnsEMBL::Pipeline::SpeciesFactory',
-parameters => {
species => $self->o('species')
},
-input_ids => [ {} ],
-flow_into => {
1 => 'Notify',
2 => ['DumpTypeFactory'],
},
},
######### DUMPING DATA
{
-logic_name => 'DumpTypeFactory',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
-parameters => {
column_names => ['type'],
inputlist => $self->o('types'),
input_id => { species => '#species#', type => '#type#' },
fan_branch_code => 2
},
-flow_into => { 2 => ['DumpFlatfile', 'ChecksumGenerator'] },
},
{
-logic_name => 'DumpFlatfile',
-module => 'Bio::EnsEMBL::Pipeline::Flatfile::DumpFile',
-max_retry_count => 1,
-hive_capacity => 10,
},
####### CHECKSUMMING
{
-logic_name => 'ChecksumGenerator',
-module => 'Bio::EnsEMBL::Pipeline::Flatfile::ChecksumGenerator',
-wait_for => [qw/DumpFlatfile/],
-hive_capacity => 10,
},
####### NOTIFICATION
{
-logic_name => 'Notify',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::NotifyByEmail',
-parameters => {
email => $self->o('email'),
subject => $self->o('pipeline_name').' has finished',
text => 'Your pipeline has finished. Please consult the hive output'
},
-wait_for => ['ChecksumGenerator'],
}
];
}
sub pipeline_wide_parameters {
my ($self) = @_;
return {
%{ $self->SUPER::pipeline_wide_parameters() }, # inherit other stuff from the base class
base_path => $self->o('base_path'),
db_types => $self->o('db_types'),
release => $self->o('release'),
};
}
# override the default method, to force an automatic loading of the registry in all workers
sub beekeeper_extra_cmdline_options {
my $self = shift;
return "-reg_conf ".$self->o("registry");
}
sub resource_classes {
my $self = shift;
return {
0 => { -desc => 'default', 'LSF' => '-q normal -M4000000 -R"select[mem>4000] rusage[mem=4000]"'},
}
}
1;
=pod
=head1 LICENSE
Copyright (c) 1999-2012 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
http://www.ensembl.org/info/about/code_licence.html
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at <dev@ensembl.org>.
Questions may also be sent to the Ensembl help desk at
<helpdesk@ensembl.org>.
=head1 NAME
Bio::EnsEMBL::Pipeline::SpeciesFactory
=head1 DESCRIPTION
A module which generates dump jobs for each species it finds in the Ensembl
Registry. The species we run the code on can be controlled by specifying
the I<species> parameter or by reducing the number of DBAdaptors loaded into
the registry.
Allowed parameters are:
=over 8
=item species - Can be an array of species to perform dumps for or a single
species name. If specified only jobs will be created for
those species. Defaults to nothing so all species are processed
item db_types - Specify the types of database to dump. Defaults to core and
should be an array.
=back
The code flows once per species to branch 2.
=cut
package Bio::EnsEMBL::Pipeline::SpeciesFactory;
use strict;
use warnings;
use base qw/Bio::EnsEMBL::Pipeline::Base/;
use Bio::EnsEMBL::Registry;
sub param_defaults {
my ($self) = @_;
return {
db_types => [qw/core/],
species => []
};
}
sub fetch_input {
my ($self) = @_;
$self->reset_empty_array_param('db_types');
my $core_dbas = $self->get_DBAdaptors();
$self->info('Found %d core DBAdaptor(s) to process', scalar(@{$core_dbas}));
$self->param('dbas', $core_dbas);
my %species_lookup =
map { $_ => 1 }
map { Bio::EnsEMBL::Registry->get_alias($_) }
@{$self->param('species')};
$self->param('species_lookup', \%species_lookup);
return;
}
sub run {
my ($self) = @_;
my @dna;
my @genes;
my @species;
foreach my $dba (@{$self->param('dbas')}) {
if(!$self->process_dba($dba)) {
$self->fine('Skipping %s', $dba->species());
next;
}
my $input_id = $self->input_id($dba);
push(@species, [ $input_id, 2 ]);
}
$self->param('species', \@species);
return;
}
sub write_output {
my ($self) = @_;
$self->do_flow('species');
return;
}
sub get_DBAdaptors {
my ($self) = @_;
return Bio::EnsEMBL::Registry->get_all_DBAdaptors(-GROUP => 'core');
}
sub do_flow {
my ($self, $key) = @_;
my $targets = $self->param($key);
foreach my $entry (@{$targets}) {
my ($input_id, $flow) = @{$entry};
$self->fine('Flowing %s to %d for %s', $input_id->{species}, $flow, $key);
$self->dataflow_output_id($input_id, $flow);
}
return;
}
sub process_dba {
my ($self, $dba) = @_;
#Reject if DB was ancestral sequences
return 0 if $dba->species() =~ /ancestral/i;
#If species is defined then make sure we only allow those species through
if(@{$self->param('species')}) {
my $lookup = $self->param('species_lookup');
my $name = $dba->species();
my $aliases = Bio::EnsEMBL::Registry->get_all_aliases($name);
push(@{$aliases}, $name);
my $found = 0;
foreach my $alias (@{$aliases}) {
if($lookup->{$alias}) {
$found = 1;
last;
}
}
return $found;
}
#Otherwise just accept
return 1;
}
sub input_id {
my ($self, $dba, $type) = @_;
my $mc = $dba->get_MetaContainer();
my $input_id = {
db_types => $self->db_types($dba),
species => $mc->get_production_name(),
};
return $input_id;
}
sub db_types {
my ($self, $dba) = @_;
return $self->param('db_types');
}
1;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment