diff --git a/modules/Bio/EnsEMBL/Pipeline/Base.pm b/modules/Bio/EnsEMBL/Pipeline/Base.pm index 0d96747e7a149dd3d88347a13ec634401f9fa921..c6f36b605bc3ee3f1914607a810b7a92fd6ed005 100644 --- a/modules/Bio/EnsEMBL/Pipeline/Base.pm +++ b/modules/Bio/EnsEMBL/Pipeline/Base.pm @@ -241,6 +241,15 @@ sub assert_executable { return 1; } +# Runs the given command and returns a list of exit code and output +sub run_cmd { + my ($self, $cmd) = @_; + $self->info('About to run "%s"', $cmd); + my $output = `$cmd 2>&1`; + my $rc = $? >> 8; + $self->throw("Cannot run program '$cmd'. Return code was ${rc}. Program output was $output") if $rc; + return ($rc, $output); +} ## Production database adaptor sub get_production_DBAdaptor { diff --git a/modules/Bio/EnsEMBL/Pipeline/FASTA/BlastIndexer.pm b/modules/Bio/EnsEMBL/Pipeline/FASTA/BlastIndexer.pm new file mode 100644 index 0000000000000000000000000000000000000000..d68cf8d3c744a8cce97a2392dc419fafef749358 --- /dev/null +++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/BlastIndexer.pm @@ -0,0 +1,163 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=head1 NAME + +Bio::EnsEMBL::Pipeline::FASTA::NcbiBlastIndexer + +=head1 DESCRIPTION + +Creates NCBI Blast indexes of the given GZipped file. The resulting index +is created under the parameter location I<base_path> in I<blast_dir> and then in a +directory defined by the type of dump. The type of dump also changes the file +name generated. Genomic dumps have their release number replaced with the +last repeat masked date. + +Allowed parameters are: + +=over 8 + +=item file - The file to index + +=item program - The location of the xdformat program + +=item molecule - The type of molecule to index. I<dna> and I<pep> are allowed + +=item type - Type of index we are creating. I<genomic> and I<genes> are allowed + +=item base_path - The base of the dumps + +=item release - Required for correct DB naming + +=back + +=cut + +package Bio::EnsEMBL::Pipeline::FASTA::BlastIndexer; + +use strict; +use warnings; +use base qw/Bio::EnsEMBL::Pipeline::FASTA::Indexer/; + +use Bio::EnsEMBL::Utils::Exception qw/throw/; +use File::Copy qw/copy/; +use File::Spec; +use POSIX qw/strftime/; + +sub param_defaults { + my ($self) = @_; + return { +# program => 'xdformat', #program to use for indexing +# molecule => 'pep', #pep or dna +# type => 'genes', #genes or genomic +# blast_dir => 'blast_type_dir', # sets the type of directory used for + }; +} + +sub fetch_input { + my ($self) = @_; + my $mol = $self->param('molecule'); + throw "No molecule param given" unless defined $mol; + if($mol ne 'dna' && $mol ne 'pep') { + throw "param 'molecule' must be set to 'dna' or 'pep'. Value given was '${$mol}'"; + } + my $type = $self->param('type'); + throw "No type param given" unless defined $type; + if($type ne 'genomic' && $type ne 'genes') { + throw "param 'type' must be set to 'genomic' or 'genes'. Value given was '${$type}'"; + } + $self->assert_executable($self->param('program')); + $self->assert_executable('gunzip'); +} + +sub write_output { + my ($self) = @_; + $self->dataflow_output_id({ + species => $self->param('species'), + type => $self->param('type'), + molecule => $self->param('molecule'), + index_base => $self->param('index_base') + }, 1); + return; +} + +sub index_file { + my ($self, $file) = @_; + my $molecule_arg = ($self->param('molecule') eq 'dna') ? '-n' : '-p' ; + my $silence = ($self->debug()) ? 0 : 1; + my $target_dir = $self->target_dir(); + my $target_file = $self->target_file($file); + my $db_title = $self->db_title($file); + my $date = $self->db_date(); + + my $cmd = sprintf(q{cd %s && %s %s -q%d -I -t %s -d %s -o %s %s }, + $target_dir, $self->param('program'), $molecule_arg, $silence, $db_title, $date, $target_file, $file); + + $self->info('About to run "%s"', $cmd); + my $output = `$cmd 2>&1`; + my $rc = $? >> 8; + throw "Cannot run program '$cmd'. Return code was ${rc}. Program output was $output" if $rc; + unlink $file or throw "Cannot remove the file '$file' from the filesystem: $!"; + $self->param('index_base', $target_file); + return; +} + +sub target_file { + my ($self, $file) = @_; + my $target_dir = $self->target_dir(); + my $target_filename = $self->target_filename($file); + return File::Spec->catfile($target_dir, $target_filename); +} + +# Produce a dir like /nfs/path/to/<blast_dir>/genes/XXX && /nfs/path/to/<blast_dir>/dna/XXX +sub target_dir { + my ($self) = @_; + return $self->get_dir($self->param('blast_dir'), $self->param('type')); +} + +sub db_title { + my ($self, $source_file) = @_; + my ($vol, $dir, $file) = File::Spec->splitpath($source_file); + my $release = $self->param('release'); + my $title = $file; + $title =~ s/$release\.//; + return $title; +} + +sub db_date { + my ($self) = @_; + return strftime('%d-%m-%Y', gmtime()); +} + +#Source like Homo_sapiens.GRCh37.68.dna.toplevel.fa +#Filename like Homo_sapiens.GRCh37.20090401.dna.toplevel.fa +sub target_filename { + my ($self, $source_file) = @_; + my ($vol, $dir, $file) = File::Spec->splitpath($source_file); + if($self->param('type') eq 'genomic') { + my @split = split(/\./, $file); + my $rm_date = $self->repeat_mask_date(); + $split[-4] = $rm_date; + return join(q{.}, @split); + } + return $file; +} + +1; diff --git a/modules/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm b/modules/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm index 513f163e57f926a28eb6523df93b46bb2f2c4e16..ed8c33de83564b3a30845be30407df2a496a4ae3 100644 --- a/modules/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm +++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm @@ -113,10 +113,7 @@ sub index_file { my $cmd = sprintf(q{%s %s %s}, $self->param('program'), $file, $target_file); - $self->info('About to run "%s"', $cmd); - my $output = `$cmd 2>&1`; - my $rc = $? >> 8; - throw "Cannot run program '$cmd'. Return code was ${rc}. Program output was $output" if $rc; + $self->run_cmd($cmd); unlink $file or throw "Cannot remove the file '$file' from the filesystem: $!"; #Check the file size. If it's 16 bytes then reject as that is an empty file for 2bit diff --git a/modules/Bio/EnsEMBL/Pipeline/FASTA/NcbiBlastIndexer.pm b/modules/Bio/EnsEMBL/Pipeline/FASTA/NcbiBlastIndexer.pm new file mode 100644 index 0000000000000000000000000000000000000000..a5a3f82de540b289e7e7101ceec12daf539d7921 --- /dev/null +++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/NcbiBlastIndexer.pm @@ -0,0 +1,82 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=head1 NAME + +Bio::EnsEMBL::Pipeline::FASTA::NcbiBlastIndexer + +=head1 DESCRIPTION + +Creates NCBI Blast indexes of the given GZipped file. The resulting index +is created under the parameter location I<base_path> in ncbi_blast and then in a +directory defined by the type of dump. The type of dump also changes the file +name generated. Genomic dumps have their release number replaced with the +last repeat masked date. + +Allowed parameters are: + +=over 8 + +=item file - The file to index + +=item program - The location of the xdformat program + +=item molecule - The type of molecule to index. I<dna> and I<pep> are allowed + +=item type - Type of index we are creating. I<genomic> and I<genes> are allowed + +=item base_path - The base of the dumps + +=item release - Required for correct DB naming + +=back + +=cut + +package Bio::EnsEMBL::Pipeline::FASTA::NcbiBlastIndexer; + +use strict; +use warnings; +use base qw/Bio::EnsEMBL::Pipeline::FASTA::BlastIndexer/; + +sub param_defaults { + my ($self) = @_; + return { + program => 'makeblastdb', + blast_dir => 'ncbi_blast', + }; +} + +sub index_file { + my ($self, $file) = @_; + my $molecule_arg = ($self->param('molecule') eq 'dna') ? 'nucl' : 'prot' ; + my $target_dir = $self->target_dir(); + my $target_file = $self->target_file($file); + my $db_title = $self->db_title($file); + + my $cmd = sprintf(q{cd %s && %s -in %s -out %s -dbtype %s -title %s -input_type fasta}, + $target_dir, $self->param('program'), $file, $target_file, $molecule_arg, $db_title); + $self->run_cmd($cmd); + unlink $file or $self->throw("Cannot remove the file '$file' from the filesystem: $!"); + $self->param('index_base', $target_file); + return; +} + +1; diff --git a/modules/Bio/EnsEMBL/Pipeline/FASTA/NcbiBlastReIndexer.pm b/modules/Bio/EnsEMBL/Pipeline/FASTA/NcbiBlastReIndexer.pm new file mode 100644 index 0000000000000000000000000000000000000000..37ce068e5753246cf1049d2fa50ec86a081b5277 --- /dev/null +++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/NcbiBlastReIndexer.pm @@ -0,0 +1,81 @@ +=pod + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=head1 NAME + +Bio::EnsEMBL::Pipeline::FASTA::NcbiBlastReIndexer + +=head1 DESCRIPTION + +Overrides the NcbiBlastIndexer to provide indexing based only on a file name generated +by the FASTA pipeline for FTP dumps. + +Allowed parameters are: + +=over 8 + +=item file - The file to index. All other parameters are derrived from this + +=back + +=cut + +package Bio::EnsEMBL::Pipeline::FASTA::NcbiBlastReIndexer; + +use strict; +use warnings; +use base qw/Bio::EnsEMBL::Pipeline::FASTA::NcbiBlastIndexer/; +use File::Basename; + +sub param_defaults { + my ($self) = @_; + return $self->SUPER::param_defaults(); +} + +sub fetch_input { + my ($self) = @_; + my $file = $self->param('file'); + die "No file parameter '$file' given" unless defined $file; + die "No file at '$file' exists" unless -f $file; + my $filename = basename($self->param('file')); + + #Filenames can look like: + # Drosophila_melanogaster.BDGP5.70.dna.toplevel.fa.gz + # Drosophila_melanogaster.BDGP5.70.dna_rm.toplevel.fa.gz + # Drosophila_melanogaster.BDGP5.70.dna_sm.toplevel.fa.gz + # Drosophila_melanogaster.BDGP5.70.cdna.all.fa.gz + # Drosophila_melanogaster.BDGP5.70.cdna.abinitio.fa.gz + # Drosophila_melanogaster.BDGP5.70.ncrna.fa.gz + # Drosophila_melanogaster.BDGP5.70.pep.all.fa.gz + # Drosophila_melanogaster.BDGP5.70.pep.abinitio.fa.gz + + # Because . can appear in assembly names the regex is quite difficult + # and it's easier to split on . and then pop & shift our way through it + my ($species, $assembly, $release, $type, $sub_type); + $filename =~ s/\.fa\.gz$//; + my @split_name = split(/\./, $filename); + $species = shift (@split_name); + $sub_type = pop(@split_name); + $type = pop(@split_name); + $release = pop(@split_name); + $assembly = join(q{.}, @split_name); + + my $molecule = ( $type =~ /na/ ) ? 'dna' : 'pep'; # if it has an na in there then it's DNA + my $blast_type = ($type =~ /dna/ ) ? 'genomic' : 'genes'; # if it was DNA then it's genomic + + $self->param('species', lc($species)); + $self->param('molecule', $molecule); + $self->param('type', $blast_type); + $self->param('release', $release); + + return $self->SUPER::fetch_input(); +} + +1; diff --git a/modules/Bio/EnsEMBL/Pipeline/FASTA/WuBlastIndexer.pm b/modules/Bio/EnsEMBL/Pipeline/FASTA/WuBlastIndexer.pm index f2904d8dfb01596ffbfc6158f0b8550c0ba9fc44..40f59b765d5772f2b0dc361616cd168039cbea4f 100644 --- a/modules/Bio/EnsEMBL/Pipeline/FASTA/WuBlastIndexer.pm +++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/WuBlastIndexer.pm @@ -30,23 +30,7 @@ directory defined by the type of dump. The type of dump also changes the file name generated. Genomic dumps have their release number replaced with the last repeat masked date. -Allowed parameters are: - -=over 8 - -=item file - The file to index - -=item program - The location of the xdformat program - -=item molecule - The type of molecule to index. I<dna> and I<pep> are allowed - -=item type - Type of index we are creating. I<genomic> and I<genes> are allowed - -=item base_path - The base of the dumps - -=item release - Required for correct DB naming - -=back +See Bio::EnsEMBL::Pipeline::FASTA::BlastIndexer for the allowed parameters. =cut @@ -54,47 +38,16 @@ package Bio::EnsEMBL::Pipeline::FASTA::WuBlastIndexer; use strict; use warnings; -use base qw/Bio::EnsEMBL::Pipeline::FASTA::Indexer/; - -use Bio::EnsEMBL::Utils::Exception qw/throw/; -use File::Copy qw/copy/; -use File::Spec; -use POSIX qw/strftime/; +use base qw/Bio::EnsEMBL::Pipeline::FASTA::BlastIndexer/; sub param_defaults { my ($self) = @_; return { program => 'xdformat', -# molecule => 'pep', #pep or dna -# type => 'genes' #genes or genomic + blast_dir => 'blast', }; } -sub fetch_input { - my ($self) = @_; - my $mol = $self->param('molecule'); - if($mol ne 'dna' && $mol ne 'pep') { - throw "param 'molecule' must be set to 'dna' or 'pep'"; - } - my $type = $self->param('type'); - if($type ne 'genomic' && $type ne 'genes') { - throw "param 'type' must be set to 'genomic' or 'genes'"; - } - $self->assert_executable($self->param('program')); - $self->assert_executable('gunzip'); -} - -sub write_output { - my ($self) = @_; - $self->dataflow_output_id({ - species => $self->param('species'), - type => $self->param('type'), - molecule => $self->param('molecule'), - index_base => $self->param('index_base') - }, 1); - return; -} - sub index_file { my ($self, $file) = @_; my $molecule_arg = ($self->param('molecule') eq 'dna') ? '-n' : '-p' ; @@ -107,54 +60,10 @@ sub index_file { my $cmd = sprintf(q{cd %s && %s %s -q%d -I -t %s -d %s -o %s %s }, $target_dir, $self->param('program'), $molecule_arg, $silence, $db_title, $date, $target_file, $file); - $self->info('About to run "%s"', $cmd); - my $output = `$cmd 2>&1`; - my $rc = $? >> 8; - throw "Cannot run program '$cmd'. Return code was ${rc}. Program output was $output" if $rc; - unlink $file or throw "Cannot remove the file '$file' from the filesystem: $!"; + $self->run_cmd($cmd); + unlink $file or $self->throw("Cannot remove the file '$file' from the filesystem: $!"); $self->param('index_base', $target_file); return; } -sub target_file { - my ($self, $file) = @_; - my $target_dir = $self->target_dir(); - my $target_filename = $self->target_filename($file); - return File::Spec->catfile($target_dir, $target_filename); -} - -# Produce a dir like /nfs/path/to/blast/genes/XXX && /nfs/path/to/blast/dna/XXX -sub target_dir { - my ($self) = @_; - return $self->get_dir('blast', $self->param('type')); -} - -sub db_title { - my ($self, $source_file) = @_; - my ($vol, $dir, $file) = File::Spec->splitpath($source_file); - my $release = $self->param('release'); - my $title = $file; - $title =~ s/$release\.//; - return $title; -} - -sub db_date { - my ($self) = @_; - return strftime('%d-%m-%Y', gmtime()); -} - -#Source like Homo_sapiens.GRCh37.68.dna.toplevel.fa -#Filename like Homo_sapiens.GRCh37.20090401.dna.toplevel.fa -sub target_filename { - my ($self, $source_file) = @_; - my ($vol, $dir, $file) = File::Spec->splitpath($source_file); - if($self->param('type') eq 'genomic') { - my @split = split(/\./, $file); - my $rm_date = $self->repeat_mask_date(); - $split[-4] = $rm_date; - return join(q{.}, @split); - } - return $file; -} - 1; diff --git a/modules/Bio/EnsEMBL/Pipeline/PipeConfig/NcbiBlastReIndexer_conf.pm b/modules/Bio/EnsEMBL/Pipeline/PipeConfig/NcbiBlastReIndexer_conf.pm new file mode 100644 index 0000000000000000000000000000000000000000..78fc86c947d2adccfe60874805cf0843d19c7afd --- /dev/null +++ b/modules/Bio/EnsEMBL/Pipeline/PipeConfig/NcbiBlastReIndexer_conf.pm @@ -0,0 +1,88 @@ +package Bio::EnsEMBL::Pipeline::PipeConfig::NcbiBlastReIndexer_conf; + +use strict; +use warnings; + +use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); + +sub default_options { + my ($self) = @_; + + my $settings = { + # inherit other stuff from the base class + %{ $self->SUPER::default_options() }, + + # 'fa_dir' => '', # locate where the current fa files to look for are + # 'base_path' => '', # where do you want your files + + ncbiblast_exe => 'makeblastdb', + + ### Defaults + + pipeline_name => 'ncbi_blast_indexer', + + }; + $settings->{'pipeline_db'}->{'-driver'} = 'sqlite'; + return $settings; +} + +sub pipeline_create_commands { + my ($self) = @_; + return [ + # inheriting database and hive tables' creation + @{$self->SUPER::pipeline_create_commands}, + ]; +} + +sub pipeline_analyses { + my ($self) = @_; + return [ + + { + -logic_name => 'FindFiles', + -module => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory', + -meadow_type => 'LOCAL', + -parameters => { + inputcmd => 'find '.$self->o('fa_dir').q{ -type f -name '#pattern#'}, + column_names => ['file'] + }, + -input_ids => [ + { pattern => '*.dna*.toplevel.fa.gz' }, + { pattern => '*.cdna.*.fa.gz' }, + { pattern => '*.pep.*.fa.gz' }, + { pattern => '*.ncrna.*.fa.gz' }, + ], + -flow_into => { + 2 => { Index => { file => '#file#'} } + }, + }, + + { + -logic_name => 'Index', + -module => 'Bio::EnsEMBL::Pipeline::FASTA::NcbiBlastReIndexer', + -analysis_capacity => 10, + -rc_name => 'dump', + -parameters => { + program => $self->o('ncbiblast_exe') + } + }, + ]; +} + +sub pipeline_wide_parameters { + my ($self) = @_; + return { + %{ $self->SUPER::pipeline_wide_parameters() }, + base_path => $self->o('base_path'), + }; +} + +sub resource_classes { + my $self = shift; + return { + %{$self->SUPER::resource_classes()}, + dump => { 'LSF' => '-q normal -M4000000 -R"select[mem>4000] rusage[mem=4000]"'}, + } +} + +1; \ No newline at end of file