From 90f6df20e09225f4060f72968f23a7c53cc4e2c3 Mon Sep 17 00:00:00 2001
From: Andrew Yates <ayates@ebi.ac.uk>
Date: Tue, 19 Mar 2013 10:02:54 +0000
Subject: [PATCH] Adding support for NCBI blast DB creation. This is not rolled
 into the main distro yet but is available as a separate pipeline. Existing
 indexers have had some functionality moved down the stack to avoid massive
 code duplication

---
 modules/Bio/EnsEMBL/Pipeline/Base.pm          |   9 +
 .../EnsEMBL/Pipeline/FASTA/BlastIndexer.pm    | 163 ++++++++++++++++++
 .../Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm |   5 +-
 .../Pipeline/FASTA/NcbiBlastIndexer.pm        |  82 +++++++++
 .../Pipeline/FASTA/NcbiBlastReIndexer.pm      |  81 +++++++++
 .../EnsEMBL/Pipeline/FASTA/WuBlastIndexer.pm  | 101 +----------
 .../PipeConfig/NcbiBlastReIndexer_conf.pm     |  88 ++++++++++
 7 files changed, 429 insertions(+), 100 deletions(-)
 create mode 100644 modules/Bio/EnsEMBL/Pipeline/FASTA/BlastIndexer.pm
 create mode 100644 modules/Bio/EnsEMBL/Pipeline/FASTA/NcbiBlastIndexer.pm
 create mode 100644 modules/Bio/EnsEMBL/Pipeline/FASTA/NcbiBlastReIndexer.pm
 create mode 100644 modules/Bio/EnsEMBL/Pipeline/PipeConfig/NcbiBlastReIndexer_conf.pm

diff --git a/modules/Bio/EnsEMBL/Pipeline/Base.pm b/modules/Bio/EnsEMBL/Pipeline/Base.pm
index 0d96747e7a..c6f36b605b 100644
--- a/modules/Bio/EnsEMBL/Pipeline/Base.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/Base.pm
@@ -241,6 +241,15 @@ sub assert_executable {
   return 1;
 }
 
+# Runs the given command and returns a list of exit code and output
+sub run_cmd {
+  my ($self, $cmd) = @_;
+  $self->info('About to run "%s"', $cmd);
+  my $output = `$cmd 2>&1`;
+  my $rc = $? >> 8;
+  $self->throw("Cannot run program '$cmd'. Return code was ${rc}. Program output was $output") if $rc;
+  return ($rc, $output);
+}
 
 ## Production database adaptor
 sub get_production_DBAdaptor {
diff --git a/modules/Bio/EnsEMBL/Pipeline/FASTA/BlastIndexer.pm b/modules/Bio/EnsEMBL/Pipeline/FASTA/BlastIndexer.pm
new file mode 100644
index 0000000000..d68cf8d3c7
--- /dev/null
+++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/BlastIndexer.pm
@@ -0,0 +1,163 @@
+=pod
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::Pipeline::FASTA::NcbiBlastIndexer
+
+=head1 DESCRIPTION
+
+Creates NCBI Blast indexes of the given GZipped file. The resulting index
+is created under the parameter location I<base_path> in I<blast_dir> and then in a
+directory defined by the type of dump. The type of dump also changes the file
+name generated. Genomic dumps have their release number replaced with the
+last repeat masked date. 
+
+Allowed parameters are:
+
+=over 8
+
+=item file - The file to index
+
+=item program - The location of the xdformat program
+
+=item molecule - The type of molecule to index. I<dna> and I<pep> are allowed
+
+=item type - Type of index we are creating. I<genomic> and I<genes> are allowed
+
+=item base_path - The base of the dumps
+
+=item release - Required for correct DB naming
+
+=back
+
+=cut
+
+package Bio::EnsEMBL::Pipeline::FASTA::BlastIndexer;
+
+use strict;
+use warnings;
+use base qw/Bio::EnsEMBL::Pipeline::FASTA::Indexer/;
+
+use Bio::EnsEMBL::Utils::Exception qw/throw/;
+use File::Copy qw/copy/;
+use File::Spec;
+use POSIX qw/strftime/;
+
+sub param_defaults {
+  my ($self) = @_;
+  return {
+#    program => 'xdformat', #program to use for indexing
+#    molecule => 'pep', #pep or dna
+#    type => 'genes',    #genes or genomic
+#    blast_dir => 'blast_type_dir', # sets the type of directory used for 
+  };
+}
+
+sub fetch_input {
+  my ($self) = @_;
+  my $mol = $self->param('molecule');
+  throw "No molecule param given" unless defined $mol;
+  if($mol ne 'dna' && $mol ne 'pep') {
+    throw "param 'molecule' must be set to 'dna' or 'pep'. Value given was '${$mol}'";
+  }
+  my $type = $self->param('type');
+  throw "No type param given" unless defined $type;
+  if($type ne 'genomic' && $type ne 'genes') {
+    throw "param 'type' must be set to 'genomic' or 'genes'. Value given was '${$type}'";
+  }
+  $self->assert_executable($self->param('program'));
+  $self->assert_executable('gunzip');
+}
+
+sub write_output {
+  my ($self) = @_;
+  $self->dataflow_output_id({
+    species     => $self->param('species'),
+    type        => $self->param('type'),
+    molecule    => $self->param('molecule'),
+    index_base  => $self->param('index_base')
+  }, 1);
+  return;
+}
+
+sub index_file {
+  my ($self, $file) = @_;
+  my $molecule_arg = ($self->param('molecule') eq 'dna') ? '-n' : '-p' ;
+  my $silence = ($self->debug()) ? 0 : 1;
+  my $target_dir = $self->target_dir();
+  my $target_file = $self->target_file($file);
+  my $db_title = $self->db_title($file);
+  my $date = $self->db_date();
+  
+  my $cmd = sprintf(q{cd %s && %s %s -q%d -I -t %s -d %s -o %s %s }, 
+    $target_dir, $self->param('program'), $molecule_arg, $silence, $db_title, $date, $target_file, $file);
+  
+  $self->info('About to run "%s"', $cmd);
+  my $output = `$cmd 2>&1`;
+  my $rc = $? >> 8;
+  throw "Cannot run program '$cmd'. Return code was ${rc}. Program output was $output" if $rc;
+  unlink $file or throw "Cannot remove the file '$file' from the filesystem: $!";
+  $self->param('index_base', $target_file);
+  return;
+}
+
+sub target_file {
+  my ($self, $file) = @_;
+  my $target_dir = $self->target_dir();
+  my $target_filename = $self->target_filename($file);
+  return File::Spec->catfile($target_dir, $target_filename);
+}
+
+# Produce a dir like /nfs/path/to/<blast_dir>/genes/XXX && /nfs/path/to/<blast_dir>/dna/XXX
+sub target_dir {
+  my ($self) = @_;
+  return $self->get_dir($self->param('blast_dir'), $self->param('type'));
+}
+
+sub db_title {
+  my ($self, $source_file) = @_;
+  my ($vol, $dir, $file) = File::Spec->splitpath($source_file);
+  my $release = $self->param('release');
+  my $title = $file;
+  $title =~ s/$release\.//;
+  return $title;
+}
+
+sub db_date {
+  my ($self) = @_;
+  return strftime('%d-%m-%Y', gmtime());
+}
+
+#Source like Homo_sapiens.GRCh37.68.dna.toplevel.fa
+#Filename like Homo_sapiens.GRCh37.20090401.dna.toplevel.fa
+sub target_filename {
+  my ($self, $source_file) = @_;
+  my ($vol, $dir, $file) = File::Spec->splitpath($source_file);
+  if($self->param('type') eq 'genomic') {
+    my @split = split(/\./, $file);
+    my $rm_date = $self->repeat_mask_date();
+    $split[-4] = $rm_date;
+    return join(q{.}, @split);
+  }
+  return $file;
+}
+
+1;
diff --git a/modules/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm b/modules/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm
index 513f163e57..ed8c33de83 100644
--- a/modules/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm
@@ -113,10 +113,7 @@ sub index_file {
   my $cmd = sprintf(q{%s %s %s}, 
     $self->param('program'), $file, $target_file);
   
-  $self->info('About to run "%s"', $cmd);
-  my $output = `$cmd 2>&1`;
-  my $rc = $? >> 8;
-  throw "Cannot run program '$cmd'. Return code was ${rc}. Program output was $output" if $rc;
+  $self->run_cmd($cmd);
   unlink $file or throw "Cannot remove the file '$file' from the filesystem: $!";
   
   #Check the file size. If it's 16 bytes then reject as that is an empty file for 2bit
diff --git a/modules/Bio/EnsEMBL/Pipeline/FASTA/NcbiBlastIndexer.pm b/modules/Bio/EnsEMBL/Pipeline/FASTA/NcbiBlastIndexer.pm
new file mode 100644
index 0000000000..a5a3f82de5
--- /dev/null
+++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/NcbiBlastIndexer.pm
@@ -0,0 +1,82 @@
+=pod
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::Pipeline::FASTA::NcbiBlastIndexer
+
+=head1 DESCRIPTION
+
+Creates NCBI Blast indexes of the given GZipped file. The resulting index
+is created under the parameter location I<base_path> in ncbi_blast and then in a
+directory defined by the type of dump. The type of dump also changes the file
+name generated. Genomic dumps have their release number replaced with the
+last repeat masked date. 
+
+Allowed parameters are:
+
+=over 8
+
+=item file - The file to index
+
+=item program - The location of the xdformat program
+
+=item molecule - The type of molecule to index. I<dna> and I<pep> are allowed
+
+=item type - Type of index we are creating. I<genomic> and I<genes> are allowed
+
+=item base_path - The base of the dumps
+
+=item release - Required for correct DB naming
+
+=back
+
+=cut
+
+package Bio::EnsEMBL::Pipeline::FASTA::NcbiBlastIndexer;
+
+use strict;
+use warnings;
+use base qw/Bio::EnsEMBL::Pipeline::FASTA::BlastIndexer/;
+
+sub param_defaults {
+  my ($self) = @_;
+  return {
+    program => 'makeblastdb',
+    blast_dir => 'ncbi_blast',
+  };
+}
+
+sub index_file {
+  my ($self, $file) = @_;
+  my $molecule_arg = ($self->param('molecule') eq 'dna') ? 'nucl' : 'prot' ;
+  my $target_dir = $self->target_dir();
+  my $target_file = $self->target_file($file);
+  my $db_title = $self->db_title($file);
+  
+  my $cmd = sprintf(q{cd %s && %s -in %s -out %s -dbtype %s -title %s -input_type fasta}, 
+    $target_dir, $self->param('program'), $file, $target_file, $molecule_arg, $db_title);
+  $self->run_cmd($cmd);  
+  unlink $file or $self->throw("Cannot remove the file '$file' from the filesystem: $!");
+  $self->param('index_base', $target_file);
+  return;
+}
+
+1;
diff --git a/modules/Bio/EnsEMBL/Pipeline/FASTA/NcbiBlastReIndexer.pm b/modules/Bio/EnsEMBL/Pipeline/FASTA/NcbiBlastReIndexer.pm
new file mode 100644
index 0000000000..37ce068e57
--- /dev/null
+++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/NcbiBlastReIndexer.pm
@@ -0,0 +1,81 @@
+=pod
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::Pipeline::FASTA::NcbiBlastReIndexer
+
+=head1 DESCRIPTION
+
+Overrides the NcbiBlastIndexer to provide indexing based only on a file name generated
+by the FASTA pipeline for FTP dumps.
+
+Allowed parameters are:
+
+=over 8
+
+=item file - The file to index. All other parameters are derrived from this
+
+=back
+
+=cut
+
+package Bio::EnsEMBL::Pipeline::FASTA::NcbiBlastReIndexer;
+
+use strict;
+use warnings;
+use base qw/Bio::EnsEMBL::Pipeline::FASTA::NcbiBlastIndexer/;
+use File::Basename;
+
+sub param_defaults {
+  my ($self) = @_;
+  return $self->SUPER::param_defaults();
+}
+
+sub fetch_input {
+  my ($self) = @_;
+  my $file = $self->param('file');
+  die "No file parameter '$file' given" unless defined $file;
+  die "No file at '$file' exists" unless -f $file;
+  my $filename = basename($self->param('file'));
+  
+  #Filenames can look like:
+  # Drosophila_melanogaster.BDGP5.70.dna.toplevel.fa.gz
+  # Drosophila_melanogaster.BDGP5.70.dna_rm.toplevel.fa.gz
+  # Drosophila_melanogaster.BDGP5.70.dna_sm.toplevel.fa.gz
+  # Drosophila_melanogaster.BDGP5.70.cdna.all.fa.gz
+  # Drosophila_melanogaster.BDGP5.70.cdna.abinitio.fa.gz
+  # Drosophila_melanogaster.BDGP5.70.ncrna.fa.gz
+  # Drosophila_melanogaster.BDGP5.70.pep.all.fa.gz
+  # Drosophila_melanogaster.BDGP5.70.pep.abinitio.fa.gz
+  
+  # Because . can appear in assembly names the regex is quite difficult
+  # and it's easier to split on . and then pop & shift our way through it
+  my ($species, $assembly, $release, $type, $sub_type);
+  $filename =~ s/\.fa\.gz$//;
+  my @split_name = split(/\./, $filename);
+  $species = shift (@split_name);
+  $sub_type = pop(@split_name);
+  $type = pop(@split_name);
+  $release = pop(@split_name);
+  $assembly = join(q{.}, @split_name);
+  
+  my $molecule = ( $type =~ /na/ ) ? 'dna' : 'pep'; # if it has an na in there then it's DNA
+  my $blast_type = ($type =~ /dna/ ) ? 'genomic' : 'genes'; # if it was DNA then it's genomic
+  
+  $self->param('species', lc($species));
+  $self->param('molecule', $molecule);
+  $self->param('type', $blast_type);
+  $self->param('release', $release);
+  
+  return $self->SUPER::fetch_input();
+}
+
+1;
diff --git a/modules/Bio/EnsEMBL/Pipeline/FASTA/WuBlastIndexer.pm b/modules/Bio/EnsEMBL/Pipeline/FASTA/WuBlastIndexer.pm
index f2904d8dfb..40f59b765d 100644
--- a/modules/Bio/EnsEMBL/Pipeline/FASTA/WuBlastIndexer.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/WuBlastIndexer.pm
@@ -30,23 +30,7 @@ directory defined by the type of dump. The type of dump also changes the file
 name generated. Genomic dumps have their release number replaced with the
 last repeat masked date. 
 
-Allowed parameters are:
-
-=over 8
-
-=item file - The file to index
-
-=item program - The location of the xdformat program
-
-=item molecule - The type of molecule to index. I<dna> and I<pep> are allowed
-
-=item type - Type of index we are creating. I<genomic> and I<genes> are allowed
-
-=item base_path - The base of the dumps
-
-=item release - Required for correct DB naming
-
-=back
+See Bio::EnsEMBL::Pipeline::FASTA::BlastIndexer for the allowed parameters.
 
 =cut
 
@@ -54,47 +38,16 @@ package Bio::EnsEMBL::Pipeline::FASTA::WuBlastIndexer;
 
 use strict;
 use warnings;
-use base qw/Bio::EnsEMBL::Pipeline::FASTA::Indexer/;
-
-use Bio::EnsEMBL::Utils::Exception qw/throw/;
-use File::Copy qw/copy/;
-use File::Spec;
-use POSIX qw/strftime/;
+use base qw/Bio::EnsEMBL::Pipeline::FASTA::BlastIndexer/;
 
 sub param_defaults {
   my ($self) = @_;
   return {
     program => 'xdformat',
-#    molecule => 'pep', #pep or dna
-#    type => 'genes'    #genes or genomic
+    blast_dir => 'blast',
   };
 }
 
-sub fetch_input {
-  my ($self) = @_;
-  my $mol = $self->param('molecule');
-  if($mol ne 'dna' && $mol ne 'pep') {
-    throw "param 'molecule' must be set to 'dna' or 'pep'";
-  }
-  my $type = $self->param('type');
-  if($type ne 'genomic' && $type ne 'genes') {
-    throw "param 'type' must be set to 'genomic' or 'genes'";
-  }
-  $self->assert_executable($self->param('program'));
-  $self->assert_executable('gunzip');
-}
-
-sub write_output {
-  my ($self) = @_;
-  $self->dataflow_output_id({
-    species     => $self->param('species'),
-    type        => $self->param('type'),
-    molecule    => $self->param('molecule'),
-    index_base  => $self->param('index_base')
-  }, 1);
-  return;
-}
-
 sub index_file {
   my ($self, $file) = @_;
   my $molecule_arg = ($self->param('molecule') eq 'dna') ? '-n' : '-p' ;
@@ -107,54 +60,10 @@ sub index_file {
   my $cmd = sprintf(q{cd %s && %s %s -q%d -I -t %s -d %s -o %s %s }, 
     $target_dir, $self->param('program'), $molecule_arg, $silence, $db_title, $date, $target_file, $file);
   
-  $self->info('About to run "%s"', $cmd);
-  my $output = `$cmd 2>&1`;
-  my $rc = $? >> 8;
-  throw "Cannot run program '$cmd'. Return code was ${rc}. Program output was $output" if $rc;
-  unlink $file or throw "Cannot remove the file '$file' from the filesystem: $!";
+  $self->run_cmd($cmd);
+  unlink $file or $self->throw("Cannot remove the file '$file' from the filesystem: $!");
   $self->param('index_base', $target_file);
   return;
 }
 
-sub target_file {
-  my ($self, $file) = @_;
-  my $target_dir = $self->target_dir();
-  my $target_filename = $self->target_filename($file);
-  return File::Spec->catfile($target_dir, $target_filename);
-}
-
-# Produce a dir like /nfs/path/to/blast/genes/XXX && /nfs/path/to/blast/dna/XXX
-sub target_dir {
-  my ($self) = @_;
-  return $self->get_dir('blast', $self->param('type'));
-}
-
-sub db_title {
-  my ($self, $source_file) = @_;
-  my ($vol, $dir, $file) = File::Spec->splitpath($source_file);
-  my $release = $self->param('release');
-  my $title = $file;
-  $title =~ s/$release\.//;
-  return $title;
-}
-
-sub db_date {
-  my ($self) = @_;
-  return strftime('%d-%m-%Y', gmtime());
-}
-
-#Source like Homo_sapiens.GRCh37.68.dna.toplevel.fa
-#Filename like Homo_sapiens.GRCh37.20090401.dna.toplevel.fa
-sub target_filename {
-  my ($self, $source_file) = @_;
-  my ($vol, $dir, $file) = File::Spec->splitpath($source_file);
-  if($self->param('type') eq 'genomic') {
-    my @split = split(/\./, $file);
-    my $rm_date = $self->repeat_mask_date();
-    $split[-4] = $rm_date;
-    return join(q{.}, @split);
-  }
-  return $file;
-}
-
 1;
diff --git a/modules/Bio/EnsEMBL/Pipeline/PipeConfig/NcbiBlastReIndexer_conf.pm b/modules/Bio/EnsEMBL/Pipeline/PipeConfig/NcbiBlastReIndexer_conf.pm
new file mode 100644
index 0000000000..78fc86c947
--- /dev/null
+++ b/modules/Bio/EnsEMBL/Pipeline/PipeConfig/NcbiBlastReIndexer_conf.pm
@@ -0,0 +1,88 @@
+package Bio::EnsEMBL::Pipeline::PipeConfig::NcbiBlastReIndexer_conf;
+
+use strict;
+use warnings;
+
+use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf');
+
+sub default_options {
+    my ($self) = @_;
+    
+    my $settings = {
+      # inherit other stuff from the base class
+      %{ $self->SUPER::default_options() }, 
+      
+      # 'fa_dir'    => '',  # locate where the current fa files to look for are
+      # 'base_path' => '',  # where do you want your files
+      
+      ncbiblast_exe => 'makeblastdb',
+      
+      ### Defaults 
+      
+      pipeline_name => 'ncbi_blast_indexer',
+      
+    };
+    $settings->{'pipeline_db'}->{'-driver'} = 'sqlite';
+    return $settings;
+}
+
+sub pipeline_create_commands {
+    my ($self) = @_;
+    return [
+      # inheriting database and hive tables' creation
+      @{$self->SUPER::pipeline_create_commands}, 
+    ];
+}
+
+sub pipeline_analyses {
+  my ($self) = @_;
+  return [
+  
+    {
+      -logic_name => 'FindFiles',
+      -module     => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
+      -meadow_type => 'LOCAL',
+      -parameters => {
+        inputcmd => 'find '.$self->o('fa_dir').q{ -type f -name '#pattern#'},
+        column_names => ['file']
+      },
+      -input_ids  => [ 
+        { pattern => '*.dna*.toplevel.fa.gz' },
+        { pattern => '*.cdna.*.fa.gz' },
+        { pattern => '*.pep.*.fa.gz' },
+        { pattern => '*.ncrna.*.fa.gz' },
+      ],
+      -flow_into  => {
+        2 =>  { Index => { file => '#file#'} }
+      },
+    },
+    
+    {
+      -logic_name => 'Index',
+      -module     => 'Bio::EnsEMBL::Pipeline::FASTA::NcbiBlastReIndexer',
+      -analysis_capacity => 10,
+      -rc_name => 'dump',
+      -parameters => {
+        program => $self->o('ncbiblast_exe')
+      }
+    },
+  ];
+}
+
+sub pipeline_wide_parameters {
+  my ($self) = @_;
+  return {
+    %{ $self->SUPER::pipeline_wide_parameters() },
+    base_path => $self->o('base_path'),
+  };
+}
+
+sub resource_classes {
+  my $self = shift;
+  return {
+    %{$self->SUPER::resource_classes()},
+    dump => { 'LSF' => '-q normal -M4000000 -R"select[mem>4000] rusage[mem=4000]"'},
+  }
+}
+
+1;
\ No newline at end of file
-- 
GitLab