From 6356b422de07172457e63b39366e9d95228f2590 Mon Sep 17 00:00:00 2001
From: Andrew Yates <ayates@ebi.ac.uk>
Date: Mon, 24 Sep 2012 13:27:13 +0000
Subject: [PATCH] [ENSCORESW-217] & [ENSCORESW-263]. Added automatic generation
 of the primary assembly file and cleaned up the readme slightly pending full
 re-design.

---
 modules/Bio/EnsEMBL/Pipeline/FASTA/Base.pm    | 56 ++++++++++++
 .../Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm | 50 +----------
 .../Pipeline/FASTA/CreatePrimaryAssembly.pm   | 85 +++++++++++++++++++
 .../Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm    |  7 +-
 .../EnsEMBL/Pipeline/PipeConfig/FASTA_conf.pm | 13 ++-
 5 files changed, 157 insertions(+), 54 deletions(-)
 create mode 100644 modules/Bio/EnsEMBL/Pipeline/FASTA/CreatePrimaryAssembly.pm

diff --git a/modules/Bio/EnsEMBL/Pipeline/FASTA/Base.pm b/modules/Bio/EnsEMBL/Pipeline/FASTA/Base.pm
index 5e03acd253..95947cf3f4 100644
--- a/modules/Bio/EnsEMBL/Pipeline/FASTA/Base.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/Base.pm
@@ -19,4 +19,60 @@ sub old_path {
   my $dir = File::Spec->catdir($base, "release-$release", 'fasta', $prod, 'dna');
 }
 
+# Filter a FASTA dump for reference regions only and parse names from the
+# headers in the file e.g. NNN:NNN:NNN:1:80:1
+sub filter_fasta_for_nonref {
+  my ($self, $source_file, $target_fh) = @_;
+
+  my $allowed_regions = $self->allowed_regions();
+  
+  $self->info('Filtering from %s', $source_file);
+  
+  open my $src_fh, '-|', "gzip -c -d $source_file" or $self->throw("Cannot decompress $source_file: $!");
+  my $transfer = 0;
+  while(my $line = <$src_fh>) {
+    #HEADER
+    if(index($line, '>') == 0) {
+      #regex is looking for NNN:NNN:NNN:1:80:1 i.e. the name
+      my ($name) = $line =~ />.+\s(.+:.+:.+:\d+:\d+:\d+)/; 
+      $transfer = ($allowed_regions->{$name}) ? 1 : 0;
+      if($transfer) {
+        $self->info('%s was an allowed Slice', $name);
+      }
+      else {
+        $self->info('%s will be skipped; not a reference Slice', $name);
+      }
+    }
+    print $target_fh $line if $transfer;
+  }
+  close($src_fh);
+  
+  return;
+}
+
+sub allowed_regions {
+  my ($self) = @_;
+  my $filter_human = 1;
+  my @slices = grep { $_->is_reference() } @{$self->get_Slices('core', $filter_human)};
+  my %hash = map { $_->name() => 1 } @slices;
+  return \%hash;
+}
+
+sub has_non_refs {
+  my ($self) = @_;
+  my $sql = <<'SQL';
+select count(*)
+from attrib_type at
+join seq_region_attrib sra using (attrib_type_id)
+join seq_region sr using (seq_region_id)
+join coord_system cs using (coord_system_id)
+where cs.species_id =?
+and at.code =?
+SQL
+  my $dba = $self->get_DBAdaptor();
+  return $dba->dbc()->sql_helper()->execute_single_result(
+    -SQL => $sql, -PARAMS => [$dba->species_id(), 'non_ref']);
+}
+
+
 1;
diff --git a/modules/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm b/modules/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm
index e4c1c9af72..513f163e57 100644
--- a/modules/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm
@@ -147,44 +147,16 @@ sub decompress {
   my ($vol, $dir, $file) = File::Spec->splitpath($source);
   $file =~ s/.gz$//;
   my $target = File::Spec->catdir($target_dir, $file);
-  
-  my $allowed_regions = $self->allowed_regions();
-  
-  $self->info('Decompressing %s to %s', $source, $target);
-  
-  open my $src_fh, '-|', "zcat $source" or throw "Cannot decompress $source to $target";
+  $self->info('Writing to %s', $target);
   work_with_file($target, 'w', sub {
     my ($trg_fh) = @_;
-    my $transfer = 0;
-    while(my $line = <$src_fh>) {
-      #HEADER
-      if(index($line, '>') == 0) {
-        #regex is looking for NNN:NNN:NNN:1:80:1 i.e. the name
-        my ($name) = $line =~ />.+\s(.+:.+:.+:\d+:\d+:\d+)/; 
-        $transfer = ($allowed_regions->{$name}) ? 1 : 0;
-        if($transfer) {
-          $self->info('%s was an allowed Slice', $name);
-        }
-        else {
-          $self->info('%s will be skipped; not a reference Slice', $name);
-        }
-      }
-      print $trg_fh $line if $transfer;
-    }
+    $self->filter_fasta_for_nonref($source, $trg_fh);
+    return;
   });
-  close($src_fh);
 
   return $target;
 }
 
-sub allowed_regions {
-  my ($self) = @_;
-  my $filter_human = 1;
-  my @slices = grep { $_->is_reference() } @{$self->get_Slices('core', $filter_human)};
-  my %hash = map { $_->name() => 1 } @slices;
-  return \%hash;
-}
-
 #Filename like 30061.Homo_sapiens.GRCh37.2bit
 sub target_filename {
   my ($self) = @_;
@@ -217,20 +189,4 @@ sub blat_port {
   return $id + $self->param('port_offset');
 }
 
-sub has_non_refs {
-  my ($self) = @_;
-  my $sql = <<'SQL';
-select count(*)
-from attrib_type at
-join seq_region_attrib sra using (attrib_type_id)
-join seq_region sr using (seq_region_id)
-join coord_system cs using (coord_system_id)
-where cs.species_id =?
-and at.code =?
-SQL
-  my $dba = $self->get_DBAdaptor();
-  return $dba->dbc()->sql_helper()->execute_single_result(
-    -SQL => $sql, -PARAMS => [$dba->species_id(), 'non_ref']);
-}
-
 1;
diff --git a/modules/Bio/EnsEMBL/Pipeline/FASTA/CreatePrimaryAssembly.pm b/modules/Bio/EnsEMBL/Pipeline/FASTA/CreatePrimaryAssembly.pm
new file mode 100644
index 0000000000..5ba8c0551a
--- /dev/null
+++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/CreatePrimaryAssembly.pm
@@ -0,0 +1,85 @@
+=pod
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::Pipeline::FASTA::CreatePrimaryAssembly
+
+=head1 DESCRIPTION
+
+Scans the given file and attempts to create a primary assembly file which
+is a file containing only those regions we believe to be part of the 
+core assembly e.g. in Human this is 1-22, X, Y & MT
+
+Allowed parameters are:
+
+=over 8
+
+=item species - Required to indicate which species we are working with
+
+=item file - The file to filter
+
+=back
+
+=cut
+
+package Bio::EnsEMBL::Pipeline::FASTA::CreatePrimaryAssembly;
+
+use strict;
+use warnings;
+use base qw/Bio::EnsEMBL::Pipeline::FASTA::Base/;
+use Bio::EnsEMBL::Utils::IO qw/gz_work_with_file/;
+
+sub fetch_input {
+  my ($self) = @_;
+  foreach my $key (qw/species file/) {
+    $self->throw("Cannot find the required parameter $key") unless $self->param($key);
+  }
+  return;
+}
+
+# Creates the file only if required i.e. has non-reference toplevel sequences
+sub run {
+  my ($self) = @_;
+  if($self->has_non_refs()) {
+    my $source = $self->param('file');
+    my $target = $self->target_file();
+    $self->info('Decompressing to %s', $target);
+    gz_work_with_file($target, 'w', sub {
+      my ($trg_fh) = @_;
+      $self->filter_fasta_for_nonref($source, $trg_fh);
+      return;      
+    });
+  }
+  $self->cleanup_DBAdaptor();
+  return;
+}
+
+sub target_file {
+  my ($self) = @_;
+  # File name format looks like:
+  # <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa.gz
+  # e.g. Homo_sapiens.GRCh37.64.dna_rm.toplevel.fa.gz -> Homo_sapiens.GRCh37.64.dna_rm.primary_assembly.fa.gz
+  my $file = $self->param('file');
+  $file =~ s/\.toplevel\./.primary_assembly./;
+  return $file;
+}
+
+1;
diff --git a/modules/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm b/modules/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm
index 47bf44cd51..0d0e05de2c 100644
--- a/modules/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm
@@ -752,10 +752,9 @@ EXAMPLES
 -----------------
 PRIMARY ASSEMBLY
 -----------------
-This is a subset of sequences found in the toplevel file. The subset is
-defined as anything which is not a reference sequence region i.e. haplotypes
-and patches. If you wish to perform alignments to an assembly and want to
-ignore alternative assemblies then use this file.
+Primary assembly contains all toplevel sequence regions excluding haplotypes
+and patches. This file is best used for performing sequence similarity searches
+where patch and haplotype sequences would confuse analysis.   
 
 EXAMPLES
 
diff --git a/modules/Bio/EnsEMBL/Pipeline/PipeConfig/FASTA_conf.pm b/modules/Bio/EnsEMBL/Pipeline/PipeConfig/FASTA_conf.pm
index b13f0be22b..2d5350a2d9 100644
--- a/modules/Bio/EnsEMBL/Pipeline/PipeConfig/FASTA_conf.pm
+++ b/modules/Bio/EnsEMBL/Pipeline/PipeConfig/FASTA_conf.pm
@@ -132,10 +132,17 @@ sub pipeline_analyses {
         -can_be_empty => 1,
         -max_retry_count => 5,
         -flow_into  => {
-          1 => [qw/BlastDNAIndex BlatDNAIndex BlatSmDNAIndex/]
+          1 => [qw/BlastDNAIndex BlatDNAIndex BlatSmDNAIndex PrimaryAssembly/]
         },
       },
       
+      {
+        -logic_name       => 'PrimaryAssembly',
+        -module           => 'Bio::EnsEMBL::Pipeline::FASTA::CreatePrimaryAssembly',
+        -can_be_empty     => 1,
+        -max_retry_count  => 5,
+      },
+      
       ######## COPY DATA
       
       {
@@ -229,7 +236,7 @@ sub pipeline_analyses {
         },
         -hive_capacity => 3,
         -can_be_empty => 1,
-        -wait_for => [qw/DumpDNA DumpGenes BlastDNAIndex BlastGeneIndex BlastPepIndex/]
+        -wait_for => [qw/DumpDNA DumpGenes PrimaryAssembly BlastDNAIndex BlastGeneIndex BlastPepIndex/]
       },
       
       ####### CHECKSUMMING
@@ -242,7 +249,7 @@ sub pipeline_analyses {
           input_id => { 'dir' => '#dir#' },
           fan_branch_code => 2,
         },
-        -wait_for   => [qw/DumpDNA DumpGenes BlastDNAIndex BlastGeneIndex BlastPepIndex/],
+        -wait_for   => [qw/DumpDNA DumpGenes PrimaryAssembly BlastDNAIndex BlastGeneIndex BlastPepIndex/],
         -flow_into  => { 2 => ['ChecksumGenerator'] } 
       },
       
-- 
GitLab