From 6356b422de07172457e63b39366e9d95228f2590 Mon Sep 17 00:00:00 2001 From: Andrew Yates <ayates@ebi.ac.uk> Date: Mon, 24 Sep 2012 13:27:13 +0000 Subject: [PATCH] [ENSCORESW-217] & [ENSCORESW-263]. Added automatic generation of the primary assembly file and cleaned up the readme slightly pending full re-design. --- modules/Bio/EnsEMBL/Pipeline/FASTA/Base.pm | 56 ++++++++++++ .../Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm | 50 +---------- .../Pipeline/FASTA/CreatePrimaryAssembly.pm | 85 +++++++++++++++++++ .../Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm | 7 +- .../EnsEMBL/Pipeline/PipeConfig/FASTA_conf.pm | 13 ++- 5 files changed, 157 insertions(+), 54 deletions(-) create mode 100644 modules/Bio/EnsEMBL/Pipeline/FASTA/CreatePrimaryAssembly.pm diff --git a/modules/Bio/EnsEMBL/Pipeline/FASTA/Base.pm b/modules/Bio/EnsEMBL/Pipeline/FASTA/Base.pm index 5e03acd253..95947cf3f4 100644 --- a/modules/Bio/EnsEMBL/Pipeline/FASTA/Base.pm +++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/Base.pm @@ -19,4 +19,60 @@ sub old_path { my $dir = File::Spec->catdir($base, "release-$release", 'fasta', $prod, 'dna'); } +# Filter a FASTA dump for reference regions only and parse names from the +# headers in the file e.g. NNN:NNN:NNN:1:80:1 +sub filter_fasta_for_nonref { + my ($self, $source_file, $target_fh) = @_; + + my $allowed_regions = $self->allowed_regions(); + + $self->info('Filtering from %s', $source_file); + + open my $src_fh, '-|', "gzip -c -d $source_file" or $self->throw("Cannot decompress $source_file: $!"); + my $transfer = 0; + while(my $line = <$src_fh>) { + #HEADER + if(index($line, '>') == 0) { + #regex is looking for NNN:NNN:NNN:1:80:1 i.e. the name + my ($name) = $line =~ />.+\s(.+:.+:.+:\d+:\d+:\d+)/; + $transfer = ($allowed_regions->{$name}) ? 1 : 0; + if($transfer) { + $self->info('%s was an allowed Slice', $name); + } + else { + $self->info('%s will be skipped; not a reference Slice', $name); + } + } + print $target_fh $line if $transfer; + } + close($src_fh); + + return; +} + +sub allowed_regions { + my ($self) = @_; + my $filter_human = 1; + my @slices = grep { $_->is_reference() } @{$self->get_Slices('core', $filter_human)}; + my %hash = map { $_->name() => 1 } @slices; + return \%hash; +} + +sub has_non_refs { + my ($self) = @_; + my $sql = <<'SQL'; +select count(*) +from attrib_type at +join seq_region_attrib sra using (attrib_type_id) +join seq_region sr using (seq_region_id) +join coord_system cs using (coord_system_id) +where cs.species_id =? +and at.code =? +SQL + my $dba = $self->get_DBAdaptor(); + return $dba->dbc()->sql_helper()->execute_single_result( + -SQL => $sql, -PARAMS => [$dba->species_id(), 'non_ref']); +} + + 1; diff --git a/modules/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm b/modules/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm index e4c1c9af72..513f163e57 100644 --- a/modules/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm +++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm @@ -147,44 +147,16 @@ sub decompress { my ($vol, $dir, $file) = File::Spec->splitpath($source); $file =~ s/.gz$//; my $target = File::Spec->catdir($target_dir, $file); - - my $allowed_regions = $self->allowed_regions(); - - $self->info('Decompressing %s to %s', $source, $target); - - open my $src_fh, '-|', "zcat $source" or throw "Cannot decompress $source to $target"; + $self->info('Writing to %s', $target); work_with_file($target, 'w', sub { my ($trg_fh) = @_; - my $transfer = 0; - while(my $line = <$src_fh>) { - #HEADER - if(index($line, '>') == 0) { - #regex is looking for NNN:NNN:NNN:1:80:1 i.e. the name - my ($name) = $line =~ />.+\s(.+:.+:.+:\d+:\d+:\d+)/; - $transfer = ($allowed_regions->{$name}) ? 1 : 0; - if($transfer) { - $self->info('%s was an allowed Slice', $name); - } - else { - $self->info('%s will be skipped; not a reference Slice', $name); - } - } - print $trg_fh $line if $transfer; - } + $self->filter_fasta_for_nonref($source, $trg_fh); + return; }); - close($src_fh); return $target; } -sub allowed_regions { - my ($self) = @_; - my $filter_human = 1; - my @slices = grep { $_->is_reference() } @{$self->get_Slices('core', $filter_human)}; - my %hash = map { $_->name() => 1 } @slices; - return \%hash; -} - #Filename like 30061.Homo_sapiens.GRCh37.2bit sub target_filename { my ($self) = @_; @@ -217,20 +189,4 @@ sub blat_port { return $id + $self->param('port_offset'); } -sub has_non_refs { - my ($self) = @_; - my $sql = <<'SQL'; -select count(*) -from attrib_type at -join seq_region_attrib sra using (attrib_type_id) -join seq_region sr using (seq_region_id) -join coord_system cs using (coord_system_id) -where cs.species_id =? -and at.code =? -SQL - my $dba = $self->get_DBAdaptor(); - return $dba->dbc()->sql_helper()->execute_single_result( - -SQL => $sql, -PARAMS => [$dba->species_id(), 'non_ref']); -} - 1; diff --git a/modules/Bio/EnsEMBL/Pipeline/FASTA/CreatePrimaryAssembly.pm b/modules/Bio/EnsEMBL/Pipeline/FASTA/CreatePrimaryAssembly.pm new file mode 100644 index 0000000000..5ba8c0551a --- /dev/null +++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/CreatePrimaryAssembly.pm @@ -0,0 +1,85 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=head1 NAME + +Bio::EnsEMBL::Pipeline::FASTA::CreatePrimaryAssembly + +=head1 DESCRIPTION + +Scans the given file and attempts to create a primary assembly file which +is a file containing only those regions we believe to be part of the +core assembly e.g. in Human this is 1-22, X, Y & MT + +Allowed parameters are: + +=over 8 + +=item species - Required to indicate which species we are working with + +=item file - The file to filter + +=back + +=cut + +package Bio::EnsEMBL::Pipeline::FASTA::CreatePrimaryAssembly; + +use strict; +use warnings; +use base qw/Bio::EnsEMBL::Pipeline::FASTA::Base/; +use Bio::EnsEMBL::Utils::IO qw/gz_work_with_file/; + +sub fetch_input { + my ($self) = @_; + foreach my $key (qw/species file/) { + $self->throw("Cannot find the required parameter $key") unless $self->param($key); + } + return; +} + +# Creates the file only if required i.e. has non-reference toplevel sequences +sub run { + my ($self) = @_; + if($self->has_non_refs()) { + my $source = $self->param('file'); + my $target = $self->target_file(); + $self->info('Decompressing to %s', $target); + gz_work_with_file($target, 'w', sub { + my ($trg_fh) = @_; + $self->filter_fasta_for_nonref($source, $trg_fh); + return; + }); + } + $self->cleanup_DBAdaptor(); + return; +} + +sub target_file { + my ($self) = @_; + # File name format looks like: + # <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa.gz + # e.g. Homo_sapiens.GRCh37.64.dna_rm.toplevel.fa.gz -> Homo_sapiens.GRCh37.64.dna_rm.primary_assembly.fa.gz + my $file = $self->param('file'); + $file =~ s/\.toplevel\./.primary_assembly./; + return $file; +} + +1; diff --git a/modules/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm b/modules/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm index 47bf44cd51..0d0e05de2c 100644 --- a/modules/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm +++ b/modules/Bio/EnsEMBL/Pipeline/FASTA/DumpFile.pm @@ -752,10 +752,9 @@ EXAMPLES ----------------- PRIMARY ASSEMBLY ----------------- -This is a subset of sequences found in the toplevel file. The subset is -defined as anything which is not a reference sequence region i.e. haplotypes -and patches. If you wish to perform alignments to an assembly and want to -ignore alternative assemblies then use this file. +Primary assembly contains all toplevel sequence regions excluding haplotypes +and patches. This file is best used for performing sequence similarity searches +where patch and haplotype sequences would confuse analysis. EXAMPLES diff --git a/modules/Bio/EnsEMBL/Pipeline/PipeConfig/FASTA_conf.pm b/modules/Bio/EnsEMBL/Pipeline/PipeConfig/FASTA_conf.pm index b13f0be22b..2d5350a2d9 100644 --- a/modules/Bio/EnsEMBL/Pipeline/PipeConfig/FASTA_conf.pm +++ b/modules/Bio/EnsEMBL/Pipeline/PipeConfig/FASTA_conf.pm @@ -132,10 +132,17 @@ sub pipeline_analyses { -can_be_empty => 1, -max_retry_count => 5, -flow_into => { - 1 => [qw/BlastDNAIndex BlatDNAIndex BlatSmDNAIndex/] + 1 => [qw/BlastDNAIndex BlatDNAIndex BlatSmDNAIndex PrimaryAssembly/] }, }, + { + -logic_name => 'PrimaryAssembly', + -module => 'Bio::EnsEMBL::Pipeline::FASTA::CreatePrimaryAssembly', + -can_be_empty => 1, + -max_retry_count => 5, + }, + ######## COPY DATA { @@ -229,7 +236,7 @@ sub pipeline_analyses { }, -hive_capacity => 3, -can_be_empty => 1, - -wait_for => [qw/DumpDNA DumpGenes BlastDNAIndex BlastGeneIndex BlastPepIndex/] + -wait_for => [qw/DumpDNA DumpGenes PrimaryAssembly BlastDNAIndex BlastGeneIndex BlastPepIndex/] }, ####### CHECKSUMMING @@ -242,7 +249,7 @@ sub pipeline_analyses { input_id => { 'dir' => '#dir#' }, fan_branch_code => 2, }, - -wait_for => [qw/DumpDNA DumpGenes BlastDNAIndex BlastGeneIndex BlastPepIndex/], + -wait_for => [qw/DumpDNA DumpGenes PrimaryAssembly BlastDNAIndex BlastGeneIndex BlastPepIndex/], -flow_into => { 2 => ['ChecksumGenerator'] } }, -- GitLab