Commit 6356b422 authored by Andy Yates's avatar Andy Yates
Browse files

[ENSCORESW-217] & [ENSCORESW-263]. Added automatic generation of the primary...

[ENSCORESW-217] & [ENSCORESW-263]. Added automatic generation of the primary assembly file and cleaned up the readme slightly pending full re-design.
parent 1cf0018c
......@@ -19,4 +19,60 @@ sub old_path {
my $dir = File::Spec->catdir($base, "release-$release", 'fasta', $prod, 'dna');
}
# Filter a FASTA dump for reference regions only and parse names from the
# headers in the file e.g. NNN:NNN:NNN:1:80:1
sub filter_fasta_for_nonref {
my ($self, $source_file, $target_fh) = @_;
my $allowed_regions = $self->allowed_regions();
$self->info('Filtering from %s', $source_file);
open my $src_fh, '-|', "gzip -c -d $source_file" or $self->throw("Cannot decompress $source_file: $!");
my $transfer = 0;
while(my $line = <$src_fh>) {
#HEADER
if(index($line, '>') == 0) {
#regex is looking for NNN:NNN:NNN:1:80:1 i.e. the name
my ($name) = $line =~ />.+\s(.+:.+:.+:\d+:\d+:\d+)/;
$transfer = ($allowed_regions->{$name}) ? 1 : 0;
if($transfer) {
$self->info('%s was an allowed Slice', $name);
}
else {
$self->info('%s will be skipped; not a reference Slice', $name);
}
}
print $target_fh $line if $transfer;
}
close($src_fh);
return;
}
sub allowed_regions {
my ($self) = @_;
my $filter_human = 1;
my @slices = grep { $_->is_reference() } @{$self->get_Slices('core', $filter_human)};
my %hash = map { $_->name() => 1 } @slices;
return \%hash;
}
sub has_non_refs {
my ($self) = @_;
my $sql = <<'SQL';
select count(*)
from attrib_type at
join seq_region_attrib sra using (attrib_type_id)
join seq_region sr using (seq_region_id)
join coord_system cs using (coord_system_id)
where cs.species_id =?
and at.code =?
SQL
my $dba = $self->get_DBAdaptor();
return $dba->dbc()->sql_helper()->execute_single_result(
-SQL => $sql, -PARAMS => [$dba->species_id(), 'non_ref']);
}
1;
......@@ -147,44 +147,16 @@ sub decompress {
my ($vol, $dir, $file) = File::Spec->splitpath($source);
$file =~ s/.gz$//;
my $target = File::Spec->catdir($target_dir, $file);
my $allowed_regions = $self->allowed_regions();
$self->info('Decompressing %s to %s', $source, $target);
open my $src_fh, '-|', "zcat $source" or throw "Cannot decompress $source to $target";
$self->info('Writing to %s', $target);
work_with_file($target, 'w', sub {
my ($trg_fh) = @_;
my $transfer = 0;
while(my $line = <$src_fh>) {
#HEADER
if(index($line, '>') == 0) {
#regex is looking for NNN:NNN:NNN:1:80:1 i.e. the name
my ($name) = $line =~ />.+\s(.+:.+:.+:\d+:\d+:\d+)/;
$transfer = ($allowed_regions->{$name}) ? 1 : 0;
if($transfer) {
$self->info('%s was an allowed Slice', $name);
}
else {
$self->info('%s will be skipped; not a reference Slice', $name);
}
}
print $trg_fh $line if $transfer;
}
$self->filter_fasta_for_nonref($source, $trg_fh);
return;
});
close($src_fh);
return $target;
}
sub allowed_regions {
my ($self) = @_;
my $filter_human = 1;
my @slices = grep { $_->is_reference() } @{$self->get_Slices('core', $filter_human)};
my %hash = map { $_->name() => 1 } @slices;
return \%hash;
}
#Filename like 30061.Homo_sapiens.GRCh37.2bit
sub target_filename {
my ($self) = @_;
......@@ -217,20 +189,4 @@ sub blat_port {
return $id + $self->param('port_offset');
}
sub has_non_refs {
my ($self) = @_;
my $sql = <<'SQL';
select count(*)
from attrib_type at
join seq_region_attrib sra using (attrib_type_id)
join seq_region sr using (seq_region_id)
join coord_system cs using (coord_system_id)
where cs.species_id =?
and at.code =?
SQL
my $dba = $self->get_DBAdaptor();
return $dba->dbc()->sql_helper()->execute_single_result(
-SQL => $sql, -PARAMS => [$dba->species_id(), 'non_ref']);
}
1;
=pod
=head1 LICENSE
Copyright (c) 1999-2012 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
http://www.ensembl.org/info/about/code_licence.html
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at <dev@ensembl.org>.
Questions may also be sent to the Ensembl help desk at
<helpdesk@ensembl.org>.
=head1 NAME
Bio::EnsEMBL::Pipeline::FASTA::CreatePrimaryAssembly
=head1 DESCRIPTION
Scans the given file and attempts to create a primary assembly file which
is a file containing only those regions we believe to be part of the
core assembly e.g. in Human this is 1-22, X, Y & MT
Allowed parameters are:
=over 8
=item species - Required to indicate which species we are working with
=item file - The file to filter
=back
=cut
package Bio::EnsEMBL::Pipeline::FASTA::CreatePrimaryAssembly;
use strict;
use warnings;
use base qw/Bio::EnsEMBL::Pipeline::FASTA::Base/;
use Bio::EnsEMBL::Utils::IO qw/gz_work_with_file/;
sub fetch_input {
my ($self) = @_;
foreach my $key (qw/species file/) {
$self->throw("Cannot find the required parameter $key") unless $self->param($key);
}
return;
}
# Creates the file only if required i.e. has non-reference toplevel sequences
sub run {
my ($self) = @_;
if($self->has_non_refs()) {
my $source = $self->param('file');
my $target = $self->target_file();
$self->info('Decompressing to %s', $target);
gz_work_with_file($target, 'w', sub {
my ($trg_fh) = @_;
$self->filter_fasta_for_nonref($source, $trg_fh);
return;
});
}
$self->cleanup_DBAdaptor();
return;
}
sub target_file {
my ($self) = @_;
# File name format looks like:
# <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa.gz
# e.g. Homo_sapiens.GRCh37.64.dna_rm.toplevel.fa.gz -> Homo_sapiens.GRCh37.64.dna_rm.primary_assembly.fa.gz
my $file = $self->param('file');
$file =~ s/\.toplevel\./.primary_assembly./;
return $file;
}
1;
......@@ -752,10 +752,9 @@ EXAMPLES
-----------------
PRIMARY ASSEMBLY
-----------------
This is a subset of sequences found in the toplevel file. The subset is
defined as anything which is not a reference sequence region i.e. haplotypes
and patches. If you wish to perform alignments to an assembly and want to
ignore alternative assemblies then use this file.
Primary assembly contains all toplevel sequence regions excluding haplotypes
and patches. This file is best used for performing sequence similarity searches
where patch and haplotype sequences would confuse analysis.
EXAMPLES
......
......@@ -132,10 +132,17 @@ sub pipeline_analyses {
-can_be_empty => 1,
-max_retry_count => 5,
-flow_into => {
1 => [qw/BlastDNAIndex BlatDNAIndex BlatSmDNAIndex/]
1 => [qw/BlastDNAIndex BlatDNAIndex BlatSmDNAIndex PrimaryAssembly/]
},
},
{
-logic_name => 'PrimaryAssembly',
-module => 'Bio::EnsEMBL::Pipeline::FASTA::CreatePrimaryAssembly',
-can_be_empty => 1,
-max_retry_count => 5,
},
######## COPY DATA
{
......@@ -229,7 +236,7 @@ sub pipeline_analyses {
},
-hive_capacity => 3,
-can_be_empty => 1,
-wait_for => [qw/DumpDNA DumpGenes BlastDNAIndex BlastGeneIndex BlastPepIndex/]
-wait_for => [qw/DumpDNA DumpGenes PrimaryAssembly BlastDNAIndex BlastGeneIndex BlastPepIndex/]
},
####### CHECKSUMMING
......@@ -242,7 +249,7 @@ sub pipeline_analyses {
input_id => { 'dir' => '#dir#' },
fan_branch_code => 2,
},
-wait_for => [qw/DumpDNA DumpGenes BlastDNAIndex BlastGeneIndex BlastPepIndex/],
-wait_for => [qw/DumpDNA DumpGenes PrimaryAssembly BlastDNAIndex BlastGeneIndex BlastPepIndex/],
-flow_into => { 2 => ['ChecksumGenerator'] }
},
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment