Skip to content
Snippets Groups Projects
Commit 09c9830f authored by Andy Yates's avatar Andy Yates
Browse files

Two modules used to check a flat file dump parsers

parent daa03a96
No related branches found
No related tags found
No related merge requests found
=pod
=head1 LICENSE
Copyright (c) 1999-2012 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
http://www.ensembl.org/info/about/code_licence.html
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at <dev@ensembl.org>.
Questions may also be sent to the Ensembl help desk at
<helpdesk@ensembl.org>.
=head1 NAME
Bio::EnsEMBL::Pipeline::Flatfile::CheckFlatfile
=head1 DESCRIPTION
Takes in a file and passes it through BioPerl's SeqIO parser code. This
is just a smoke test to ensure the files are well formatted.
Allowed parameters are:
=over 8
=item file - The file to parse
=item format - Passed into SeqIO; the format to parse
=back
=cut
package Bio::EnsEMBL::Pipeline::Flatfile::CheckFlatfile;
use strict;
use warnings;
use Bio::SeqIO;
use base qw/Bio::EnsEMBL::Pipeline::Flatfile::Base/;
sub fetch_input {
my ($self) = @_;
$self->throw("No 'file' parameter specified") unless $self->param('file');
$self->throw("No 'format' parameter specified") unless $self->param('format');
return;
}
sub run {
my ($self) = @_;
my $fh = $self->get_fh();
my $format = $self->param('format');
my $stream = Bio::SeqIO->new(-FH => $fh, -FORMAT => $format);
my $count = 0;
while ( (my $seq = $stream->next_seq()) ) {
$self->fine("Found the record %s", $seq->accession());
$count++;
}
$self->info("Processed %d record(s)", $count);
close $fh;
return;
}
sub get_fh {
my ($self) = @_;
my $file = $self->param('file');
$self->throw("Cannot find file $file") unless -f $file;
my $fh;
if($file =~ /\.gz$/) {
open $fh, '-|', 'gzip -c -d '.$file or die "Cannot open $file for gunzip: $!";
}
else {
open $fh, '<', $file or die "Cannot open file $file: $!";
}
return $fh;
}
1;
\ No newline at end of file
package Bio::EnsEMBL::Pipeline::PipeConfig::FlatfileChecker_conf;
use strict;
use warnings;
use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf');
sub default_options {
my ($self) = @_;
return {
# inherit other stuff from the base class
%{ $self->SUPER::default_options() },
# 'base_path' => '', #where do you want your files
# 'format' => '',
### Defaults
pipeline_name => 'flatfile_dump_check_'.$self->o('format'),
};
}
sub pipeline_create_commands {
my ($self) = @_;
return [
# inheriting database and hive tables' creation
@{$self->SUPER::pipeline_create_commands},
];
}
sub pipeline_analyses {
my ($self) = @_;
return [
{
-logic_name => 'FindFiles',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
-meadow_type => 'LOCAL',
-parameters => {
inputcmd => 'find '.$self->o('base_path').q{ -type f -name '*.dat.gz'},
column_names => ['file'],
randomize => 1,
input_id => '{ file => "#file#" }'
},
-input_ids => [ {} ],
-flow_into => {
# 1 => 'Notify',
2 => ['CheckFlatfile'],
},
},
{
-logic_name => 'CheckFlatfile',
-module => 'Bio::EnsEMBL::Pipeline::Flatfile::CheckFlatfile',
-hive_capacity => 15,
-rc_name => 'dump',
},
];
}
sub pipeline_wide_parameters {
my ($self) = @_;
return {
%{ $self->SUPER::pipeline_wide_parameters() },
format => $self->o('format'),
};
}
sub resource_classes {
my $self = shift;
return {
%{$self->SUPER::resource_classes()},
dump => { 'LSF' => '-q normal -M3000000 -R"select[mem>3000] rusage[mem=3000]"'},
}
}
1;
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment