Commit e0e4e7b6 authored by Leo Gordon's avatar Leo Gordon
Browse files

added perldoc-compatible documentation in all scripts

parent 617e5a7c
......@@ -71,7 +71,7 @@ sub main {
'host|dbhost=s' => \$self->{'db_conf'}->{'-host'},
'port|dbport=i' => \$self->{'db_conf'}->{'-port'},
'user|dbuser=s' => \$self->{'db_conf'}->{'-user'},
'password|dbpass=s' => \$self->{'db_conf'}->{'-pass'},
'password|dbpass=s' => \$self->{'db_conf'}->{'-pass'},
'database|dbname=s' => \$self->{'db_conf'}->{'-dbname'},
# loop control
......@@ -112,7 +112,7 @@ sub main {
'monitor!' => \$self->{'monitor'},
);
if ($help) { usage(); }
if ($help) { usage(0); }
parse_conf($self, $conf_file);
......@@ -139,7 +139,7 @@ sub main {
$self->{'url'} = $self->{'dba'}->dbc->url;
} else {
print "\nERROR : Connection parameters (regfile+regname, url or dbhost+dbuser+dbname) need to be specified\n\n";
usage();
usage(1);
}
my $queen = $self->{'dba'}->get_Queen;
......@@ -234,53 +234,19 @@ sub main {
#######################
sub usage {
print "beekeeper.pl [options]\n";
print " -help : print this help\n";
print "\n===============[connection parameters]==================\n";
print " -conf <path> : config file describing db connection\n";
print " -regfile <path> : path to a Registry configuration file\n";
print " -regname <string> : species/alias name for the Hive DBAdaptor\n";
print " -url <url string> : url defining where hive database is located\n";
print " -dbhost <machine> : mysql database host <machine>\n";
print " -dbport <port#> : mysql port number\n";
print " -dbuser <name> : mysql connection user <name>\n";
print " -dbpass <pass> : mysql connection password\n";
print " -dbname <name> : mysql database <name>\n";
print "\n===============[loop control]============================\n";
print " -loop : run autonomously, loops and sleeps\n";
print " -max_loops <num> : perform max this # of loops in autonomous mode\n";
print " -run : run 1 iteration of automation loop\n";
print " -run_job_id <job_id> : run 1 iteration for this job_id\n";
print " -sleep <num> : when looping, sleep <num> minutes (default 3min)\n";
print "\n===============[meadow control]==========================\n";
print " -local : run jobs on local CPU (fork)\n";
print " -local_cpus <num> : max # workers to be running locally\n";
print " -wlimit <num> : max # workers to create per loop\n";
print " -no_pend : don't adjust needed workers by pending workers\n";
print " -lsf_options <string> : passes <string> to LSF bsub command as <options>\n";
print "\n===============[worker control]==========================\n";
print " -jlimit <num> : #jobs to run before worker can die naturally\n";
print " -batch_size <num> : #jobs a worker can claim at once\n";
print " -lifespan <num> : lifespan limit for each worker\n";
print " -logic_name <string> : restrict the pipeline stat/runs to this analysis logic_name\n";
print " -maximise_concurrency 1 : try to run more different analyses at the same time\n";
print "\n===============[other commands/options]==================\n";
print " -dead : clean dead jobs for resubmission\n";
# print " -overdue <min> : worker overdue minutes checking if dead\n";
print " -alldead : all outstanding workers\n";
print " -no_analysis_stats : don't show status of each analysis\n";
print " -worker_stats : show status of each running worker\n";
print " -failed_jobs : show all failed jobs\n";
print " -reset_job_id <num> : reset a job back to READY so it can be rerun\n";
print " -reset_all_jobs_for_analysis <logic_name>\n";
print " : reset jobs back to READY so it can be rerun\n";
exit(1);
my $retvalue = shift @_;
if(`which perldoc`) {
system('perldoc', $0);
} else {
foreach my $line (<DATA>) {
if($line!~s/\=\w+\s?//) {
$line = "\t$line";
}
print $line;
}
}
exit($retvalue);
}
sub parse_conf {
......@@ -497,3 +463,91 @@ sub remove_analysis_id {
$self->{'dba'}->get_AnalysisAdaptor->remove($analysis);
}
__DATA__
=pod
=head1 NAME
beekeeper.pl
=head1 DESCRIPTION
beekeeper.pl is the Perl script used to initialize and control the execution of eHive pipelines
and perform some maintenance tasks on the undelying eHive database.
=head1 USAGE EXAMPLES
# Usually run after the pipeline has been created to calculate the internal statistics necessary for eHive functioning
beekeeper.pl --host=hostname --port=3306 --user=username --password=secret --database=ehive_dbname -sync
# An alternative way of doing the same thing
beekeeper.pl -url mysql://username:secret@hostname:port/ehive_dbname -sync
# Run the pipeline in automatic mode (-loop), run all the workers locally (-local) and allow for 3 parallel workers (-local_cpus 3)
beekeeper.pl -url mysql://username:secret@hostname:port/long_mult_test -local -local_cpus 3 -loop
# Run in automatic mode, but only restrict to running the 'fast_blast' analysis
beekeeper.pl -url mysql://username:secret@hostname:port/long_mult_test -logic_name fast_blast -loop
# Restrict the normal execution to one iteration only - can be used for testing a newly set up pipeline
beekeeper.pl -url mysql://username:secret@hostname:port/long_mult_test -run
# Reset all 'buggy_analysis' jobs to 'READY' state, so that they can be run again
beekeeper.pl -url mysql://username:secret@hostname:port/long_mult_test -reset_all_jobs_for_analysis buggy_analysis
# Do a cleanup: find and bury dead workers, reclaim their jobs
beekeeper.pl -url mysql://username:secret@hostname:port/long_mult_test -dead
=head1 OPTIONS
=head2 Connection parameters
-conf <path> : config file describing db connection
-regfile <path> : path to a Registry configuration file
-regname <string> : species/alias name for the Hive DBAdaptor
-url <url string> : url defining where hive database is located
-host <machine> : mysql database host <machine>
-port <port#> : mysql port number
-user <name> : mysql connection user <name>
-password <pass> : mysql connection password <pass>
-database <name> : mysql database <name>
=head2 Looping control
-loop : run autonomously, loops and sleeps
-max_loops <num> : perform max this # of loops in autonomous mode
-run : run 1 iteration of automation loop
-run_job_id <job_id> : run 1 iteration for this job_id
-sleep <num> : when looping, sleep <num> minutes (default 2min)
=head2 Meadow control
-local : run jobs on local CPU (fork)
-local_cpus <num> : max # workers to be running locally
-wlimit <num> : max # workers to create per loop
-no_pend : don't adjust needed workers by pending workers
-lsf_options <string> : passes <string> to LSF bsub command as <options>
=head2 Worker control
-jlimit <num> : #jobs to run before worker can die naturally
-batch_size <num> : #jobs a worker can claim at once
-lifespan <num> : lifespan limit for each worker
-logic_name <string> : restrict the pipeline stat/runs to this analysis logic_name
-maximise_concurrency 1 : try to run more different analyses at the same time
=head2 Other commands/options
-help : print this help
-dead : clean dead jobs for resubmission
-alldead : all outstanding workers
-no_analysis_stats : don't show status of each analysis
-worker_stats : show status of each running worker
-failed_jobs : show all failed jobs
-reset_job_id <num> : reset a job back to READY so it can be rerun
-reset_all_jobs_for_analysis <logic_name>
: reset jobs back to READY so it can be rerun
=cut
......@@ -8,64 +8,6 @@
#
# You may distribute this module under the same terms as perl itself
# POD documentation - main docs before the code
=head1 NAME
cmd_hive.pl - DESCRIPTION
=head1 SYNOPSIS
perl \
/nfs/acari/avilella/src/ensembl_main/ensembl-personal/avilella/hive/cmd_hive.pl \
-url mysql://user:password@mysqldb:port/name_of_hive_db
-logic_name example1 -input_id 'echo I.have.$suffix.$tag.and.I.am.baking.one.right.now' \
-suffix_a apple01 -suffix_b apple05 -tag pies\
cmd_hive.pl -url mysql://ensadmin:ensembl@compara2:5316/avilella_compara_homology_54
-input_id \ '{ "sequence_id" => "$suffix", "minibatch" => "$suffixn" }' \
-parameters '{ "fastadb" => "/data/blastdb/Ensembl/family_54/fasta/metazoa_54.pep", "tabfile" => "/data/blastdb/Ensembl/family_54/fasta/metazoa_54.tab" }' \
-suffix_a 1 -suffix_b 100 -step 9 -hive_capacity 200 -logic_name family_blast_54a \
-module Bio::EnsEMBL::Compara::RunnableDB::FamilyBlast
=head1 DESCRIPTION
This script is to help load a batch of jobs all belonging to the same analysis,
whose parameters have to be varied over a range of values.
It was initially intended to run jobs wrapped into a script via
Bio::EnsEMBL::Hive::RunnableDB::SystemCmd module,
but is now extended to run any RunnableDB module jobs.
There are three ways of providing the range for the mutable parameter:
- perl built-in .. range operator (by setting -suffix_a 1234 and -suffix_b 5678 values)
** you can create mini-batches by providing the -step value, which will percolate as $suffixn
- values provided in a file (by setting -inputfile filename)
- hashed mode
Always use single quotes to protect the values of -input_id and -parameters.
Be careful of using things that don't expand, like apple_01 apple_05
instead of apple01 apple05
Also don't use suffix_a and suffix_b in the reverse order apple05
to apple01 because they expand in things like:
apple54,applf04,applf54,applg04,applg54,applh04,applh54...
If using hashed, call with something like:
[-hashed_a 00:00:00]
[-hashed_b 01:61:67]
=head1 AUTHOR - Albert Vilella
=head2 CONTRIBUTOR - Leo Gordon
=cut
# Let the code begin...
use strict;
use DBI;
use Getopt::Long;
......@@ -87,34 +29,39 @@ $self->{'logic_name'} = 'cmd_hive_analysis';
$self->{'module'} = 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd';
$self->{'parameters'} = '{}';
my $conf_file;
my ($help, $host, $user, $pass, $dbname, $port, $adaptor, $url);
GetOptions('help' => \$help,
'url=s' => \$url,
'conf=s' => \$conf_file,
'dbhost=s' => \$host,
'dbport=i' => \$port,
'dbuser=s' => \$user,
'dbpass=s' => \$pass,
'dbname=s' => \$dbname,
'logic_name=s' => \$self->{'logic_name'},
'module=s' => \$self->{'module'},
'input_id=s' => \$self->{'input_id'},
'parameters=s' => \$self->{'parameters'},
'inputfile=s' => \$self->{'inputfile'},
'suffix_a=s' => \$self->{'suffix_a'},
'suffix_b=s' => \$self->{'suffix_b'},
'step=i' => \$self->{'step'},
'hashed_a=s' => \$self->{'hashed_a'},
'hashed_b=s' => \$self->{'hashed_b'},
'tag=s' => \$self->{'tag'},
'hive_capacity=s' => \$self->{'hive_capacity'},
'batch_size=s' => \$self->{'batch_size'},
'debug=s' => \$self->{'debug'},
);
if ($help) { usage(); }
GetOptions(
# connection parameters:
'url=s' => \$url,
'host|dbhost=s' => \$host,
'port|dbport=i' => \$port,
'user|dbuser=s' => \$user,
'password|dbpass=s' => \$pass,
'database|dbname=s' => \$dbname,
# analysis parameters:
'logic_name=s' => \$self->{'logic_name'},
'module=s' => \$self->{'module'},
'hive_capacity=s' => \$self->{'hive_capacity'},
'batch_size=s' => \$self->{'batch_size'},
'input_id=s' => \$self->{'input_id'},
'parameters=s' => \$self->{'parameters'},
# range parameters:
'inputfile=s' => \$self->{'inputfile'},
'suffix_a=s' => \$self->{'suffix_a'},
'suffix_b=s' => \$self->{'suffix_b'},
'step=i' => \$self->{'step'},
'hashed_a=s' => \$self->{'hashed_a'},
'hashed_b=s' => \$self->{'hashed_b'},
# other options:
'help' => \$help,
'debug=s' => \$self->{'debug'},
);
if ($help) { usage(0); }
my $DBA;
if($url) {
......@@ -131,7 +78,7 @@ if($url) {
and defined($self->{'db_conf'}->{'-dbname'}))
{
print "\nERROR : must specify host, user, and database to connect\n\n";
usage();
usage(1);
}
# connect to database specified
......@@ -149,19 +96,6 @@ exit(0);
#
#######################
sub usage {
print "cmd_hive.pl [options]\n";
print " -help : print this help\n";
print " -url <url string> : url defining where hive database is located\n";
print " -input_id <cmd string> : command to be executed (or param. hash to be passed to analysis module)\n";
print " -suffix_a <tag> : suffix from here\n";
print " -suffix_b <tag> : suffix to here\n";
print " -tag <tag> : fixed tag in the command line\n";
print " -logic_name <analysis name> : logic_name of the analysis\n";
print " -module <module name> : name of the module to be run\n";
exit(1);
}
sub job_creation {
my $self = shift;
......@@ -208,7 +142,6 @@ sub job_creation {
}
close FILE;
} elsif(defined($self->{'suffix_a'}) and defined($self->{'suffix_b'})) {
my $tag = $self->{'tag'};
my $step = $self->{'step'} || 1;
my @full_list = $self->{'suffix_a'}..$self->{'suffix_b'};
while(@full_list) {
......@@ -217,12 +150,11 @@ sub job_creation {
for($from = $to = shift @full_list; $batch_cnt<$step && @full_list; $batch_cnt++) {
$to = shift @full_list;
}
# expanding tags here (now you can substitute $suffix, $suffix2, $suffixn and if you really need it, $tag):
# expanding tags here (now you can substitute $suffix, $suffix2, $suffixn):
my $resolved_input_id = $self->{'input_id'};
$resolved_input_id =~ s/\$suffixn/$batch_cnt/g; # the order of substitutions is important!
$resolved_input_id =~ s/\$suffix2/$to/g;
$resolved_input_id =~ s/\$suffix/$from/g;
$resolved_input_id =~ s/\$tag/$tag/g;
if(++$count % 100 == 0) {
print "$resolved_input_id at ",(time()-$starttime)," secs\n";
......@@ -308,4 +240,115 @@ sub resolve_suffix {
return $hashed_input_id;
}
1;
sub usage {
my $retvalue = shift @_;
if(`which perldoc`) {
system('perldoc', $0);
} else {
foreach my $line (<DATA>) {
if($line!~s/\=\w+\s?//) {
$line = "\t$line";
}
print $line;
}
}
exit($retvalue);
}
__DATA__
=pod
=head1 NAME
cmd_hive.pl
=head1 USAGE
cmd_hive.pl -url mysql://user:password@host:port/name_of_hive_db \
-logic_name example1 -input_id 'echo I.have.$suffix.and.I.am.baking.one.right.now' \
-suffix_a apple01 -suffix_b apple05
cmd_hive.pl -url mysql://user:password@host:port/avilella_compara_homology_54 \
-input_id '{ "sequence_id" => "$suffix", "minibatch" => "$suffixn" }' \
-parameters '{ "fastadb" => "/data/blastdb/Ensembl/family_54/fasta/metazoa_54.pep", "tabfile" => "/data/blastdb/Ensembl/family_54/fasta/metazoa_54.tab" }' \
-suffix_a 1 -suffix_b 100 -step 9 -hive_capacity 200 -logic_name family_blast_54a \
-module Bio::EnsEMBL::Compara::RunnableDB::FamilyBlast
=head1 DESCRIPTION
This script helps to load a batch of jobs all belonging to the same analysis,
whose parameters are given by a range of values.
By default it will use the 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd'
to run a script wrapped into eHive jobs, but it will run any RunnableDB module that you specify instead.
There are three ways of providing the range for the mutable parameter(s):
- values provided in a file (by setting -inputfile filename)
- perl built-in .. range operator (by setting -suffix_a 1234 and -suffix_b 5678 values)
** you can create mini-batches by providing the -step value, which will percolate as $suffixn
- hashed mode
=head1 OPTIONS
=head2 Connection parameters
-url <url string> : url defining where hive database is located
-host <machine> : mysql database host <machine>
-port <port#> : mysql port number
-user <name> : mysql connection user <name>
-password <pass> : mysql connection password <pass>
-database <name> : mysql database <name>
=head2 Analysis parameters
-logic_name <analysis_name> : logic_name of the analysis
-module <module_name> : name of the module to be run
-hive_capacity <hive_capacity> : top limit on the number of jobs of this analysis run at the same time
-batch_size <batch_size> : how many jobs can be claimed by a worker at once
-parameters <parameters_hash> : hash containing analysis-wide parameters for the module
-input_id <inputid_hash> : hash containing job-specific parameters for the module
Always use single quotes to protect the values of -input_id and -parameters.
=head2 Range parameters (file mode)
-inputfile <filename> : filename to take the values from (one per line)
Contents of each line will be substituted for '$inputfile' pattern in the input_id.
=head2 Range parameters (simple range mode)
-suffix_a <tag> : bottom boundary of the range
-suffix_b <tag> : top boundary of the range
-step <step_size> : desired size of the subrange, may be smaller for last subrange (1 by default)
The result of range expansion will get chunked into subranges of <step_size> (or 1 if not specified).
Start of the subrange will be substituted for '$suffix',
end of the subrange will be substituted for '$suffix2'
and size of the subrange will be substituted for '$suffixn' pattern in the input_id.
Be careful of using things that don't expand, like apple_01 apple_05 instead of apple01 apple05
Also don't use suffix_a and suffix_b in the reverse order apple05 to apple01 because they expand in things like:
apple54,applf04,applf54,applg04,applg54,applh04,applh54...
=head2 Range parameters (hashed mode)
-hashed_a <tag_a> : for example, -hashed_a 00:00:00
-hashed_b <tag_b> : for example, -hashed_b 01:61:67
Please ask Albert about this mode or to provide documentation for it :)
=head2 Other options
-help : print this help
=head1 AUTHORS
Albert Vilella
Leo Gordon
=cut
......@@ -67,7 +67,7 @@ GetOptions('help' => \$help,
$self->{'analysis_id'} = shift if(@_);
if ($help) { usage(); }
if ($help) { usage(0); }
parse_conf($self, $conf_file);
......@@ -91,7 +91,7 @@ else {
and defined($self->{'db_conf'}->{'-dbname'}))
{
print "\nERROR : must specify host, user, and database to connect\n\n";
usage();
usage(1);
}
# connect to database specified
......@@ -100,7 +100,7 @@ else {
unless($DBA and $DBA->isa("Bio::EnsEMBL::Hive::DBSQL::DBAdaptor")) {
print("ERROR : no database connection\n\n");
usage();
usage(1);
}
my $queen = $DBA->get_Queen();
......@@ -129,7 +129,7 @@ if($self->{'logic_name'}) {
my $analysis = $queen->db->get_AnalysisAdaptor->fetch_by_logic_name($self->{'logic_name'});
unless($analysis) {
printf("logic_name:'%s' does not exist in database\n\n", $self->{'logic_name'});
usage();
usage(1);
}
$self->{'analysis_id'} = $analysis->dbID;
}
......@@ -221,37 +221,21 @@ exit(0);
#######################
sub usage {
print "runWorker.pl [options]\n";
print " -help : print this help\n";
print " -regfile <path> : path to a Registry configuration file\n";
print " -regname <string> : species/alias name for the Hive DBAdaptor\n";
print " -url <url string> : url defining where database is located\n";
print " -conf <path> : config file describing db connection\n";
print " -dbhost <machine> : mysql database host <machine>\n";
print " -dbport <port#> : mysql port number\n";
print " -dbname <name> : mysql database <name>\n";
print " -dbuser <name> : mysql connection user <name>\n";
print " -dbpass <pass> : mysql connection password\n";
print " -analysis_id <id> : analysis_id in db\n";
print " -logic_name <string> : logic_name of analysis to make this worker\n";
print " -batch_size <num> : #jobs to claim at a time\n";
print " -limit <num> : #jobs to run before worker can die naturally\n";
print " -lifespan <num> : number of minutes this worker is allowed to run\n";
print " -outdir <path> : directory where stdout/stderr is redirected\n";
print " -bk <string> : beekeeper identifier\n";
print " -pid <string> : externally set process_id descriptor (e.g. lsf job_id, array_id)\n";
print " -input_id <string> : test input_id on specified analysis (analysis_id or logic_name)\n";
print " -job_id <id> : run specific job defined by analysis_job_id\n";
print " -debug <level> : turn on debug messages at <level> \n";
print " -analysis_stats : show status of each analysis in hive\n";
print " -no_cleanup : don't perform global_cleanup when worker exits\n";
print " -no_write : don't write_output or auto_dataflow input_job\n";
print "runWorker.pl v1.6\n";
exit(1);
my $retvalue = shift @_;
if(`which perldoc`) {
system('perldoc', $0);
} else {
foreach my $line (<DATA>) {
if($line!~s/\=\w+\s?//) {
$line = "\t$line";
}
print $line;
}
}
exit($retvalue);
}
sub parse_conf {
my $self = shift;
my $conf_file = shift;
......@@ -269,3 +253,49 @@ sub parse_conf {
}
}
__DATA__
=pod
=head1 NAME
runWorker.pl
=head1 DESCRIPTION
runWorker.pl is an eHive script that does the work of a single Worker -
specializes in one of the analyses and starts executing jobs of that analysis one-by-one or batch-by-batch.
=head1 USAGE
runWorker.pl [options]
=head1 OPTIONS
-help : print this help\n";
-regfile <path> : path to a Registry configuration file\n";
-regname <string> : species/alias name for the Hive DBAdaptor\n";
-url <url string> : url defining where database is located\n";
-conf <path> : config file describing db connection\n";
-dbhost <machine> : mysql database host <machine>\n";
-dbport <port#> : mysql port number\n";
-dbname <name> : mysql database <name>\n";
-dbuser <name> : mysql connection user <name>\n";
-dbpass <pass> : mysql connection password\n";
-analysis_id <id> : analysis_id in db\n";
-logic_name <string> : logic_name of analysis to make this worker\n";
-batch_size <num> : #jobs to claim at a time\n";
-limit <num> : #jobs to run before worker can die naturally\n";
-lifespan <num> : number of minutes this worker is allowed to run\n";
-outdir <path> : directory where stdout/stderr is redirected\n";
-bk <string> : beekeeper identifier\n";
-pid <string> : externally set process_id descriptor (e.g. lsf job_id, array_id)\n";
-input_id <string> : test input_id on specified analysis (analysis_id or logic_name)\n";
-job_id <id> : run specific job defined by analysis_job_id\n";
-debug <level> : turn on debug messages at <level> \n";
-analysis_stats : show status of each analysis in hive\n";
-no_cleanup : don't perform global_cleanup when worker exits\n";
-no_write : don't write_output or auto_dataflow input_job\n";
=cut
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment