diff --git a/modules/Bio/EnsEMBL/Hive.pm b/modules/Bio/EnsEMBL/Hive.pm new file mode 100755 index 0000000000000000000000000000000000000000..d9a8accbbc6539648e09bb97ea5425e7288c7961 --- /dev/null +++ b/modules/Bio/EnsEMBL/Hive.pm @@ -0,0 +1,81 @@ +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=pod + +=head1 NAME + Bio::EnsEMBL::Hive + +=head1 DESCRIPTION + Object which encapsulates the details of how to find jobs, how to run those + jobs, and then check the rules to create the next jobs in the chain. + Essentially knows where to find data, how to process data, and where to + put it when it's done (put in next person's INBOX) so the next Worker + in the chain can find data to work on. + + Hive based processing is a concept based on a more controlled version + of an autonomous agent type system. Each worker is not told what to do + (like a centralized control system - like the current pipeline system) + but rather queries a central database for jobs (give me jobs). + + Each worker is linked to an analysis_id, registers its self on creation + into the Hive, creates a RunnableDB instance of the Analysis->module, + gets relevant configuration information from the database, does its + work, creates the next layer of analysis_job entries by interfacing to + the DataflowRuleAdaptor to determine the analyses it needs to pass its + output data to and creates jobs on the database of the next analysis. + It repeats this cycle until it has lived its lifetime or until there are no + more jobs left to process. + The lifetime limit is a safety limit to prevent these from 'infecting' + a system and sitting on a compute node for longer than is socially exceptable. + This is primarily needed on compute resources like an LSF system where jobs + are not preempted and run until they are done. + + The Queen's primary job is to create Workers to get the work down. + As part of this, she is also responsible for summarizing the status of the + analyses by querying the analysis_jobs, summarizing, and updating the + analysis_stats table. From this she is also responsible for monitoring and + 'unblocking' analyses via the analysis_ctrl_rules. + The Queen is also responsible for freeing up jobs that were claimed by Workers + that died unexpectantly so that other workers can take over the work. + + The Beekeeper is in charge of interfacing between the Queen and a compute resource + or 'compute farm'. Its job is to query Queens if they need any workers and to + send the requested number of workers to open machines via the runWorker.pl script. + It is also responsible for interfacing with the Queen to identify workers which died + unexpectantly so that she can free the dead workers unfinished jobs. + + +=head1 CONTACT + Contact Jessica Severin on EnsEMBL::Hive implemetation/design detail: jessica@ebi.ac.uk + Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk + +=head1 APPENDIX + The rest of the documentation details each of the object methods. + Internal methods are usually preceded with a _ + +=cut + +use strict; +use Sys::Hostname; +use Data::UUID; +use Bio::EnsEMBL::Utils::Argument; +use Bio::EnsEMBL::Utils::Exception; + +use Bio::EnsEMBL::Analysis; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Pipeline::RunnableDB; + +use Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor; +use Bio::EnsEMBL::Hive::DBSQL::AnalysisStatsAdaptor; +use Bio::EnsEMBL::Hive::DBSQL::DataflowRuleAdaptor; +use Bio::EnsEMBL::Hive::DBSQL::AnalysisCtrlRuleAdaptor; +use Bio::EnsEMBL::Hive::DBSQL::AnalysisDataAdaptor; +use Bio::EnsEMBL::Hive::Extensions; +use Bio::EnsEMBL::Hive::Queen; +use Bio::EnsEMBL::Hive::URLFactory; + +1; +