Commit a3296a09 authored by David Mendez's avatar David Mendez
Browse files

Add function to determine when the job checker is assumed dead

parent 0474cb87
......@@ -91,7 +91,7 @@ STATUS_AGENT_CONFIG = RUN_CONFIG.get('status_agent', {})
RUN_CONFIG['status_agent'] = {
'lock_validity_seconds': 1,
'sleep_time': 1,
'dead_assumption_seconds': 10,
'death_assumption_seconds': 10,
**STATUS_AGENT_CONFIG,
}
......
......@@ -209,8 +209,18 @@ class DelayedJob(DB.Model):
"""
:return: True if the job needs to be checked in lsf, false otherwise
"""
needs_to_be_checked_by_status = self.status in [JobStatuses.QUEUED, JobStatuses.RUNNING, JobStatuses.UNKNOWN]
return needs_to_be_checked_by_status
return self.status in [JobStatuses.QUEUED, JobStatuses.RUNNING, JobStatuses.UNKNOWN]
def job_checker_seems_to_have_died(self):
"""
:return: True if the checker seems to have died, false otherwise
"""
lsf_script_errored = self.last_lsf_check_status != 0
death_assumption_seconds = RUN_CONFIG.get('status_agent').get('death_assumption_seconds')
checker_assumed_dead_time = app_utils.get_utc_now() - datetime.timedelta(seconds=death_assumption_seconds)
checker_is_assumed_dead = self.last_lsf_checked_at < checker_assumed_dead_time
return lsf_script_errored and checker_is_assumed_dead
# ----------------------------------------------------------------------------------------------------------------------
......
......@@ -6,6 +6,8 @@ import datetime
from app import create_app
from app.models import delayed_job_models
from app.config import RUN_CONFIG
from app import utils
class TestJobLSFStatus(unittest.TestCase):
......@@ -47,7 +49,7 @@ class TestJobLSFStatus(unittest.TestCase):
job.status = status
needs_to_be_checked_got = job.needs_to_be_checked_in_lsf()
self.assertFalse(needs_to_be_checked_got,
msg=f'A job with status {status} does not need to be checked in LSF!')
msg=f'A job with status {status} does not need to be checked in LSF!')
def test_determines_when_the_job_checker_seems_to_have_died(self):
"""
......@@ -63,15 +65,33 @@ class TestJobLSFStatus(unittest.TestCase):
}
docker_image_url_must_be = 'some_url'
job = delayed_job_models.get_or_create(job_type, params, docker_image_url_must_be)
death_assumption_seconds = RUN_CONFIG.get('status_agent').get('death_assumption_seconds')
last_lsf_checked_at = utils.get_utc_now() - datetime.timedelta(seconds=death_assumption_seconds)
job.last_lsf_checked_at = last_lsf_checked_at
job.last_lsf_check_status = 1
last_lsf_checked_at = datetime.datetime.utcnow() + datetime.timedelta(seconds=-3)
print('last_lsf_checked_at: ', last_lsf_checked_at)
job_checker_seems_to_have_died_got = job.job_checker_seems_to_have_died()
self.assertTrue(job_checker_seems_to_have_died_got,
msg='When the last script execution errors and the last time it reported is greater than '
'the assumed dead time it must assume that the checker died.')
# job.last_lsf_checked_at = 0
# the last checked at time must be older than the seconds at which it is considered dead
job.last_lsf_checked_at = utils.get_utc_now()
job_checker_seems_to_have_died_got = job.job_checker_seems_to_have_died()
self.assertFalse(job_checker_seems_to_have_died_got,
msg='When the last script execution errors but the last time it reported is less than '
'the assumed dead time it must NOT assume that the checker died.')
# for status in [delayed_job_models.JobStatuses.QUEUED, delayed_job_models.JobStatuses.RUNNING,
# delayed_job_models.JobStatuses.UNKNOWN]:
# job.status = status
# needs_to_be_checked_got = job.needs_to_be_checked_in_lsf()
# self.assertTrue(needs_to_be_checked_got, msg=f'A job with status {status} need to be checked in LSF!')
job.last_lsf_checked_at = utils.get_utc_now()
job.last_lsf_check_status = 0
job_checker_seems_to_have_died_got = job.job_checker_seems_to_have_died()
self.assertFalse(job_checker_seems_to_have_died_got,
msg='When the last script execution was successful and the last time it reported is less '
'than the assumed dead time it must NOT assume that the checker died.')
last_lsf_checked_at = utils.get_utc_now() - datetime.timedelta(seconds=death_assumption_seconds)
job.last_lsf_checked_at = last_lsf_checked_at
job.last_lsf_check_status = 0
job_checker_seems_to_have_died_got = job.job_checker_seems_to_have_died()
self.assertFalse(job_checker_seems_to_have_died_got,
msg='When the last script execution was successful but the last time it reported is '
'greater than the assumed dead time it must NOT assume that the checker died.')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment