Commit a3296a09 authored by David Mendez's avatar David Mendez
Browse files

Add function to determine when the job checker is assumed dead

parent 0474cb87
...@@ -91,7 +91,7 @@ STATUS_AGENT_CONFIG = RUN_CONFIG.get('status_agent', {}) ...@@ -91,7 +91,7 @@ STATUS_AGENT_CONFIG = RUN_CONFIG.get('status_agent', {})
RUN_CONFIG['status_agent'] = { RUN_CONFIG['status_agent'] = {
'lock_validity_seconds': 1, 'lock_validity_seconds': 1,
'sleep_time': 1, 'sleep_time': 1,
'dead_assumption_seconds': 10, 'death_assumption_seconds': 10,
**STATUS_AGENT_CONFIG, **STATUS_AGENT_CONFIG,
} }
......
...@@ -209,8 +209,18 @@ class DelayedJob(DB.Model): ...@@ -209,8 +209,18 @@ class DelayedJob(DB.Model):
""" """
:return: True if the job needs to be checked in lsf, false otherwise :return: True if the job needs to be checked in lsf, false otherwise
""" """
needs_to_be_checked_by_status = self.status in [JobStatuses.QUEUED, JobStatuses.RUNNING, JobStatuses.UNKNOWN] return self.status in [JobStatuses.QUEUED, JobStatuses.RUNNING, JobStatuses.UNKNOWN]
return needs_to_be_checked_by_status
def job_checker_seems_to_have_died(self):
"""
:return: True if the checker seems to have died, false otherwise
"""
lsf_script_errored = self.last_lsf_check_status != 0
death_assumption_seconds = RUN_CONFIG.get('status_agent').get('death_assumption_seconds')
checker_assumed_dead_time = app_utils.get_utc_now() - datetime.timedelta(seconds=death_assumption_seconds)
checker_is_assumed_dead = self.last_lsf_checked_at < checker_assumed_dead_time
return lsf_script_errored and checker_is_assumed_dead
# ---------------------------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------------------------
......
...@@ -6,6 +6,8 @@ import datetime ...@@ -6,6 +6,8 @@ import datetime
from app import create_app from app import create_app
from app.models import delayed_job_models from app.models import delayed_job_models
from app.config import RUN_CONFIG
from app import utils
class TestJobLSFStatus(unittest.TestCase): class TestJobLSFStatus(unittest.TestCase):
...@@ -47,7 +49,7 @@ class TestJobLSFStatus(unittest.TestCase): ...@@ -47,7 +49,7 @@ class TestJobLSFStatus(unittest.TestCase):
job.status = status job.status = status
needs_to_be_checked_got = job.needs_to_be_checked_in_lsf() needs_to_be_checked_got = job.needs_to_be_checked_in_lsf()
self.assertFalse(needs_to_be_checked_got, self.assertFalse(needs_to_be_checked_got,
msg=f'A job with status {status} does not need to be checked in LSF!') msg=f'A job with status {status} does not need to be checked in LSF!')
def test_determines_when_the_job_checker_seems_to_have_died(self): def test_determines_when_the_job_checker_seems_to_have_died(self):
""" """
...@@ -63,15 +65,33 @@ class TestJobLSFStatus(unittest.TestCase): ...@@ -63,15 +65,33 @@ class TestJobLSFStatus(unittest.TestCase):
} }
docker_image_url_must_be = 'some_url' docker_image_url_must_be = 'some_url'
job = delayed_job_models.get_or_create(job_type, params, docker_image_url_must_be) job = delayed_job_models.get_or_create(job_type, params, docker_image_url_must_be)
death_assumption_seconds = RUN_CONFIG.get('status_agent').get('death_assumption_seconds')
last_lsf_checked_at = utils.get_utc_now() - datetime.timedelta(seconds=death_assumption_seconds)
job.last_lsf_checked_at = last_lsf_checked_at
job.last_lsf_check_status = 1
last_lsf_checked_at = datetime.datetime.utcnow() + datetime.timedelta(seconds=-3) job_checker_seems_to_have_died_got = job.job_checker_seems_to_have_died()
print('last_lsf_checked_at: ', last_lsf_checked_at) self.assertTrue(job_checker_seems_to_have_died_got,
msg='When the last script execution errors and the last time it reported is greater than '
'the assumed dead time it must assume that the checker died.')
# job.last_lsf_checked_at = 0 job.last_lsf_checked_at = utils.get_utc_now()
# the last checked at time must be older than the seconds at which it is considered dead job_checker_seems_to_have_died_got = job.job_checker_seems_to_have_died()
self.assertFalse(job_checker_seems_to_have_died_got,
msg='When the last script execution errors but the last time it reported is less than '
'the assumed dead time it must NOT assume that the checker died.')
# for status in [delayed_job_models.JobStatuses.QUEUED, delayed_job_models.JobStatuses.RUNNING, job.last_lsf_checked_at = utils.get_utc_now()
# delayed_job_models.JobStatuses.UNKNOWN]: job.last_lsf_check_status = 0
# job.status = status job_checker_seems_to_have_died_got = job.job_checker_seems_to_have_died()
# needs_to_be_checked_got = job.needs_to_be_checked_in_lsf() self.assertFalse(job_checker_seems_to_have_died_got,
# self.assertTrue(needs_to_be_checked_got, msg=f'A job with status {status} need to be checked in LSF!') msg='When the last script execution was successful and the last time it reported is less '
'than the assumed dead time it must NOT assume that the checker died.')
last_lsf_checked_at = utils.get_utc_now() - datetime.timedelta(seconds=death_assumption_seconds)
job.last_lsf_checked_at = last_lsf_checked_at
job.last_lsf_check_status = 0
job_checker_seems_to_have_died_got = job.job_checker_seems_to_have_died()
self.assertFalse(job_checker_seems_to_have_died_got,
msg='When the last script execution was successful but the last time it reported is '
'greater than the assumed dead time it must NOT assume that the checker died.')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment