Harden stuck-job retry accounting and retry config validation
This commit is contained in:
@@ -65,6 +65,8 @@ class Settings(BaseSettings):
|
|||||||
|
|
||||||
@model_validator(mode="after")
|
@model_validator(mode="after")
|
||||||
def validate_job_lease_timeout(self) -> "Settings":
|
def validate_job_lease_timeout(self) -> "Settings":
|
||||||
|
if self.max_stuck_job_retries < 0:
|
||||||
|
raise ValueError("MAX_STUCK_JOB_RETRIES must be >= 0.")
|
||||||
minimum_lease_timeout = (self.max_review_minutes * 60) + 60
|
minimum_lease_timeout = (self.max_review_minutes * 60) + 60
|
||||||
if self.job_lease_timeout_seconds < minimum_lease_timeout:
|
if self.job_lease_timeout_seconds < minimum_lease_timeout:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
|||||||
@@ -172,6 +172,15 @@ def recover_stuck_jobs(session: Session, *, lease_timeout_seconds: int, action:
|
|||||||
latest_run.status = RunStatus.failed
|
latest_run.status = RunStatus.failed
|
||||||
latest_run.finished_at = now
|
latest_run.finished_at = now
|
||||||
latest_run.error_message = message
|
latest_run.error_message = message
|
||||||
|
else:
|
||||||
|
session.add(
|
||||||
|
ReviewRun(
|
||||||
|
job_id=job.id,
|
||||||
|
status=RunStatus.failed,
|
||||||
|
finished_at=now,
|
||||||
|
error_message=message,
|
||||||
|
)
|
||||||
|
)
|
||||||
job.last_error = message
|
job.last_error = message
|
||||||
if should_fail:
|
if should_fail:
|
||||||
job.status = JobStatus.failed
|
job.status = JobStatus.failed
|
||||||
|
|||||||
@@ -23,3 +23,11 @@ def test_job_lease_timeout_must_cover_max_review_runtime(monkeypatch: pytest.Mon
|
|||||||
|
|
||||||
with pytest.raises(ValidationError, match="JOB_LEASE_TIMEOUT_SECONDS must be at least"):
|
with pytest.raises(ValidationError, match="JOB_LEASE_TIMEOUT_SECONDS must be at least"):
|
||||||
get_settings()
|
get_settings()
|
||||||
|
|
||||||
|
|
||||||
|
def test_max_stuck_job_retries_must_be_non_negative(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
monkeypatch.setenv("MAX_STUCK_JOB_RETRIES", "-1")
|
||||||
|
get_settings.cache_clear()
|
||||||
|
|
||||||
|
with pytest.raises(ValidationError, match="MAX_STUCK_JOB_RETRIES must be >= 0"):
|
||||||
|
get_settings()
|
||||||
|
|||||||
@@ -169,3 +169,49 @@ def test_requeue_allows_one_retry_then_fails_on_second_timeout() -> None:
|
|||||||
assert second[0].failed is True
|
assert second[0].failed is True
|
||||||
failed = session.execute(select(ReviewJob).where(ReviewJob.id == job.id)).scalar_one()
|
failed = session.execute(select(ReviewJob).where(ReviewJob.id == job.id)).scalar_one()
|
||||||
assert failed.status == JobStatus.failed
|
assert failed.status == JobStatus.failed
|
||||||
|
|
||||||
|
|
||||||
|
def test_recovery_persists_timeout_retry_even_if_latest_run_not_running() -> None:
|
||||||
|
session_factory = get_session_factory()
|
||||||
|
with session_factory() as session:
|
||||||
|
job = enqueue_job(
|
||||||
|
session,
|
||||||
|
repo="acme/repo",
|
||||||
|
pr_number=5,
|
||||||
|
head_sha="33334444",
|
||||||
|
trigger_comment_id=1005,
|
||||||
|
trigger_comment_body="@codex review",
|
||||||
|
requested_by="alice",
|
||||||
|
command=ParsedCommand(name="review", raw="@codex review"),
|
||||||
|
)
|
||||||
|
claimed = claim_next_job(session)
|
||||||
|
assert claimed is not None
|
||||||
|
assert claimed.id == job.id
|
||||||
|
|
||||||
|
with session_factory() as session:
|
||||||
|
latest_run = session.execute(select(ReviewRun).where(ReviewRun.job_id == job.id).order_by(ReviewRun.id.desc()).limit(1)).scalar_one()
|
||||||
|
latest_run.status = RunStatus.succeeded
|
||||||
|
latest_run.finished_at = datetime.now(timezone.utc)
|
||||||
|
stale = session.execute(select(ReviewJob).where(ReviewJob.id == job.id)).scalar_one()
|
||||||
|
stale.started_at = datetime.now(timezone.utc) - timedelta(seconds=601)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
with session_factory() as session:
|
||||||
|
first = recover_stuck_jobs(session, lease_timeout_seconds=300, action="requeue", max_retries=1)
|
||||||
|
assert len(first) == 1
|
||||||
|
assert first[0].failed is False
|
||||||
|
claimed_again = claim_next_job(session)
|
||||||
|
assert claimed_again is not None
|
||||||
|
assert claimed_again.id == job.id
|
||||||
|
|
||||||
|
with session_factory() as session:
|
||||||
|
stale_again = session.execute(select(ReviewJob).where(ReviewJob.id == job.id)).scalar_one()
|
||||||
|
stale_again.started_at = datetime.now(timezone.utc) - timedelta(seconds=601)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
with session_factory() as session:
|
||||||
|
second = recover_stuck_jobs(session, lease_timeout_seconds=300, action="requeue", max_retries=1)
|
||||||
|
assert len(second) == 1
|
||||||
|
assert second[0].failed is True
|
||||||
|
final = session.execute(select(ReviewJob).where(ReviewJob.id == job.id)).scalar_one()
|
||||||
|
assert final.status == JobStatus.failed
|
||||||
|
|||||||
Reference in New Issue
Block a user