Harden stuck-job retry accounting and retry config validation
All checks were successful
ci / test (pull_request) Successful in 27s
ci / publish (pull_request) Has been skipped

This commit is contained in:
2026-05-22 21:54:56 +00:00
parent c73aadc660
commit 9028d5b56c
4 changed files with 65 additions and 0 deletions

View File

@@ -23,3 +23,11 @@ def test_job_lease_timeout_must_cover_max_review_runtime(monkeypatch: pytest.Mon
with pytest.raises(ValidationError, match="JOB_LEASE_TIMEOUT_SECONDS must be at least"):
get_settings()
def test_max_stuck_job_retries_must_be_non_negative(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("MAX_STUCK_JOB_RETRIES", "-1")
get_settings.cache_clear()
with pytest.raises(ValidationError, match="MAX_STUCK_JOB_RETRIES must be >= 0"):
get_settings()

View File

@@ -169,3 +169,49 @@ def test_requeue_allows_one_retry_then_fails_on_second_timeout() -> None:
assert second[0].failed is True
failed = session.execute(select(ReviewJob).where(ReviewJob.id == job.id)).scalar_one()
assert failed.status == JobStatus.failed
def test_recovery_persists_timeout_retry_even_if_latest_run_not_running() -> None:
session_factory = get_session_factory()
with session_factory() as session:
job = enqueue_job(
session,
repo="acme/repo",
pr_number=5,
head_sha="33334444",
trigger_comment_id=1005,
trigger_comment_body="@codex review",
requested_by="alice",
command=ParsedCommand(name="review", raw="@codex review"),
)
claimed = claim_next_job(session)
assert claimed is not None
assert claimed.id == job.id
with session_factory() as session:
latest_run = session.execute(select(ReviewRun).where(ReviewRun.job_id == job.id).order_by(ReviewRun.id.desc()).limit(1)).scalar_one()
latest_run.status = RunStatus.succeeded
latest_run.finished_at = datetime.now(timezone.utc)
stale = session.execute(select(ReviewJob).where(ReviewJob.id == job.id)).scalar_one()
stale.started_at = datetime.now(timezone.utc) - timedelta(seconds=601)
session.commit()
with session_factory() as session:
first = recover_stuck_jobs(session, lease_timeout_seconds=300, action="requeue", max_retries=1)
assert len(first) == 1
assert first[0].failed is False
claimed_again = claim_next_job(session)
assert claimed_again is not None
assert claimed_again.id == job.id
with session_factory() as session:
stale_again = session.execute(select(ReviewJob).where(ReviewJob.id == job.id)).scalar_one()
stale_again.started_at = datetime.now(timezone.utc) - timedelta(seconds=601)
session.commit()
with session_factory() as session:
second = recover_stuck_jobs(session, lease_timeout_seconds=300, action="requeue", max_retries=1)
assert len(second) == 1
assert second[0].failed is True
final = session.execute(select(ReviewJob).where(ReviewJob.id == job.id)).scalar_one()
assert final.status == JobStatus.failed