diff --git a/src/gitea_codex_bot/config.py b/src/gitea_codex_bot/config.py index c010ac9..0bd792e 100644 --- a/src/gitea_codex_bot/config.py +++ b/src/gitea_codex_bot/config.py @@ -65,6 +65,8 @@ class Settings(BaseSettings): @model_validator(mode="after") def validate_job_lease_timeout(self) -> "Settings": + if self.max_stuck_job_retries < 0: + raise ValueError("MAX_STUCK_JOB_RETRIES must be >= 0.") minimum_lease_timeout = (self.max_review_minutes * 60) + 60 if self.job_lease_timeout_seconds < minimum_lease_timeout: raise ValueError( diff --git a/src/gitea_codex_bot/services/jobs.py b/src/gitea_codex_bot/services/jobs.py index 03ca2ef..6964d72 100644 --- a/src/gitea_codex_bot/services/jobs.py +++ b/src/gitea_codex_bot/services/jobs.py @@ -172,6 +172,15 @@ def recover_stuck_jobs(session: Session, *, lease_timeout_seconds: int, action: latest_run.status = RunStatus.failed latest_run.finished_at = now latest_run.error_message = message + else: + session.add( + ReviewRun( + job_id=job.id, + status=RunStatus.failed, + finished_at=now, + error_message=message, + ) + ) job.last_error = message if should_fail: job.status = JobStatus.failed diff --git a/tests/test_config.py b/tests/test_config.py index 3b3a903..b879836 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -23,3 +23,11 @@ def test_job_lease_timeout_must_cover_max_review_runtime(monkeypatch: pytest.Mon with pytest.raises(ValidationError, match="JOB_LEASE_TIMEOUT_SECONDS must be at least"): get_settings() + + +def test_max_stuck_job_retries_must_be_non_negative(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("MAX_STUCK_JOB_RETRIES", "-1") + get_settings.cache_clear() + + with pytest.raises(ValidationError, match="MAX_STUCK_JOB_RETRIES must be >= 0"): + get_settings() diff --git a/tests/test_transitions.py b/tests/test_transitions.py index 8166f14..03277fb 100644 --- a/tests/test_transitions.py +++ b/tests/test_transitions.py @@ -169,3 +169,49 @@ def test_requeue_allows_one_retry_then_fails_on_second_timeout() -> None: assert second[0].failed is True failed = session.execute(select(ReviewJob).where(ReviewJob.id == job.id)).scalar_one() assert failed.status == JobStatus.failed + + +def test_recovery_persists_timeout_retry_even_if_latest_run_not_running() -> None: + session_factory = get_session_factory() + with session_factory() as session: + job = enqueue_job( + session, + repo="acme/repo", + pr_number=5, + head_sha="33334444", + trigger_comment_id=1005, + trigger_comment_body="@codex review", + requested_by="alice", + command=ParsedCommand(name="review", raw="@codex review"), + ) + claimed = claim_next_job(session) + assert claimed is not None + assert claimed.id == job.id + + with session_factory() as session: + latest_run = session.execute(select(ReviewRun).where(ReviewRun.job_id == job.id).order_by(ReviewRun.id.desc()).limit(1)).scalar_one() + latest_run.status = RunStatus.succeeded + latest_run.finished_at = datetime.now(timezone.utc) + stale = session.execute(select(ReviewJob).where(ReviewJob.id == job.id)).scalar_one() + stale.started_at = datetime.now(timezone.utc) - timedelta(seconds=601) + session.commit() + + with session_factory() as session: + first = recover_stuck_jobs(session, lease_timeout_seconds=300, action="requeue", max_retries=1) + assert len(first) == 1 + assert first[0].failed is False + claimed_again = claim_next_job(session) + assert claimed_again is not None + assert claimed_again.id == job.id + + with session_factory() as session: + stale_again = session.execute(select(ReviewJob).where(ReviewJob.id == job.id)).scalar_one() + stale_again.started_at = datetime.now(timezone.utc) - timedelta(seconds=601) + session.commit() + + with session_factory() as session: + second = recover_stuck_jobs(session, lease_timeout_seconds=300, action="requeue", max_retries=1) + assert len(second) == 1 + assert second[0].failed is True + final = session.execute(select(ReviewJob).where(ReviewJob.id == job.id)).scalar_one() + assert final.status == JobStatus.failed