Cap stuck-job requeue retries and notify on exhaustion

This commit is contained in:
2026-05-22 21:25:53 +00:00
parent dfd67c1cce
commit d24b4f4f79
5 changed files with 128 additions and 19 deletions

View File

@@ -6,7 +6,7 @@ from gitea_codex_bot.db import get_session_factory
from datetime import datetime, timedelta, timezone
from gitea_codex_bot.models import JobStatus, ReviewJob, ReviewRun, RunStatus
from gitea_codex_bot.services.jobs import claim_next_job, enqueue_job, finish_job
from gitea_codex_bot.services.jobs import claim_next_job, enqueue_job, finish_job, recover_stuck_jobs
from gitea_codex_bot.types import ParsedCommand
@@ -25,7 +25,7 @@ def test_claim_and_transition() -> None:
)
with session_factory() as session:
claimed = claim_next_job(session, lease_timeout_seconds=300, stuck_job_recovery_action="requeue")
claimed = claim_next_job(session)
assert claimed is not None
assert claimed.id == job.id
assert claimed.status == JobStatus.running
@@ -62,7 +62,7 @@ def test_claim_recovers_stuck_running_job_by_requeue() -> None:
requested_by="alice",
command=ParsedCommand(name="review", raw="@codex review"),
)
claimed = claim_next_job(session, lease_timeout_seconds=300, stuck_job_recovery_action="requeue")
claimed = claim_next_job(session)
assert claimed is not None
assert claimed.id == first.id
@@ -72,7 +72,10 @@ def test_claim_recovers_stuck_running_job_by_requeue() -> None:
session.commit()
with session_factory() as session:
recovered = claim_next_job(session, lease_timeout_seconds=300, stuck_job_recovery_action="requeue")
outcomes = recover_stuck_jobs(session, lease_timeout_seconds=300, action="requeue", max_retries=1)
assert len(outcomes) == 1
assert outcomes[0].failed is False
recovered = claim_next_job(session)
assert recovered is not None
assert recovered.id == first.id
assert recovered.status == JobStatus.running
@@ -103,7 +106,7 @@ def test_claim_recovers_stuck_running_job_by_fail() -> None:
requested_by="alice",
command=ParsedCommand(name="review", raw="@codex review"),
)
claimed = claim_next_job(session, lease_timeout_seconds=300, stuck_job_recovery_action="requeue")
claimed = claim_next_job(session)
assert claimed is not None
assert claimed.id == stuck_job.id
@@ -113,10 +116,56 @@ def test_claim_recovers_stuck_running_job_by_fail() -> None:
session.commit()
with session_factory() as session:
no_job = claim_next_job(session, lease_timeout_seconds=300, stuck_job_recovery_action="fail")
outcomes = recover_stuck_jobs(session, lease_timeout_seconds=300, action="fail", max_retries=1)
assert len(outcomes) == 1
assert outcomes[0].failed is True
no_job = claim_next_job(session)
assert no_job is None
with session_factory() as session:
failed = session.execute(select(ReviewJob).where(ReviewJob.id == stuck_job.id)).scalar_one()
assert failed.status == JobStatus.failed
assert failed.finished_at is not None
def test_requeue_allows_one_retry_then_fails_on_second_timeout() -> None:
session_factory = get_session_factory()
with session_factory() as session:
job = enqueue_job(
session,
repo="acme/repo",
pr_number=4,
head_sha="11112222",
trigger_comment_id=1004,
trigger_comment_body="@codex review",
requested_by="alice",
command=ParsedCommand(name="review", raw="@codex review"),
)
claimed = claim_next_job(session)
assert claimed is not None
assert claimed.id == job.id
with session_factory() as session:
stale = session.execute(select(ReviewJob).where(ReviewJob.id == job.id)).scalar_one()
stale.started_at = datetime.now(timezone.utc) - timedelta(seconds=601)
session.commit()
with session_factory() as session:
first = recover_stuck_jobs(session, lease_timeout_seconds=300, action="requeue", max_retries=1)
assert len(first) == 1
assert first[0].failed is False
claimed_again = claim_next_job(session)
assert claimed_again is not None
assert claimed_again.id == job.id
with session_factory() as session:
stale_again = session.execute(select(ReviewJob).where(ReviewJob.id == job.id)).scalar_one()
stale_again.started_at = datetime.now(timezone.utc) - timedelta(seconds=601)
session.commit()
with session_factory() as session:
second = recover_stuck_jobs(session, lease_timeout_seconds=300, action="requeue", max_retries=1)
assert len(second) == 1
assert second[0].failed is True
failed = session.execute(select(ReviewJob).where(ReviewJob.id == job.id)).scalar_one()
assert failed.status == JobStatus.failed