Add stuck running job recovery with lease timeout
Some checks failed
ci / test (pull_request) Failing after 30s
ci / publish (pull_request) Has been skipped

This commit is contained in:
2026-05-22 21:10:11 +00:00
parent b036a16d3c
commit dc48df1aab
5 changed files with 135 additions and 4 deletions

View File

@@ -38,6 +38,8 @@ class Settings(BaseSettings):
max_diff_bytes: int = Field(default=200000, alias="MAX_DIFF_BYTES")
max_review_minutes: int = Field(default=10, alias="MAX_REVIEW_MINUTES")
concurrency: int = Field(default=1, alias="CONCURRENCY")
job_lease_timeout_seconds: int = Field(default=300, alias="JOB_LEASE_TIMEOUT_SECONDS")
stuck_job_recovery_action: Literal["requeue", "fail"] = Field(default="requeue", alias="STUCK_JOB_RECOVERY_ACTION")
review_runner_image: str = Field(default="node:22-bookworm-slim", alias="REVIEW_RUNNER_IMAGE")
enable_fix_commands: bool = Field(default=False, alias="ENABLE_FIX_COMMANDS")

View File

@@ -82,7 +82,8 @@ def enqueue_job(
return job
def claim_next_job(session: Session) -> ReviewJob | None:
def claim_next_job(session: Session, *, lease_timeout_seconds: int, stuck_job_recovery_action: str) -> ReviewJob | None:
recover_stuck_jobs(session, lease_timeout_seconds=lease_timeout_seconds, action=stuck_job_recovery_action)
job = session.execute(
select(ReviewJob).where(ReviewJob.status == JobStatus.queued).order_by(ReviewJob.created_at.asc()).limit(1).with_for_update(skip_locked=True)
).scalar_one_or_none()
@@ -98,6 +99,43 @@ def claim_next_job(session: Session) -> ReviewJob | None:
return job
def recover_stuck_jobs(session: Session, *, lease_timeout_seconds: int, action: str) -> int:
if lease_timeout_seconds <= 0:
return 0
now = datetime.now(timezone.utc)
cutoff = now - timedelta(seconds=lease_timeout_seconds)
stale_jobs = session.execute(
select(ReviewJob)
.where(
ReviewJob.status == JobStatus.running,
ReviewJob.started_at.is_not(None),
ReviewJob.started_at < cutoff,
)
.order_by(ReviewJob.started_at.asc())
.with_for_update(skip_locked=True)
).scalars()
recovered = 0
for job in stale_jobs:
recovered += 1
message = f"Job lease timed out after {lease_timeout_seconds}s while in running state; recovered via action={action}."
latest_run = (
session.execute(select(ReviewRun).where(ReviewRun.job_id == job.id).order_by(ReviewRun.id.desc()).limit(1)).scalar_one_or_none()
)
if latest_run and latest_run.status == RunStatus.running:
latest_run.status = RunStatus.failed
latest_run.finished_at = now
latest_run.error_message = message
job.last_error = message
if action == "fail":
job.status = JobStatus.failed
job.finished_at = now
else:
job.status = JobStatus.queued
job.started_at = None
job.finished_at = None
return recovered
def finish_job(
session: Session,
*,

View File

@@ -78,7 +78,11 @@ def _handle_non_review_command(
def process_one_job(settings: Settings) -> bool:
session_factory = get_session_factory()
with session_factory() as session:
job = claim_next_job(session)
job = claim_next_job(
session,
lease_timeout_seconds=settings.job_lease_timeout_seconds,
stuck_job_recovery_action=settings.stuck_job_recovery_action,
)
if not job:
return False