Cap stuck-job requeue retries and notify on exhaustion
This commit is contained in:
@@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from dataclasses import dataclass
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
@@ -12,6 +13,17 @@ from gitea_codex_bot.services.security import payload_digest
|
||||
from gitea_codex_bot.types import ParsedCommand
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
LEASE_TIMEOUT_ERROR_PREFIX = "Job lease timed out"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RecoveryOutcome:
|
||||
repo: str
|
||||
pr_number: int
|
||||
job_id: int
|
||||
retries_used: int
|
||||
failed: bool
|
||||
message: str
|
||||
|
||||
|
||||
def persist_webhook_event(
|
||||
@@ -95,8 +107,7 @@ def enqueue_job(
|
||||
return job
|
||||
|
||||
|
||||
def claim_next_job(session: Session, *, lease_timeout_seconds: int, stuck_job_recovery_action: str) -> ReviewJob | None:
|
||||
recover_stuck_jobs(session, lease_timeout_seconds=lease_timeout_seconds, action=stuck_job_recovery_action)
|
||||
def claim_next_job(session: Session) -> ReviewJob | None:
|
||||
job = session.execute(
|
||||
select(ReviewJob).where(ReviewJob.status == JobStatus.queued).order_by(ReviewJob.created_at.asc()).limit(1).with_for_update(skip_locked=True)
|
||||
).scalar_one_or_none()
|
||||
@@ -121,9 +132,9 @@ def claim_next_job(session: Session, *, lease_timeout_seconds: int, stuck_job_re
|
||||
return job
|
||||
|
||||
|
||||
def recover_stuck_jobs(session: Session, *, lease_timeout_seconds: int, action: str) -> int:
|
||||
def recover_stuck_jobs(session: Session, *, lease_timeout_seconds: int, action: str, max_retries: int) -> list[RecoveryOutcome]:
|
||||
if lease_timeout_seconds <= 0:
|
||||
return 0
|
||||
return []
|
||||
now = datetime.now(timezone.utc)
|
||||
cutoff = now - timedelta(seconds=lease_timeout_seconds)
|
||||
stale_jobs = session.execute(
|
||||
@@ -136,10 +147,24 @@ def recover_stuck_jobs(session: Session, *, lease_timeout_seconds: int, action:
|
||||
.order_by(ReviewJob.started_at.asc())
|
||||
.with_for_update(skip_locked=True)
|
||||
).scalars()
|
||||
recovered = 0
|
||||
outcomes: list[RecoveryOutcome] = []
|
||||
for job in stale_jobs:
|
||||
recovered += 1
|
||||
message = f"Job lease timed out after {lease_timeout_seconds}s while in running state; recovered via action={action}."
|
||||
prior_retries = session.execute(
|
||||
select(ReviewRun)
|
||||
.where(
|
||||
ReviewRun.job_id == job.id,
|
||||
ReviewRun.status == RunStatus.failed,
|
||||
ReviewRun.error_message.is_not(None),
|
||||
)
|
||||
.order_by(ReviewRun.id.asc())
|
||||
).scalars()
|
||||
lease_retries_used = sum(1 for run in prior_retries if (run.error_message or "").startswith(LEASE_TIMEOUT_ERROR_PREFIX))
|
||||
retries_used_after_this_timeout = lease_retries_used + 1
|
||||
should_fail = action == "fail" or lease_retries_used >= max_retries
|
||||
message = (
|
||||
f"{LEASE_TIMEOUT_ERROR_PREFIX} after {lease_timeout_seconds}s while in running state; "
|
||||
f"retries_used={retries_used_after_this_timeout}, max_retries={max_retries}."
|
||||
)
|
||||
latest_run = (
|
||||
session.execute(select(ReviewRun).where(ReviewRun.job_id == job.id).order_by(ReviewRun.id.desc()).limit(1)).scalar_one_or_none()
|
||||
)
|
||||
@@ -148,14 +173,25 @@ def recover_stuck_jobs(session: Session, *, lease_timeout_seconds: int, action:
|
||||
latest_run.finished_at = now
|
||||
latest_run.error_message = message
|
||||
job.last_error = message
|
||||
if action == "fail":
|
||||
if should_fail:
|
||||
job.status = JobStatus.failed
|
||||
job.finished_at = now
|
||||
else:
|
||||
job.status = JobStatus.queued
|
||||
job.started_at = None
|
||||
job.finished_at = None
|
||||
return recovered
|
||||
outcomes.append(
|
||||
RecoveryOutcome(
|
||||
repo=job.repo,
|
||||
pr_number=job.pr_number,
|
||||
job_id=job.id,
|
||||
retries_used=retries_used_after_this_timeout,
|
||||
failed=should_fail,
|
||||
message=message,
|
||||
)
|
||||
)
|
||||
session.commit()
|
||||
return outcomes
|
||||
|
||||
|
||||
def finish_job(
|
||||
|
||||
Reference in New Issue
Block a user