Add stuck-job recovery for running jobs #1

Closed
luna wants to merge 4 commits from feat/stuck-job-recovery into main
3 changed files with 25 additions and 3 deletions
Showing only changes of commit c73aadc660 - Show all commits

View File

@@ -44,7 +44,7 @@ WORKDIR=/var/lib/gitea-codex/worktrees
MAX_DIFF_BYTES=200000
MAX_REVIEW_MINUTES=10
CONCURRENCY=1
JOB_LEASE_TIMEOUT_SECONDS=300
JOB_LEASE_TIMEOUT_SECONDS=660
STUCK_JOB_RECOVERY_ACTION=requeue
MAX_STUCK_JOB_RETRIES=1

View File

@@ -3,7 +3,7 @@ from __future__ import annotations
from functools import lru_cache
from typing import Literal
from pydantic import Field, SecretStr, field_validator
from pydantic import Field, SecretStr, field_validator, model_validator
from pydantic_settings import BaseSettings, SettingsConfigDict
@@ -38,7 +38,7 @@ class Settings(BaseSettings):
max_diff_bytes: int = Field(default=200000, alias="MAX_DIFF_BYTES")
max_review_minutes: int = Field(default=10, alias="MAX_REVIEW_MINUTES")
concurrency: int = Field(default=1, alias="CONCURRENCY")
job_lease_timeout_seconds: int = Field(default=300, alias="JOB_LEASE_TIMEOUT_SECONDS")
job_lease_timeout_seconds: int = Field(default=660, alias="JOB_LEASE_TIMEOUT_SECONDS")
stuck_job_recovery_action: Literal["requeue", "fail"] = Field(default="requeue", alias="STUCK_JOB_RECOVERY_ACTION")
max_stuck_job_retries: int = Field(default=1, alias="MAX_STUCK_JOB_RETRIES")
@@ -63,6 +63,16 @@ class Settings(BaseSettings):
values = [item.strip() for item in self.allowed_repos.split(",")]
return {value for value in values if value}
@model_validator(mode="after")
def validate_job_lease_timeout(self) -> "Settings":
minimum_lease_timeout = (self.max_review_minutes * 60) + 60
if self.job_lease_timeout_seconds < minimum_lease_timeout:
raise ValueError(
"JOB_LEASE_TIMEOUT_SECONDS must be at least MAX_REVIEW_MINUTES*60 + 60 "
f"(minimum {minimum_lease_timeout}s for MAX_REVIEW_MINUTES={self.max_review_minutes})."
)
return self
@lru_cache(maxsize=1)
def get_settings() -> Settings:

View File

@@ -1,3 +1,6 @@
import pytest
from pydantic import ValidationError
from gitea_codex_bot.config import get_settings
@@ -11,3 +14,12 @@ def test_codex_auth_defaults_to_api_key_mode() -> None:
settings = get_settings()
assert settings.codex_auth_mode == "api_key"
assert settings.codex_auth_json_path is None
def test_job_lease_timeout_must_cover_max_review_runtime(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("MAX_REVIEW_MINUTES", "10")
monkeypatch.setenv("JOB_LEASE_TIMEOUT_SECONDS", "300")
get_settings.cache_clear()
with pytest.raises(ValidationError, match="JOB_LEASE_TIMEOUT_SECONDS must be at least"):
get_settings()