clickthrough/skill/agent_runner.py

from dataclasses import dataclass
from typing import Any, Dict

from .clickthrough_skill import ActionPlan, ClickthroughSkill


@dataclass
class AgentRunResult:
    summary: Dict[str, Any]
    action: Dict[str, Any]
    history: Dict[str, Any]
    grid: Dict[str, Any]
    plan_preview: Dict[str, Any]


class ClickthroughAgentRunner:
    def __init__(self, skill: ClickthroughSkill) -> None:
        self.skill = skill

    def run_once(
        self,
        screenshot_base64: str,
        width: int,
        height: int,
        rows: int = 4,
        columns: int = 4,
        preferred_label: str | None = None,
        action: str = "click",
        text: str | None = None,
    ) -> AgentRunResult:
        grid = self.skill.describe_grid(
            screenshot_base64=screenshot_base64,
            width=width,
            height=height,
            rows=rows,
            columns=columns,
        )
        plan_response = self.skill.plan_with_planner(
            grid_id=grid["grid_id"],
            preferred_label=preferred_label,
            action=action,
            text=text,
        )
        plan_payload = plan_response["plan"]
        plan = ActionPlan(
            grid_id=plan_payload["grid_id"],
            target_cell=plan_payload.get("target_cell"),
            action=plan_payload["action"],
            text=plan_payload.get("text"),
        )
        action_result = self.skill.plan_action(plan)
        summary = self.skill.grid_summary(grid["grid_id"])
        history = self.skill.grid_history(grid["grid_id"])
        return AgentRunResult(
            summary=summary,
            action=action_result,
            history=history,
            grid=grid,
            plan_preview=plan_response,
        )