hokusai/bulkgen/state.py

"""Incremental build state tracking via ``.<project>.bulkgen-state.yaml``."""

from __future__ import annotations

import hashlib
from pathlib import Path

import yaml
from pydantic import BaseModel


def state_filename(project_name: str) -> str:
    """Return the state filename for a given project name.

    For a config file named ``cards.bulkgen.yaml`` the project name is
    ``cards`` and the state file is ``.cards.bulkgen-state.yaml``.
    """
    return f".{project_name}.bulkgen-state.yaml"


class TargetState(BaseModel):
    """Recorded state of a single target from its last successful build."""

    input_hashes: dict[str, str]
    prompt: str
    model: str
    extra_params: dict[str, object] = {}


class BuildState(BaseModel):
    """Full build state persisted to disk."""

    targets: dict[str, TargetState] = {}


def hash_file(path: Path) -> str:
    """Compute the SHA-256 hex digest of a file's contents."""
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()


def load_state(project_dir: Path, project_name: str) -> BuildState:
    """Load build state from disk, returning empty state if the file is missing."""
    state_path = project_dir / state_filename(project_name)
    if not state_path.exists():
        return BuildState()
    with state_path.open() as f:
        raw = yaml.safe_load(f)  # pyright: ignore[reportAny]
    if raw is None:
        return BuildState()
    return BuildState.model_validate(raw)


def save_state(state: BuildState, project_dir: Path, project_name: str) -> None:
    """Persist build state to disk."""
    state_path = project_dir / state_filename(project_name)
    with state_path.open("w") as f:
        yaml.dump(state.model_dump(), f, default_flow_style=False, sort_keys=False)


def is_target_dirty(
    target_name: str,
    *,
    resolved_prompt: str,
    model: str,
    dep_files: list[Path],
    extra_params: dict[str, object],
    state: BuildState,
    project_dir: Path,
) -> bool:
    """Determine whether a target needs rebuilding.

    A target is dirty if:
    - Its output file does not exist
    - It has never been built (not recorded in state)
    - Any dependency file hash has changed
    - The resolved prompt text has changed
    - The model has changed
    - Extra parameters (width, height, etc.) have changed
    """
    output_path = project_dir / target_name
    if not output_path.exists():
        return True

    if target_name not in state.targets:
        return True

    prev = state.targets[target_name]

    if prev.model != model:
        return True

    if prev.prompt != resolved_prompt:
        return True

    if prev.extra_params != extra_params:
        return True

    for dep_path in dep_files:
        dep_key = str(dep_path.relative_to(project_dir))
        current_hash = hash_file(dep_path)
        if prev.input_hashes.get(dep_key) != current_hash:
            return True

    return False


def record_target_state(
    target_name: str,
    *,
    resolved_prompt: str,
    model: str,
    dep_files: list[Path],
    extra_params: dict[str, object],
    state: BuildState,
    project_dir: Path,
) -> None:
    """Record the state of a successfully built target."""
    input_hashes: dict[str, str] = {}
    for dep_path in dep_files:
        dep_key = str(dep_path.relative_to(project_dir))
        input_hashes[dep_key] = hash_file(dep_path)

    state.targets[target_name] = TargetState(
        input_hashes=input_hashes,
        prompt=resolved_prompt,
        model=model,
        extra_params=extra_params,
    )