hokusai/bulkgen/state.py

"""Incremental build state tracking via ``.bulkgen.state.yaml``."""

from __future__ import annotations

import hashlib
from pathlib import Path

import yaml
from pydantic import BaseModel

STATE_FILENAME = ".bulkgen.state.yaml"


class TargetState(BaseModel):
    """Recorded state of a single target from its last successful build."""

    input_hashes: dict[str, str]
    prompt_hash: str
    model: str
    extra_hash: str = ""


class BuildState(BaseModel):
    """Full build state persisted to disk."""

    targets: dict[str, TargetState] = {}


def hash_file(path: Path) -> str:
    """Compute the SHA-256 hex digest of a file's contents."""
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()


def hash_string(value: str) -> str:
    """Compute the SHA-256 hex digest of a string."""
    return hashlib.sha256(value.encode("utf-8")).hexdigest()


def load_state(project_dir: Path) -> BuildState:
    """Load build state from disk, returning empty state if the file is missing."""
    state_path = project_dir / STATE_FILENAME
    if not state_path.exists():
        return BuildState()
    with state_path.open() as f:
        raw = yaml.safe_load(f)  # pyright: ignore[reportAny]
    if raw is None:
        return BuildState()
    return BuildState.model_validate(raw)


def save_state(state: BuildState, project_dir: Path) -> None:
    """Persist build state to disk."""
    state_path = project_dir / STATE_FILENAME
    with state_path.open("w") as f:
        yaml.dump(state.model_dump(), f, default_flow_style=False, sort_keys=False)


def _extra_hash(params: dict[str, object]) -> str:
    """Hash extra target parameters (width, height, etc.) for change detection."""
    if not params:
        return ""
    return hash_string(str(sorted(params.items())))


def is_target_dirty(
    target_name: str,
    *,
    resolved_prompt: str,
    model: str,
    dep_files: list[Path],
    extra_params: dict[str, object],
    state: BuildState,
    project_dir: Path,
) -> bool:
    """Determine whether a target needs rebuilding.

    A target is dirty if:
    - Its output file does not exist
    - It has never been built (not recorded in state)
    - Any dependency file hash has changed
    - The resolved prompt text has changed
    - The model has changed
    - Extra parameters (width, height, etc.) have changed
    """
    output_path = project_dir / target_name
    if not output_path.exists():
        return True

    if target_name not in state.targets:
        return True

    prev = state.targets[target_name]

    if prev.model != model:
        return True

    if prev.prompt_hash != hash_string(resolved_prompt):
        return True

    if prev.extra_hash != _extra_hash(extra_params):
        return True

    for dep_path in dep_files:
        dep_key = str(dep_path.relative_to(project_dir))
        current_hash = hash_file(dep_path)
        if prev.input_hashes.get(dep_key) != current_hash:
            return True

    return False


def record_target_state(
    target_name: str,
    *,
    resolved_prompt: str,
    model: str,
    dep_files: list[Path],
    extra_params: dict[str, object],
    state: BuildState,
    project_dir: Path,
) -> None:
    """Record the state of a successfully built target."""
    input_hashes: dict[str, str] = {}
    for dep_path in dep_files:
        dep_key = str(dep_path.relative_to(project_dir))
        input_hashes[dep_key] = hash_file(dep_path)

    state.targets[target_name] = TargetState(
        input_hashes=input_hashes,
        prompt_hash=hash_string(resolved_prompt),
        model=model,
        extra_hash=_extra_hash(extra_params),
    )