fix: wrap image bytes in BytesIO for gpt-image edit endpoint

The OpenAI SDK's legacy multipart path only accepts dall-e-2 when raw bytes are passed. Wrapping in io.BytesIO with a name attribute routes through the newer path that supports gpt-image-* models. Also removes output_format from the edit call as that endpoint does not support it.
2026-03-05 21:21:26 +01:00 · 2026-03-05 21:21:26 +01:00 · d90db2933e
commit d90db2933e
parent 770f408dad
2 changed files with 31 additions and 14 deletions
--- a/hokusai/providers/openai_image.py
+++ b/hokusai/providers/openai_image.py
@ -3,6 +3,7 @@
 from __future__ import annotations

 import base64
+import io
 from pathlib import Path
 from typing import Literal, override

@ -128,7 +129,6 @@ class OpenAIImageProvider(Provider):
                    target_config.reference_images,
                    project_dir,
                    size,
-                    output_format,
                )
            else:
                response = await _generate_new(
@ -183,17 +183,30 @@ async def _generate_edit(
    reference_images: list[str],
    project_dir: Path,
    size: _SIZE | None,
-    output_format: str | None = None,
 ) -> ImagesResponse:
    """Generate an image using reference images via the edits endpoint.

    gpt-image-* models accept up to 16 images and return b64 by default
-    (they reject ``response_format``).  DALL-E 2 accepts only one image.
+    (they reject ``response_format`` and ``output_format``).
+    DALL-E 2 accepts only one image.
    """
-    images = [(project_dir / name).read_bytes() for name in reference_images]
-    image: bytes | list[bytes] = images[0] if len(images) == 1 else images
+    raw_images = [(project_dir / name).read_bytes() for name in reference_images]

    if model.startswith("gpt-image-"):
+        # gpt-image-* models require file-like objects with a name attribute;
+        # raw bytes trigger the legacy multipart path that only accepts dall-e-2.
+        def _to_named_buf(data: bytes, name: str) -> io.BytesIO:
+            buf = io.BytesIO(data)
+            buf.name = name
+            return buf
+
+        file_images = [
+            _to_named_buf(data, name)
+            for data, name in zip(raw_images, reference_images, strict=True)
+        ]
+        image: io.BytesIO | list[io.BytesIO] = (
+            file_images[0] if len(file_images) == 1 else file_images
+        )
        kwargs: dict[str, object] = {
            "image": image,
            "prompt": prompt,
@ -202,12 +215,13 @@ async def _generate_edit(
        }
        if size is not None:
            kwargs["size"] = size
-        if output_format is not None:
-            kwargs["output_format"] = output_format
        return await client.images.edit(**kwargs)  # pyright: ignore[reportCallIssue,reportArgumentType,reportUnknownVariableType]

+    dalle_image: bytes | list[bytes] = (
+        raw_images[0] if len(raw_images) == 1 else raw_images
+    )
    kwargs = {
-        "image": image,
+        "image": dalle_image,
        "prompt": prompt,
        "model": model,
        "n": 1,
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@ -7,6 +7,7 @@ Mock-heavy tests produce many Any-typed expressions from MagicMock.
 from __future__ import annotations

 import base64
+import io
 from pathlib import Path
 from unittest.mock import AsyncMock, MagicMock, patch

@ -480,8 +481,10 @@ class TestOpenAIImageProvider:
            )

            call_args = mock_client.images.edit.call_args
-            # Single reference image should be passed as raw bytes
-            assert call_args.kwargs["image"] == b"reference data"
+            # gpt-image-* models pass a BytesIO with a name attribute
+            img_arg = call_args.kwargs["image"]
+            assert img_arg.read() == b"reference data"
+            assert hasattr(img_arg, "name")

        output = project_dir / "out.png"
        assert output.exists()
@ -514,9 +517,9 @@ class TestOpenAIImageProvider:
            )

            call_args = mock_client.images.edit.call_args
-            # Multiple reference images should be passed as a list of bytes
-            image_arg: list[bytes] = call_args.kwargs["image"]
+            # gpt-image-* models pass a list of BytesIO with name attributes
+            image_arg: list[io.BytesIO] = call_args.kwargs["image"]
            assert isinstance(image_arg, list)
            assert len(image_arg) == 2
-            assert image_arg[0] == b"ref1 data"
-            assert image_arg[1] == b"ref2 data"
+            assert image_arg[0].read() == b"ref1 data"
+            assert image_arg[1].read() == b"ref2 data"