From 6a9d7efd5d8102ec23814c5c10823b0c2590e19c Mon Sep 17 00:00:00 2001
From: Konstantin Fickel <mail@konstantinfickel.de>
Date: Sat, 14 Feb 2026 17:45:39 +0100
Subject: [PATCH] fix: send images to Mistral as base64 vision chunks instead
 of placeholders

The text provider now includes reference_images alongside inputs when
building prompts. Image files are sent as base64 data URLs via
ImageURLChunk for actual multimodal vision support, replacing the
previous [Attached image: ...] placeholder text.
---
 bulkgen/providers/text.py | 76 +++++++++++++++++++++++++++++++--------
 tests/test_providers.py   | 49 ++++++++++++++++++++-----
 2 files changed, 102 insertions(+), 23 deletions(-)

diff --git a/bulkgen/providers/text.py b/bulkgen/providers/text.py
index b10206e..8f29e5f 100644
--- a/bulkgen/providers/text.py
+++ b/bulkgen/providers/text.py
@@ -2,6 +2,8 @@
 
 from __future__ import annotations
 
+import base64
+import mimetypes
 from pathlib import Path
 from typing import override
 
@@ -11,6 +13,13 @@ from bulkgen.config import IMAGE_EXTENSIONS, TargetConfig
 from bulkgen.providers import Provider
 
 
+def _image_to_data_url(path: Path) -> str:
+    """Read an image file and return a ``data:`` URL with base64-encoded content."""
+    mime = mimetypes.guess_type(path.name)[0] or "image/png"
+    b64 = base64.b64encode(path.read_bytes()).decode("ascii")
+    return f"data:{mime};base64,{b64}"
+
+
 class TextProvider(Provider):
     """Generates text via the Mistral API."""
 
@@ -30,26 +39,26 @@ class TextProvider(Provider):
     ) -> None:
         output_path = project_dir / target_name
 
-        content_parts: list[str] = [resolved_prompt]
+        all_input_names = list(target_config.inputs) + list(
+            target_config.reference_images
+        )
 
-        for input_name in target_config.inputs:
-            input_path = project_dir / input_name
-            suffix = input_path.suffix.lower()
+        has_images = any(
+            (project_dir / name).suffix.lower() in IMAGE_EXTENSIONS
+            for name in all_input_names
+        )
 
-            if suffix in IMAGE_EXTENSIONS:
-                content_parts.append(f"\n[Attached image: {input_name}]")
-            else:
-                file_content = input_path.read_text()
-                content_parts.append(
-                    f"\n--- Contents of {input_name} ---\n{file_content}"
-                )
-
-        full_prompt = "\n".join(content_parts)
+        if has_images:
+            message = _build_multimodal_message(
+                resolved_prompt, all_input_names, project_dir
+            )
+        else:
+            message = _build_text_message(resolved_prompt, all_input_names, project_dir)
 
         async with Mistral(api_key=self._api_key) as client:
             response = await client.chat.complete_async(
                 model=resolved_model,
-                messages=[models.UserMessage(content=full_prompt)],
+                messages=[message],
             )
 
         if not response.choices:
@@ -63,3 +72,42 @@ class TextProvider(Provider):
 
         text = content if isinstance(content, str) else str(content)
         _ = output_path.write_text(text)
+
+
+def _build_text_message(
+    prompt: str,
+    input_names: list[str],
+    project_dir: Path,
+) -> models.UserMessage:
+    """Build a plain-text message (no images)."""
+    parts: list[str] = [prompt]
+    for name in input_names:
+        file_content = (project_dir / name).read_text()
+        parts.append(f"\n--- Contents of {name} ---\n{file_content}")
+    return models.UserMessage(content="\n".join(parts))
+
+
+def _build_multimodal_message(
+    prompt: str,
+    input_names: list[str],
+    project_dir: Path,
+) -> models.UserMessage:
+    """Build a multimodal message with text and image chunks."""
+    chunks: list[models.TextChunk | models.ImageURLChunk] = [
+        models.TextChunk(text=prompt),
+    ]
+
+    for name in input_names:
+        input_path = project_dir / name
+        suffix = input_path.suffix.lower()
+
+        if suffix in IMAGE_EXTENSIONS:
+            data_url = _image_to_data_url(input_path)
+            chunks.append(models.ImageURLChunk(image_url=models.ImageURL(url=data_url)))
+        else:
+            file_content = input_path.read_text()
+            chunks.append(
+                models.TextChunk(text=f"\n--- Contents of {name} ---\n{file_content}")
+            )
+
+    return models.UserMessage(content=chunks)  # pyright: ignore[reportArgumentType]
diff --git a/tests/test_providers.py b/tests/test_providers.py
index a4109fd..38a3ca0 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -2,7 +2,7 @@
 
 Mock-heavy tests produce many Any-typed expressions from MagicMock.
 """
-# pyright: reportAny=false
+# pyright: reportAny=false, reportUnknownMemberType=false
 
 from __future__ import annotations
 
@@ -312,8 +312,10 @@ class TestTextProvider:
 
             call_args = mock_client.chat.complete_async.call_args
             messages = call_args.kwargs["messages"]
-            prompt_text = messages[0].content
-            assert "[Attached image: photo.png]" in prompt_text
+            chunks = messages[0].content
+            assert isinstance(chunks, list)
+            assert chunks[0].text == "Describe this image"
+            assert chunks[1].image_url.url.startswith("data:image/png;base64,")
 
     async def test_text_no_choices_raises(self, project_dir: Path) -> None:
         target_config = TargetConfig(prompt="x")
@@ -374,9 +376,38 @@ class TestTextProvider:
             )
 
             call_args = mock_client.chat.complete_async.call_args
-            prompt_text = call_args.kwargs["messages"][0].content
-            assert "--- Contents of a.txt ---" in prompt_text
-            assert "content A" in prompt_text
-            assert "--- Contents of b.txt ---" in prompt_text
-            assert "content B" in prompt_text
-            assert "[Attached image: c.png]" in prompt_text
+            chunks = call_args.kwargs["messages"][0].content
+            assert isinstance(chunks, list)
+            # TextChunk for prompt, TextChunk for a.txt, TextChunk for b.txt,
+            # ImageURLChunk for c.png
+            assert chunks[0].text == "Combine all"
+            assert "content A" in chunks[1].text
+            assert "content B" in chunks[2].text
+            assert chunks[3].image_url.url.startswith("data:image/png;base64,")
+
+    async def test_text_with_reference_images(self, project_dir: Path) -> None:
+        _ = (project_dir / "ref.png").write_bytes(b"\x89PNG")
+
+        target_config = TargetConfig(
+            prompt="Describe the style", reference_images=["ref.png"]
+        )
+        response = _make_text_response("A stylized image")
+
+        with patch("bulkgen.providers.text.Mistral") as mock_cls:
+            mock_client = _make_mistral_mock(response)
+            mock_cls.return_value = mock_client
+
+            provider = TextProvider(api_key="test-key")
+            await provider.generate(
+                target_name="desc.txt",
+                target_config=target_config,
+                resolved_prompt="Describe the style",
+                resolved_model="mistral-large-latest",
+                project_dir=project_dir,
+            )
+
+            call_args = mock_client.chat.complete_async.call_args
+            chunks = call_args.kwargs["messages"][0].content
+            assert isinstance(chunks, list)
+            assert chunks[0].text == "Describe the style"
+            assert chunks[1].image_url.url.startswith("data:image/png;base64,")