From 6a9d7efd5d8102ec23814c5c10823b0c2590e19c Mon Sep 17 00:00:00 2001 From: Konstantin Fickel Date: Sat, 14 Feb 2026 17:45:39 +0100 Subject: [PATCH] fix: send images to Mistral as base64 vision chunks instead of placeholders The text provider now includes reference_images alongside inputs when building prompts. Image files are sent as base64 data URLs via ImageURLChunk for actual multimodal vision support, replacing the previous [Attached image: ...] placeholder text. --- bulkgen/providers/text.py | 76 +++++++++++++++++++++++++++++++-------- tests/test_providers.py | 49 ++++++++++++++++++++----- 2 files changed, 102 insertions(+), 23 deletions(-) diff --git a/bulkgen/providers/text.py b/bulkgen/providers/text.py index b10206e..8f29e5f 100644 --- a/bulkgen/providers/text.py +++ b/bulkgen/providers/text.py @@ -2,6 +2,8 @@ from __future__ import annotations +import base64 +import mimetypes from pathlib import Path from typing import override @@ -11,6 +13,13 @@ from bulkgen.config import IMAGE_EXTENSIONS, TargetConfig from bulkgen.providers import Provider +def _image_to_data_url(path: Path) -> str: + """Read an image file and return a ``data:`` URL with base64-encoded content.""" + mime = mimetypes.guess_type(path.name)[0] or "image/png" + b64 = base64.b64encode(path.read_bytes()).decode("ascii") + return f"data:{mime};base64,{b64}" + + class TextProvider(Provider): """Generates text via the Mistral API.""" @@ -30,26 +39,26 @@ class TextProvider(Provider): ) -> None: output_path = project_dir / target_name - content_parts: list[str] = [resolved_prompt] + all_input_names = list(target_config.inputs) + list( + target_config.reference_images + ) - for input_name in target_config.inputs: - input_path = project_dir / input_name - suffix = input_path.suffix.lower() + has_images = any( + (project_dir / name).suffix.lower() in IMAGE_EXTENSIONS + for name in all_input_names + ) - if suffix in IMAGE_EXTENSIONS: - content_parts.append(f"\n[Attached image: {input_name}]") - else: - file_content = input_path.read_text() - content_parts.append( - f"\n--- Contents of {input_name} ---\n{file_content}" - ) - - full_prompt = "\n".join(content_parts) + if has_images: + message = _build_multimodal_message( + resolved_prompt, all_input_names, project_dir + ) + else: + message = _build_text_message(resolved_prompt, all_input_names, project_dir) async with Mistral(api_key=self._api_key) as client: response = await client.chat.complete_async( model=resolved_model, - messages=[models.UserMessage(content=full_prompt)], + messages=[message], ) if not response.choices: @@ -63,3 +72,42 @@ class TextProvider(Provider): text = content if isinstance(content, str) else str(content) _ = output_path.write_text(text) + + +def _build_text_message( + prompt: str, + input_names: list[str], + project_dir: Path, +) -> models.UserMessage: + """Build a plain-text message (no images).""" + parts: list[str] = [prompt] + for name in input_names: + file_content = (project_dir / name).read_text() + parts.append(f"\n--- Contents of {name} ---\n{file_content}") + return models.UserMessage(content="\n".join(parts)) + + +def _build_multimodal_message( + prompt: str, + input_names: list[str], + project_dir: Path, +) -> models.UserMessage: + """Build a multimodal message with text and image chunks.""" + chunks: list[models.TextChunk | models.ImageURLChunk] = [ + models.TextChunk(text=prompt), + ] + + for name in input_names: + input_path = project_dir / name + suffix = input_path.suffix.lower() + + if suffix in IMAGE_EXTENSIONS: + data_url = _image_to_data_url(input_path) + chunks.append(models.ImageURLChunk(image_url=models.ImageURL(url=data_url))) + else: + file_content = input_path.read_text() + chunks.append( + models.TextChunk(text=f"\n--- Contents of {name} ---\n{file_content}") + ) + + return models.UserMessage(content=chunks) # pyright: ignore[reportArgumentType] diff --git a/tests/test_providers.py b/tests/test_providers.py index a4109fd..38a3ca0 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -2,7 +2,7 @@ Mock-heavy tests produce many Any-typed expressions from MagicMock. """ -# pyright: reportAny=false +# pyright: reportAny=false, reportUnknownMemberType=false from __future__ import annotations @@ -312,8 +312,10 @@ class TestTextProvider: call_args = mock_client.chat.complete_async.call_args messages = call_args.kwargs["messages"] - prompt_text = messages[0].content - assert "[Attached image: photo.png]" in prompt_text + chunks = messages[0].content + assert isinstance(chunks, list) + assert chunks[0].text == "Describe this image" + assert chunks[1].image_url.url.startswith("data:image/png;base64,") async def test_text_no_choices_raises(self, project_dir: Path) -> None: target_config = TargetConfig(prompt="x") @@ -374,9 +376,38 @@ class TestTextProvider: ) call_args = mock_client.chat.complete_async.call_args - prompt_text = call_args.kwargs["messages"][0].content - assert "--- Contents of a.txt ---" in prompt_text - assert "content A" in prompt_text - assert "--- Contents of b.txt ---" in prompt_text - assert "content B" in prompt_text - assert "[Attached image: c.png]" in prompt_text + chunks = call_args.kwargs["messages"][0].content + assert isinstance(chunks, list) + # TextChunk for prompt, TextChunk for a.txt, TextChunk for b.txt, + # ImageURLChunk for c.png + assert chunks[0].text == "Combine all" + assert "content A" in chunks[1].text + assert "content B" in chunks[2].text + assert chunks[3].image_url.url.startswith("data:image/png;base64,") + + async def test_text_with_reference_images(self, project_dir: Path) -> None: + _ = (project_dir / "ref.png").write_bytes(b"\x89PNG") + + target_config = TargetConfig( + prompt="Describe the style", reference_images=["ref.png"] + ) + response = _make_text_response("A stylized image") + + with patch("bulkgen.providers.text.Mistral") as mock_cls: + mock_client = _make_mistral_mock(response) + mock_cls.return_value = mock_client + + provider = TextProvider(api_key="test-key") + await provider.generate( + target_name="desc.txt", + target_config=target_config, + resolved_prompt="Describe the style", + resolved_model="mistral-large-latest", + project_dir=project_dir, + ) + + call_args = mock_client.chat.complete_async.call_args + chunks = call_args.kwargs["messages"][0].content + assert isinstance(chunks, list) + assert chunks[0].text == "Describe the style" + assert chunks[1].image_url.url.startswith("data:image/png;base64,")