fix: send images to Mistral as base64 vision chunks instead of placeholders
All checks were successful
Continuous Integration / Build Package (push) Successful in 34s
Continuous Integration / Lint, Check & Test (push) Successful in 53s

The text provider now includes reference_images alongside inputs when
building prompts. Image files are sent as base64 data URLs via
ImageURLChunk for actual multimodal vision support, replacing the
previous [Attached image: ...] placeholder text.
This commit is contained in:
Konstantin Fickel 2026-02-14 17:45:39 +01:00
parent d565329e16
commit 6a9d7efd5d
Signed by: kfickel
GPG key ID: A793722F9933C1A5
2 changed files with 102 additions and 23 deletions

View file

@ -2,6 +2,8 @@
from __future__ import annotations
import base64
import mimetypes
from pathlib import Path
from typing import override
@ -11,6 +13,13 @@ from bulkgen.config import IMAGE_EXTENSIONS, TargetConfig
from bulkgen.providers import Provider
def _image_to_data_url(path: Path) -> str:
"""Read an image file and return a ``data:`` URL with base64-encoded content."""
mime = mimetypes.guess_type(path.name)[0] or "image/png"
b64 = base64.b64encode(path.read_bytes()).decode("ascii")
return f"data:{mime};base64,{b64}"
class TextProvider(Provider):
"""Generates text via the Mistral API."""
@ -30,26 +39,26 @@ class TextProvider(Provider):
) -> None:
output_path = project_dir / target_name
content_parts: list[str] = [resolved_prompt]
all_input_names = list(target_config.inputs) + list(
target_config.reference_images
)
for input_name in target_config.inputs:
input_path = project_dir / input_name
suffix = input_path.suffix.lower()
has_images = any(
(project_dir / name).suffix.lower() in IMAGE_EXTENSIONS
for name in all_input_names
)
if suffix in IMAGE_EXTENSIONS:
content_parts.append(f"\n[Attached image: {input_name}]")
else:
file_content = input_path.read_text()
content_parts.append(
f"\n--- Contents of {input_name} ---\n{file_content}"
)
full_prompt = "\n".join(content_parts)
if has_images:
message = _build_multimodal_message(
resolved_prompt, all_input_names, project_dir
)
else:
message = _build_text_message(resolved_prompt, all_input_names, project_dir)
async with Mistral(api_key=self._api_key) as client:
response = await client.chat.complete_async(
model=resolved_model,
messages=[models.UserMessage(content=full_prompt)],
messages=[message],
)
if not response.choices:
@ -63,3 +72,42 @@ class TextProvider(Provider):
text = content if isinstance(content, str) else str(content)
_ = output_path.write_text(text)
def _build_text_message(
prompt: str,
input_names: list[str],
project_dir: Path,
) -> models.UserMessage:
"""Build a plain-text message (no images)."""
parts: list[str] = [prompt]
for name in input_names:
file_content = (project_dir / name).read_text()
parts.append(f"\n--- Contents of {name} ---\n{file_content}")
return models.UserMessage(content="\n".join(parts))
def _build_multimodal_message(
prompt: str,
input_names: list[str],
project_dir: Path,
) -> models.UserMessage:
"""Build a multimodal message with text and image chunks."""
chunks: list[models.TextChunk | models.ImageURLChunk] = [
models.TextChunk(text=prompt),
]
for name in input_names:
input_path = project_dir / name
suffix = input_path.suffix.lower()
if suffix in IMAGE_EXTENSIONS:
data_url = _image_to_data_url(input_path)
chunks.append(models.ImageURLChunk(image_url=models.ImageURL(url=data_url)))
else:
file_content = input_path.read_text()
chunks.append(
models.TextChunk(text=f"\n--- Contents of {name} ---\n{file_content}")
)
return models.UserMessage(content=chunks) # pyright: ignore[reportArgumentType]