fix: send images to Mistral as base64 vision chunks instead of placeholders
The text provider now includes reference_images alongside inputs when building prompts. Image files are sent as base64 data URLs via ImageURLChunk for actual multimodal vision support, replacing the previous [Attached image: ...] placeholder text.
This commit is contained in:
parent
d565329e16
commit
6a9d7efd5d
2 changed files with 102 additions and 23 deletions
|
|
@ -2,6 +2,8 @@
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import mimetypes
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import override
|
from typing import override
|
||||||
|
|
||||||
|
|
@ -11,6 +13,13 @@ from bulkgen.config import IMAGE_EXTENSIONS, TargetConfig
|
||||||
from bulkgen.providers import Provider
|
from bulkgen.providers import Provider
|
||||||
|
|
||||||
|
|
||||||
|
def _image_to_data_url(path: Path) -> str:
|
||||||
|
"""Read an image file and return a ``data:`` URL with base64-encoded content."""
|
||||||
|
mime = mimetypes.guess_type(path.name)[0] or "image/png"
|
||||||
|
b64 = base64.b64encode(path.read_bytes()).decode("ascii")
|
||||||
|
return f"data:{mime};base64,{b64}"
|
||||||
|
|
||||||
|
|
||||||
class TextProvider(Provider):
|
class TextProvider(Provider):
|
||||||
"""Generates text via the Mistral API."""
|
"""Generates text via the Mistral API."""
|
||||||
|
|
||||||
|
|
@ -30,26 +39,26 @@ class TextProvider(Provider):
|
||||||
) -> None:
|
) -> None:
|
||||||
output_path = project_dir / target_name
|
output_path = project_dir / target_name
|
||||||
|
|
||||||
content_parts: list[str] = [resolved_prompt]
|
all_input_names = list(target_config.inputs) + list(
|
||||||
|
target_config.reference_images
|
||||||
|
)
|
||||||
|
|
||||||
for input_name in target_config.inputs:
|
has_images = any(
|
||||||
input_path = project_dir / input_name
|
(project_dir / name).suffix.lower() in IMAGE_EXTENSIONS
|
||||||
suffix = input_path.suffix.lower()
|
for name in all_input_names
|
||||||
|
)
|
||||||
|
|
||||||
if suffix in IMAGE_EXTENSIONS:
|
if has_images:
|
||||||
content_parts.append(f"\n[Attached image: {input_name}]")
|
message = _build_multimodal_message(
|
||||||
else:
|
resolved_prompt, all_input_names, project_dir
|
||||||
file_content = input_path.read_text()
|
)
|
||||||
content_parts.append(
|
else:
|
||||||
f"\n--- Contents of {input_name} ---\n{file_content}"
|
message = _build_text_message(resolved_prompt, all_input_names, project_dir)
|
||||||
)
|
|
||||||
|
|
||||||
full_prompt = "\n".join(content_parts)
|
|
||||||
|
|
||||||
async with Mistral(api_key=self._api_key) as client:
|
async with Mistral(api_key=self._api_key) as client:
|
||||||
response = await client.chat.complete_async(
|
response = await client.chat.complete_async(
|
||||||
model=resolved_model,
|
model=resolved_model,
|
||||||
messages=[models.UserMessage(content=full_prompt)],
|
messages=[message],
|
||||||
)
|
)
|
||||||
|
|
||||||
if not response.choices:
|
if not response.choices:
|
||||||
|
|
@ -63,3 +72,42 @@ class TextProvider(Provider):
|
||||||
|
|
||||||
text = content if isinstance(content, str) else str(content)
|
text = content if isinstance(content, str) else str(content)
|
||||||
_ = output_path.write_text(text)
|
_ = output_path.write_text(text)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_text_message(
|
||||||
|
prompt: str,
|
||||||
|
input_names: list[str],
|
||||||
|
project_dir: Path,
|
||||||
|
) -> models.UserMessage:
|
||||||
|
"""Build a plain-text message (no images)."""
|
||||||
|
parts: list[str] = [prompt]
|
||||||
|
for name in input_names:
|
||||||
|
file_content = (project_dir / name).read_text()
|
||||||
|
parts.append(f"\n--- Contents of {name} ---\n{file_content}")
|
||||||
|
return models.UserMessage(content="\n".join(parts))
|
||||||
|
|
||||||
|
|
||||||
|
def _build_multimodal_message(
|
||||||
|
prompt: str,
|
||||||
|
input_names: list[str],
|
||||||
|
project_dir: Path,
|
||||||
|
) -> models.UserMessage:
|
||||||
|
"""Build a multimodal message with text and image chunks."""
|
||||||
|
chunks: list[models.TextChunk | models.ImageURLChunk] = [
|
||||||
|
models.TextChunk(text=prompt),
|
||||||
|
]
|
||||||
|
|
||||||
|
for name in input_names:
|
||||||
|
input_path = project_dir / name
|
||||||
|
suffix = input_path.suffix.lower()
|
||||||
|
|
||||||
|
if suffix in IMAGE_EXTENSIONS:
|
||||||
|
data_url = _image_to_data_url(input_path)
|
||||||
|
chunks.append(models.ImageURLChunk(image_url=models.ImageURL(url=data_url)))
|
||||||
|
else:
|
||||||
|
file_content = input_path.read_text()
|
||||||
|
chunks.append(
|
||||||
|
models.TextChunk(text=f"\n--- Contents of {name} ---\n{file_content}")
|
||||||
|
)
|
||||||
|
|
||||||
|
return models.UserMessage(content=chunks) # pyright: ignore[reportArgumentType]
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
Mock-heavy tests produce many Any-typed expressions from MagicMock.
|
Mock-heavy tests produce many Any-typed expressions from MagicMock.
|
||||||
"""
|
"""
|
||||||
# pyright: reportAny=false
|
# pyright: reportAny=false, reportUnknownMemberType=false
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
|
@ -312,8 +312,10 @@ class TestTextProvider:
|
||||||
|
|
||||||
call_args = mock_client.chat.complete_async.call_args
|
call_args = mock_client.chat.complete_async.call_args
|
||||||
messages = call_args.kwargs["messages"]
|
messages = call_args.kwargs["messages"]
|
||||||
prompt_text = messages[0].content
|
chunks = messages[0].content
|
||||||
assert "[Attached image: photo.png]" in prompt_text
|
assert isinstance(chunks, list)
|
||||||
|
assert chunks[0].text == "Describe this image"
|
||||||
|
assert chunks[1].image_url.url.startswith("data:image/png;base64,")
|
||||||
|
|
||||||
async def test_text_no_choices_raises(self, project_dir: Path) -> None:
|
async def test_text_no_choices_raises(self, project_dir: Path) -> None:
|
||||||
target_config = TargetConfig(prompt="x")
|
target_config = TargetConfig(prompt="x")
|
||||||
|
|
@ -374,9 +376,38 @@ class TestTextProvider:
|
||||||
)
|
)
|
||||||
|
|
||||||
call_args = mock_client.chat.complete_async.call_args
|
call_args = mock_client.chat.complete_async.call_args
|
||||||
prompt_text = call_args.kwargs["messages"][0].content
|
chunks = call_args.kwargs["messages"][0].content
|
||||||
assert "--- Contents of a.txt ---" in prompt_text
|
assert isinstance(chunks, list)
|
||||||
assert "content A" in prompt_text
|
# TextChunk for prompt, TextChunk for a.txt, TextChunk for b.txt,
|
||||||
assert "--- Contents of b.txt ---" in prompt_text
|
# ImageURLChunk for c.png
|
||||||
assert "content B" in prompt_text
|
assert chunks[0].text == "Combine all"
|
||||||
assert "[Attached image: c.png]" in prompt_text
|
assert "content A" in chunks[1].text
|
||||||
|
assert "content B" in chunks[2].text
|
||||||
|
assert chunks[3].image_url.url.startswith("data:image/png;base64,")
|
||||||
|
|
||||||
|
async def test_text_with_reference_images(self, project_dir: Path) -> None:
|
||||||
|
_ = (project_dir / "ref.png").write_bytes(b"\x89PNG")
|
||||||
|
|
||||||
|
target_config = TargetConfig(
|
||||||
|
prompt="Describe the style", reference_images=["ref.png"]
|
||||||
|
)
|
||||||
|
response = _make_text_response("A stylized image")
|
||||||
|
|
||||||
|
with patch("bulkgen.providers.text.Mistral") as mock_cls:
|
||||||
|
mock_client = _make_mistral_mock(response)
|
||||||
|
mock_cls.return_value = mock_client
|
||||||
|
|
||||||
|
provider = TextProvider(api_key="test-key")
|
||||||
|
await provider.generate(
|
||||||
|
target_name="desc.txt",
|
||||||
|
target_config=target_config,
|
||||||
|
resolved_prompt="Describe the style",
|
||||||
|
resolved_model="mistral-large-latest",
|
||||||
|
project_dir=project_dir,
|
||||||
|
)
|
||||||
|
|
||||||
|
call_args = mock_client.chat.complete_async.call_args
|
||||||
|
chunks = call_args.kwargs["messages"][0].content
|
||||||
|
assert isinstance(chunks, list)
|
||||||
|
assert chunks[0].text == "Describe the style"
|
||||||
|
assert chunks[1].image_url.url.startswith("data:image/png;base64,")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue