From dc2a97d3b8313ea145e99da15b712770b97059d3 Mon Sep 17 00:00:00 2001
From: Konstantin Fickel <mail@konstantinfickel.de>
Date: Sat, 21 Jun 2025 16:43:40 +0200
Subject: [PATCH] refactor: split parse into multiple files

Signed-off-by: Konstantin Fickel <mail@konstantinfickel.de>
---
 src/streamer/parse/__init__.py     |   4 ++
 src/streamer/parse/extract_tag.py  |  45 ++++++++++++
 src/streamer/parse/list.py         |  13 ++++
 src/streamer/parse/markdown_tag.py |  20 ++++++
 src/streamer/{ => parse}/parse.py  | 108 +++++------------------------
 src/streamer/parse/shard.py        |  19 +++++
 6 files changed, 117 insertions(+), 92 deletions(-)
 create mode 100644 src/streamer/parse/__init__.py
 create mode 100644 src/streamer/parse/extract_tag.py
 create mode 100644 src/streamer/parse/list.py
 create mode 100644 src/streamer/parse/markdown_tag.py
 rename src/streamer/{ => parse}/parse.py (73%)
 create mode 100644 src/streamer/parse/shard.py

diff --git a/src/streamer/parse/__init__.py b/src/streamer/parse/__init__.py
new file mode 100644
index 0000000..9b2d2c0
--- /dev/null
+++ b/src/streamer/parse/__init__.py
@@ -0,0 +1,4 @@
+from .shard import Shard, StreamFile
+from .parse import parse_markdown_file
+
+__all__ = ["Shard", "StreamFile", "parse_markdown_file"]
diff --git a/src/streamer/parse/extract_tag.py b/src/streamer/parse/extract_tag.py
new file mode 100644
index 0000000..b4fd526
--- /dev/null
+++ b/src/streamer/parse/extract_tag.py
@@ -0,0 +1,45 @@
+import re
+from typing import Iterable
+from mistletoe.block_token import BlockToken
+from mistletoe.span_token import RawText
+from mistletoe.token import Token
+
+from .markdown_tag import Tag
+
+
+def extract_tags(tokens: Iterable[Token]) -> list[str]:
+    return [token.content for token in tokens if isinstance(token, Tag)]
+
+
+def extract_markers_and_tags(block_token: BlockToken) -> tuple[list[str], list[str]]:
+    markers, tags = [], []
+    is_marker = True
+
+    if block_token.children is None:
+        return [], []
+
+    for token in block_token.children:
+        if isinstance(token, Tag):
+            if is_marker:
+                markers.append(token)
+            else:
+                tags.append(token)
+        elif not (isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)):
+            is_marker = False
+
+    return extract_tags(markers), extract_tags(tags)
+
+
+def has_markers(block_token: BlockToken) -> bool:
+    if block_token.children is None:
+        return False
+
+    for child in block_token.children:
+        if isinstance(child, Tag):
+            return True
+        elif not (isinstance(child, RawText) and re.match(r"^[\s]*$", child.content)):
+            return False
+    return False
+
+
+__all__ = ["extract_tags", "extract_markers_and_tags", "has_markers"]
diff --git a/src/streamer/parse/list.py b/src/streamer/parse/list.py
new file mode 100644
index 0000000..730ea35
--- /dev/null
+++ b/src/streamer/parse/list.py
@@ -0,0 +1,13 @@
+from itertools import pairwise
+from typing import TypeVar
+
+A = TypeVar("A")
+
+
+def split_at(list_to_be_split: list[A], positions: list[int]):
+    positions = sorted(set([0, *positions, len(list_to_be_split)]))
+
+    return [list_to_be_split[left:right] for left, right in pairwise(positions)]
+
+
+__all__ = ["split_at"]
diff --git a/src/streamer/parse/markdown_tag.py b/src/streamer/parse/markdown_tag.py
new file mode 100644
index 0000000..21f88b5
--- /dev/null
+++ b/src/streamer/parse/markdown_tag.py
@@ -0,0 +1,20 @@
+import re
+from mistletoe.markdown_renderer import Fragment, MarkdownRenderer
+from mistletoe.span_token import SpanToken
+
+
+class Tag(SpanToken):
+    parse_inner = False
+    pattern = re.compile(r"@([^\s]+)")
+
+
+class TagMarkdownRenderer(MarkdownRenderer):
+    def __init__(self):
+        super().__init__(Tag)
+
+    def render_tag(self, token: Tag):
+        yield Fragment("@")
+        yield Fragment(token.content)
+
+
+__all__ = ["Tag", "TagMarkdownRenderer"]
diff --git a/src/streamer/parse.py b/src/streamer/parse/parse.py
similarity index 73%
rename from src/streamer/parse.py
rename to src/streamer/parse/parse.py
index 693960e..09e2541 100644
--- a/src/streamer/parse.py
+++ b/src/streamer/parse/parse.py
@@ -1,83 +1,29 @@
-from __future__ import annotations
-from typing import Iterable, Optional, TypeVar
-from pydantic import BaseModel
+from typing import Optional
 from mistletoe import Document
-from mistletoe.markdown_renderer import MarkdownRenderer, Fragment
-from mistletoe.span_token import SpanToken, RawText
 from mistletoe.block_token import Paragraph, BlockToken, Heading
-from mistletoe.token import Token
-from itertools import pairwise
 from collections import Counter
-import re
 
-
-class Tag(SpanToken):
-    parse_inner = False
-    pattern = re.compile(r"@([^\s]+)")
-
-
-class TagMarkdownRenderer(MarkdownRenderer):
-    def __init__(self):
-        super().__init__(Tag)
-
-    def render_tag(self, token: Tag):
-        yield Fragment("@")
-        yield Fragment(token.content)
-
-
-class Shard(BaseModel):
-    markers: list[str] = []
-    tags: list[str] = []
-    start_line: int
-    end_line: int
-    children: list[Shard] = []
-
-
-class StreamFile(BaseModel):
-    filename: str
-    shard: Optional[Shard] = None
-
-
-T = TypeVar("T")
+from .markdown_tag import TagMarkdownRenderer
+from .extract_tag import extract_markers_and_tags, extract_tags, has_markers
+from .shard import Shard, StreamFile
+from .list import split_at
 
 
 def get_line_number(block_token: BlockToken) -> int:
     return block_token.line_number  # type: ignore
 
 
-def extract_tags(tokens: Iterable[Token]) -> list[str]:
-    return [token.content for token in tokens if isinstance(token, Tag)]
-
-
-def extract_markers_and_tags(block_token: BlockToken) -> tuple[list[str], list[str]]:
-    markers, tags = [], []
-    is_marker = True
-
-    if block_token.children is None:
-        return [], []
-
-    for token in block_token.children:
-        if isinstance(token, Tag):
-            if is_marker:
-                markers.append(token)
-            else:
-                tags.append(token)
-        elif not (isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)):
-            is_marker = False
-
-    return extract_tags(markers), extract_tags(tags)
-
-
-def has_markers(block_token: BlockToken) -> bool:
-    if block_token.children is None:
-        return False
-
-    for child in block_token.children:
-        if isinstance(child, Tag):
-            return True
-        elif not (isinstance(child, RawText) and re.match(r"^[\s]*$", child.content)):
-            return False
-    return False
+def merge_into_first_shard(
+    shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = []
+):
+    return shards[0].model_copy(
+        update={
+            "start_line": start_line,
+            "end_line": end_line,
+            "children": shards[1:],
+            "tags": shards[0].tags + additional_tags,
+        }
+    )
 
 
 def find_paragraph_shard_positions(block_tokens: list[BlockToken]) -> list[int]:
@@ -88,15 +34,6 @@ def find_paragraph_shard_positions(block_tokens: list[BlockToken]) -> list[int]:
     ]
 
 
-A = TypeVar("A")
-
-
-def split_at(list_to_be_split: list[A], positions: list[int]):
-    positions = sorted(set([0, *positions, len(list_to_be_split)]))
-
-    return [list_to_be_split[left:right] for left, right in pairwise(positions)]
-
-
 def find_headings_by_level(
     block_tokens: list[BlockToken], header_level: int
 ) -> list[int]:
@@ -134,19 +71,6 @@ def calculate_heading_level_for_next_split(
     )
 
 
-def merge_into_first_shard(
-    shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = []
-):
-    return shards[0].model_copy(
-        update={
-            "start_line": start_line,
-            "end_line": end_line,
-            "children": shards[1:],
-            "tags": shards[0].tags + additional_tags,
-        }
-    )
-
-
 def parse_single_block_shards(block_token: BlockToken, start_line: int, end_line: int):
     markers, tags = extract_markers_and_tags(block_token)
     return Shard(start_line=start_line, end_line=end_line, markers=markers, tags=tags)
diff --git a/src/streamer/parse/shard.py b/src/streamer/parse/shard.py
new file mode 100644
index 0000000..27616b5
--- /dev/null
+++ b/src/streamer/parse/shard.py
@@ -0,0 +1,19 @@
+from __future__ import annotations
+from typing import Optional
+from pydantic import BaseModel
+
+
+class Shard(BaseModel):
+    markers: list[str] = []
+    tags: list[str] = []
+    start_line: int
+    end_line: int
+    children: list[Shard] = []
+
+
+class StreamFile(BaseModel):
+    filename: str
+    shard: Optional[Shard] = None
+
+
+__all__ = ["Shard", "StreamFile"]