refactor: split parse into multiple files

Signed-off-by: Konstantin Fickel <mail@konstantinfickel.de>
2025-06-21 16:43:40 +02:00 · 2025-06-21 16:43:40 +02:00 · dc2a97d3b8
commit dc2a97d3b8
parent 082c13b046
6 changed files with 117 additions and 92 deletions
--- a/src/streamer/parse/init.py
+++ b/src/streamer/parse/init.py
@ -0,0 +1,4 @@
+from .shard import Shard, StreamFile
+from .parse import parse_markdown_file
+
+__all__ = ["Shard", "StreamFile", "parse_markdown_file"]
--- a/src/streamer/parse/extract_tag.py
+++ b/src/streamer/parse/extract_tag.py
@ -0,0 +1,45 @@
+import re
+from typing import Iterable
+from mistletoe.block_token import BlockToken
+from mistletoe.span_token import RawText
+from mistletoe.token import Token
+
+from .markdown_tag import Tag
+
+
+def extract_tags(tokens: Iterable[Token]) -> list[str]:
+    return [token.content for token in tokens if isinstance(token, Tag)]
+
+
+def extract_markers_and_tags(block_token: BlockToken) -> tuple[list[str], list[str]]:
+    markers, tags = [], []
+    is_marker = True
+
+    if block_token.children is None:
+        return [], []
+
+    for token in block_token.children:
+        if isinstance(token, Tag):
+            if is_marker:
+                markers.append(token)
+            else:
+                tags.append(token)
+        elif not (isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)):
+            is_marker = False
+
+    return extract_tags(markers), extract_tags(tags)
+
+
+def has_markers(block_token: BlockToken) -> bool:
+    if block_token.children is None:
+        return False
+
+    for child in block_token.children:
+        if isinstance(child, Tag):
+            return True
+        elif not (isinstance(child, RawText) and re.match(r"^[\s]*$", child.content)):
+            return False
+    return False
+
+
+__all__ = ["extract_tags", "extract_markers_and_tags", "has_markers"]
--- a/src/streamer/parse/list.py
+++ b/src/streamer/parse/list.py
@ -0,0 +1,13 @@
+from itertools import pairwise
+from typing import TypeVar
+
+A = TypeVar("A")
+
+
+def split_at(list_to_be_split: list[A], positions: list[int]):
+    positions = sorted(set([0, *positions, len(list_to_be_split)]))
+
+    return [list_to_be_split[left:right] for left, right in pairwise(positions)]
+
+
+__all__ = ["split_at"]
--- a/src/streamer/parse/markdown_tag.py
+++ b/src/streamer/parse/markdown_tag.py
@ -0,0 +1,20 @@
+import re
+from mistletoe.markdown_renderer import Fragment, MarkdownRenderer
+from mistletoe.span_token import SpanToken
+
+
+class Tag(SpanToken):
+    parse_inner = False
+    pattern = re.compile(r"@([^\s]+)")
+
+
+class TagMarkdownRenderer(MarkdownRenderer):
+    def __init__(self):
+        super().__init__(Tag)
+
+    def render_tag(self, token: Tag):
+        yield Fragment("@")
+        yield Fragment(token.content)
+
+
+__all__ = ["Tag", "TagMarkdownRenderer"]
--- a/src/streamer/parse/parse.py
+++ b/src/streamer/parse/parse.py
@ -1,83 +1,29 @@
-from __future__ import annotations
-from typing import Iterable, Optional, TypeVar
-from pydantic import BaseModel
+from typing import Optional
 from mistletoe import Document
-from mistletoe.markdown_renderer import MarkdownRenderer, Fragment
-from mistletoe.span_token import SpanToken, RawText
 from mistletoe.block_token import Paragraph, BlockToken, Heading
-from mistletoe.token import Token
-from itertools import pairwise
 from collections import Counter
-import re

-
-class Tag(SpanToken):
-    parse_inner = False
-    pattern = re.compile(r"@([^\s]+)")
-
-
-class TagMarkdownRenderer(MarkdownRenderer):
-    def __init__(self):
-        super().__init__(Tag)
-
-    def render_tag(self, token: Tag):
-        yield Fragment("@")
-        yield Fragment(token.content)
-
-
-class Shard(BaseModel):
-    markers: list[str] = []
-    tags: list[str] = []
-    start_line: int
-    end_line: int
-    children: list[Shard] = []
-
-
-class StreamFile(BaseModel):
-    filename: str
-    shard: Optional[Shard] = None
-
-
-T = TypeVar("T")
+from .markdown_tag import TagMarkdownRenderer
+from .extract_tag import extract_markers_and_tags, extract_tags, has_markers
+from .shard import Shard, StreamFile
+from .list import split_at


 def get_line_number(block_token: BlockToken) -> int:
    return block_token.line_number  # type: ignore


-def extract_tags(tokens: Iterable[Token]) -> list[str]:
-    return [token.content for token in tokens if isinstance(token, Tag)]
-
-
-def extract_markers_and_tags(block_token: BlockToken) -> tuple[list[str], list[str]]:
-    markers, tags = [], []
-    is_marker = True
-
-    if block_token.children is None:
-        return [], []
-
-    for token in block_token.children:
-        if isinstance(token, Tag):
-            if is_marker:
-                markers.append(token)
-            else:
-                tags.append(token)
-        elif not (isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)):
-            is_marker = False
-
-    return extract_tags(markers), extract_tags(tags)
-
-
-def has_markers(block_token: BlockToken) -> bool:
-    if block_token.children is None:
-        return False
-
-    for child in block_token.children:
-        if isinstance(child, Tag):
-            return True
-        elif not (isinstance(child, RawText) and re.match(r"^[\s]*$", child.content)):
-            return False
-    return False
+def merge_into_first_shard(
+    shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = []
+):
+    return shards[0].model_copy(
+        update={
+            "start_line": start_line,
+            "end_line": end_line,
+            "children": shards[1:],
+            "tags": shards[0].tags + additional_tags,
+        }
+    )


 def find_paragraph_shard_positions(block_tokens: list[BlockToken]) -> list[int]:
@ -88,15 +34,6 @@ def find_paragraph_shard_positions(block_tokens: list[BlockToken]) -> list[int]:
    ]


-A = TypeVar("A")
-
-
-def split_at(list_to_be_split: list[A], positions: list[int]):
-    positions = sorted(set([0, *positions, len(list_to_be_split)]))
-
-    return [list_to_be_split[left:right] for left, right in pairwise(positions)]
-
-
 def find_headings_by_level(
    block_tokens: list[BlockToken], header_level: int
 ) -> list[int]:
@ -134,19 +71,6 @@ def calculate_heading_level_for_next_split(
    )


-def merge_into_first_shard(
-    shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = []
-):
-    return shards[0].model_copy(
-        update={
-            "start_line": start_line,
-            "end_line": end_line,
-            "children": shards[1:],
-            "tags": shards[0].tags + additional_tags,
-        }
-    )
-
-
 def parse_single_block_shards(block_token: BlockToken, start_line: int, end_line: int):
    markers, tags = extract_markers_and_tags(block_token)
    return Shard(start_line=start_line, end_line=end_line, markers=markers, tags=tags)
--- a/src/streamer/parse/shard.py
+++ b/src/streamer/parse/shard.py
@ -0,0 +1,19 @@
+from __future__ import annotations
+from typing import Optional
+from pydantic import BaseModel
+
+
+class Shard(BaseModel):
+    markers: list[str] = []
+    tags: list[str] = []
+    start_line: int
+    end_line: int
+    children: list[Shard] = []
+
+
+class StreamFile(BaseModel):
+    filename: str
+    shard: Optional[Shard] = None
+
+
+__all__ = ["Shard", "StreamFile"]