refactor: split parse into multiple files

Signed-off-by: Konstantin Fickel <mail@konstantinfickel.de>
2025-06-21 16:43:40 +02:00 · 2025-06-21 16:43:40 +02:00 · dc2a97d3b8
commit dc2a97d3b8
parent 082c13b046
6 changed files with 117 additions and 92 deletions
--- a/src/streamer/parse/init.py
+++ b/src/streamer/parse/init.py
@ -0,0 +1,4 @@
 from .shard import Shard, StreamFile
 from .parse import parse_markdown_file
 __all__ = ["Shard", "StreamFile", "parse_markdown_file"]
--- a/src/streamer/parse/extract_tag.py
+++ b/src/streamer/parse/extract_tag.py
@ -0,0 +1,45 @@
 import re
 from typing import Iterable
 from mistletoe.block_token import BlockToken
 from mistletoe.span_token import RawText
 from mistletoe.token import Token
 from .markdown_tag import Tag
 def extract_tags(tokens: Iterable[Token]) -> list[str]:
    return [token.content for token in tokens if isinstance(token, Tag)]
 def extract_markers_and_tags(block_token: BlockToken) -> tuple[list[str], list[str]]:
    markers, tags = [], []
    is_marker = True
    if block_token.children is None:
        return [], []
    for token in block_token.children:
        if isinstance(token, Tag):
            if is_marker:
                markers.append(token)
            else:
                tags.append(token)
        elif not (isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)):
            is_marker = False
    return extract_tags(markers), extract_tags(tags)
 def has_markers(block_token: BlockToken) -> bool:
    if block_token.children is None:
        return False
    for child in block_token.children:
        if isinstance(child, Tag):
            return True
        elif not (isinstance(child, RawText) and re.match(r"^[\s]*$", child.content)):
            return False
    return False
 __all__ = ["extract_tags", "extract_markers_and_tags", "has_markers"]
--- a/src/streamer/parse/list.py
+++ b/src/streamer/parse/list.py
@ -0,0 +1,13 @@
 from itertools import pairwise
 from typing import TypeVar
 A = TypeVar("A")
 def split_at(list_to_be_split: list[A], positions: list[int]):
    positions = sorted(set([0, *positions, len(list_to_be_split)]))
    return [list_to_be_split[left:right] for left, right in pairwise(positions)]
 __all__ = ["split_at"]
--- a/src/streamer/parse/markdown_tag.py
+++ b/src/streamer/parse/markdown_tag.py
@ -0,0 +1,20 @@
 import re
 from mistletoe.markdown_renderer import Fragment, MarkdownRenderer
 from mistletoe.span_token import SpanToken
 class Tag(SpanToken):
    parse_inner = False
    pattern = re.compile(r"@([^\s]+)")
 class TagMarkdownRenderer(MarkdownRenderer):
    def __init__(self):
        super().__init__(Tag)
    def render_tag(self, token: Tag):
        yield Fragment("@")
        yield Fragment(token.content)
 __all__ = ["Tag", "TagMarkdownRenderer"]
--- a/src/streamer/parse/parse.py
+++ b/src/streamer/parse/parse.py
@ -1,83 +1,29 @@
-from __future__ import annotations
+from typing import Optional
 from typing import Iterable, Optional, TypeVar
 from pydantic import BaseModel
 from mistletoe import Document
 from mistletoe.markdown_renderer import MarkdownRenderer, Fragment
 from mistletoe.span_token import SpanToken, RawText
 from mistletoe.block_token import Paragraph, BlockToken, Heading
 from mistletoe.token import Token
 from itertools import pairwise
 from collections import Counter
 import re
-
+from .markdown_tag import TagMarkdownRenderer
-class Tag(SpanToken):
+from .extract_tag import extract_markers_and_tags, extract_tags, has_markers
-    parse_inner = False
+from .shard import Shard, StreamFile
-    pattern = re.compile(r"@([^\s]+)")
+from .list import split_at
 class TagMarkdownRenderer(MarkdownRenderer):
    def __init__(self):
        super().__init__(Tag)
    def render_tag(self, token: Tag):
        yield Fragment("@")
        yield Fragment(token.content)
 class Shard(BaseModel):
    markers: list[str] = []
    tags: list[str] = []
    start_line: int
    end_line: int
    children: list[Shard] = []
 class StreamFile(BaseModel):
    filename: str
    shard: Optional[Shard] = None
 T = TypeVar("T")
 def get_line_number(block_token: BlockToken) -> int:
    return block_token.line_number  # type: ignore
-def extract_tags(tokens: Iterable[Token]) -> list[str]:
+def merge_into_first_shard(
-    return [token.content for token in tokens if isinstance(token, Tag)]
+    shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = []
-
+):
-
+    return shards[0].model_copy(
-def extract_markers_and_tags(block_token: BlockToken) -> tuple[list[str], list[str]]:
+        update={
-    markers, tags = [], []
+            "start_line": start_line,
-    is_marker = True
+            "end_line": end_line,
-
+            "children": shards[1:],
-    if block_token.children is None:
+            "tags": shards[0].tags + additional_tags,
-        return [], []
+        }
-
+    )
    for token in block_token.children:
        if isinstance(token, Tag):
            if is_marker:
                markers.append(token)
            else:
                tags.append(token)
        elif not (isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)):
            is_marker = False
    return extract_tags(markers), extract_tags(tags)
 def has_markers(block_token: BlockToken) -> bool:
    if block_token.children is None:
        return False
    for child in block_token.children:
        if isinstance(child, Tag):
            return True
        elif not (isinstance(child, RawText) and re.match(r"^[\s]*$", child.content)):
            return False
    return False
 def find_paragraph_shard_positions(block_tokens: list[BlockToken]) -> list[int]:
@ -88,15 +34,6 @@ def find_paragraph_shard_positions(block_tokens: list[BlockToken]) -> list[int]:
    ]
 A = TypeVar("A")
 def split_at(list_to_be_split: list[A], positions: list[int]):
    positions = sorted(set([0, *positions, len(list_to_be_split)]))
    return [list_to_be_split[left:right] for left, right in pairwise(positions)]
 def find_headings_by_level(
    block_tokens: list[BlockToken], header_level: int
 ) -> list[int]:
@ -134,19 +71,6 @@ def calculate_heading_level_for_next_split(
    )
 def merge_into_first_shard(
    shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = []
 ):
    return shards[0].model_copy(
        update={
            "start_line": start_line,
            "end_line": end_line,
            "children": shards[1:],
            "tags": shards[0].tags + additional_tags,
        }
    )
 def parse_single_block_shards(block_token: BlockToken, start_line: int, end_line: int):
    markers, tags = extract_markers_and_tags(block_token)
    return Shard(start_line=start_line, end_line=end_line, markers=markers, tags=tags)
--- a/src/streamer/parse/shard.py
+++ b/src/streamer/parse/shard.py
@ -0,0 +1,19 @@
 from __future__ import annotations
 from typing import Optional
 from pydantic import BaseModel
 class Shard(BaseModel):
    markers: list[str] = []
    tags: list[str] = []
    start_line: int
    end_line: int
    children: list[Shard] = []
 class StreamFile(BaseModel):
    filename: str
    shard: Optional[Shard] = None
 __all__ = ["Shard", "StreamFile"]