From dc2a97d3b8313ea145e99da15b712770b97059d3 Mon Sep 17 00:00:00 2001 From: Konstantin Fickel Date: Sat, 21 Jun 2025 16:43:40 +0200 Subject: [PATCH] refactor: split parse into multiple files Signed-off-by: Konstantin Fickel --- src/streamer/parse/__init__.py | 4 ++ src/streamer/parse/extract_tag.py | 45 ++++++++++++ src/streamer/parse/list.py | 13 ++++ src/streamer/parse/markdown_tag.py | 20 ++++++ src/streamer/{ => parse}/parse.py | 108 +++++------------------------ src/streamer/parse/shard.py | 19 +++++ 6 files changed, 117 insertions(+), 92 deletions(-) create mode 100644 src/streamer/parse/__init__.py create mode 100644 src/streamer/parse/extract_tag.py create mode 100644 src/streamer/parse/list.py create mode 100644 src/streamer/parse/markdown_tag.py rename src/streamer/{ => parse}/parse.py (73%) create mode 100644 src/streamer/parse/shard.py diff --git a/src/streamer/parse/__init__.py b/src/streamer/parse/__init__.py new file mode 100644 index 0000000..9b2d2c0 --- /dev/null +++ b/src/streamer/parse/__init__.py @@ -0,0 +1,4 @@ +from .shard import Shard, StreamFile +from .parse import parse_markdown_file + +__all__ = ["Shard", "StreamFile", "parse_markdown_file"] diff --git a/src/streamer/parse/extract_tag.py b/src/streamer/parse/extract_tag.py new file mode 100644 index 0000000..b4fd526 --- /dev/null +++ b/src/streamer/parse/extract_tag.py @@ -0,0 +1,45 @@ +import re +from typing import Iterable +from mistletoe.block_token import BlockToken +from mistletoe.span_token import RawText +from mistletoe.token import Token + +from .markdown_tag import Tag + + +def extract_tags(tokens: Iterable[Token]) -> list[str]: + return [token.content for token in tokens if isinstance(token, Tag)] + + +def extract_markers_and_tags(block_token: BlockToken) -> tuple[list[str], list[str]]: + markers, tags = [], [] + is_marker = True + + if block_token.children is None: + return [], [] + + for token in block_token.children: + if isinstance(token, Tag): + if is_marker: + markers.append(token) + else: + tags.append(token) + elif not (isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)): + is_marker = False + + return extract_tags(markers), extract_tags(tags) + + +def has_markers(block_token: BlockToken) -> bool: + if block_token.children is None: + return False + + for child in block_token.children: + if isinstance(child, Tag): + return True + elif not (isinstance(child, RawText) and re.match(r"^[\s]*$", child.content)): + return False + return False + + +__all__ = ["extract_tags", "extract_markers_and_tags", "has_markers"] diff --git a/src/streamer/parse/list.py b/src/streamer/parse/list.py new file mode 100644 index 0000000..730ea35 --- /dev/null +++ b/src/streamer/parse/list.py @@ -0,0 +1,13 @@ +from itertools import pairwise +from typing import TypeVar + +A = TypeVar("A") + + +def split_at(list_to_be_split: list[A], positions: list[int]): + positions = sorted(set([0, *positions, len(list_to_be_split)])) + + return [list_to_be_split[left:right] for left, right in pairwise(positions)] + + +__all__ = ["split_at"] diff --git a/src/streamer/parse/markdown_tag.py b/src/streamer/parse/markdown_tag.py new file mode 100644 index 0000000..21f88b5 --- /dev/null +++ b/src/streamer/parse/markdown_tag.py @@ -0,0 +1,20 @@ +import re +from mistletoe.markdown_renderer import Fragment, MarkdownRenderer +from mistletoe.span_token import SpanToken + + +class Tag(SpanToken): + parse_inner = False + pattern = re.compile(r"@([^\s]+)") + + +class TagMarkdownRenderer(MarkdownRenderer): + def __init__(self): + super().__init__(Tag) + + def render_tag(self, token: Tag): + yield Fragment("@") + yield Fragment(token.content) + + +__all__ = ["Tag", "TagMarkdownRenderer"] diff --git a/src/streamer/parse.py b/src/streamer/parse/parse.py similarity index 73% rename from src/streamer/parse.py rename to src/streamer/parse/parse.py index 693960e..09e2541 100644 --- a/src/streamer/parse.py +++ b/src/streamer/parse/parse.py @@ -1,83 +1,29 @@ -from __future__ import annotations -from typing import Iterable, Optional, TypeVar -from pydantic import BaseModel +from typing import Optional from mistletoe import Document -from mistletoe.markdown_renderer import MarkdownRenderer, Fragment -from mistletoe.span_token import SpanToken, RawText from mistletoe.block_token import Paragraph, BlockToken, Heading -from mistletoe.token import Token -from itertools import pairwise from collections import Counter -import re - -class Tag(SpanToken): - parse_inner = False - pattern = re.compile(r"@([^\s]+)") - - -class TagMarkdownRenderer(MarkdownRenderer): - def __init__(self): - super().__init__(Tag) - - def render_tag(self, token: Tag): - yield Fragment("@") - yield Fragment(token.content) - - -class Shard(BaseModel): - markers: list[str] = [] - tags: list[str] = [] - start_line: int - end_line: int - children: list[Shard] = [] - - -class StreamFile(BaseModel): - filename: str - shard: Optional[Shard] = None - - -T = TypeVar("T") +from .markdown_tag import TagMarkdownRenderer +from .extract_tag import extract_markers_and_tags, extract_tags, has_markers +from .shard import Shard, StreamFile +from .list import split_at def get_line_number(block_token: BlockToken) -> int: return block_token.line_number # type: ignore -def extract_tags(tokens: Iterable[Token]) -> list[str]: - return [token.content for token in tokens if isinstance(token, Tag)] - - -def extract_markers_and_tags(block_token: BlockToken) -> tuple[list[str], list[str]]: - markers, tags = [], [] - is_marker = True - - if block_token.children is None: - return [], [] - - for token in block_token.children: - if isinstance(token, Tag): - if is_marker: - markers.append(token) - else: - tags.append(token) - elif not (isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)): - is_marker = False - - return extract_tags(markers), extract_tags(tags) - - -def has_markers(block_token: BlockToken) -> bool: - if block_token.children is None: - return False - - for child in block_token.children: - if isinstance(child, Tag): - return True - elif not (isinstance(child, RawText) and re.match(r"^[\s]*$", child.content)): - return False - return False +def merge_into_first_shard( + shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = [] +): + return shards[0].model_copy( + update={ + "start_line": start_line, + "end_line": end_line, + "children": shards[1:], + "tags": shards[0].tags + additional_tags, + } + ) def find_paragraph_shard_positions(block_tokens: list[BlockToken]) -> list[int]: @@ -88,15 +34,6 @@ def find_paragraph_shard_positions(block_tokens: list[BlockToken]) -> list[int]: ] -A = TypeVar("A") - - -def split_at(list_to_be_split: list[A], positions: list[int]): - positions = sorted(set([0, *positions, len(list_to_be_split)])) - - return [list_to_be_split[left:right] for left, right in pairwise(positions)] - - def find_headings_by_level( block_tokens: list[BlockToken], header_level: int ) -> list[int]: @@ -134,19 +71,6 @@ def calculate_heading_level_for_next_split( ) -def merge_into_first_shard( - shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = [] -): - return shards[0].model_copy( - update={ - "start_line": start_line, - "end_line": end_line, - "children": shards[1:], - "tags": shards[0].tags + additional_tags, - } - ) - - def parse_single_block_shards(block_token: BlockToken, start_line: int, end_line: int): markers, tags = extract_markers_and_tags(block_token) return Shard(start_line=start_line, end_line=end_line, markers=markers, tags=tags) diff --git a/src/streamer/parse/shard.py b/src/streamer/parse/shard.py new file mode 100644 index 0000000..27616b5 --- /dev/null +++ b/src/streamer/parse/shard.py @@ -0,0 +1,19 @@ +from __future__ import annotations +from typing import Optional +from pydantic import BaseModel + + +class Shard(BaseModel): + markers: list[str] = [] + tags: list[str] = [] + start_line: int + end_line: int + children: list[Shard] = [] + + +class StreamFile(BaseModel): + filename: str + shard: Optional[Shard] = None + + +__all__ = ["Shard", "StreamFile"]