feat: support lists in parsing

Signed-off-by: Konstantin Fickel <mail@konstantinfickel.de>
2025-06-22 11:36:58 +02:00 · 2025-06-22 11:36:58 +02:00 · fa85017ce3
commit fa85017ce3
parent dc2a97d3b8
2 changed files with 111 additions and 36 deletions
--- a/src/streamer/parse/parse.py
+++ b/src/streamer/parse/parse.py
@ -1,10 +1,10 @@
 from typing import Optional
 from mistletoe import Document
-from mistletoe.block_token import Paragraph, BlockToken, Heading
+from mistletoe.block_token import Paragraph, BlockToken, Heading, List, ListItem
 from collections import Counter
 from .markdown_tag import TagMarkdownRenderer
-from .extract_tag import extract_markers_and_tags, extract_tags, has_markers
+from .extract_tag import extract_markers_and_tags, has_markers
 from .shard import Shard, StreamFile
 from .list import split_at
@ -13,6 +13,31 @@ def get_line_number(block_token: BlockToken) -> int:
    return block_token.line_number  # type: ignore
 def build_shard(
    start_line,
    end_line,
    markers: list[str] = [],
    tags: list[str] = [],
    children: list[Shard] = [],
 ) -> Shard:
    if (
        len(children) == 1
        and len(tags) == 0
        and len(markers) == 0
        and children[0].start_line == start_line
        and children[0].end_line == end_line
    ):
        return children[0]
    return Shard(
        markers=markers,
        tags=tags,
        children=children,
        start_line=start_line,
        end_line=end_line,
    )
 def merge_into_first_shard(
    shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = []
 ):
@ -71,29 +96,61 @@ def calculate_heading_level_for_next_split(
    )
-def parse_single_block_shards(block_token: BlockToken, start_line: int, end_line: int):
+def parse_single_block_shards(
    block_token: BlockToken, start_line: int, end_line: int
 ) -> tuple[Optional[Shard], list[str]]:
    markers, tags, children = [], [], []
    if isinstance(block_token, List):
        list_items: list[ListItem] = (  # type: ignore
            list(block_token.children) if block_token.children is not None else []
        )
        for index, list_item in enumerate(list_items):
            list_item_start_line = get_line_number(list_item)
            list_item_end_line = (
                get_line_number(list_items[index + 1]) - 1
                if index + 1 < len(list_items)
                else end_line
            )
            list_item_shard, list_item_tags = parse_multiple_block_shards(
                list_item.children,  # type: ignore
                list_item_start_line,
                list_item_end_line,
            )
            if list_item_shard is not None:
                children.append(list_item_shard)
            tags.extend(list_item_tags)
    elif isinstance(block_token, (Paragraph, Heading)):
        markers, tags = extract_markers_and_tags(block_token)
-    return Shard(start_line=start_line, end_line=end_line, markers=markers, tags=tags)
+
    if len(markers) == 0 and len(children) == 0:
        return None, tags
    return build_shard(
        start_line, end_line, markers=markers, tags=tags, children=children
    ), []
-def parse_paragraph_shards(
+def parse_multiple_block_shards(
-    block_tokens: list[BlockToken], start_line: int, end_line: int
+    block_tokens: list[BlockToken],
-) -> Optional[Shard]:
+    start_line: int,
    end_line: int,
    enforce_shard: bool = False,
 ) -> tuple[Optional[Shard], list[str]]:
    is_first_block_heading = isinstance(block_tokens[0], Heading) and has_markers(
        block_tokens[0]
    )
    paragraph_positions = find_paragraph_shard_positions(block_tokens)
-    children = []
+    children, tags = [], []
    added_tags = []
    is_first_block_only_with_marker = False
    for i, token in enumerate(block_tokens):
        if i in paragraph_positions:
-            is_first_block_heading = i == 0
+            is_first_block_only_with_marker = i == 0
        if i in paragraph_positions or (i == 0 and is_first_block_heading):
        child_start_line = get_line_number(token)
        child_end_line = (
            get_line_number(block_tokens[i + 1]) - 1
@ -101,20 +158,21 @@ def parse_paragraph_shards(
            else end_line
        )
-            children.append(
+        child_shard, child_tags = parse_single_block_shards(
-                parse_single_block_shards(token, child_start_line, child_end_line)
+            token, child_start_line, child_end_line
        )
        elif token.children:
            added_tags.extend(extract_tags(token.children))
-    if len(children) == 0 and len(added_tags) == 0:
+        if child_shard is not None:
-        return None
+            children.append(child_shard)
        if len(child_tags) > 0:
            tags.extend(child_tags)
    if len(children) == 0 and not enforce_shard:
        return None, tags
    if is_first_block_heading or is_first_block_only_with_marker:
-        return merge_into_first_shard(children, start_line, end_line, added_tags)
+        return merge_into_first_shard(children, start_line, end_line, tags), []
    else:
-        return Shard(
+        return build_shard(start_line, end_line, tags=tags, children=children), []
            start_line=start_line, end_line=end_line, children=children, tags=added_tags
        )
 def parse_header_shards(
@ -124,12 +182,14 @@ def parse_header_shards(
    use_first_child_as_header: bool = False,
 ) -> Optional[Shard]:
    if len(block_tokens) == 0:
-        return Shard(start_line=start_line, end_line=end_line)
+        return build_shard(start_line, end_line)
    split_at_heading_level = calculate_heading_level_for_next_split(block_tokens)
    if split_at_heading_level is None:
-        return parse_paragraph_shards(block_tokens, start_line, end_line)
+        return parse_multiple_block_shards(
            block_tokens, start_line, end_line, enforce_shard=True
        )[0]
    heading_positions = find_headings_by_level(block_tokens, split_at_heading_level)
@ -154,11 +214,11 @@ def parse_header_shards(
    if use_first_child_as_header and len(children) > 0:
        return merge_into_first_shard(children, start_line, end_line)
    else:
-        return Shard(start_line=start_line, end_line=end_line, children=children)
+        return build_shard(start_line, end_line, children=children)
 def parse_markdown_file(file_name: str, file_content: str) -> StreamFile:
-    shard = Shard(start_line=1, end_line=max([len(file_content.splitlines()), 1]))
+    shard = build_shard(1, max([len(file_content.splitlines()), 1]))
    with TagMarkdownRenderer():
        ast = Document(file_content)
--- a/test/test_parse.py
+++ b/test/test_parse.py
@ -213,3 +213,18 @@ class TestParseProcess:
                ),
            ],
        )
    def test_simple_list(self):
        file_text = "* hello world\n  * @Marker i've got a marker"
        assert parse_markdown_file(self.file_name, file_text).shard == Shard(
            markers=[],
            tags=[],
            start_line=1,
            end_line=2,
            children=[
                Shard(
                    markers=["Marker"], tags=[], start_line=2, end_line=2, children=[]
                )
            ],
        )