feat: support lists in parsing

Signed-off-by: Konstantin Fickel <mail@konstantinfickel.de>
2025-06-22 11:36:58 +02:00 · 2025-06-22 11:36:58 +02:00 · fa85017ce3
commit fa85017ce3
parent dc2a97d3b8
2 changed files with 111 additions and 36 deletions
--- a/src/streamer/parse/parse.py
+++ b/src/streamer/parse/parse.py
@ -1,10 +1,10 @@
 from typing import Optional
 from mistletoe import Document
-from mistletoe.block_token import Paragraph, BlockToken, Heading
+from mistletoe.block_token import Paragraph, BlockToken, Heading, List, ListItem
 from collections import Counter

 from .markdown_tag import TagMarkdownRenderer
-from .extract_tag import extract_markers_and_tags, extract_tags, has_markers
+from .extract_tag import extract_markers_and_tags, has_markers
 from .shard import Shard, StreamFile
 from .list import split_at

@ -13,6 +13,31 @@ def get_line_number(block_token: BlockToken) -> int:
    return block_token.line_number  # type: ignore


+def build_shard(
+    start_line,
+    end_line,
+    markers: list[str] = [],
+    tags: list[str] = [],
+    children: list[Shard] = [],
+) -> Shard:
+    if (
+        len(children) == 1
+        and len(tags) == 0
+        and len(markers) == 0
+        and children[0].start_line == start_line
+        and children[0].end_line == end_line
+    ):
+        return children[0]
+
+    return Shard(
+        markers=markers,
+        tags=tags,
+        children=children,
+        start_line=start_line,
+        end_line=end_line,
+    )
+
+
 def merge_into_first_shard(
    shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = []
 ):
@ -71,29 +96,61 @@ def calculate_heading_level_for_next_split(
    )


-def parse_single_block_shards(block_token: BlockToken, start_line: int, end_line: int):
+def parse_single_block_shards(
+    block_token: BlockToken, start_line: int, end_line: int
+) -> tuple[Optional[Shard], list[str]]:
+    markers, tags, children = [], [], []
+
+    if isinstance(block_token, List):
+        list_items: list[ListItem] = (  # type: ignore
+            list(block_token.children) if block_token.children is not None else []
+        )
+        for index, list_item in enumerate(list_items):
+            list_item_start_line = get_line_number(list_item)
+            list_item_end_line = (
+                get_line_number(list_items[index + 1]) - 1
+                if index + 1 < len(list_items)
+                else end_line
+            )
+            list_item_shard, list_item_tags = parse_multiple_block_shards(
+                list_item.children,  # type: ignore
+                list_item_start_line,
+                list_item_end_line,
+            )
+            if list_item_shard is not None:
+                children.append(list_item_shard)
+            tags.extend(list_item_tags)
+
+    elif isinstance(block_token, (Paragraph, Heading)):
        markers, tags = extract_markers_and_tags(block_token)
-    return Shard(start_line=start_line, end_line=end_line, markers=markers, tags=tags)
+
+    if len(markers) == 0 and len(children) == 0:
+        return None, tags
+
+    return build_shard(
+        start_line, end_line, markers=markers, tags=tags, children=children
+    ), []


-def parse_paragraph_shards(
-    block_tokens: list[BlockToken], start_line: int, end_line: int
-) -> Optional[Shard]:
+def parse_multiple_block_shards(
+    block_tokens: list[BlockToken],
+    start_line: int,
+    end_line: int,
+    enforce_shard: bool = False,
+) -> tuple[Optional[Shard], list[str]]:
    is_first_block_heading = isinstance(block_tokens[0], Heading) and has_markers(
        block_tokens[0]
    )

    paragraph_positions = find_paragraph_shard_positions(block_tokens)
-    children = []
-    added_tags = []
+    children, tags = [], []

    is_first_block_only_with_marker = False

    for i, token in enumerate(block_tokens):
        if i in paragraph_positions:
-            is_first_block_heading = i == 0
+            is_first_block_only_with_marker = i == 0

-        if i in paragraph_positions or (i == 0 and is_first_block_heading):
        child_start_line = get_line_number(token)
        child_end_line = (
            get_line_number(block_tokens[i + 1]) - 1
@ -101,20 +158,21 @@ def parse_paragraph_shards(
            else end_line
        )

-            children.append(
-                parse_single_block_shards(token, child_start_line, child_end_line)
+        child_shard, child_tags = parse_single_block_shards(
+            token, child_start_line, child_end_line
        )
-        elif token.children:
-            added_tags.extend(extract_tags(token.children))

-    if len(children) == 0 and len(added_tags) == 0:
-        return None
+        if child_shard is not None:
+            children.append(child_shard)
+        if len(child_tags) > 0:
+            tags.extend(child_tags)
+
+    if len(children) == 0 and not enforce_shard:
+        return None, tags
    if is_first_block_heading or is_first_block_only_with_marker:
-        return merge_into_first_shard(children, start_line, end_line, added_tags)
+        return merge_into_first_shard(children, start_line, end_line, tags), []
    else:
-        return Shard(
-            start_line=start_line, end_line=end_line, children=children, tags=added_tags
-        )
+        return build_shard(start_line, end_line, tags=tags, children=children), []


 def parse_header_shards(
@ -124,12 +182,14 @@ def parse_header_shards(
    use_first_child_as_header: bool = False,
 ) -> Optional[Shard]:
    if len(block_tokens) == 0:
-        return Shard(start_line=start_line, end_line=end_line)
+        return build_shard(start_line, end_line)

    split_at_heading_level = calculate_heading_level_for_next_split(block_tokens)

    if split_at_heading_level is None:
-        return parse_paragraph_shards(block_tokens, start_line, end_line)
+        return parse_multiple_block_shards(
+            block_tokens, start_line, end_line, enforce_shard=True
+        )[0]

    heading_positions = find_headings_by_level(block_tokens, split_at_heading_level)

@ -154,11 +214,11 @@ def parse_header_shards(
    if use_first_child_as_header and len(children) > 0:
        return merge_into_first_shard(children, start_line, end_line)
    else:
-        return Shard(start_line=start_line, end_line=end_line, children=children)
+        return build_shard(start_line, end_line, children=children)


 def parse_markdown_file(file_name: str, file_content: str) -> StreamFile:
-    shard = Shard(start_line=1, end_line=max([len(file_content.splitlines()), 1]))
+    shard = build_shard(1, max([len(file_content.splitlines()), 1]))

    with TagMarkdownRenderer():
        ast = Document(file_content)
--- a/test/test_parse.py
+++ b/test/test_parse.py
@ -213,3 +213,18 @@ class TestParseProcess:
                ),
            ],
        )
+
+    def test_simple_list(self):
+        file_text = "* hello world\n  * @Marker i've got a marker"
+
+        assert parse_markdown_file(self.file_name, file_text).shard == Shard(
+            markers=[],
+            tags=[],
+            start_line=1,
+            end_line=2,
+            children=[
+                Shard(
+                    markers=["Marker"], tags=[], start_line=2, end_line=2, children=[]
+                )
+            ],
+        )