feat: also parse within formatting

Signed-off-by: Konstantin Fickel <mail@konstantinfickel.de>
2025-06-22 12:28:21 +02:00 · 2025-06-22 12:28:21 +02:00 · 8f5a000c5c
commit 8f5a000c5c
parent fa85017ce3
3 changed files with 186 additions and 34 deletions
--- a/src/streamer/parse/extract_tag.py
+++ b/src/streamer/parse/extract_tag.py
@ -1,45 +1,84 @@
 import re
 from typing import Iterable
 from mistletoe.block_token import BlockToken
-from mistletoe.span_token import RawText
+from mistletoe.span_token import Emphasis, RawText, Strikethrough, Strong, Link
 from mistletoe.token import Token

 from .markdown_tag import Tag


-def extract_tags(tokens: Iterable[Token]) -> list[str]:
-    return [token.content for token in tokens if isinstance(token, Tag)]
+def extract_markers_and_tags_from_single_token(
+    token: Token,
+    marker_boundary_encountered: bool,
+    return_at_first_marker: bool = False,
+) -> tuple[list[str], list[str], bool]:
+    result_markers, result_tags = [], []
+    result_marker_boundary_encountered = marker_boundary_encountered
+
+    if isinstance(token, Tag):
+        if marker_boundary_encountered:
+            result_tags.append(token.content)
+        else:
+            result_markers.append(token.content)
+    elif isinstance(token, (Emphasis, Strong, Strikethrough, Link)):
+        markers, tags, child_marker_boundary_encountered = (
+            extract_markers_and_tags_from_tokens(
+                token.children or [],
+                marker_boundary_encountered,
+                return_at_first_marker,
+            )
+        )
+        result_markers.extend(markers)
+        result_tags.extend(tags)
+        result_marker_boundary_encountered = (
+            marker_boundary_encountered or child_marker_boundary_encountered
+        )
+    elif isinstance(token, RawText) and re.match(r"^[\s]*$", token.content):
+        pass
+    else:
+        result_marker_boundary_encountered = True
+
+    return result_markers, result_tags, result_marker_boundary_encountered
+
+
+def extract_markers_and_tags_from_tokens(
+    tokens: Iterable[Token],
+    marker_boundary_encountered: bool,
+    return_at_first_marker: bool = False,
+) -> tuple[list[str], list[str], bool]:
+    result_markers, result_tags = [], []
+    result_marker_boundary_encountered = marker_boundary_encountered
+
+    for child in tokens:
+        markers, tags, child_marker_boundary_encountered = (
+            extract_markers_and_tags_from_single_token(
+                child, result_marker_boundary_encountered, return_at_first_marker
+            )
+        )
+        result_markers.extend(markers)
+        result_tags.extend(tags)
+        result_marker_boundary_encountered = (
+            marker_boundary_encountered or child_marker_boundary_encountered
+        )
+
+        if len(result_markers) > 0 and return_at_first_marker:
+            break
+
+    return result_markers, result_tags, result_marker_boundary_encountered


 def extract_markers_and_tags(block_token: BlockToken) -> tuple[list[str], list[str]]:
-    markers, tags = [], []
-    is_marker = True
-
-    if block_token.children is None:
-        return [], []
-
-    for token in block_token.children:
-        if isinstance(token, Tag):
-            if is_marker:
-                markers.append(token)
-            else:
-                tags.append(token)
-        elif not (isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)):
-            is_marker = False
-
-    return extract_tags(markers), extract_tags(tags)
+    markers, tags, _ = extract_markers_and_tags_from_tokens(
+        block_token.children or [], False
+    )
+    return markers, tags


 def has_markers(block_token: BlockToken) -> bool:
-    if block_token.children is None:
-        return False
-
-    for child in block_token.children:
-        if isinstance(child, Tag):
-            return True
-        elif not (isinstance(child, RawText) and re.match(r"^[\s]*$", child.content)):
-            return False
-    return False
+    markers, _, _ = extract_markers_and_tags_from_tokens(
+        block_token.children or [], False, return_at_first_marker=True
+    )
+    return len(markers) > 0


-__all__ = ["extract_tags", "extract_markers_and_tags", "has_markers"]
+__all__ = ["extract_markers_and_tags", "has_markers"]
--- a/src/streamer/parse/markdown_tag.py
+++ b/src/streamer/parse/markdown_tag.py
@ -5,7 +5,7 @@ from mistletoe.span_token import SpanToken

 class Tag(SpanToken):
    parse_inner = False
-    pattern = re.compile(r"@([^\s]+)")
+    pattern = re.compile(r"@([^\s*\x60~\[\]]+)")


 class TagMarkdownRenderer(MarkdownRenderer):
--- a/test/test_parse.py
+++ b/test/test_parse.py
@ -44,11 +44,11 @@ class TestParseProcess:
        )

    def test_parse_single_line_with_two_tags(self):
-        test_file = "@Tag1 @Tag2 Hello World"
+        test_file = "@Marker1 @Marker2 Hello World"
        assert parse_markdown_file(self.file_name, test_file) == StreamFile(
            filename=self.file_name,
            shard=Shard(
-                markers=["Tag1", "Tag2"],
+                markers=["Marker1", "Marker2"],
                start_line=1,
                end_line=1,
            ),
@ -149,7 +149,7 @@ class TestParseProcess:
            ),
        )

-    def test_continue_full_parsing_before_headings_start(self):
+    def test_parse_fullly_before_headings_start(self):
        file_text = "Hello\n\n@Marker1 World!\n\n# @Marker2 I'm a heading!"

        assert parse_markdown_file(self.file_name, file_text).shard == Shard(
@ -171,7 +171,7 @@ class TestParseProcess:
            ],
        )

-    def test_complex_heading_structure(self):
+    def test_parse_complex_heading_structure(self):
        file_text = "Preamble @Preamble\n## @Intro\n# @Title\n## @Chapter1\n## @Chapter2\n### Section 1\n### Section 2"

        assert parse_markdown_file(self.file_name, file_text).shard == Shard(
@ -228,3 +228,116 @@ class TestParseProcess:
                )
            ],
        )
+
+    def test_parse_complex_list(self):
+        file_text = """* I'm the parent!
+    * @Marker1 I've got a marker\n
+    * I've got no marker!
+    * I've got a child with a marker!
+        * @Marker2 I'm the child with the marker
+"""
+
+        assert parse_markdown_file(self.file_name, file_text).shard == Shard(
+            markers=[],
+            tags=[],
+            start_line=1,
+            end_line=6,
+            children=[
+                Shard(
+                    markers=[],
+                    tags=[],
+                    start_line=2,
+                    end_line=6,
+                    children=[
+                        Shard(
+                            markers=["Marker1"],
+                            tags=[],
+                            start_line=2,
+                            end_line=3,
+                            children=[],
+                        ),
+                        Shard(
+                            markers=[],
+                            tags=[],
+                            start_line=5,
+                            end_line=6,
+                            children=[
+                                Shard(
+                                    markers=["Marker2"],
+                                    tags=[],
+                                    start_line=6,
+                                    end_line=6,
+                                    children=[],
+                                )
+                            ],
+                        ),
+                    ],
+                )
+            ],
+        )
+
+    def test_parse_ignores_tags_in_code(self):
+        file_text = "```\n@Marker\n```"
+
+        assert parse_markdown_file(self.file_name, file_text).shard == Shard(
+            markers=[],
+            tags=[],
+            start_line=1,
+            end_line=3,
+            children=[],
+        )
+
+    def test_parse_finds_tags_in_italic_text(self):
+        file_text = "*@ItalicMarker*"
+
+        assert parse_markdown_file(self.file_name, file_text).shard == Shard(
+            markers=["ItalicMarker"],
+            tags=[],
+            start_line=1,
+            end_line=1,
+            children=[],
+        )
+
+    def test_parse_finds_tags_in_bold_text(self):
+        file_text = "**@BoldMarker**"
+
+        assert parse_markdown_file(self.file_name, file_text).shard == Shard(
+            markers=["BoldMarker"],
+            tags=[],
+            start_line=1,
+            end_line=1,
+            children=[],
+        )
+
+    def test_parse_finds_tags_in_strikethrough_text(self):
+        file_text = "~~@StrikeMarker~~"
+
+        assert parse_markdown_file(self.file_name, file_text).shard == Shard(
+            markers=["StrikeMarker"],
+            tags=[],
+            start_line=1,
+            end_line=1,
+            children=[],
+        )
+
+    def test_parse_finds_tags_in_link(self):
+        file_text = "[@LinkMarker](https://konstantinfickel.de)"
+
+        assert parse_markdown_file(self.file_name, file_text).shard == Shard(
+            markers=["LinkMarker"],
+            tags=[],
+            start_line=1,
+            end_line=1,
+            children=[],
+        )
+
+    def test_parse_continues_looking_for_markers_after_first_link_marker(self):
+        file_text = "[@LinkMarker1](https://konstantinfickel.de1) [@LinkMarker2](https://konstantinfickel.de)"
+
+        assert parse_markdown_file(self.file_name, file_text).shard == Shard(
+            markers=["LinkMarker1", "LinkMarker2"],
+            tags=[],
+            start_line=1,
+            end_line=1,
+            children=[],
+        )