From 8f5a000c5cf83d930de77bb73db4d1389cbc0bc1 Mon Sep 17 00:00:00 2001 From: Konstantin Fickel Date: Sun, 22 Jun 2025 12:28:21 +0200 Subject: [PATCH] feat: also parse within formatting Signed-off-by: Konstantin Fickel --- src/streamer/parse/extract_tag.py | 97 ++++++++++++++++------- src/streamer/parse/markdown_tag.py | 2 +- test/test_parse.py | 121 ++++++++++++++++++++++++++++- 3 files changed, 186 insertions(+), 34 deletions(-) diff --git a/src/streamer/parse/extract_tag.py b/src/streamer/parse/extract_tag.py index b4fd526..ace1258 100644 --- a/src/streamer/parse/extract_tag.py +++ b/src/streamer/parse/extract_tag.py @@ -1,45 +1,84 @@ import re from typing import Iterable from mistletoe.block_token import BlockToken -from mistletoe.span_token import RawText +from mistletoe.span_token import Emphasis, RawText, Strikethrough, Strong, Link from mistletoe.token import Token from .markdown_tag import Tag -def extract_tags(tokens: Iterable[Token]) -> list[str]: - return [token.content for token in tokens if isinstance(token, Tag)] +def extract_markers_and_tags_from_single_token( + token: Token, + marker_boundary_encountered: bool, + return_at_first_marker: bool = False, +) -> tuple[list[str], list[str], bool]: + result_markers, result_tags = [], [] + result_marker_boundary_encountered = marker_boundary_encountered + + if isinstance(token, Tag): + if marker_boundary_encountered: + result_tags.append(token.content) + else: + result_markers.append(token.content) + elif isinstance(token, (Emphasis, Strong, Strikethrough, Link)): + markers, tags, child_marker_boundary_encountered = ( + extract_markers_and_tags_from_tokens( + token.children or [], + marker_boundary_encountered, + return_at_first_marker, + ) + ) + result_markers.extend(markers) + result_tags.extend(tags) + result_marker_boundary_encountered = ( + marker_boundary_encountered or child_marker_boundary_encountered + ) + elif isinstance(token, RawText) and re.match(r"^[\s]*$", token.content): + pass + else: + result_marker_boundary_encountered = True + + return result_markers, result_tags, result_marker_boundary_encountered + + +def extract_markers_and_tags_from_tokens( + tokens: Iterable[Token], + marker_boundary_encountered: bool, + return_at_first_marker: bool = False, +) -> tuple[list[str], list[str], bool]: + result_markers, result_tags = [], [] + result_marker_boundary_encountered = marker_boundary_encountered + + for child in tokens: + markers, tags, child_marker_boundary_encountered = ( + extract_markers_and_tags_from_single_token( + child, result_marker_boundary_encountered, return_at_first_marker + ) + ) + result_markers.extend(markers) + result_tags.extend(tags) + result_marker_boundary_encountered = ( + marker_boundary_encountered or child_marker_boundary_encountered + ) + + if len(result_markers) > 0 and return_at_first_marker: + break + + return result_markers, result_tags, result_marker_boundary_encountered def extract_markers_and_tags(block_token: BlockToken) -> tuple[list[str], list[str]]: - markers, tags = [], [] - is_marker = True - - if block_token.children is None: - return [], [] - - for token in block_token.children: - if isinstance(token, Tag): - if is_marker: - markers.append(token) - else: - tags.append(token) - elif not (isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)): - is_marker = False - - return extract_tags(markers), extract_tags(tags) + markers, tags, _ = extract_markers_and_tags_from_tokens( + block_token.children or [], False + ) + return markers, tags def has_markers(block_token: BlockToken) -> bool: - if block_token.children is None: - return False - - for child in block_token.children: - if isinstance(child, Tag): - return True - elif not (isinstance(child, RawText) and re.match(r"^[\s]*$", child.content)): - return False - return False + markers, _, _ = extract_markers_and_tags_from_tokens( + block_token.children or [], False, return_at_first_marker=True + ) + return len(markers) > 0 -__all__ = ["extract_tags", "extract_markers_and_tags", "has_markers"] +__all__ = ["extract_markers_and_tags", "has_markers"] diff --git a/src/streamer/parse/markdown_tag.py b/src/streamer/parse/markdown_tag.py index 21f88b5..4de6d35 100644 --- a/src/streamer/parse/markdown_tag.py +++ b/src/streamer/parse/markdown_tag.py @@ -5,7 +5,7 @@ from mistletoe.span_token import SpanToken class Tag(SpanToken): parse_inner = False - pattern = re.compile(r"@([^\s]+)") + pattern = re.compile(r"@([^\s*\x60~\[\]]+)") class TagMarkdownRenderer(MarkdownRenderer): diff --git a/test/test_parse.py b/test/test_parse.py index 7a5fb6e..e9c27ba 100644 --- a/test/test_parse.py +++ b/test/test_parse.py @@ -44,11 +44,11 @@ class TestParseProcess: ) def test_parse_single_line_with_two_tags(self): - test_file = "@Tag1 @Tag2 Hello World" + test_file = "@Marker1 @Marker2 Hello World" assert parse_markdown_file(self.file_name, test_file) == StreamFile( filename=self.file_name, shard=Shard( - markers=["Tag1", "Tag2"], + markers=["Marker1", "Marker2"], start_line=1, end_line=1, ), @@ -149,7 +149,7 @@ class TestParseProcess: ), ) - def test_continue_full_parsing_before_headings_start(self): + def test_parse_fullly_before_headings_start(self): file_text = "Hello\n\n@Marker1 World!\n\n# @Marker2 I'm a heading!" assert parse_markdown_file(self.file_name, file_text).shard == Shard( @@ -171,7 +171,7 @@ class TestParseProcess: ], ) - def test_complex_heading_structure(self): + def test_parse_complex_heading_structure(self): file_text = "Preamble @Preamble\n## @Intro\n# @Title\n## @Chapter1\n## @Chapter2\n### Section 1\n### Section 2" assert parse_markdown_file(self.file_name, file_text).shard == Shard( @@ -228,3 +228,116 @@ class TestParseProcess: ) ], ) + + def test_parse_complex_list(self): + file_text = """* I'm the parent! + * @Marker1 I've got a marker\n + * I've got no marker! + * I've got a child with a marker! + * @Marker2 I'm the child with the marker +""" + + assert parse_markdown_file(self.file_name, file_text).shard == Shard( + markers=[], + tags=[], + start_line=1, + end_line=6, + children=[ + Shard( + markers=[], + tags=[], + start_line=2, + end_line=6, + children=[ + Shard( + markers=["Marker1"], + tags=[], + start_line=2, + end_line=3, + children=[], + ), + Shard( + markers=[], + tags=[], + start_line=5, + end_line=6, + children=[ + Shard( + markers=["Marker2"], + tags=[], + start_line=6, + end_line=6, + children=[], + ) + ], + ), + ], + ) + ], + ) + + def test_parse_ignores_tags_in_code(self): + file_text = "```\n@Marker\n```" + + assert parse_markdown_file(self.file_name, file_text).shard == Shard( + markers=[], + tags=[], + start_line=1, + end_line=3, + children=[], + ) + + def test_parse_finds_tags_in_italic_text(self): + file_text = "*@ItalicMarker*" + + assert parse_markdown_file(self.file_name, file_text).shard == Shard( + markers=["ItalicMarker"], + tags=[], + start_line=1, + end_line=1, + children=[], + ) + + def test_parse_finds_tags_in_bold_text(self): + file_text = "**@BoldMarker**" + + assert parse_markdown_file(self.file_name, file_text).shard == Shard( + markers=["BoldMarker"], + tags=[], + start_line=1, + end_line=1, + children=[], + ) + + def test_parse_finds_tags_in_strikethrough_text(self): + file_text = "~~@StrikeMarker~~" + + assert parse_markdown_file(self.file_name, file_text).shard == Shard( + markers=["StrikeMarker"], + tags=[], + start_line=1, + end_line=1, + children=[], + ) + + def test_parse_finds_tags_in_link(self): + file_text = "[@LinkMarker](https://konstantinfickel.de)" + + assert parse_markdown_file(self.file_name, file_text).shard == Shard( + markers=["LinkMarker"], + tags=[], + start_line=1, + end_line=1, + children=[], + ) + + def test_parse_continues_looking_for_markers_after_first_link_marker(self): + file_text = "[@LinkMarker1](https://konstantinfickel.de1) [@LinkMarker2](https://konstantinfickel.de)" + + assert parse_markdown_file(self.file_name, file_text).shard == Shard( + markers=["LinkMarker1", "LinkMarker2"], + tags=[], + start_line=1, + end_line=1, + children=[], + )