From 695a28e715df98b673ed550c58f0c4e616131bac Mon Sep 17 00:00:00 2001 From: Konstantin Fickel Date: Fri, 20 Jun 2025 16:26:17 +0200 Subject: [PATCH] chore: refactor and simplify parsing Signed-off-by: Konstantin Fickel --- src/streamer/parse.py | 47 ++++++++++++++++++++++++------------------- test/test_parse.py | 4 ++-- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/src/streamer/parse.py b/src/streamer/parse.py index 5ba66f9..326b537 100644 --- a/src/streamer/parse.py +++ b/src/streamer/parse.py @@ -91,6 +91,9 @@ def split_at(list_to_be_split: list[T], positions: list[int]): def to_shard( tokens: list[Token], start_line: int, end_line: int, children: list[Shard] = [] ) -> Shard: + if len(children) == 1 and len(tokens) == 0: + return children[0] + markers, tags = extract_markers_and_tags(tokens) if len(tokens) > 0 else ([], []) return Shard( @@ -102,6 +105,28 @@ def to_shard( ) +def parse_paragraph_shards(block_tokens: list[BlockToken], end_line: int) -> Shard: + start_line = block_tokens[0].line_number + shard_starts = find_shard_positions(block_tokens) + + child_shards: list[Shard] = [] + own_elements: list[BlockToken] = [] + + for i in range(len(block_tokens)): + token = block_tokens[i] + if i in shard_starts: + shard_end_line = ( + block_tokens[i + 1].line_number - 1 + if i + 1 < len(block_tokens) + else end_line + ) + child_shards.append(to_shard([token], token.line_number, shard_end_line)) + else: + own_elements.append(token) + + return to_shard(own_elements, start_line, end_line, children=child_shards) + + def parse_markdown_file(file_name: str, file_content: str) -> StreamFile: shard = None with TagMarkdownRenderer(): @@ -109,27 +134,7 @@ def parse_markdown_file(file_name: str, file_content: str) -> StreamFile: line_count = len(file_content.splitlines()) if block_tokens := ast.children: - shard_starts = find_shard_positions(block_tokens) - - child_shards: list[Shard] = [] - own_elements: list[BlockToken] = [] - - for i in range(len(block_tokens)): - token = block_tokens[i] - if i in shard_starts: - end_line = ( - block_tokens[i + 1].line_number - 1 - if i + 1 < len(block_tokens) - else line_count - ) - child_shards.append(to_shard([token], token.line_number, end_line)) - else: - own_elements.append(token) - - if len(child_shards) == 1 and len(own_elements) == 0: - shard = child_shards[0] - else: - shard = to_shard(own_elements, 1, line_count, children=child_shards) + shard = parse_paragraph_shards(block_tokens, line_count) return StreamFile(shard=shard, filename=file_name) diff --git a/test/test_parse.py b/test/test_parse.py index 2b5b4e0..b9f24bb 100644 --- a/test/test_parse.py +++ b/test/test_parse.py @@ -26,14 +26,14 @@ class TestParseProcess: ) def test_parse_basic_multi_line_file(self): - test_file = "Hello World\nHello again!" + test_file = "Hello World\n\nHello again!" assert parse_markdown_file(self.file_name, test_file) == StreamFile( filename=self.file_name, shard=Shard( markers=[], tags=[], start_line=1, - end_line=2, + end_line=3, children=[], ), )