diff --git a/src/streamer/parse.py b/src/streamer/parse.py index 6d6d59e..949a655 100644 --- a/src/streamer/parse.py +++ b/src/streamer/parse.py @@ -26,11 +26,11 @@ class TagMarkdownRenderer(MarkdownRenderer): class Shard(BaseModel): - markers: list[str] - tags: list[str] + markers: list[str] = [] + tags: list[str] = [] start_line: int end_line: int - children: list[Shard] + children: list[Shard] = [] class StreamFile(BaseModel): @@ -101,6 +101,26 @@ def to_shard( ) +def append_children(shard: Shard, new_children: list[Shard] = []) -> Shard: + shard_children = shard.children if len(shard.children) > 0 else [] + new_shard_children = shard_children + new_children + + if ( + len(new_shard_children) == 1 + and len(shard.markers) == 0 + and len(shard.tags) == 0 + ): + return new_shard_children[0] + + return Shard( + markers=shard.markers, + tags=shard.tags, + start_line=shard.start_line, + end_line=shard.end_line, + children=shard_children + new_children, + ) + + def parse_paragraph_shards(block_tokens: list[BlockToken], end_line: int) -> Shard: start_line = block_tokens[0].line_number shard_starts = find_shard_positions(block_tokens) @@ -148,15 +168,21 @@ def find_heading_positions( def parse_header_shards( - block_tokens: list[BlockToken], end_line: int, ignore_first_token: bool = False + block_tokens: list[BlockToken], + start_line: int, + end_line: int, + first_token_is_header: bool = False, ) -> Shard: + if len(block_tokens) == 0: + return Shard(start_line=start_line, end_line=end_line) + max_header_level_with_marker = optional_max( map( lambda heading: heading.level, filter( lambda block_token: isinstance(block_token, Heading) and has_markers(block_token), - block_tokens[1:] if ignore_first_token else block_tokens, + block_tokens[1:] if first_token_is_header else block_tokens, ), ) ) @@ -169,12 +195,14 @@ def parse_header_shards( lambda heading: heading.level, filter( lambda block_token: isinstance(block_token, Heading), - block_tokens[1:] if ignore_first_token else block_tokens, + block_tokens[1:] if first_token_is_header else block_tokens, ), ) ) slice_positions = find_heading_positions(block_tokens, header_level_for_slicing) + if first_token_is_header: + slice_positions.append(1) is_first_slice_part_of_parent_shard = 0 not in slice_positions sliced_by_heading_level = split_at(block_tokens, slice_positions) @@ -190,16 +218,25 @@ def parse_header_shards( child_shards = [] for i in range(len(child_elements)): + child_start_line = child_elements[i][0].line_number child_end_line = ( child_elements[i + 1][0].line_number - 1 if i + 1 < len(child_elements) else end_line ) child_shards.append( - parse_header_shards(child_elements[i], child_end_line, True) + parse_header_shards( + child_elements[i], + child_start_line, + child_end_line, + first_token_is_header=True, + ) ) - return to_shard(own_elements, block_tokens[0].line_number, end_line, child_shards) + own_shard = parse_header_shards( + own_elements, start_line, end_line, first_token_is_header=False + ) + return append_children(own_shard, child_shards) def parse_markdown_file(file_name: str, file_content: str) -> StreamFile: @@ -209,7 +246,7 @@ def parse_markdown_file(file_name: str, file_content: str) -> StreamFile: line_count = len(file_content.splitlines()) if block_tokens := ast.children: - shard = parse_header_shards(block_tokens, line_count) + shard = parse_header_shards(block_tokens, 1, line_count) return StreamFile(shard=shard, filename=file_name) diff --git a/test/test_parse.py b/test/test_parse.py index d4caf07..8beef74 100644 --- a/test/test_parse.py +++ b/test/test_parse.py @@ -142,7 +142,7 @@ class TestParseProcess: ), ) - def test_parse_split_at_headin_if_marker_on_subheading(self): + def test_parse_split_at_heading_if_marker_on_subheading(self): file_text = "# Heading @Tag1\n\n## @Marker1 Subheading @Tag2\n\n# Heading @Tag3" assert parse_markdown_file(self.file_name, file_text) == StreamFile( @@ -159,21 +159,24 @@ class TestParseProcess: start_line=1, end_line=4, children=[ + Shard( + markers=[], + tags=[], + start_line=2, + end_line=2, + children=[], + ), Shard( markers=["Marker1"], tags=["Tag2"], start_line=3, end_line=4, children=[], - ) + ), ], ), Shard( - markers=[], - tags=["Tag3"], - start_line=5, - end_line=5, - children=[], + markers=[], tags=["Tag3"], start_line=5, end_line=5, children=[] ), ], ), @@ -192,3 +195,32 @@ class TestParseProcess: children=[], ), ) + + def test_continue_full_parsing_before_headings_start(self): + file_text = "Hello\n\n@Marker1 World!\n\n# @Marker2 I'm a heading!" + + assert parse_markdown_file(self.file_name, file_text) == StreamFile( + filename=self.file_name, + shard=Shard( + markers=[], + tags=[], + start_line=1, + end_line=5, + children=[ + Shard( + markers=["Marker1"], + tags=[], + start_line=3, + end_line=3, + children=[], + ), + Shard( + markers=["Marker2"], + tags=[], + start_line=5, + end_line=5, + children=[], + ), + ], + ), + )