diff --git a/src/streamer/parse/parse.py b/src/streamer/parse/parse.py index 09e2541..08dbb26 100644 --- a/src/streamer/parse/parse.py +++ b/src/streamer/parse/parse.py @@ -1,10 +1,10 @@ from typing import Optional from mistletoe import Document -from mistletoe.block_token import Paragraph, BlockToken, Heading +from mistletoe.block_token import Paragraph, BlockToken, Heading, List, ListItem from collections import Counter from .markdown_tag import TagMarkdownRenderer -from .extract_tag import extract_markers_and_tags, extract_tags, has_markers +from .extract_tag import extract_markers_and_tags, has_markers from .shard import Shard, StreamFile from .list import split_at @@ -13,6 +13,31 @@ def get_line_number(block_token: BlockToken) -> int: return block_token.line_number # type: ignore +def build_shard( + start_line, + end_line, + markers: list[str] = [], + tags: list[str] = [], + children: list[Shard] = [], +) -> Shard: + if ( + len(children) == 1 + and len(tags) == 0 + and len(markers) == 0 + and children[0].start_line == start_line + and children[0].end_line == end_line + ): + return children[0] + + return Shard( + markers=markers, + tags=tags, + children=children, + start_line=start_line, + end_line=end_line, + ) + + def merge_into_first_shard( shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = [] ): @@ -71,51 +96,84 @@ def calculate_heading_level_for_next_split( ) -def parse_single_block_shards(block_token: BlockToken, start_line: int, end_line: int): - markers, tags = extract_markers_and_tags(block_token) - return Shard(start_line=start_line, end_line=end_line, markers=markers, tags=tags) +def parse_single_block_shards( + block_token: BlockToken, start_line: int, end_line: int +) -> tuple[Optional[Shard], list[str]]: + markers, tags, children = [], [], [] + + if isinstance(block_token, List): + list_items: list[ListItem] = ( # type: ignore + list(block_token.children) if block_token.children is not None else [] + ) + for index, list_item in enumerate(list_items): + list_item_start_line = get_line_number(list_item) + list_item_end_line = ( + get_line_number(list_items[index + 1]) - 1 + if index + 1 < len(list_items) + else end_line + ) + list_item_shard, list_item_tags = parse_multiple_block_shards( + list_item.children, # type: ignore + list_item_start_line, + list_item_end_line, + ) + if list_item_shard is not None: + children.append(list_item_shard) + tags.extend(list_item_tags) + + elif isinstance(block_token, (Paragraph, Heading)): + markers, tags = extract_markers_and_tags(block_token) + + if len(markers) == 0 and len(children) == 0: + return None, tags + + return build_shard( + start_line, end_line, markers=markers, tags=tags, children=children + ), [] -def parse_paragraph_shards( - block_tokens: list[BlockToken], start_line: int, end_line: int -) -> Optional[Shard]: +def parse_multiple_block_shards( + block_tokens: list[BlockToken], + start_line: int, + end_line: int, + enforce_shard: bool = False, +) -> tuple[Optional[Shard], list[str]]: is_first_block_heading = isinstance(block_tokens[0], Heading) and has_markers( block_tokens[0] ) paragraph_positions = find_paragraph_shard_positions(block_tokens) - children = [] - added_tags = [] + children, tags = [], [] is_first_block_only_with_marker = False for i, token in enumerate(block_tokens): if i in paragraph_positions: - is_first_block_heading = i == 0 + is_first_block_only_with_marker = i == 0 - if i in paragraph_positions or (i == 0 and is_first_block_heading): - child_start_line = get_line_number(token) - child_end_line = ( - get_line_number(block_tokens[i + 1]) - 1 - if i + 1 < len(block_tokens) - else end_line - ) - - children.append( - parse_single_block_shards(token, child_start_line, child_end_line) - ) - elif token.children: - added_tags.extend(extract_tags(token.children)) - - if len(children) == 0 and len(added_tags) == 0: - return None - if is_first_block_heading or is_first_block_only_with_marker: - return merge_into_first_shard(children, start_line, end_line, added_tags) - else: - return Shard( - start_line=start_line, end_line=end_line, children=children, tags=added_tags + child_start_line = get_line_number(token) + child_end_line = ( + get_line_number(block_tokens[i + 1]) - 1 + if i + 1 < len(block_tokens) + else end_line ) + child_shard, child_tags = parse_single_block_shards( + token, child_start_line, child_end_line + ) + + if child_shard is not None: + children.append(child_shard) + if len(child_tags) > 0: + tags.extend(child_tags) + + if len(children) == 0 and not enforce_shard: + return None, tags + if is_first_block_heading or is_first_block_only_with_marker: + return merge_into_first_shard(children, start_line, end_line, tags), [] + else: + return build_shard(start_line, end_line, tags=tags, children=children), [] + def parse_header_shards( block_tokens: list[BlockToken], @@ -124,12 +182,14 @@ def parse_header_shards( use_first_child_as_header: bool = False, ) -> Optional[Shard]: if len(block_tokens) == 0: - return Shard(start_line=start_line, end_line=end_line) + return build_shard(start_line, end_line) split_at_heading_level = calculate_heading_level_for_next_split(block_tokens) if split_at_heading_level is None: - return parse_paragraph_shards(block_tokens, start_line, end_line) + return parse_multiple_block_shards( + block_tokens, start_line, end_line, enforce_shard=True + )[0] heading_positions = find_headings_by_level(block_tokens, split_at_heading_level) @@ -154,11 +214,11 @@ def parse_header_shards( if use_first_child_as_header and len(children) > 0: return merge_into_first_shard(children, start_line, end_line) else: - return Shard(start_line=start_line, end_line=end_line, children=children) + return build_shard(start_line, end_line, children=children) def parse_markdown_file(file_name: str, file_content: str) -> StreamFile: - shard = Shard(start_line=1, end_line=max([len(file_content.splitlines()), 1])) + shard = build_shard(1, max([len(file_content.splitlines()), 1])) with TagMarkdownRenderer(): ast = Document(file_content) diff --git a/test/test_parse.py b/test/test_parse.py index e7b6562..7a5fb6e 100644 --- a/test/test_parse.py +++ b/test/test_parse.py @@ -213,3 +213,18 @@ class TestParseProcess: ), ], ) + + def test_simple_list(self): + file_text = "* hello world\n * @Marker i've got a marker" + + assert parse_markdown_file(self.file_name, file_text).shard == Shard( + markers=[], + tags=[], + start_line=1, + end_line=2, + children=[ + Shard( + markers=["Marker"], tags=[], start_line=2, end_line=2, children=[] + ) + ], + )