feat: support lists in parsing

Signed-off-by: Konstantin Fickel <mail@konstantinfickel.de>
This commit is contained in:
Konstantin Fickel 2025-06-22 11:36:58 +02:00
parent dc2a97d3b8
commit fa85017ce3
2 changed files with 111 additions and 36 deletions

View file

@ -1,10 +1,10 @@
from typing import Optional
from mistletoe import Document
from mistletoe.block_token import Paragraph, BlockToken, Heading
from mistletoe.block_token import Paragraph, BlockToken, Heading, List, ListItem
from collections import Counter
from .markdown_tag import TagMarkdownRenderer
from .extract_tag import extract_markers_and_tags, extract_tags, has_markers
from .extract_tag import extract_markers_and_tags, has_markers
from .shard import Shard, StreamFile
from .list import split_at
@ -13,6 +13,31 @@ def get_line_number(block_token: BlockToken) -> int:
return block_token.line_number # type: ignore
def build_shard(
start_line,
end_line,
markers: list[str] = [],
tags: list[str] = [],
children: list[Shard] = [],
) -> Shard:
if (
len(children) == 1
and len(tags) == 0
and len(markers) == 0
and children[0].start_line == start_line
and children[0].end_line == end_line
):
return children[0]
return Shard(
markers=markers,
tags=tags,
children=children,
start_line=start_line,
end_line=end_line,
)
def merge_into_first_shard(
shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = []
):
@ -71,29 +96,61 @@ def calculate_heading_level_for_next_split(
)
def parse_single_block_shards(block_token: BlockToken, start_line: int, end_line: int):
def parse_single_block_shards(
block_token: BlockToken, start_line: int, end_line: int
) -> tuple[Optional[Shard], list[str]]:
markers, tags, children = [], [], []
if isinstance(block_token, List):
list_items: list[ListItem] = ( # type: ignore
list(block_token.children) if block_token.children is not None else []
)
for index, list_item in enumerate(list_items):
list_item_start_line = get_line_number(list_item)
list_item_end_line = (
get_line_number(list_items[index + 1]) - 1
if index + 1 < len(list_items)
else end_line
)
list_item_shard, list_item_tags = parse_multiple_block_shards(
list_item.children, # type: ignore
list_item_start_line,
list_item_end_line,
)
if list_item_shard is not None:
children.append(list_item_shard)
tags.extend(list_item_tags)
elif isinstance(block_token, (Paragraph, Heading)):
markers, tags = extract_markers_and_tags(block_token)
return Shard(start_line=start_line, end_line=end_line, markers=markers, tags=tags)
if len(markers) == 0 and len(children) == 0:
return None, tags
return build_shard(
start_line, end_line, markers=markers, tags=tags, children=children
), []
def parse_paragraph_shards(
block_tokens: list[BlockToken], start_line: int, end_line: int
) -> Optional[Shard]:
def parse_multiple_block_shards(
block_tokens: list[BlockToken],
start_line: int,
end_line: int,
enforce_shard: bool = False,
) -> tuple[Optional[Shard], list[str]]:
is_first_block_heading = isinstance(block_tokens[0], Heading) and has_markers(
block_tokens[0]
)
paragraph_positions = find_paragraph_shard_positions(block_tokens)
children = []
added_tags = []
children, tags = [], []
is_first_block_only_with_marker = False
for i, token in enumerate(block_tokens):
if i in paragraph_positions:
is_first_block_heading = i == 0
is_first_block_only_with_marker = i == 0
if i in paragraph_positions or (i == 0 and is_first_block_heading):
child_start_line = get_line_number(token)
child_end_line = (
get_line_number(block_tokens[i + 1]) - 1
@ -101,20 +158,21 @@ def parse_paragraph_shards(
else end_line
)
children.append(
parse_single_block_shards(token, child_start_line, child_end_line)
child_shard, child_tags = parse_single_block_shards(
token, child_start_line, child_end_line
)
elif token.children:
added_tags.extend(extract_tags(token.children))
if len(children) == 0 and len(added_tags) == 0:
return None
if child_shard is not None:
children.append(child_shard)
if len(child_tags) > 0:
tags.extend(child_tags)
if len(children) == 0 and not enforce_shard:
return None, tags
if is_first_block_heading or is_first_block_only_with_marker:
return merge_into_first_shard(children, start_line, end_line, added_tags)
return merge_into_first_shard(children, start_line, end_line, tags), []
else:
return Shard(
start_line=start_line, end_line=end_line, children=children, tags=added_tags
)
return build_shard(start_line, end_line, tags=tags, children=children), []
def parse_header_shards(
@ -124,12 +182,14 @@ def parse_header_shards(
use_first_child_as_header: bool = False,
) -> Optional[Shard]:
if len(block_tokens) == 0:
return Shard(start_line=start_line, end_line=end_line)
return build_shard(start_line, end_line)
split_at_heading_level = calculate_heading_level_for_next_split(block_tokens)
if split_at_heading_level is None:
return parse_paragraph_shards(block_tokens, start_line, end_line)
return parse_multiple_block_shards(
block_tokens, start_line, end_line, enforce_shard=True
)[0]
heading_positions = find_headings_by_level(block_tokens, split_at_heading_level)
@ -154,11 +214,11 @@ def parse_header_shards(
if use_first_child_as_header and len(children) > 0:
return merge_into_first_shard(children, start_line, end_line)
else:
return Shard(start_line=start_line, end_line=end_line, children=children)
return build_shard(start_line, end_line, children=children)
def parse_markdown_file(file_name: str, file_content: str) -> StreamFile:
shard = Shard(start_line=1, end_line=max([len(file_content.splitlines()), 1]))
shard = build_shard(1, max([len(file_content.splitlines()), 1]))
with TagMarkdownRenderer():
ast = Document(file_content)

View file

@ -213,3 +213,18 @@ class TestParseProcess:
),
],
)
def test_simple_list(self):
file_text = "* hello world\n * @Marker i've got a marker"
assert parse_markdown_file(self.file_name, file_text).shard == Shard(
markers=[],
tags=[],
start_line=1,
end_line=2,
children=[
Shard(
markers=["Marker"], tags=[], start_line=2, end_line=2, children=[]
)
],
)