feat: full heading parsing also before headings

Signed-off-by: Konstantin Fickel <mail@konstantinfickel.de>
This commit is contained in:
Konstantin Fickel 2025-06-20 21:46:05 +02:00
parent 6d61d67d2e
commit 63ce959d4c
2 changed files with 85 additions and 16 deletions

View file

@ -26,11 +26,11 @@ class TagMarkdownRenderer(MarkdownRenderer):
class Shard(BaseModel):
markers: list[str]
tags: list[str]
markers: list[str] = []
tags: list[str] = []
start_line: int
end_line: int
children: list[Shard]
children: list[Shard] = []
class StreamFile(BaseModel):
@ -101,6 +101,26 @@ def to_shard(
)
def append_children(shard: Shard, new_children: list[Shard] = []) -> Shard:
shard_children = shard.children if len(shard.children) > 0 else []
new_shard_children = shard_children + new_children
if (
len(new_shard_children) == 1
and len(shard.markers) == 0
and len(shard.tags) == 0
):
return new_shard_children[0]
return Shard(
markers=shard.markers,
tags=shard.tags,
start_line=shard.start_line,
end_line=shard.end_line,
children=shard_children + new_children,
)
def parse_paragraph_shards(block_tokens: list[BlockToken], end_line: int) -> Shard:
start_line = block_tokens[0].line_number
shard_starts = find_shard_positions(block_tokens)
@ -148,15 +168,21 @@ def find_heading_positions(
def parse_header_shards(
block_tokens: list[BlockToken], end_line: int, ignore_first_token: bool = False
block_tokens: list[BlockToken],
start_line: int,
end_line: int,
first_token_is_header: bool = False,
) -> Shard:
if len(block_tokens) == 0:
return Shard(start_line=start_line, end_line=end_line)
max_header_level_with_marker = optional_max(
map(
lambda heading: heading.level,
filter(
lambda block_token: isinstance(block_token, Heading)
and has_markers(block_token),
block_tokens[1:] if ignore_first_token else block_tokens,
block_tokens[1:] if first_token_is_header else block_tokens,
),
)
)
@ -169,12 +195,14 @@ def parse_header_shards(
lambda heading: heading.level,
filter(
lambda block_token: isinstance(block_token, Heading),
block_tokens[1:] if ignore_first_token else block_tokens,
block_tokens[1:] if first_token_is_header else block_tokens,
),
)
)
slice_positions = find_heading_positions(block_tokens, header_level_for_slicing)
if first_token_is_header:
slice_positions.append(1)
is_first_slice_part_of_parent_shard = 0 not in slice_positions
sliced_by_heading_level = split_at(block_tokens, slice_positions)
@ -190,16 +218,25 @@ def parse_header_shards(
child_shards = []
for i in range(len(child_elements)):
child_start_line = child_elements[i][0].line_number
child_end_line = (
child_elements[i + 1][0].line_number - 1
if i + 1 < len(child_elements)
else end_line
)
child_shards.append(
parse_header_shards(child_elements[i], child_end_line, True)
parse_header_shards(
child_elements[i],
child_start_line,
child_end_line,
first_token_is_header=True,
)
)
return to_shard(own_elements, block_tokens[0].line_number, end_line, child_shards)
own_shard = parse_header_shards(
own_elements, start_line, end_line, first_token_is_header=False
)
return append_children(own_shard, child_shards)
def parse_markdown_file(file_name: str, file_content: str) -> StreamFile:
@ -209,7 +246,7 @@ def parse_markdown_file(file_name: str, file_content: str) -> StreamFile:
line_count = len(file_content.splitlines())
if block_tokens := ast.children:
shard = parse_header_shards(block_tokens, line_count)
shard = parse_header_shards(block_tokens, 1, line_count)
return StreamFile(shard=shard, filename=file_name)

View file

@ -142,7 +142,7 @@ class TestParseProcess:
),
)
def test_parse_split_at_headin_if_marker_on_subheading(self):
def test_parse_split_at_heading_if_marker_on_subheading(self):
file_text = "# Heading @Tag1\n\n## @Marker1 Subheading @Tag2\n\n# Heading @Tag3"
assert parse_markdown_file(self.file_name, file_text) == StreamFile(
@ -159,21 +159,24 @@ class TestParseProcess:
start_line=1,
end_line=4,
children=[
Shard(
markers=[],
tags=[],
start_line=2,
end_line=2,
children=[],
),
Shard(
markers=["Marker1"],
tags=["Tag2"],
start_line=3,
end_line=4,
children=[],
)
),
],
),
Shard(
markers=[],
tags=["Tag3"],
start_line=5,
end_line=5,
children=[],
markers=[], tags=["Tag3"], start_line=5, end_line=5, children=[]
),
],
),
@ -192,3 +195,32 @@ class TestParseProcess:
children=[],
),
)
def test_continue_full_parsing_before_headings_start(self):
file_text = "Hello\n\n@Marker1 World!\n\n# @Marker2 I'm a heading!"
assert parse_markdown_file(self.file_name, file_text) == StreamFile(
filename=self.file_name,
shard=Shard(
markers=[],
tags=[],
start_line=1,
end_line=5,
children=[
Shard(
markers=["Marker1"],
tags=[],
start_line=3,
end_line=3,
children=[],
),
Shard(
markers=["Marker2"],
tags=[],
start_line=5,
end_line=5,
children=[],
),
],
),
)