feat: support lists in parsing
Signed-off-by: Konstantin Fickel <mail@konstantinfickel.de>
This commit is contained in:
parent
dc2a97d3b8
commit
fa85017ce3
2 changed files with 111 additions and 36 deletions
|
|
@ -1,10 +1,10 @@
|
|||
from typing import Optional
|
||||
from mistletoe import Document
|
||||
from mistletoe.block_token import Paragraph, BlockToken, Heading
|
||||
from mistletoe.block_token import Paragraph, BlockToken, Heading, List, ListItem
|
||||
from collections import Counter
|
||||
|
||||
from .markdown_tag import TagMarkdownRenderer
|
||||
from .extract_tag import extract_markers_and_tags, extract_tags, has_markers
|
||||
from .extract_tag import extract_markers_and_tags, has_markers
|
||||
from .shard import Shard, StreamFile
|
||||
from .list import split_at
|
||||
|
||||
|
|
@ -13,6 +13,31 @@ def get_line_number(block_token: BlockToken) -> int:
|
|||
return block_token.line_number # type: ignore
|
||||
|
||||
|
||||
def build_shard(
|
||||
start_line,
|
||||
end_line,
|
||||
markers: list[str] = [],
|
||||
tags: list[str] = [],
|
||||
children: list[Shard] = [],
|
||||
) -> Shard:
|
||||
if (
|
||||
len(children) == 1
|
||||
and len(tags) == 0
|
||||
and len(markers) == 0
|
||||
and children[0].start_line == start_line
|
||||
and children[0].end_line == end_line
|
||||
):
|
||||
return children[0]
|
||||
|
||||
return Shard(
|
||||
markers=markers,
|
||||
tags=tags,
|
||||
children=children,
|
||||
start_line=start_line,
|
||||
end_line=end_line,
|
||||
)
|
||||
|
||||
|
||||
def merge_into_first_shard(
|
||||
shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = []
|
||||
):
|
||||
|
|
@ -71,29 +96,61 @@ def calculate_heading_level_for_next_split(
|
|||
)
|
||||
|
||||
|
||||
def parse_single_block_shards(block_token: BlockToken, start_line: int, end_line: int):
|
||||
def parse_single_block_shards(
|
||||
block_token: BlockToken, start_line: int, end_line: int
|
||||
) -> tuple[Optional[Shard], list[str]]:
|
||||
markers, tags, children = [], [], []
|
||||
|
||||
if isinstance(block_token, List):
|
||||
list_items: list[ListItem] = ( # type: ignore
|
||||
list(block_token.children) if block_token.children is not None else []
|
||||
)
|
||||
for index, list_item in enumerate(list_items):
|
||||
list_item_start_line = get_line_number(list_item)
|
||||
list_item_end_line = (
|
||||
get_line_number(list_items[index + 1]) - 1
|
||||
if index + 1 < len(list_items)
|
||||
else end_line
|
||||
)
|
||||
list_item_shard, list_item_tags = parse_multiple_block_shards(
|
||||
list_item.children, # type: ignore
|
||||
list_item_start_line,
|
||||
list_item_end_line,
|
||||
)
|
||||
if list_item_shard is not None:
|
||||
children.append(list_item_shard)
|
||||
tags.extend(list_item_tags)
|
||||
|
||||
elif isinstance(block_token, (Paragraph, Heading)):
|
||||
markers, tags = extract_markers_and_tags(block_token)
|
||||
return Shard(start_line=start_line, end_line=end_line, markers=markers, tags=tags)
|
||||
|
||||
if len(markers) == 0 and len(children) == 0:
|
||||
return None, tags
|
||||
|
||||
return build_shard(
|
||||
start_line, end_line, markers=markers, tags=tags, children=children
|
||||
), []
|
||||
|
||||
|
||||
def parse_paragraph_shards(
|
||||
block_tokens: list[BlockToken], start_line: int, end_line: int
|
||||
) -> Optional[Shard]:
|
||||
def parse_multiple_block_shards(
|
||||
block_tokens: list[BlockToken],
|
||||
start_line: int,
|
||||
end_line: int,
|
||||
enforce_shard: bool = False,
|
||||
) -> tuple[Optional[Shard], list[str]]:
|
||||
is_first_block_heading = isinstance(block_tokens[0], Heading) and has_markers(
|
||||
block_tokens[0]
|
||||
)
|
||||
|
||||
paragraph_positions = find_paragraph_shard_positions(block_tokens)
|
||||
children = []
|
||||
added_tags = []
|
||||
children, tags = [], []
|
||||
|
||||
is_first_block_only_with_marker = False
|
||||
|
||||
for i, token in enumerate(block_tokens):
|
||||
if i in paragraph_positions:
|
||||
is_first_block_heading = i == 0
|
||||
is_first_block_only_with_marker = i == 0
|
||||
|
||||
if i in paragraph_positions or (i == 0 and is_first_block_heading):
|
||||
child_start_line = get_line_number(token)
|
||||
child_end_line = (
|
||||
get_line_number(block_tokens[i + 1]) - 1
|
||||
|
|
@ -101,20 +158,21 @@ def parse_paragraph_shards(
|
|||
else end_line
|
||||
)
|
||||
|
||||
children.append(
|
||||
parse_single_block_shards(token, child_start_line, child_end_line)
|
||||
child_shard, child_tags = parse_single_block_shards(
|
||||
token, child_start_line, child_end_line
|
||||
)
|
||||
elif token.children:
|
||||
added_tags.extend(extract_tags(token.children))
|
||||
|
||||
if len(children) == 0 and len(added_tags) == 0:
|
||||
return None
|
||||
if child_shard is not None:
|
||||
children.append(child_shard)
|
||||
if len(child_tags) > 0:
|
||||
tags.extend(child_tags)
|
||||
|
||||
if len(children) == 0 and not enforce_shard:
|
||||
return None, tags
|
||||
if is_first_block_heading or is_first_block_only_with_marker:
|
||||
return merge_into_first_shard(children, start_line, end_line, added_tags)
|
||||
return merge_into_first_shard(children, start_line, end_line, tags), []
|
||||
else:
|
||||
return Shard(
|
||||
start_line=start_line, end_line=end_line, children=children, tags=added_tags
|
||||
)
|
||||
return build_shard(start_line, end_line, tags=tags, children=children), []
|
||||
|
||||
|
||||
def parse_header_shards(
|
||||
|
|
@ -124,12 +182,14 @@ def parse_header_shards(
|
|||
use_first_child_as_header: bool = False,
|
||||
) -> Optional[Shard]:
|
||||
if len(block_tokens) == 0:
|
||||
return Shard(start_line=start_line, end_line=end_line)
|
||||
return build_shard(start_line, end_line)
|
||||
|
||||
split_at_heading_level = calculate_heading_level_for_next_split(block_tokens)
|
||||
|
||||
if split_at_heading_level is None:
|
||||
return parse_paragraph_shards(block_tokens, start_line, end_line)
|
||||
return parse_multiple_block_shards(
|
||||
block_tokens, start_line, end_line, enforce_shard=True
|
||||
)[0]
|
||||
|
||||
heading_positions = find_headings_by_level(block_tokens, split_at_heading_level)
|
||||
|
||||
|
|
@ -154,11 +214,11 @@ def parse_header_shards(
|
|||
if use_first_child_as_header and len(children) > 0:
|
||||
return merge_into_first_shard(children, start_line, end_line)
|
||||
else:
|
||||
return Shard(start_line=start_line, end_line=end_line, children=children)
|
||||
return build_shard(start_line, end_line, children=children)
|
||||
|
||||
|
||||
def parse_markdown_file(file_name: str, file_content: str) -> StreamFile:
|
||||
shard = Shard(start_line=1, end_line=max([len(file_content.splitlines()), 1]))
|
||||
shard = build_shard(1, max([len(file_content.splitlines()), 1]))
|
||||
|
||||
with TagMarkdownRenderer():
|
||||
ast = Document(file_content)
|
||||
|
|
|
|||
|
|
@ -213,3 +213,18 @@ class TestParseProcess:
|
|||
),
|
||||
],
|
||||
)
|
||||
|
||||
def test_simple_list(self):
|
||||
file_text = "* hello world\n * @Marker i've got a marker"
|
||||
|
||||
assert parse_markdown_file(self.file_name, file_text).shard == Shard(
|
||||
markers=[],
|
||||
tags=[],
|
||||
start_line=1,
|
||||
end_line=2,
|
||||
children=[
|
||||
Shard(
|
||||
markers=["Marker"], tags=[], start_line=2, end_line=2, children=[]
|
||||
)
|
||||
],
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue