feat: support lists in parsing
Signed-off-by: Konstantin Fickel <mail@konstantinfickel.de>
This commit is contained in:
parent
dc2a97d3b8
commit
fa85017ce3
2 changed files with 111 additions and 36 deletions
|
|
@ -1,10 +1,10 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from mistletoe import Document
|
from mistletoe import Document
|
||||||
from mistletoe.block_token import Paragraph, BlockToken, Heading
|
from mistletoe.block_token import Paragraph, BlockToken, Heading, List, ListItem
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
|
||||||
from .markdown_tag import TagMarkdownRenderer
|
from .markdown_tag import TagMarkdownRenderer
|
||||||
from .extract_tag import extract_markers_and_tags, extract_tags, has_markers
|
from .extract_tag import extract_markers_and_tags, has_markers
|
||||||
from .shard import Shard, StreamFile
|
from .shard import Shard, StreamFile
|
||||||
from .list import split_at
|
from .list import split_at
|
||||||
|
|
||||||
|
|
@ -13,6 +13,31 @@ def get_line_number(block_token: BlockToken) -> int:
|
||||||
return block_token.line_number # type: ignore
|
return block_token.line_number # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
def build_shard(
|
||||||
|
start_line,
|
||||||
|
end_line,
|
||||||
|
markers: list[str] = [],
|
||||||
|
tags: list[str] = [],
|
||||||
|
children: list[Shard] = [],
|
||||||
|
) -> Shard:
|
||||||
|
if (
|
||||||
|
len(children) == 1
|
||||||
|
and len(tags) == 0
|
||||||
|
and len(markers) == 0
|
||||||
|
and children[0].start_line == start_line
|
||||||
|
and children[0].end_line == end_line
|
||||||
|
):
|
||||||
|
return children[0]
|
||||||
|
|
||||||
|
return Shard(
|
||||||
|
markers=markers,
|
||||||
|
tags=tags,
|
||||||
|
children=children,
|
||||||
|
start_line=start_line,
|
||||||
|
end_line=end_line,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def merge_into_first_shard(
|
def merge_into_first_shard(
|
||||||
shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = []
|
shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = []
|
||||||
):
|
):
|
||||||
|
|
@ -71,29 +96,61 @@ def calculate_heading_level_for_next_split(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse_single_block_shards(block_token: BlockToken, start_line: int, end_line: int):
|
def parse_single_block_shards(
|
||||||
|
block_token: BlockToken, start_line: int, end_line: int
|
||||||
|
) -> tuple[Optional[Shard], list[str]]:
|
||||||
|
markers, tags, children = [], [], []
|
||||||
|
|
||||||
|
if isinstance(block_token, List):
|
||||||
|
list_items: list[ListItem] = ( # type: ignore
|
||||||
|
list(block_token.children) if block_token.children is not None else []
|
||||||
|
)
|
||||||
|
for index, list_item in enumerate(list_items):
|
||||||
|
list_item_start_line = get_line_number(list_item)
|
||||||
|
list_item_end_line = (
|
||||||
|
get_line_number(list_items[index + 1]) - 1
|
||||||
|
if index + 1 < len(list_items)
|
||||||
|
else end_line
|
||||||
|
)
|
||||||
|
list_item_shard, list_item_tags = parse_multiple_block_shards(
|
||||||
|
list_item.children, # type: ignore
|
||||||
|
list_item_start_line,
|
||||||
|
list_item_end_line,
|
||||||
|
)
|
||||||
|
if list_item_shard is not None:
|
||||||
|
children.append(list_item_shard)
|
||||||
|
tags.extend(list_item_tags)
|
||||||
|
|
||||||
|
elif isinstance(block_token, (Paragraph, Heading)):
|
||||||
markers, tags = extract_markers_and_tags(block_token)
|
markers, tags = extract_markers_and_tags(block_token)
|
||||||
return Shard(start_line=start_line, end_line=end_line, markers=markers, tags=tags)
|
|
||||||
|
if len(markers) == 0 and len(children) == 0:
|
||||||
|
return None, tags
|
||||||
|
|
||||||
|
return build_shard(
|
||||||
|
start_line, end_line, markers=markers, tags=tags, children=children
|
||||||
|
), []
|
||||||
|
|
||||||
|
|
||||||
def parse_paragraph_shards(
|
def parse_multiple_block_shards(
|
||||||
block_tokens: list[BlockToken], start_line: int, end_line: int
|
block_tokens: list[BlockToken],
|
||||||
) -> Optional[Shard]:
|
start_line: int,
|
||||||
|
end_line: int,
|
||||||
|
enforce_shard: bool = False,
|
||||||
|
) -> tuple[Optional[Shard], list[str]]:
|
||||||
is_first_block_heading = isinstance(block_tokens[0], Heading) and has_markers(
|
is_first_block_heading = isinstance(block_tokens[0], Heading) and has_markers(
|
||||||
block_tokens[0]
|
block_tokens[0]
|
||||||
)
|
)
|
||||||
|
|
||||||
paragraph_positions = find_paragraph_shard_positions(block_tokens)
|
paragraph_positions = find_paragraph_shard_positions(block_tokens)
|
||||||
children = []
|
children, tags = [], []
|
||||||
added_tags = []
|
|
||||||
|
|
||||||
is_first_block_only_with_marker = False
|
is_first_block_only_with_marker = False
|
||||||
|
|
||||||
for i, token in enumerate(block_tokens):
|
for i, token in enumerate(block_tokens):
|
||||||
if i in paragraph_positions:
|
if i in paragraph_positions:
|
||||||
is_first_block_heading = i == 0
|
is_first_block_only_with_marker = i == 0
|
||||||
|
|
||||||
if i in paragraph_positions or (i == 0 and is_first_block_heading):
|
|
||||||
child_start_line = get_line_number(token)
|
child_start_line = get_line_number(token)
|
||||||
child_end_line = (
|
child_end_line = (
|
||||||
get_line_number(block_tokens[i + 1]) - 1
|
get_line_number(block_tokens[i + 1]) - 1
|
||||||
|
|
@ -101,20 +158,21 @@ def parse_paragraph_shards(
|
||||||
else end_line
|
else end_line
|
||||||
)
|
)
|
||||||
|
|
||||||
children.append(
|
child_shard, child_tags = parse_single_block_shards(
|
||||||
parse_single_block_shards(token, child_start_line, child_end_line)
|
token, child_start_line, child_end_line
|
||||||
)
|
)
|
||||||
elif token.children:
|
|
||||||
added_tags.extend(extract_tags(token.children))
|
|
||||||
|
|
||||||
if len(children) == 0 and len(added_tags) == 0:
|
if child_shard is not None:
|
||||||
return None
|
children.append(child_shard)
|
||||||
|
if len(child_tags) > 0:
|
||||||
|
tags.extend(child_tags)
|
||||||
|
|
||||||
|
if len(children) == 0 and not enforce_shard:
|
||||||
|
return None, tags
|
||||||
if is_first_block_heading or is_first_block_only_with_marker:
|
if is_first_block_heading or is_first_block_only_with_marker:
|
||||||
return merge_into_first_shard(children, start_line, end_line, added_tags)
|
return merge_into_first_shard(children, start_line, end_line, tags), []
|
||||||
else:
|
else:
|
||||||
return Shard(
|
return build_shard(start_line, end_line, tags=tags, children=children), []
|
||||||
start_line=start_line, end_line=end_line, children=children, tags=added_tags
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_header_shards(
|
def parse_header_shards(
|
||||||
|
|
@ -124,12 +182,14 @@ def parse_header_shards(
|
||||||
use_first_child_as_header: bool = False,
|
use_first_child_as_header: bool = False,
|
||||||
) -> Optional[Shard]:
|
) -> Optional[Shard]:
|
||||||
if len(block_tokens) == 0:
|
if len(block_tokens) == 0:
|
||||||
return Shard(start_line=start_line, end_line=end_line)
|
return build_shard(start_line, end_line)
|
||||||
|
|
||||||
split_at_heading_level = calculate_heading_level_for_next_split(block_tokens)
|
split_at_heading_level = calculate_heading_level_for_next_split(block_tokens)
|
||||||
|
|
||||||
if split_at_heading_level is None:
|
if split_at_heading_level is None:
|
||||||
return parse_paragraph_shards(block_tokens, start_line, end_line)
|
return parse_multiple_block_shards(
|
||||||
|
block_tokens, start_line, end_line, enforce_shard=True
|
||||||
|
)[0]
|
||||||
|
|
||||||
heading_positions = find_headings_by_level(block_tokens, split_at_heading_level)
|
heading_positions = find_headings_by_level(block_tokens, split_at_heading_level)
|
||||||
|
|
||||||
|
|
@ -154,11 +214,11 @@ def parse_header_shards(
|
||||||
if use_first_child_as_header and len(children) > 0:
|
if use_first_child_as_header and len(children) > 0:
|
||||||
return merge_into_first_shard(children, start_line, end_line)
|
return merge_into_first_shard(children, start_line, end_line)
|
||||||
else:
|
else:
|
||||||
return Shard(start_line=start_line, end_line=end_line, children=children)
|
return build_shard(start_line, end_line, children=children)
|
||||||
|
|
||||||
|
|
||||||
def parse_markdown_file(file_name: str, file_content: str) -> StreamFile:
|
def parse_markdown_file(file_name: str, file_content: str) -> StreamFile:
|
||||||
shard = Shard(start_line=1, end_line=max([len(file_content.splitlines()), 1]))
|
shard = build_shard(1, max([len(file_content.splitlines()), 1]))
|
||||||
|
|
||||||
with TagMarkdownRenderer():
|
with TagMarkdownRenderer():
|
||||||
ast = Document(file_content)
|
ast = Document(file_content)
|
||||||
|
|
|
||||||
|
|
@ -213,3 +213,18 @@ class TestParseProcess:
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_simple_list(self):
|
||||||
|
file_text = "* hello world\n * @Marker i've got a marker"
|
||||||
|
|
||||||
|
assert parse_markdown_file(self.file_name, file_text).shard == Shard(
|
||||||
|
markers=[],
|
||||||
|
tags=[],
|
||||||
|
start_line=1,
|
||||||
|
end_line=2,
|
||||||
|
children=[
|
||||||
|
Shard(
|
||||||
|
markers=["Marker"], tags=[], start_line=2, end_line=2, children=[]
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue