feat: also parse within formatting
Signed-off-by: Konstantin Fickel <mail@konstantinfickel.de>
This commit is contained in:
parent
fa85017ce3
commit
8f5a000c5c
3 changed files with 186 additions and 34 deletions
|
|
@ -1,45 +1,84 @@
|
||||||
import re
|
import re
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
from mistletoe.block_token import BlockToken
|
from mistletoe.block_token import BlockToken
|
||||||
from mistletoe.span_token import RawText
|
from mistletoe.span_token import Emphasis, RawText, Strikethrough, Strong, Link
|
||||||
from mistletoe.token import Token
|
from mistletoe.token import Token
|
||||||
|
|
||||||
from .markdown_tag import Tag
|
from .markdown_tag import Tag
|
||||||
|
|
||||||
|
|
||||||
def extract_tags(tokens: Iterable[Token]) -> list[str]:
|
def extract_markers_and_tags_from_single_token(
|
||||||
return [token.content for token in tokens if isinstance(token, Tag)]
|
token: Token,
|
||||||
|
marker_boundary_encountered: bool,
|
||||||
|
return_at_first_marker: bool = False,
|
||||||
|
) -> tuple[list[str], list[str], bool]:
|
||||||
|
result_markers, result_tags = [], []
|
||||||
|
result_marker_boundary_encountered = marker_boundary_encountered
|
||||||
|
|
||||||
|
if isinstance(token, Tag):
|
||||||
|
if marker_boundary_encountered:
|
||||||
|
result_tags.append(token.content)
|
||||||
|
else:
|
||||||
|
result_markers.append(token.content)
|
||||||
|
elif isinstance(token, (Emphasis, Strong, Strikethrough, Link)):
|
||||||
|
markers, tags, child_marker_boundary_encountered = (
|
||||||
|
extract_markers_and_tags_from_tokens(
|
||||||
|
token.children or [],
|
||||||
|
marker_boundary_encountered,
|
||||||
|
return_at_first_marker,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
result_markers.extend(markers)
|
||||||
|
result_tags.extend(tags)
|
||||||
|
result_marker_boundary_encountered = (
|
||||||
|
marker_boundary_encountered or child_marker_boundary_encountered
|
||||||
|
)
|
||||||
|
elif isinstance(token, RawText) and re.match(r"^[\s]*$", token.content):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
result_marker_boundary_encountered = True
|
||||||
|
|
||||||
|
return result_markers, result_tags, result_marker_boundary_encountered
|
||||||
|
|
||||||
|
|
||||||
|
def extract_markers_and_tags_from_tokens(
|
||||||
|
tokens: Iterable[Token],
|
||||||
|
marker_boundary_encountered: bool,
|
||||||
|
return_at_first_marker: bool = False,
|
||||||
|
) -> tuple[list[str], list[str], bool]:
|
||||||
|
result_markers, result_tags = [], []
|
||||||
|
result_marker_boundary_encountered = marker_boundary_encountered
|
||||||
|
|
||||||
|
for child in tokens:
|
||||||
|
markers, tags, child_marker_boundary_encountered = (
|
||||||
|
extract_markers_and_tags_from_single_token(
|
||||||
|
child, result_marker_boundary_encountered, return_at_first_marker
|
||||||
|
)
|
||||||
|
)
|
||||||
|
result_markers.extend(markers)
|
||||||
|
result_tags.extend(tags)
|
||||||
|
result_marker_boundary_encountered = (
|
||||||
|
marker_boundary_encountered or child_marker_boundary_encountered
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(result_markers) > 0 and return_at_first_marker:
|
||||||
|
break
|
||||||
|
|
||||||
|
return result_markers, result_tags, result_marker_boundary_encountered
|
||||||
|
|
||||||
|
|
||||||
def extract_markers_and_tags(block_token: BlockToken) -> tuple[list[str], list[str]]:
|
def extract_markers_and_tags(block_token: BlockToken) -> tuple[list[str], list[str]]:
|
||||||
markers, tags = [], []
|
markers, tags, _ = extract_markers_and_tags_from_tokens(
|
||||||
is_marker = True
|
block_token.children or [], False
|
||||||
|
)
|
||||||
if block_token.children is None:
|
return markers, tags
|
||||||
return [], []
|
|
||||||
|
|
||||||
for token in block_token.children:
|
|
||||||
if isinstance(token, Tag):
|
|
||||||
if is_marker:
|
|
||||||
markers.append(token)
|
|
||||||
else:
|
|
||||||
tags.append(token)
|
|
||||||
elif not (isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)):
|
|
||||||
is_marker = False
|
|
||||||
|
|
||||||
return extract_tags(markers), extract_tags(tags)
|
|
||||||
|
|
||||||
|
|
||||||
def has_markers(block_token: BlockToken) -> bool:
|
def has_markers(block_token: BlockToken) -> bool:
|
||||||
if block_token.children is None:
|
markers, _, _ = extract_markers_and_tags_from_tokens(
|
||||||
return False
|
block_token.children or [], False, return_at_first_marker=True
|
||||||
|
)
|
||||||
for child in block_token.children:
|
return len(markers) > 0
|
||||||
if isinstance(child, Tag):
|
|
||||||
return True
|
|
||||||
elif not (isinstance(child, RawText) and re.match(r"^[\s]*$", child.content)):
|
|
||||||
return False
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["extract_tags", "extract_markers_and_tags", "has_markers"]
|
__all__ = ["extract_markers_and_tags", "has_markers"]
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ from mistletoe.span_token import SpanToken
|
||||||
|
|
||||||
class Tag(SpanToken):
|
class Tag(SpanToken):
|
||||||
parse_inner = False
|
parse_inner = False
|
||||||
pattern = re.compile(r"@([^\s]+)")
|
pattern = re.compile(r"@([^\s*\x60~\[\]]+)")
|
||||||
|
|
||||||
|
|
||||||
class TagMarkdownRenderer(MarkdownRenderer):
|
class TagMarkdownRenderer(MarkdownRenderer):
|
||||||
|
|
|
||||||
|
|
@ -44,11 +44,11 @@ class TestParseProcess:
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_parse_single_line_with_two_tags(self):
|
def test_parse_single_line_with_two_tags(self):
|
||||||
test_file = "@Tag1 @Tag2 Hello World"
|
test_file = "@Marker1 @Marker2 Hello World"
|
||||||
assert parse_markdown_file(self.file_name, test_file) == StreamFile(
|
assert parse_markdown_file(self.file_name, test_file) == StreamFile(
|
||||||
filename=self.file_name,
|
filename=self.file_name,
|
||||||
shard=Shard(
|
shard=Shard(
|
||||||
markers=["Tag1", "Tag2"],
|
markers=["Marker1", "Marker2"],
|
||||||
start_line=1,
|
start_line=1,
|
||||||
end_line=1,
|
end_line=1,
|
||||||
),
|
),
|
||||||
|
|
@ -149,7 +149,7 @@ class TestParseProcess:
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_continue_full_parsing_before_headings_start(self):
|
def test_parse_fullly_before_headings_start(self):
|
||||||
file_text = "Hello\n\n@Marker1 World!\n\n# @Marker2 I'm a heading!"
|
file_text = "Hello\n\n@Marker1 World!\n\n# @Marker2 I'm a heading!"
|
||||||
|
|
||||||
assert parse_markdown_file(self.file_name, file_text).shard == Shard(
|
assert parse_markdown_file(self.file_name, file_text).shard == Shard(
|
||||||
|
|
@ -171,7 +171,7 @@ class TestParseProcess:
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_complex_heading_structure(self):
|
def test_parse_complex_heading_structure(self):
|
||||||
file_text = "Preamble @Preamble\n## @Intro\n# @Title\n## @Chapter1\n## @Chapter2\n### Section 1\n### Section 2"
|
file_text = "Preamble @Preamble\n## @Intro\n# @Title\n## @Chapter1\n## @Chapter2\n### Section 1\n### Section 2"
|
||||||
|
|
||||||
assert parse_markdown_file(self.file_name, file_text).shard == Shard(
|
assert parse_markdown_file(self.file_name, file_text).shard == Shard(
|
||||||
|
|
@ -228,3 +228,116 @@ class TestParseProcess:
|
||||||
)
|
)
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_parse_complex_list(self):
|
||||||
|
file_text = """* I'm the parent!
|
||||||
|
* @Marker1 I've got a marker\n
|
||||||
|
* I've got no marker!
|
||||||
|
* I've got a child with a marker!
|
||||||
|
* @Marker2 I'm the child with the marker
|
||||||
|
"""
|
||||||
|
|
||||||
|
assert parse_markdown_file(self.file_name, file_text).shard == Shard(
|
||||||
|
markers=[],
|
||||||
|
tags=[],
|
||||||
|
start_line=1,
|
||||||
|
end_line=6,
|
||||||
|
children=[
|
||||||
|
Shard(
|
||||||
|
markers=[],
|
||||||
|
tags=[],
|
||||||
|
start_line=2,
|
||||||
|
end_line=6,
|
||||||
|
children=[
|
||||||
|
Shard(
|
||||||
|
markers=["Marker1"],
|
||||||
|
tags=[],
|
||||||
|
start_line=2,
|
||||||
|
end_line=3,
|
||||||
|
children=[],
|
||||||
|
),
|
||||||
|
Shard(
|
||||||
|
markers=[],
|
||||||
|
tags=[],
|
||||||
|
start_line=5,
|
||||||
|
end_line=6,
|
||||||
|
children=[
|
||||||
|
Shard(
|
||||||
|
markers=["Marker2"],
|
||||||
|
tags=[],
|
||||||
|
start_line=6,
|
||||||
|
end_line=6,
|
||||||
|
children=[],
|
||||||
|
)
|
||||||
|
],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_parse_ignores_tags_in_code(self):
|
||||||
|
file_text = "```\n@Marker\n```"
|
||||||
|
|
||||||
|
assert parse_markdown_file(self.file_name, file_text).shard == Shard(
|
||||||
|
markers=[],
|
||||||
|
tags=[],
|
||||||
|
start_line=1,
|
||||||
|
end_line=3,
|
||||||
|
children=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_parse_finds_tags_in_italic_text(self):
|
||||||
|
file_text = "*@ItalicMarker*"
|
||||||
|
|
||||||
|
assert parse_markdown_file(self.file_name, file_text).shard == Shard(
|
||||||
|
markers=["ItalicMarker"],
|
||||||
|
tags=[],
|
||||||
|
start_line=1,
|
||||||
|
end_line=1,
|
||||||
|
children=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_parse_finds_tags_in_bold_text(self):
|
||||||
|
file_text = "**@BoldMarker**"
|
||||||
|
|
||||||
|
assert parse_markdown_file(self.file_name, file_text).shard == Shard(
|
||||||
|
markers=["BoldMarker"],
|
||||||
|
tags=[],
|
||||||
|
start_line=1,
|
||||||
|
end_line=1,
|
||||||
|
children=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_parse_finds_tags_in_strikethrough_text(self):
|
||||||
|
file_text = "~~@StrikeMarker~~"
|
||||||
|
|
||||||
|
assert parse_markdown_file(self.file_name, file_text).shard == Shard(
|
||||||
|
markers=["StrikeMarker"],
|
||||||
|
tags=[],
|
||||||
|
start_line=1,
|
||||||
|
end_line=1,
|
||||||
|
children=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_parse_finds_tags_in_link(self):
|
||||||
|
file_text = "[@LinkMarker](https://konstantinfickel.de)"
|
||||||
|
|
||||||
|
assert parse_markdown_file(self.file_name, file_text).shard == Shard(
|
||||||
|
markers=["LinkMarker"],
|
||||||
|
tags=[],
|
||||||
|
start_line=1,
|
||||||
|
end_line=1,
|
||||||
|
children=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_parse_continues_looking_for_markers_after_first_link_marker(self):
|
||||||
|
file_text = "[@LinkMarker1](https://konstantinfickel.de1) [@LinkMarker2](https://konstantinfickel.de)"
|
||||||
|
|
||||||
|
assert parse_markdown_file(self.file_name, file_text).shard == Shard(
|
||||||
|
markers=["LinkMarker1", "LinkMarker2"],
|
||||||
|
tags=[],
|
||||||
|
start_line=1,
|
||||||
|
end_line=1,
|
||||||
|
children=[],
|
||||||
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue