refactor: split parse into multiple files

Signed-off-by: Konstantin Fickel <mail@konstantinfickel.de>
This commit is contained in:
Konstantin Fickel 2025-06-21 16:43:40 +02:00
parent 082c13b046
commit dc2a97d3b8
6 changed files with 117 additions and 92 deletions

View file

@ -0,0 +1,4 @@
from .shard import Shard, StreamFile
from .parse import parse_markdown_file
__all__ = ["Shard", "StreamFile", "parse_markdown_file"]

View file

@ -0,0 +1,45 @@
import re
from typing import Iterable
from mistletoe.block_token import BlockToken
from mistletoe.span_token import RawText
from mistletoe.token import Token
from .markdown_tag import Tag
def extract_tags(tokens: Iterable[Token]) -> list[str]:
return [token.content for token in tokens if isinstance(token, Tag)]
def extract_markers_and_tags(block_token: BlockToken) -> tuple[list[str], list[str]]:
markers, tags = [], []
is_marker = True
if block_token.children is None:
return [], []
for token in block_token.children:
if isinstance(token, Tag):
if is_marker:
markers.append(token)
else:
tags.append(token)
elif not (isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)):
is_marker = False
return extract_tags(markers), extract_tags(tags)
def has_markers(block_token: BlockToken) -> bool:
if block_token.children is None:
return False
for child in block_token.children:
if isinstance(child, Tag):
return True
elif not (isinstance(child, RawText) and re.match(r"^[\s]*$", child.content)):
return False
return False
__all__ = ["extract_tags", "extract_markers_and_tags", "has_markers"]

View file

@ -0,0 +1,13 @@
from itertools import pairwise
from typing import TypeVar
A = TypeVar("A")
def split_at(list_to_be_split: list[A], positions: list[int]):
positions = sorted(set([0, *positions, len(list_to_be_split)]))
return [list_to_be_split[left:right] for left, right in pairwise(positions)]
__all__ = ["split_at"]

View file

@ -0,0 +1,20 @@
import re
from mistletoe.markdown_renderer import Fragment, MarkdownRenderer
from mistletoe.span_token import SpanToken
class Tag(SpanToken):
parse_inner = False
pattern = re.compile(r"@([^\s]+)")
class TagMarkdownRenderer(MarkdownRenderer):
def __init__(self):
super().__init__(Tag)
def render_tag(self, token: Tag):
yield Fragment("@")
yield Fragment(token.content)
__all__ = ["Tag", "TagMarkdownRenderer"]

View file

@ -1,83 +1,29 @@
from __future__ import annotations from typing import Optional
from typing import Iterable, Optional, TypeVar
from pydantic import BaseModel
from mistletoe import Document from mistletoe import Document
from mistletoe.markdown_renderer import MarkdownRenderer, Fragment
from mistletoe.span_token import SpanToken, RawText
from mistletoe.block_token import Paragraph, BlockToken, Heading from mistletoe.block_token import Paragraph, BlockToken, Heading
from mistletoe.token import Token
from itertools import pairwise
from collections import Counter from collections import Counter
import re
from .markdown_tag import TagMarkdownRenderer
class Tag(SpanToken): from .extract_tag import extract_markers_and_tags, extract_tags, has_markers
parse_inner = False from .shard import Shard, StreamFile
pattern = re.compile(r"@([^\s]+)") from .list import split_at
class TagMarkdownRenderer(MarkdownRenderer):
def __init__(self):
super().__init__(Tag)
def render_tag(self, token: Tag):
yield Fragment("@")
yield Fragment(token.content)
class Shard(BaseModel):
markers: list[str] = []
tags: list[str] = []
start_line: int
end_line: int
children: list[Shard] = []
class StreamFile(BaseModel):
filename: str
shard: Optional[Shard] = None
T = TypeVar("T")
def get_line_number(block_token: BlockToken) -> int: def get_line_number(block_token: BlockToken) -> int:
return block_token.line_number # type: ignore return block_token.line_number # type: ignore
def extract_tags(tokens: Iterable[Token]) -> list[str]: def merge_into_first_shard(
return [token.content for token in tokens if isinstance(token, Tag)] shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = []
):
return shards[0].model_copy(
def extract_markers_and_tags(block_token: BlockToken) -> tuple[list[str], list[str]]: update={
markers, tags = [], [] "start_line": start_line,
is_marker = True "end_line": end_line,
"children": shards[1:],
if block_token.children is None: "tags": shards[0].tags + additional_tags,
return [], [] }
)
for token in block_token.children:
if isinstance(token, Tag):
if is_marker:
markers.append(token)
else:
tags.append(token)
elif not (isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)):
is_marker = False
return extract_tags(markers), extract_tags(tags)
def has_markers(block_token: BlockToken) -> bool:
if block_token.children is None:
return False
for child in block_token.children:
if isinstance(child, Tag):
return True
elif not (isinstance(child, RawText) and re.match(r"^[\s]*$", child.content)):
return False
return False
def find_paragraph_shard_positions(block_tokens: list[BlockToken]) -> list[int]: def find_paragraph_shard_positions(block_tokens: list[BlockToken]) -> list[int]:
@ -88,15 +34,6 @@ def find_paragraph_shard_positions(block_tokens: list[BlockToken]) -> list[int]:
] ]
A = TypeVar("A")
def split_at(list_to_be_split: list[A], positions: list[int]):
positions = sorted(set([0, *positions, len(list_to_be_split)]))
return [list_to_be_split[left:right] for left, right in pairwise(positions)]
def find_headings_by_level( def find_headings_by_level(
block_tokens: list[BlockToken], header_level: int block_tokens: list[BlockToken], header_level: int
) -> list[int]: ) -> list[int]:
@ -134,19 +71,6 @@ def calculate_heading_level_for_next_split(
) )
def merge_into_first_shard(
shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = []
):
return shards[0].model_copy(
update={
"start_line": start_line,
"end_line": end_line,
"children": shards[1:],
"tags": shards[0].tags + additional_tags,
}
)
def parse_single_block_shards(block_token: BlockToken, start_line: int, end_line: int): def parse_single_block_shards(block_token: BlockToken, start_line: int, end_line: int):
markers, tags = extract_markers_and_tags(block_token) markers, tags = extract_markers_and_tags(block_token)
return Shard(start_line=start_line, end_line=end_line, markers=markers, tags=tags) return Shard(start_line=start_line, end_line=end_line, markers=markers, tags=tags)

View file

@ -0,0 +1,19 @@
from __future__ import annotations
from typing import Optional
from pydantic import BaseModel
class Shard(BaseModel):
markers: list[str] = []
tags: list[str] = []
start_line: int
end_line: int
children: list[Shard] = []
class StreamFile(BaseModel):
filename: str
shard: Optional[Shard] = None
__all__ = ["Shard", "StreamFile"]