feat: add initial parser data structure & test

Signed-off-by: Konstantin Fickel <mail@konstantinfickel.de>
This commit is contained in:
Konstantin Fickel 2025-06-20 14:36:02 +02:00
parent f73c6d16cb
commit 2091e5c98d
8 changed files with 375 additions and 30 deletions

View file

@ -1,7 +1,9 @@
import glob
import os
cwd = os.getcwd()
def run() -> None:
for file_name in glob.glob(f"{glob.escape(cwd)}/*.md"):
with open(file_name, "r") as file:

79
src/streamer/parse.py Normal file
View file

@ -0,0 +1,79 @@
from __future__ import annotations
from itertools import takewhile, dropwhile
from typing import Optional, TypeVar
from pydantic import BaseModel
from mistletoe import Document
from mistletoe.markdown_renderer import MarkdownRenderer, Fragment
from mistletoe.span_token import SpanToken, RawText
from mistletoe.token import Token
import re
class Tag(SpanToken):
parse_inner = False
pattern = re.compile(r"@([^\s]+)")
class TagMarkdownRenderer(MarkdownRenderer):
def __init__(self):
super().__init__(Tag)
def render_tag(self, token: Tag):
yield Fragment("@")
yield Fragment(token.content)
class Shard(BaseModel):
markers: list[str]
tags: list[str]
content: str
start_line: int
end_line: int
children: list[Shard]
class StreamFile(BaseModel):
filename: str
shard: Optional[Shard] = None
T = TypeVar("T")
def extract_tags(tokens: list[Token]) -> list[str]:
return map(
lambda marker: marker.content,
filter(lambda token: isinstance(token, Tag), tokens),
)
def extract_markers_and_tags(header: Token) -> tuple[list[str], list[str]]:
marker_boundary_check = lambda token: isinstance(token, Tag) or (
isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)
)
marker_region = takewhile(marker_boundary_check, header.children)
tag_region = dropwhile(marker_boundary_check, header.children)
return extract_tags(marker_region), extract_tags(tag_region)
def parse_markdown_file(file_name: str, file_content: str) -> StreamFile:
shard = None
with TagMarkdownRenderer() as renderer:
ast = Document(file_content)
if block_tokes := ast.children:
markers, tags = extract_markers_and_tags(block_tokes[0])
shard = Shard(
markers=markers,
tags=tags,
content=file_content,
start_line=1,
end_line=len(file_content.splitlines()),
children=[],
)
return StreamFile(shard=shard, filename=file_name)
__all__ = ["Shard", "StreamFile", "parse_markdown_file"]