feat: add initial parser data structure & test
Signed-off-by: Konstantin Fickel <mail@konstantinfickel.de>
This commit is contained in:
parent
f73c6d16cb
commit
2091e5c98d
8 changed files with 375 additions and 30 deletions
|
|
@ -1,7 +1,9 @@
|
|||
import glob
|
||||
import os
|
||||
|
||||
cwd = os.getcwd()
|
||||
|
||||
|
||||
def run() -> None:
|
||||
for file_name in glob.glob(f"{glob.escape(cwd)}/*.md"):
|
||||
with open(file_name, "r") as file:
|
||||
|
|
|
|||
79
src/streamer/parse.py
Normal file
79
src/streamer/parse.py
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
from __future__ import annotations
|
||||
from itertools import takewhile, dropwhile
|
||||
from typing import Optional, TypeVar
|
||||
from pydantic import BaseModel
|
||||
from mistletoe import Document
|
||||
from mistletoe.markdown_renderer import MarkdownRenderer, Fragment
|
||||
from mistletoe.span_token import SpanToken, RawText
|
||||
from mistletoe.token import Token
|
||||
import re
|
||||
|
||||
|
||||
class Tag(SpanToken):
|
||||
parse_inner = False
|
||||
pattern = re.compile(r"@([^\s]+)")
|
||||
|
||||
|
||||
class TagMarkdownRenderer(MarkdownRenderer):
|
||||
def __init__(self):
|
||||
super().__init__(Tag)
|
||||
|
||||
def render_tag(self, token: Tag):
|
||||
yield Fragment("@")
|
||||
yield Fragment(token.content)
|
||||
|
||||
|
||||
class Shard(BaseModel):
|
||||
markers: list[str]
|
||||
tags: list[str]
|
||||
content: str
|
||||
start_line: int
|
||||
end_line: int
|
||||
children: list[Shard]
|
||||
|
||||
|
||||
class StreamFile(BaseModel):
|
||||
filename: str
|
||||
shard: Optional[Shard] = None
|
||||
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def extract_tags(tokens: list[Token]) -> list[str]:
|
||||
return map(
|
||||
lambda marker: marker.content,
|
||||
filter(lambda token: isinstance(token, Tag), tokens),
|
||||
)
|
||||
|
||||
|
||||
def extract_markers_and_tags(header: Token) -> tuple[list[str], list[str]]:
|
||||
marker_boundary_check = lambda token: isinstance(token, Tag) or (
|
||||
isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)
|
||||
)
|
||||
marker_region = takewhile(marker_boundary_check, header.children)
|
||||
tag_region = dropwhile(marker_boundary_check, header.children)
|
||||
|
||||
return extract_tags(marker_region), extract_tags(tag_region)
|
||||
|
||||
|
||||
def parse_markdown_file(file_name: str, file_content: str) -> StreamFile:
|
||||
shard = None
|
||||
with TagMarkdownRenderer() as renderer:
|
||||
ast = Document(file_content)
|
||||
|
||||
if block_tokes := ast.children:
|
||||
markers, tags = extract_markers_and_tags(block_tokes[0])
|
||||
shard = Shard(
|
||||
markers=markers,
|
||||
tags=tags,
|
||||
content=file_content,
|
||||
start_line=1,
|
||||
end_line=len(file_content.splitlines()),
|
||||
children=[],
|
||||
)
|
||||
|
||||
return StreamFile(shard=shard, filename=file_name)
|
||||
|
||||
|
||||
__all__ = ["Shard", "StreamFile", "parse_markdown_file"]
|
||||
Loading…
Add table
Add a link
Reference in a new issue