refactor: split parse into multiple files
Signed-off-by: Konstantin Fickel <mail@konstantinfickel.de>
This commit is contained in:
parent
082c13b046
commit
dc2a97d3b8
6 changed files with 117 additions and 92 deletions
4
src/streamer/parse/__init__.py
Normal file
4
src/streamer/parse/__init__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
from .shard import Shard, StreamFile
|
||||||
|
from .parse import parse_markdown_file
|
||||||
|
|
||||||
|
__all__ = ["Shard", "StreamFile", "parse_markdown_file"]
|
||||||
45
src/streamer/parse/extract_tag.py
Normal file
45
src/streamer/parse/extract_tag.py
Normal file
|
|
@ -0,0 +1,45 @@
|
||||||
|
import re
|
||||||
|
from typing import Iterable
|
||||||
|
from mistletoe.block_token import BlockToken
|
||||||
|
from mistletoe.span_token import RawText
|
||||||
|
from mistletoe.token import Token
|
||||||
|
|
||||||
|
from .markdown_tag import Tag
|
||||||
|
|
||||||
|
|
||||||
|
def extract_tags(tokens: Iterable[Token]) -> list[str]:
|
||||||
|
return [token.content for token in tokens if isinstance(token, Tag)]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_markers_and_tags(block_token: BlockToken) -> tuple[list[str], list[str]]:
|
||||||
|
markers, tags = [], []
|
||||||
|
is_marker = True
|
||||||
|
|
||||||
|
if block_token.children is None:
|
||||||
|
return [], []
|
||||||
|
|
||||||
|
for token in block_token.children:
|
||||||
|
if isinstance(token, Tag):
|
||||||
|
if is_marker:
|
||||||
|
markers.append(token)
|
||||||
|
else:
|
||||||
|
tags.append(token)
|
||||||
|
elif not (isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)):
|
||||||
|
is_marker = False
|
||||||
|
|
||||||
|
return extract_tags(markers), extract_tags(tags)
|
||||||
|
|
||||||
|
|
||||||
|
def has_markers(block_token: BlockToken) -> bool:
|
||||||
|
if block_token.children is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
for child in block_token.children:
|
||||||
|
if isinstance(child, Tag):
|
||||||
|
return True
|
||||||
|
elif not (isinstance(child, RawText) and re.match(r"^[\s]*$", child.content)):
|
||||||
|
return False
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["extract_tags", "extract_markers_and_tags", "has_markers"]
|
||||||
13
src/streamer/parse/list.py
Normal file
13
src/streamer/parse/list.py
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
from itertools import pairwise
|
||||||
|
from typing import TypeVar
|
||||||
|
|
||||||
|
A = TypeVar("A")
|
||||||
|
|
||||||
|
|
||||||
|
def split_at(list_to_be_split: list[A], positions: list[int]):
|
||||||
|
positions = sorted(set([0, *positions, len(list_to_be_split)]))
|
||||||
|
|
||||||
|
return [list_to_be_split[left:right] for left, right in pairwise(positions)]
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["split_at"]
|
||||||
20
src/streamer/parse/markdown_tag.py
Normal file
20
src/streamer/parse/markdown_tag.py
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
import re
|
||||||
|
from mistletoe.markdown_renderer import Fragment, MarkdownRenderer
|
||||||
|
from mistletoe.span_token import SpanToken
|
||||||
|
|
||||||
|
|
||||||
|
class Tag(SpanToken):
|
||||||
|
parse_inner = False
|
||||||
|
pattern = re.compile(r"@([^\s]+)")
|
||||||
|
|
||||||
|
|
||||||
|
class TagMarkdownRenderer(MarkdownRenderer):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__(Tag)
|
||||||
|
|
||||||
|
def render_tag(self, token: Tag):
|
||||||
|
yield Fragment("@")
|
||||||
|
yield Fragment(token.content)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Tag", "TagMarkdownRenderer"]
|
||||||
|
|
@ -1,83 +1,29 @@
|
||||||
from __future__ import annotations
|
from typing import Optional
|
||||||
from typing import Iterable, Optional, TypeVar
|
|
||||||
from pydantic import BaseModel
|
|
||||||
from mistletoe import Document
|
from mistletoe import Document
|
||||||
from mistletoe.markdown_renderer import MarkdownRenderer, Fragment
|
|
||||||
from mistletoe.span_token import SpanToken, RawText
|
|
||||||
from mistletoe.block_token import Paragraph, BlockToken, Heading
|
from mistletoe.block_token import Paragraph, BlockToken, Heading
|
||||||
from mistletoe.token import Token
|
|
||||||
from itertools import pairwise
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import re
|
|
||||||
|
|
||||||
|
from .markdown_tag import TagMarkdownRenderer
|
||||||
class Tag(SpanToken):
|
from .extract_tag import extract_markers_and_tags, extract_tags, has_markers
|
||||||
parse_inner = False
|
from .shard import Shard, StreamFile
|
||||||
pattern = re.compile(r"@([^\s]+)")
|
from .list import split_at
|
||||||
|
|
||||||
|
|
||||||
class TagMarkdownRenderer(MarkdownRenderer):
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__(Tag)
|
|
||||||
|
|
||||||
def render_tag(self, token: Tag):
|
|
||||||
yield Fragment("@")
|
|
||||||
yield Fragment(token.content)
|
|
||||||
|
|
||||||
|
|
||||||
class Shard(BaseModel):
|
|
||||||
markers: list[str] = []
|
|
||||||
tags: list[str] = []
|
|
||||||
start_line: int
|
|
||||||
end_line: int
|
|
||||||
children: list[Shard] = []
|
|
||||||
|
|
||||||
|
|
||||||
class StreamFile(BaseModel):
|
|
||||||
filename: str
|
|
||||||
shard: Optional[Shard] = None
|
|
||||||
|
|
||||||
|
|
||||||
T = TypeVar("T")
|
|
||||||
|
|
||||||
|
|
||||||
def get_line_number(block_token: BlockToken) -> int:
|
def get_line_number(block_token: BlockToken) -> int:
|
||||||
return block_token.line_number # type: ignore
|
return block_token.line_number # type: ignore
|
||||||
|
|
||||||
|
|
||||||
def extract_tags(tokens: Iterable[Token]) -> list[str]:
|
def merge_into_first_shard(
|
||||||
return [token.content for token in tokens if isinstance(token, Tag)]
|
shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = []
|
||||||
|
):
|
||||||
|
return shards[0].model_copy(
|
||||||
def extract_markers_and_tags(block_token: BlockToken) -> tuple[list[str], list[str]]:
|
update={
|
||||||
markers, tags = [], []
|
"start_line": start_line,
|
||||||
is_marker = True
|
"end_line": end_line,
|
||||||
|
"children": shards[1:],
|
||||||
if block_token.children is None:
|
"tags": shards[0].tags + additional_tags,
|
||||||
return [], []
|
}
|
||||||
|
)
|
||||||
for token in block_token.children:
|
|
||||||
if isinstance(token, Tag):
|
|
||||||
if is_marker:
|
|
||||||
markers.append(token)
|
|
||||||
else:
|
|
||||||
tags.append(token)
|
|
||||||
elif not (isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)):
|
|
||||||
is_marker = False
|
|
||||||
|
|
||||||
return extract_tags(markers), extract_tags(tags)
|
|
||||||
|
|
||||||
|
|
||||||
def has_markers(block_token: BlockToken) -> bool:
|
|
||||||
if block_token.children is None:
|
|
||||||
return False
|
|
||||||
|
|
||||||
for child in block_token.children:
|
|
||||||
if isinstance(child, Tag):
|
|
||||||
return True
|
|
||||||
elif not (isinstance(child, RawText) and re.match(r"^[\s]*$", child.content)):
|
|
||||||
return False
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def find_paragraph_shard_positions(block_tokens: list[BlockToken]) -> list[int]:
|
def find_paragraph_shard_positions(block_tokens: list[BlockToken]) -> list[int]:
|
||||||
|
|
@ -88,15 +34,6 @@ def find_paragraph_shard_positions(block_tokens: list[BlockToken]) -> list[int]:
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
A = TypeVar("A")
|
|
||||||
|
|
||||||
|
|
||||||
def split_at(list_to_be_split: list[A], positions: list[int]):
|
|
||||||
positions = sorted(set([0, *positions, len(list_to_be_split)]))
|
|
||||||
|
|
||||||
return [list_to_be_split[left:right] for left, right in pairwise(positions)]
|
|
||||||
|
|
||||||
|
|
||||||
def find_headings_by_level(
|
def find_headings_by_level(
|
||||||
block_tokens: list[BlockToken], header_level: int
|
block_tokens: list[BlockToken], header_level: int
|
||||||
) -> list[int]:
|
) -> list[int]:
|
||||||
|
|
@ -134,19 +71,6 @@ def calculate_heading_level_for_next_split(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def merge_into_first_shard(
|
|
||||||
shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = []
|
|
||||||
):
|
|
||||||
return shards[0].model_copy(
|
|
||||||
update={
|
|
||||||
"start_line": start_line,
|
|
||||||
"end_line": end_line,
|
|
||||||
"children": shards[1:],
|
|
||||||
"tags": shards[0].tags + additional_tags,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_single_block_shards(block_token: BlockToken, start_line: int, end_line: int):
|
def parse_single_block_shards(block_token: BlockToken, start_line: int, end_line: int):
|
||||||
markers, tags = extract_markers_and_tags(block_token)
|
markers, tags = extract_markers_and_tags(block_token)
|
||||||
return Shard(start_line=start_line, end_line=end_line, markers=markers, tags=tags)
|
return Shard(start_line=start_line, end_line=end_line, markers=markers, tags=tags)
|
||||||
19
src/streamer/parse/shard.py
Normal file
19
src/streamer/parse/shard.py
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
from typing import Optional
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class Shard(BaseModel):
|
||||||
|
markers: list[str] = []
|
||||||
|
tags: list[str] = []
|
||||||
|
start_line: int
|
||||||
|
end_line: int
|
||||||
|
children: list[Shard] = []
|
||||||
|
|
||||||
|
|
||||||
|
class StreamFile(BaseModel):
|
||||||
|
filename: str
|
||||||
|
shard: Optional[Shard] = None
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Shard", "StreamFile"]
|
||||||
Loading…
Add table
Add a link
Reference in a new issue