refactor: split parse into multiple files
Signed-off-by: Konstantin Fickel <mail@konstantinfickel.de>
This commit is contained in:
parent
082c13b046
commit
dc2a97d3b8
6 changed files with 117 additions and 92 deletions
4
src/streamer/parse/__init__.py
Normal file
4
src/streamer/parse/__init__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
from .shard import Shard, StreamFile
|
||||
from .parse import parse_markdown_file
|
||||
|
||||
__all__ = ["Shard", "StreamFile", "parse_markdown_file"]
|
||||
45
src/streamer/parse/extract_tag.py
Normal file
45
src/streamer/parse/extract_tag.py
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
import re
|
||||
from typing import Iterable
|
||||
from mistletoe.block_token import BlockToken
|
||||
from mistletoe.span_token import RawText
|
||||
from mistletoe.token import Token
|
||||
|
||||
from .markdown_tag import Tag
|
||||
|
||||
|
||||
def extract_tags(tokens: Iterable[Token]) -> list[str]:
|
||||
return [token.content for token in tokens if isinstance(token, Tag)]
|
||||
|
||||
|
||||
def extract_markers_and_tags(block_token: BlockToken) -> tuple[list[str], list[str]]:
|
||||
markers, tags = [], []
|
||||
is_marker = True
|
||||
|
||||
if block_token.children is None:
|
||||
return [], []
|
||||
|
||||
for token in block_token.children:
|
||||
if isinstance(token, Tag):
|
||||
if is_marker:
|
||||
markers.append(token)
|
||||
else:
|
||||
tags.append(token)
|
||||
elif not (isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)):
|
||||
is_marker = False
|
||||
|
||||
return extract_tags(markers), extract_tags(tags)
|
||||
|
||||
|
||||
def has_markers(block_token: BlockToken) -> bool:
|
||||
if block_token.children is None:
|
||||
return False
|
||||
|
||||
for child in block_token.children:
|
||||
if isinstance(child, Tag):
|
||||
return True
|
||||
elif not (isinstance(child, RawText) and re.match(r"^[\s]*$", child.content)):
|
||||
return False
|
||||
return False
|
||||
|
||||
|
||||
__all__ = ["extract_tags", "extract_markers_and_tags", "has_markers"]
|
||||
13
src/streamer/parse/list.py
Normal file
13
src/streamer/parse/list.py
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
from itertools import pairwise
|
||||
from typing import TypeVar
|
||||
|
||||
A = TypeVar("A")
|
||||
|
||||
|
||||
def split_at(list_to_be_split: list[A], positions: list[int]):
|
||||
positions = sorted(set([0, *positions, len(list_to_be_split)]))
|
||||
|
||||
return [list_to_be_split[left:right] for left, right in pairwise(positions)]
|
||||
|
||||
|
||||
__all__ = ["split_at"]
|
||||
20
src/streamer/parse/markdown_tag.py
Normal file
20
src/streamer/parse/markdown_tag.py
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
import re
|
||||
from mistletoe.markdown_renderer import Fragment, MarkdownRenderer
|
||||
from mistletoe.span_token import SpanToken
|
||||
|
||||
|
||||
class Tag(SpanToken):
|
||||
parse_inner = False
|
||||
pattern = re.compile(r"@([^\s]+)")
|
||||
|
||||
|
||||
class TagMarkdownRenderer(MarkdownRenderer):
|
||||
def __init__(self):
|
||||
super().__init__(Tag)
|
||||
|
||||
def render_tag(self, token: Tag):
|
||||
yield Fragment("@")
|
||||
yield Fragment(token.content)
|
||||
|
||||
|
||||
__all__ = ["Tag", "TagMarkdownRenderer"]
|
||||
|
|
@ -1,83 +1,29 @@
|
|||
from __future__ import annotations
|
||||
from typing import Iterable, Optional, TypeVar
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional
|
||||
from mistletoe import Document
|
||||
from mistletoe.markdown_renderer import MarkdownRenderer, Fragment
|
||||
from mistletoe.span_token import SpanToken, RawText
|
||||
from mistletoe.block_token import Paragraph, BlockToken, Heading
|
||||
from mistletoe.token import Token
|
||||
from itertools import pairwise
|
||||
from collections import Counter
|
||||
import re
|
||||
|
||||
|
||||
class Tag(SpanToken):
|
||||
parse_inner = False
|
||||
pattern = re.compile(r"@([^\s]+)")
|
||||
|
||||
|
||||
class TagMarkdownRenderer(MarkdownRenderer):
|
||||
def __init__(self):
|
||||
super().__init__(Tag)
|
||||
|
||||
def render_tag(self, token: Tag):
|
||||
yield Fragment("@")
|
||||
yield Fragment(token.content)
|
||||
|
||||
|
||||
class Shard(BaseModel):
|
||||
markers: list[str] = []
|
||||
tags: list[str] = []
|
||||
start_line: int
|
||||
end_line: int
|
||||
children: list[Shard] = []
|
||||
|
||||
|
||||
class StreamFile(BaseModel):
|
||||
filename: str
|
||||
shard: Optional[Shard] = None
|
||||
|
||||
|
||||
T = TypeVar("T")
|
||||
from .markdown_tag import TagMarkdownRenderer
|
||||
from .extract_tag import extract_markers_and_tags, extract_tags, has_markers
|
||||
from .shard import Shard, StreamFile
|
||||
from .list import split_at
|
||||
|
||||
|
||||
def get_line_number(block_token: BlockToken) -> int:
|
||||
return block_token.line_number # type: ignore
|
||||
|
||||
|
||||
def extract_tags(tokens: Iterable[Token]) -> list[str]:
|
||||
return [token.content for token in tokens if isinstance(token, Tag)]
|
||||
|
||||
|
||||
def extract_markers_and_tags(block_token: BlockToken) -> tuple[list[str], list[str]]:
|
||||
markers, tags = [], []
|
||||
is_marker = True
|
||||
|
||||
if block_token.children is None:
|
||||
return [], []
|
||||
|
||||
for token in block_token.children:
|
||||
if isinstance(token, Tag):
|
||||
if is_marker:
|
||||
markers.append(token)
|
||||
else:
|
||||
tags.append(token)
|
||||
elif not (isinstance(token, RawText) and re.match(r"^[\s]*$", token.content)):
|
||||
is_marker = False
|
||||
|
||||
return extract_tags(markers), extract_tags(tags)
|
||||
|
||||
|
||||
def has_markers(block_token: BlockToken) -> bool:
|
||||
if block_token.children is None:
|
||||
return False
|
||||
|
||||
for child in block_token.children:
|
||||
if isinstance(child, Tag):
|
||||
return True
|
||||
elif not (isinstance(child, RawText) and re.match(r"^[\s]*$", child.content)):
|
||||
return False
|
||||
return False
|
||||
def merge_into_first_shard(
|
||||
shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = []
|
||||
):
|
||||
return shards[0].model_copy(
|
||||
update={
|
||||
"start_line": start_line,
|
||||
"end_line": end_line,
|
||||
"children": shards[1:],
|
||||
"tags": shards[0].tags + additional_tags,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def find_paragraph_shard_positions(block_tokens: list[BlockToken]) -> list[int]:
|
||||
|
|
@ -88,15 +34,6 @@ def find_paragraph_shard_positions(block_tokens: list[BlockToken]) -> list[int]:
|
|||
]
|
||||
|
||||
|
||||
A = TypeVar("A")
|
||||
|
||||
|
||||
def split_at(list_to_be_split: list[A], positions: list[int]):
|
||||
positions = sorted(set([0, *positions, len(list_to_be_split)]))
|
||||
|
||||
return [list_to_be_split[left:right] for left, right in pairwise(positions)]
|
||||
|
||||
|
||||
def find_headings_by_level(
|
||||
block_tokens: list[BlockToken], header_level: int
|
||||
) -> list[int]:
|
||||
|
|
@ -134,19 +71,6 @@ def calculate_heading_level_for_next_split(
|
|||
)
|
||||
|
||||
|
||||
def merge_into_first_shard(
|
||||
shards: list[Shard], start_line: int, end_line: int, additional_tags: list[str] = []
|
||||
):
|
||||
return shards[0].model_copy(
|
||||
update={
|
||||
"start_line": start_line,
|
||||
"end_line": end_line,
|
||||
"children": shards[1:],
|
||||
"tags": shards[0].tags + additional_tags,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def parse_single_block_shards(block_token: BlockToken, start_line: int, end_line: int):
|
||||
markers, tags = extract_markers_and_tags(block_token)
|
||||
return Shard(start_line=start_line, end_line=end_line, markers=markers, tags=tags)
|
||||
19
src/streamer/parse/shard.py
Normal file
19
src/streamer/parse/shard.py
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
from __future__ import annotations
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class Shard(BaseModel):
|
||||
markers: list[str] = []
|
||||
tags: list[str] = []
|
||||
start_line: int
|
||||
end_line: int
|
||||
children: list[Shard] = []
|
||||
|
||||
|
||||
class StreamFile(BaseModel):
|
||||
filename: str
|
||||
shard: Optional[Shard] = None
|
||||
|
||||
|
||||
__all__ = ["Shard", "StreamFile"]
|
||||
Loading…
Add table
Add a link
Reference in a new issue