chore: refactor and simplify parsing
Signed-off-by: Konstantin Fickel <mail@konstantinfickel.de>
This commit is contained in:
parent
b7ad75f079
commit
695a28e715
2 changed files with 28 additions and 23 deletions
|
|
@ -91,6 +91,9 @@ def split_at(list_to_be_split: list[T], positions: list[int]):
|
||||||
def to_shard(
|
def to_shard(
|
||||||
tokens: list[Token], start_line: int, end_line: int, children: list[Shard] = []
|
tokens: list[Token], start_line: int, end_line: int, children: list[Shard] = []
|
||||||
) -> Shard:
|
) -> Shard:
|
||||||
|
if len(children) == 1 and len(tokens) == 0:
|
||||||
|
return children[0]
|
||||||
|
|
||||||
markers, tags = extract_markers_and_tags(tokens) if len(tokens) > 0 else ([], [])
|
markers, tags = extract_markers_and_tags(tokens) if len(tokens) > 0 else ([], [])
|
||||||
|
|
||||||
return Shard(
|
return Shard(
|
||||||
|
|
@ -102,13 +105,8 @@ def to_shard(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse_markdown_file(file_name: str, file_content: str) -> StreamFile:
|
def parse_paragraph_shards(block_tokens: list[BlockToken], end_line: int) -> Shard:
|
||||||
shard = None
|
start_line = block_tokens[0].line_number
|
||||||
with TagMarkdownRenderer():
|
|
||||||
ast = Document(file_content)
|
|
||||||
line_count = len(file_content.splitlines())
|
|
||||||
|
|
||||||
if block_tokens := ast.children:
|
|
||||||
shard_starts = find_shard_positions(block_tokens)
|
shard_starts = find_shard_positions(block_tokens)
|
||||||
|
|
||||||
child_shards: list[Shard] = []
|
child_shards: list[Shard] = []
|
||||||
|
|
@ -117,19 +115,26 @@ def parse_markdown_file(file_name: str, file_content: str) -> StreamFile:
|
||||||
for i in range(len(block_tokens)):
|
for i in range(len(block_tokens)):
|
||||||
token = block_tokens[i]
|
token = block_tokens[i]
|
||||||
if i in shard_starts:
|
if i in shard_starts:
|
||||||
end_line = (
|
shard_end_line = (
|
||||||
block_tokens[i + 1].line_number - 1
|
block_tokens[i + 1].line_number - 1
|
||||||
if i + 1 < len(block_tokens)
|
if i + 1 < len(block_tokens)
|
||||||
else line_count
|
else end_line
|
||||||
)
|
)
|
||||||
child_shards.append(to_shard([token], token.line_number, end_line))
|
child_shards.append(to_shard([token], token.line_number, shard_end_line))
|
||||||
else:
|
else:
|
||||||
own_elements.append(token)
|
own_elements.append(token)
|
||||||
|
|
||||||
if len(child_shards) == 1 and len(own_elements) == 0:
|
return to_shard(own_elements, start_line, end_line, children=child_shards)
|
||||||
shard = child_shards[0]
|
|
||||||
else:
|
|
||||||
shard = to_shard(own_elements, 1, line_count, children=child_shards)
|
def parse_markdown_file(file_name: str, file_content: str) -> StreamFile:
|
||||||
|
shard = None
|
||||||
|
with TagMarkdownRenderer():
|
||||||
|
ast = Document(file_content)
|
||||||
|
line_count = len(file_content.splitlines())
|
||||||
|
|
||||||
|
if block_tokens := ast.children:
|
||||||
|
shard = parse_paragraph_shards(block_tokens, line_count)
|
||||||
|
|
||||||
return StreamFile(shard=shard, filename=file_name)
|
return StreamFile(shard=shard, filename=file_name)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -26,14 +26,14 @@ class TestParseProcess:
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_parse_basic_multi_line_file(self):
|
def test_parse_basic_multi_line_file(self):
|
||||||
test_file = "Hello World\nHello again!"
|
test_file = "Hello World\n\nHello again!"
|
||||||
assert parse_markdown_file(self.file_name, test_file) == StreamFile(
|
assert parse_markdown_file(self.file_name, test_file) == StreamFile(
|
||||||
filename=self.file_name,
|
filename=self.file_name,
|
||||||
shard=Shard(
|
shard=Shard(
|
||||||
markers=[],
|
markers=[],
|
||||||
tags=[],
|
tags=[],
|
||||||
start_line=1,
|
start_line=1,
|
||||||
end_line=2,
|
end_line=3,
|
||||||
children=[],
|
children=[],
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue