from streamer.parse import StreamFile, parse_markdown_file, Shard from faker import Faker fake = Faker() class TestParseProcess: file_name: str = fake.file_name(extension="md") def test_parse_empty_file(self): assert parse_markdown_file(self.file_name, "") == StreamFile( filename=self.file_name, shard=None ) def test_parse_basic_one_line_file(self): test_file = "Hello World" assert parse_markdown_file(self.file_name, test_file) == StreamFile( filename=self.file_name, shard=Shard( markers=[], tags=[], start_line=1, end_line=1, children=[], ), ) def test_parse_basic_multi_line_file(self): test_file = "Hello World\n\nHello again!" assert parse_markdown_file(self.file_name, test_file) == StreamFile( filename=self.file_name, shard=Shard( markers=[], tags=[], start_line=1, end_line=3, children=[], ), ) def test_parse_single_line_with_tag(self): test_file = "@Tag Hello World" assert parse_markdown_file(self.file_name, test_file) == StreamFile( filename=self.file_name, shard=Shard( markers=["Tag"], tags=[], start_line=1, end_line=1, children=[], ), ) def test_parse_single_line_with_two_tags(self): test_file = "@Tag1 @Tag2 Hello World" assert parse_markdown_file(self.file_name, test_file) == StreamFile( filename=self.file_name, shard=Shard( markers=["Tag1", "Tag2"], tags=[], start_line=1, end_line=1, children=[], ), ) def test_parse_single_line_with_two_tags_and_misplaced_tag(self): test_file = "@Tag1 @Tag2 Hello World @Tag3" assert parse_markdown_file(self.file_name, test_file) == StreamFile( filename=self.file_name, shard=Shard( markers=["Tag1", "Tag2"], tags=["Tag3"], start_line=1, end_line=1, children=[], ), ) def test_parse_split_paragraphs_into_shards(self): file_text = "Hello World!\n\n@Tag1 Block 1\n\n@Tag2 Block 2" assert parse_markdown_file(self.file_name, file_text) == StreamFile( filename=self.file_name, shard=Shard( markers=[], tags=[], start_line=1, end_line=5, children=[ Shard( markers=["Tag1"], tags=[], start_line=3, end_line=3, children=[], ), Shard( markers=["Tag2"], tags=[], start_line=5, end_line=5, children=[], ), ], ), ) def test_parse_split_paragraph_with_inner_tags_at_more_positions(self): file_text = "Hello @Tag1 World!\n\n@Marker Block 1\n\nBlock 2 @Tag2" assert parse_markdown_file(self.file_name, file_text) == StreamFile( filename=self.file_name, shard=Shard( markers=[], tags=["Tag1", "Tag2"], start_line=1, end_line=5, children=[ Shard( markers=["Marker"], tags=[], start_line=3, end_line=3, children=[], ), ], ), ) def test_parse_header_without_markers(self): file_text = "# Heading\n\n## Subheading" assert parse_markdown_file(self.file_name, file_text) == StreamFile( filename=self.file_name, shard=Shard( markers=[], tags=[], start_line=1, end_line=3, children=[], ), ) def test_parse_split_at_heading_if_marker_on_subheading(self): file_text = "# Heading @Tag1\n\n## @Marker1 Subheading @Tag2\n\n# Heading @Tag3" assert parse_markdown_file(self.file_name, file_text) == StreamFile( filename=self.file_name, shard=Shard( markers=[], tags=[], start_line=1, end_line=5, children=[ Shard( markers=[], tags=["Tag1"], start_line=1, end_line=4, children=[ Shard( markers=["Marker1"], tags=["Tag2"], start_line=3, end_line=4, children=[], ), ], ), Shard( markers=[], tags=["Tag3"], start_line=5, end_line=5, children=[] ), ], ), ) def test_parse_only_parse_releveant_levels(self): file_text = "# @Marker1 Heading @Tag1\n\n## Subheading @Tag2" assert parse_markdown_file(self.file_name, file_text) == StreamFile( filename=self.file_name, shard=Shard( markers=["Marker1"], tags=["Tag1", "Tag2"], start_line=1, end_line=3, children=[], ), ) def test_continue_full_parsing_before_headings_start(self): file_text = "Hello\n\n@Marker1 World!\n\n# @Marker2 I'm a heading!" assert parse_markdown_file(self.file_name, file_text) == StreamFile( filename=self.file_name, shard=Shard( markers=[], tags=[], start_line=1, end_line=5, children=[ Shard( markers=["Marker1"], tags=[], start_line=3, end_line=3, children=[], ), Shard( markers=["Marker2"], tags=[], start_line=5, end_line=5, children=[], ), ], ), )