refactor: rewrite in rust
All checks were successful
Continuous Integration / Lint, Check & Test (push) Successful in 1m38s
Continuous Integration / Build Package (push) Successful in 1m54s

This commit is contained in:
Konstantin Fickel 2026-03-29 18:19:15 +02:00
parent 20a3e8b437
commit ed493cff29
Signed by: kfickel
GPG key ID: A793722F9933C1A5
72 changed files with 5684 additions and 3688 deletions

5
src/extract/mod.rs Normal file
View file

@ -0,0 +1,5 @@
mod parser;
mod tag_extraction;
pub use parser::parse_markdown_file;
pub use tag_extraction::{extract_markers_and_tags, has_markers};

739
src/extract/parser.rs Normal file
View file

@ -0,0 +1,739 @@
use std::collections::HashMap;
use pulldown_cmark::{Event, HeadingLevel, Options, Parser, Tag, TagEnd};
use crate::extract::tag_extraction::{extract_markers_and_tags, has_markers};
use crate::models::{Shard, StreamFile};
/// Information about a block element.
#[derive(Debug, Clone)]
struct BlockInfo {
start_line: usize,
end_line: usize,
block_type: BlockType,
events: Vec<Event<'static>>,
}
#[derive(Debug, Clone, PartialEq)]
enum BlockType {
Paragraph,
Heading(usize),
List,
ListItem,
CodeBlock,
#[allow(dead_code)]
Other,
}
/// Build a shard, applying simplification rules.
/// If the shard has exactly one child with the same line range and no markers/tags,
/// return that child instead.
fn build_shard(
start_line: usize,
end_line: usize,
markers: Vec<String>,
tags: Vec<String>,
children: Vec<Shard>,
) -> Shard {
if children.len() == 1
&& tags.is_empty()
&& markers.is_empty()
&& children[0].start_line == start_line
&& children[0].end_line == end_line
{
return children.into_iter().next().unwrap();
}
Shard {
markers,
tags,
start_line,
end_line,
children,
}
}
/// Merge shards where the first one becomes the parent with its markers/tags preserved.
fn merge_into_first_shard(
mut shards: Vec<Shard>,
start_line: usize,
end_line: usize,
additional_tags: Vec<String>,
) -> Shard {
if shards.is_empty() {
return build_shard(start_line, end_line, vec![], additional_tags, vec![]);
}
let mut first = shards.remove(0);
first.start_line = start_line;
first.end_line = end_line;
first.children = shards;
first.tags.extend(additional_tags);
first
}
/// Parse a markdown file into a StreamFile with shard structure.
pub fn parse_markdown_file(file_name: &str, file_content: &str) -> StreamFile {
let line_count = std::cmp::max(file_content.lines().count(), 1);
let end_line = line_count;
// Handle empty file
if file_content.is_empty() {
return StreamFile {
file_name: file_name.to_string(),
shard: Some(Shard::new(1, 1)),
};
}
// Parse the markdown with offset tracking
let mut options = Options::empty();
options.insert(Options::ENABLE_STRIKETHROUGH);
let parser = Parser::new_ext(file_content, options);
// Collect blocks with their line information
let blocks = collect_blocks(file_content, parser);
// Parse into shard structure
let shard = if blocks.is_empty() {
Shard::new(1, end_line)
} else {
parse_header_shards(&blocks, 1, end_line, false).unwrap_or_else(|| Shard::new(1, end_line))
};
StreamFile {
file_name: file_name.to_string(),
shard: Some(shard),
}
}
/// Collect block-level elements from the parser.
fn collect_blocks(content: &str, parser: Parser) -> Vec<BlockInfo> {
let mut blocks = Vec::new();
let mut current_block: Option<BlockInfo> = None;
let _current_events: Vec<Event<'static>> = Vec::new();
let mut depth = 0;
let mut list_items: Vec<BlockInfo> = Vec::new();
let mut in_list = false;
let mut list_start_line = 0;
// Pre-compute line starts for offset-to-line mapping
let line_starts: Vec<usize> = std::iter::once(0)
.chain(content.match_indices('\n').map(|(i, _)| i + 1))
.collect();
let offset_to_line =
|offset: usize| -> usize { line_starts.partition_point(|&start| start <= offset) };
for (event, range) in parser.into_offset_iter() {
let line = offset_to_line(range.start);
match &event {
Event::Start(Tag::Paragraph) => {
if depth == 0 {
current_block = Some(BlockInfo {
start_line: line,
end_line: line,
block_type: BlockType::Paragraph,
events: Vec::new(),
});
}
depth += 1;
if let Some(ref mut block) = current_block {
block.events.push(event.clone().into_static());
}
}
Event::End(TagEnd::Paragraph) => {
depth -= 1;
if let Some(ref mut block) = current_block {
block.events.push(event.clone().into_static());
block.end_line = line;
}
if depth == 0 {
if let Some(block) = current_block.take() {
if in_list {
list_items.push(block);
} else {
blocks.push(block);
}
}
}
}
Event::Start(Tag::Heading { level, .. }) => {
let heading_level = heading_level_to_usize(*level);
if depth == 0 {
current_block = Some(BlockInfo {
start_line: line,
end_line: line,
block_type: BlockType::Heading(heading_level),
events: Vec::new(),
});
}
depth += 1;
if let Some(ref mut block) = current_block {
block.events.push(event.clone().into_static());
}
}
Event::End(TagEnd::Heading(_)) => {
depth -= 1;
if let Some(ref mut block) = current_block {
block.events.push(event.clone().into_static());
block.end_line = line;
}
if depth == 0 {
if let Some(block) = current_block.take() {
blocks.push(block);
}
}
}
Event::Start(Tag::List(_)) => {
if !in_list {
in_list = true;
list_start_line = line;
list_items.clear();
}
depth += 1;
}
Event::End(TagEnd::List(_)) => {
depth -= 1;
if depth == 0 && in_list {
in_list = false;
// Create a list block containing all list items
if !list_items.is_empty() {
blocks.push(BlockInfo {
start_line: list_start_line,
end_line: line,
block_type: BlockType::List,
events: vec![], // List events are handled through list_items
});
// Store list items for later processing
for item in list_items.drain(..) {
blocks.push(BlockInfo {
block_type: BlockType::ListItem,
..item
});
}
}
}
}
Event::Start(Tag::Item) => {
if in_list {
current_block = Some(BlockInfo {
start_line: line,
end_line: line,
block_type: BlockType::ListItem,
events: Vec::new(),
});
}
}
Event::End(TagEnd::Item) => {
if let Some(ref mut block) = current_block {
block.end_line = line;
}
if let Some(block) = current_block.take() {
list_items.push(block);
}
}
Event::Start(Tag::CodeBlock(_)) => {
if depth == 0 {
current_block = Some(BlockInfo {
start_line: line,
end_line: line,
block_type: BlockType::CodeBlock,
events: Vec::new(),
});
}
depth += 1;
if let Some(ref mut block) = current_block {
block.events.push(event.clone().into_static());
}
}
Event::End(TagEnd::CodeBlock) => {
depth -= 1;
if let Some(ref mut block) = current_block {
block.events.push(event.clone().into_static());
block.end_line = line;
}
if depth == 0 {
if let Some(block) = current_block.take() {
blocks.push(block);
}
}
}
_ => {
if let Some(ref mut block) = current_block {
block.events.push(event.clone().into_static());
}
}
}
}
blocks
}
fn heading_level_to_usize(level: HeadingLevel) -> usize {
match level {
HeadingLevel::H1 => 1,
HeadingLevel::H2 => 2,
HeadingLevel::H3 => 3,
HeadingLevel::H4 => 4,
HeadingLevel::H5 => 5,
HeadingLevel::H6 => 6,
}
}
/// Check if a block has markers.
fn block_has_markers(block: &BlockInfo) -> bool {
has_markers(block.events.iter().cloned())
}
/// Extract markers and tags from a block.
fn extract_block_markers_and_tags(block: &BlockInfo) -> (Vec<String>, Vec<String>) {
extract_markers_and_tags(block.events.iter().cloned())
}
/// Find positions of paragraph blocks that have markers.
fn find_paragraph_shard_positions(blocks: &[BlockInfo]) -> Vec<usize> {
blocks
.iter()
.enumerate()
.filter(|(_, block)| block.block_type == BlockType::Paragraph && block_has_markers(block))
.map(|(i, _)| i)
.collect()
}
/// Find positions of headings at a specific level.
fn find_headings_by_level(blocks: &[BlockInfo], level: usize) -> Vec<usize> {
blocks
.iter()
.enumerate()
.filter(|(_, block)| matches!(block.block_type, BlockType::Heading(l) if l == level))
.map(|(i, _)| i)
.collect()
}
/// Calculate the heading level to split on for the next parsing step.
fn calculate_heading_level_for_next_split(blocks: &[BlockInfo]) -> Option<usize> {
// Find heading levels that have markers (excluding first block)
let levels_with_markers: Vec<usize> = blocks[1..]
.iter()
.filter_map(|block| {
if let BlockType::Heading(level) = block.block_type {
if block_has_markers(block) {
return Some(level);
}
}
None
})
.collect();
if levels_with_markers.is_empty() {
return None;
}
// Count headings at each level
let mut level_counts: HashMap<usize, usize> = HashMap::new();
for block in blocks {
if let BlockType::Heading(level) = block.block_type {
*level_counts.entry(level).or_insert(0) += 1;
}
}
// Return the minimum level that either:
// - Has count >= 2
// - Has a marker (excluding first block)
let levels_with_multiple: Vec<usize> = level_counts
.into_iter()
.filter(|(_, count)| *count >= 2)
.map(|(level, _)| level)
.collect();
let mut candidates = levels_with_multiple;
candidates.extend(levels_with_markers);
candidates.into_iter().min()
}
/// Split a slice at the given positions.
fn split_at<T: Clone>(items: &[T], positions: &[usize]) -> Vec<Vec<T>> {
let mut all_positions: Vec<usize> = vec![0];
all_positions.extend(positions.iter().cloned());
all_positions.push(items.len());
all_positions.sort();
all_positions.dedup();
all_positions
.windows(2)
.map(|window| items[window[0]..window[1]].to_vec())
.filter(|v| !v.is_empty())
.collect()
}
/// Parse blocks into shard hierarchy based on headings.
fn parse_header_shards(
blocks: &[BlockInfo],
start_line: usize,
end_line: usize,
use_first_child_as_header: bool,
) -> Option<Shard> {
if blocks.is_empty() {
return Some(build_shard(start_line, end_line, vec![], vec![], vec![]));
}
let split_at_heading_level = calculate_heading_level_for_next_split(blocks);
if split_at_heading_level.is_none() {
return parse_multiple_block_shards(blocks, start_line, end_line, true).0;
}
let heading_level = split_at_heading_level.unwrap();
let heading_positions = find_headings_by_level(blocks, heading_level);
let block_groups = split_at(blocks, &heading_positions);
let mut children = Vec::new();
for (i, group) in block_groups.iter().enumerate() {
if group.is_empty() {
continue;
}
let child_start_line = group[0].start_line;
let child_end_line = if i + 1 < block_groups.len() && !block_groups[i + 1].is_empty() {
block_groups[i + 1][0].start_line - 1
} else {
end_line
};
if let Some(child_shard) = parse_header_shards(
group,
child_start_line,
child_end_line,
i > 0 || heading_positions.contains(&0),
) {
children.push(child_shard);
}
}
if use_first_child_as_header && !children.is_empty() {
Some(merge_into_first_shard(
children,
start_line,
end_line,
vec![],
))
} else {
Some(build_shard(start_line, end_line, vec![], vec![], children))
}
}
/// Parse multiple blocks into shards.
fn parse_multiple_block_shards(
blocks: &[BlockInfo],
start_line: usize,
end_line: usize,
enforce_shard: bool,
) -> (Option<Shard>, Vec<String>) {
if blocks.is_empty() {
if enforce_shard {
return (
Some(build_shard(start_line, end_line, vec![], vec![], vec![])),
vec![],
);
}
return (None, vec![]);
}
let is_first_block_heading =
matches!(blocks[0].block_type, BlockType::Heading(_)) && block_has_markers(&blocks[0]);
let paragraph_positions = find_paragraph_shard_positions(blocks);
let mut children = Vec::new();
let mut tags = Vec::new();
let mut is_first_block_only_with_marker = false;
for (i, block) in blocks.iter().enumerate() {
if paragraph_positions.contains(&i) {
is_first_block_only_with_marker = i == 0;
}
let child_start_line = block.start_line;
let child_end_line = if i + 1 < blocks.len() {
blocks[i + 1].start_line - 1
} else {
end_line
};
let (child_shard, child_tags) =
parse_single_block_shard(block, child_start_line, child_end_line);
if let Some(shard) = child_shard {
children.push(shard);
}
tags.extend(child_tags);
}
if children.is_empty() && !enforce_shard {
return (None, tags);
}
if is_first_block_heading || is_first_block_only_with_marker {
(
Some(merge_into_first_shard(children, start_line, end_line, tags)),
vec![],
)
} else {
(
Some(build_shard(start_line, end_line, vec![], tags, children)),
vec![],
)
}
}
/// Parse a single block into a shard.
fn parse_single_block_shard(
block: &BlockInfo,
start_line: usize,
end_line: usize,
) -> (Option<Shard>, Vec<String>) {
match block.block_type {
BlockType::Paragraph | BlockType::Heading(_) => {
let (markers, tags) = extract_block_markers_and_tags(block);
if markers.is_empty() {
(None, tags)
} else {
(
Some(build_shard(start_line, end_line, markers, tags, vec![])),
vec![],
)
}
}
BlockType::List | BlockType::ListItem => {
// List handling is complex - for now, extract any markers/tags
let (markers, tags) = extract_block_markers_and_tags(block);
if markers.is_empty() {
(None, tags)
} else {
(
Some(build_shard(start_line, end_line, markers, tags, vec![])),
vec![],
)
}
}
_ => (None, vec![]),
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_file_name() -> String {
"test.md".to_string()
}
#[test]
fn test_parse_empty_file() {
let result = parse_markdown_file(&make_file_name(), "");
assert_eq!(
result,
StreamFile {
file_name: make_file_name(),
shard: Some(Shard::new(1, 1)),
}
);
}
#[test]
fn test_parse_basic_one_line_file() {
let result = parse_markdown_file(&make_file_name(), "Hello World");
assert_eq!(
result,
StreamFile {
file_name: make_file_name(),
shard: Some(Shard::new(1, 1)),
}
);
}
#[test]
fn test_parse_basic_multi_line_file() {
let result = parse_markdown_file(&make_file_name(), "Hello World\n\nHello again!");
assert_eq!(
result,
StreamFile {
file_name: make_file_name(),
shard: Some(Shard::new(1, 3)),
}
);
}
#[test]
fn test_parse_single_line_with_tag() {
let result = parse_markdown_file(&make_file_name(), "@Tag Hello World");
assert_eq!(
result,
StreamFile {
file_name: make_file_name(),
shard: Some(Shard {
markers: vec!["Tag".to_string()],
tags: vec![],
start_line: 1,
end_line: 1,
children: vec![],
}),
}
);
}
#[test]
fn test_parse_single_line_with_two_tags() {
let result = parse_markdown_file(&make_file_name(), "@Marker1 @Marker2 Hello World");
assert_eq!(
result,
StreamFile {
file_name: make_file_name(),
shard: Some(Shard {
markers: vec!["Marker1".to_string(), "Marker2".to_string()],
tags: vec![],
start_line: 1,
end_line: 1,
children: vec![],
}),
}
);
}
#[test]
fn test_parse_single_line_with_two_tags_and_misplaced_tag() {
let result = parse_markdown_file(&make_file_name(), "@Tag1 @Tag2 Hello World @Tag3");
assert_eq!(
result,
StreamFile {
file_name: make_file_name(),
shard: Some(Shard {
markers: vec!["Tag1".to_string(), "Tag2".to_string()],
tags: vec!["Tag3".to_string()],
start_line: 1,
end_line: 1,
children: vec![],
}),
}
);
}
#[test]
fn test_parse_header_without_markers() {
let result = parse_markdown_file(&make_file_name(), "# Heading\n\n## Subheading");
assert_eq!(
result,
StreamFile {
file_name: make_file_name(),
shard: Some(Shard::new(1, 3)),
}
);
}
#[test]
fn test_parse_ignores_tags_in_code() {
let result = parse_markdown_file(&make_file_name(), "```\n@Marker\n```");
assert_eq!(
result,
StreamFile {
file_name: make_file_name(),
shard: Some(Shard::new(1, 3)),
}
);
}
#[test]
fn test_parse_finds_tags_in_italic_text() {
let result = parse_markdown_file(&make_file_name(), "*@ItalicMarker*");
assert_eq!(
result,
StreamFile {
file_name: make_file_name(),
shard: Some(Shard {
markers: vec!["ItalicMarker".to_string()],
tags: vec![],
start_line: 1,
end_line: 1,
children: vec![],
}),
}
);
}
#[test]
fn test_parse_finds_tags_in_bold_text() {
let result = parse_markdown_file(&make_file_name(), "**@BoldMarker**");
assert_eq!(
result,
StreamFile {
file_name: make_file_name(),
shard: Some(Shard {
markers: vec!["BoldMarker".to_string()],
tags: vec![],
start_line: 1,
end_line: 1,
children: vec![],
}),
}
);
}
#[test]
fn test_parse_finds_tags_in_strikethrough_text() {
let result = parse_markdown_file(&make_file_name(), "~~@StrikeMarker~~");
assert_eq!(
result,
StreamFile {
file_name: make_file_name(),
shard: Some(Shard {
markers: vec!["StrikeMarker".to_string()],
tags: vec![],
start_line: 1,
end_line: 1,
children: vec![],
}),
}
);
}
#[test]
fn test_parse_finds_tags_in_link() {
let result = parse_markdown_file(&make_file_name(), "[@LinkMarker](https://example.com)");
assert_eq!(
result,
StreamFile {
file_name: make_file_name(),
shard: Some(Shard {
markers: vec!["LinkMarker".to_string()],
tags: vec![],
start_line: 1,
end_line: 1,
children: vec![],
}),
}
);
}
#[test]
fn test_parse_continues_looking_for_markers_after_first_link_marker() {
let result = parse_markdown_file(
&make_file_name(),
"[@LinkMarker1](https://example.com) [@LinkMarker2](https://example.com)",
);
assert_eq!(
result,
StreamFile {
file_name: make_file_name(),
shard: Some(Shard {
markers: vec!["LinkMarker1".to_string(), "LinkMarker2".to_string()],
tags: vec![],
start_line: 1,
end_line: 1,
children: vec![],
}),
}
);
}
}

View file

@ -0,0 +1,219 @@
use once_cell::sync::Lazy;
use pulldown_cmark::{Event, Tag, TagEnd};
use regex::Regex;
/// Regex pattern for matching @Tags.
/// Matches @ followed by any characters except whitespace, *, `, ~, [, ]
static TAG_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"@([^\s*`~\[\]]+)").unwrap());
/// Token type for tag extraction state machine.
#[derive(Debug, Clone)]
enum Token {
Tag(String),
Content,
Whitespace,
}
/// Tokenizes text content into Tags, Content, and Whitespace tokens.
fn tokenize(text: &str) -> Vec<Token> {
let mut tokens = Vec::new();
let mut last_end = 0;
for mat in TAG_PATTERN.find_iter(text) {
// Handle content before the match
let before = &text[last_end..mat.start()];
if !before.is_empty() {
if before.chars().all(|c| c.is_whitespace()) {
tokens.push(Token::Whitespace);
} else {
tokens.push(Token::Content);
}
}
// Extract the tag name (without the @)
let tag_name = &text[mat.start() + 1..mat.end()];
tokens.push(Token::Tag(tag_name.to_string()));
last_end = mat.end();
}
// Handle remaining content after last match
if last_end < text.len() {
let remaining = &text[last_end..];
if !remaining.is_empty() {
if remaining.chars().all(|c| c.is_whitespace()) {
tokens.push(Token::Whitespace);
} else {
tokens.push(Token::Content);
}
}
}
tokens
}
/// Extract markers and tags from a sequence of pulldown-cmark events.
///
/// Markers are @-prefixed identifiers that appear before any non-whitespace content.
/// Tags are @-prefixed identifiers that appear after content has started.
///
/// Returns (markers, tags).
pub fn extract_markers_and_tags<'a>(
events: impl Iterator<Item = Event<'a>>,
) -> (Vec<String>, Vec<String>) {
let mut markers = Vec::new();
let mut tags = Vec::new();
let mut boundary_crossed = false;
let mut in_code = false;
for event in events {
match event {
Event::Start(Tag::CodeBlock(_)) | Event::Start(Tag::MetadataBlock(_)) => {
in_code = true;
}
Event::End(TagEnd::CodeBlock) | Event::End(TagEnd::MetadataBlock(_)) => {
in_code = false;
}
Event::Code(_) => {
// Inline code is a content boundary but we don't extract tags from it
boundary_crossed = true;
}
Event::Text(text) | Event::InlineHtml(text) if !in_code => {
for token in tokenize(&text) {
match token {
Token::Whitespace => {}
Token::Tag(name) => {
if boundary_crossed {
tags.push(name);
} else {
markers.push(name);
}
}
Token::Content => {
boundary_crossed = true;
}
}
}
}
_ => {}
}
}
(markers, tags)
}
/// Check if the events contain any markers (tags before content).
pub fn has_markers<'a>(events: impl Iterator<Item = Event<'a>>) -> bool {
let (markers, _) = extract_markers_and_tags(events);
!markers.is_empty()
}
#[cfg(test)]
mod tests {
use super::*;
use pulldown_cmark::Parser;
fn extract_from_text(text: &str) -> (Vec<String>, Vec<String>) {
let mut options = pulldown_cmark::Options::empty();
options.insert(pulldown_cmark::Options::ENABLE_STRIKETHROUGH);
let parser = Parser::new_ext(text, options);
extract_markers_and_tags(parser)
}
#[test]
fn test_extract_single_marker() {
let (markers, tags) = extract_from_text("@Tag Hello World");
assert_eq!(markers, vec!["Tag"]);
assert!(tags.is_empty());
}
#[test]
fn test_extract_two_markers() {
let (markers, tags) = extract_from_text("@Marker1 @Marker2 Hello World");
assert_eq!(markers, vec!["Marker1", "Marker2"]);
assert!(tags.is_empty());
}
#[test]
fn test_extract_markers_and_tags() {
let (markers, tags) = extract_from_text("@Tag1 @Tag2 Hello World @Tag3");
assert_eq!(markers, vec!["Tag1", "Tag2"]);
assert_eq!(tags, vec!["Tag3"]);
}
#[test]
fn test_extract_inner_tags() {
let (markers, tags) = extract_from_text("Hello @Tag1 World!");
assert!(markers.is_empty());
assert_eq!(tags, vec!["Tag1"]);
}
#[test]
fn test_extract_ignores_code_blocks() {
let (markers, tags) = extract_from_text("```\n@Marker\n```");
assert!(markers.is_empty());
assert!(tags.is_empty());
}
#[test]
fn test_extract_italic_marker() {
let (markers, tags) = extract_from_text("*@ItalicMarker*");
assert_eq!(markers, vec!["ItalicMarker"]);
assert!(tags.is_empty());
}
#[test]
fn test_extract_bold_marker() {
let (markers, tags) = extract_from_text("**@BoldMarker**");
assert_eq!(markers, vec!["BoldMarker"]);
assert!(tags.is_empty());
}
#[test]
fn test_extract_strikethrough_marker() {
let (markers, tags) = extract_from_text("~~@StrikeMarker~~");
assert_eq!(markers, vec!["StrikeMarker"]);
assert!(tags.is_empty());
}
#[test]
fn test_extract_link_marker() {
let (markers, tags) = extract_from_text("[@LinkMarker](https://example.com)");
assert_eq!(markers, vec!["LinkMarker"]);
assert!(tags.is_empty());
}
#[test]
fn test_extract_multiple_link_markers() {
let (markers, tags) = extract_from_text(
"[@LinkMarker1](https://example.com) [@LinkMarker2](https://example.com)",
);
assert_eq!(markers, vec!["LinkMarker1", "LinkMarker2"]);
assert!(tags.is_empty());
}
#[test]
fn test_has_markers_true() {
let parser = Parser::new("@Tag Hello");
assert!(has_markers(parser));
}
#[test]
fn test_has_markers_false() {
let parser = Parser::new("Hello @Tag");
assert!(!has_markers(parser));
}
#[test]
fn test_empty_text() {
let (markers, tags) = extract_from_text("");
assert!(markers.is_empty());
assert!(tags.is_empty());
}
#[test]
fn test_no_tags() {
let (markers, tags) = extract_from_text("Hello World");
assert!(markers.is_empty());
assert!(tags.is_empty());
}
}