refactor: rewrite in rust

2026-03-29 18:19:15 +02:00 · 2026-03-29 18:19:15 +02:00 · ed493cff29
commit ed493cff29
parent 20a3e8b437
72 changed files with 5684 additions and 3688 deletions
--- a/src/extract/mod.rs
+++ b/src/extract/mod.rs
@ -0,0 +1,5 @@
+mod parser;
+mod tag_extraction;
+
+pub use parser::parse_markdown_file;
+pub use tag_extraction::{extract_markers_and_tags, has_markers};
--- a/src/extract/parser.rs
+++ b/src/extract/parser.rs
@ -0,0 +1,739 @@
+use std::collections::HashMap;
+
+use pulldown_cmark::{Event, HeadingLevel, Options, Parser, Tag, TagEnd};
+
+use crate::extract::tag_extraction::{extract_markers_and_tags, has_markers};
+use crate::models::{Shard, StreamFile};
+
+/// Information about a block element.
+#[derive(Debug, Clone)]
+struct BlockInfo {
+    start_line: usize,
+    end_line: usize,
+    block_type: BlockType,
+    events: Vec<Event<'static>>,
+}
+
+#[derive(Debug, Clone, PartialEq)]
+enum BlockType {
+    Paragraph,
+    Heading(usize),
+    List,
+    ListItem,
+    CodeBlock,
+    #[allow(dead_code)]
+    Other,
+}
+
+/// Build a shard, applying simplification rules.
+/// If the shard has exactly one child with the same line range and no markers/tags,
+/// return that child instead.
+fn build_shard(
+    start_line: usize,
+    end_line: usize,
+    markers: Vec<String>,
+    tags: Vec<String>,
+    children: Vec<Shard>,
+) -> Shard {
+    if children.len() == 1
+        && tags.is_empty()
+        && markers.is_empty()
+        && children[0].start_line == start_line
+        && children[0].end_line == end_line
+    {
+        return children.into_iter().next().unwrap();
+    }
+
+    Shard {
+        markers,
+        tags,
+        start_line,
+        end_line,
+        children,
+    }
+}
+
+/// Merge shards where the first one becomes the parent with its markers/tags preserved.
+fn merge_into_first_shard(
+    mut shards: Vec<Shard>,
+    start_line: usize,
+    end_line: usize,
+    additional_tags: Vec<String>,
+) -> Shard {
+    if shards.is_empty() {
+        return build_shard(start_line, end_line, vec![], additional_tags, vec![]);
+    }
+
+    let mut first = shards.remove(0);
+    first.start_line = start_line;
+    first.end_line = end_line;
+    first.children = shards;
+    first.tags.extend(additional_tags);
+    first
+}
+
+/// Parse a markdown file into a StreamFile with shard structure.
+pub fn parse_markdown_file(file_name: &str, file_content: &str) -> StreamFile {
+    let line_count = std::cmp::max(file_content.lines().count(), 1);
+    let end_line = line_count;
+
+    // Handle empty file
+    if file_content.is_empty() {
+        return StreamFile {
+            file_name: file_name.to_string(),
+            shard: Some(Shard::new(1, 1)),
+        };
+    }
+
+    // Parse the markdown with offset tracking
+    let mut options = Options::empty();
+    options.insert(Options::ENABLE_STRIKETHROUGH);
+    let parser = Parser::new_ext(file_content, options);
+
+    // Collect blocks with their line information
+    let blocks = collect_blocks(file_content, parser);
+
+    // Parse into shard structure
+    let shard = if blocks.is_empty() {
+        Shard::new(1, end_line)
+    } else {
+        parse_header_shards(&blocks, 1, end_line, false).unwrap_or_else(|| Shard::new(1, end_line))
+    };
+
+    StreamFile {
+        file_name: file_name.to_string(),
+        shard: Some(shard),
+    }
+}
+
+/// Collect block-level elements from the parser.
+fn collect_blocks(content: &str, parser: Parser) -> Vec<BlockInfo> {
+    let mut blocks = Vec::new();
+    let mut current_block: Option<BlockInfo> = None;
+    let _current_events: Vec<Event<'static>> = Vec::new();
+    let mut depth = 0;
+    let mut list_items: Vec<BlockInfo> = Vec::new();
+    let mut in_list = false;
+    let mut list_start_line = 0;
+
+    // Pre-compute line starts for offset-to-line mapping
+    let line_starts: Vec<usize> = std::iter::once(0)
+        .chain(content.match_indices('\n').map(|(i, _)| i + 1))
+        .collect();
+
+    let offset_to_line =
+        |offset: usize| -> usize { line_starts.partition_point(|&start| start <= offset) };
+
+    for (event, range) in parser.into_offset_iter() {
+        let line = offset_to_line(range.start);
+
+        match &event {
+            Event::Start(Tag::Paragraph) => {
+                if depth == 0 {
+                    current_block = Some(BlockInfo {
+                        start_line: line,
+                        end_line: line,
+                        block_type: BlockType::Paragraph,
+                        events: Vec::new(),
+                    });
+                }
+                depth += 1;
+                if let Some(ref mut block) = current_block {
+                    block.events.push(event.clone().into_static());
+                }
+            }
+            Event::End(TagEnd::Paragraph) => {
+                depth -= 1;
+                if let Some(ref mut block) = current_block {
+                    block.events.push(event.clone().into_static());
+                    block.end_line = line;
+                }
+                if depth == 0 {
+                    if let Some(block) = current_block.take() {
+                        if in_list {
+                            list_items.push(block);
+                        } else {
+                            blocks.push(block);
+                        }
+                    }
+                }
+            }
+            Event::Start(Tag::Heading { level, .. }) => {
+                let heading_level = heading_level_to_usize(*level);
+                if depth == 0 {
+                    current_block = Some(BlockInfo {
+                        start_line: line,
+                        end_line: line,
+                        block_type: BlockType::Heading(heading_level),
+                        events: Vec::new(),
+                    });
+                }
+                depth += 1;
+                if let Some(ref mut block) = current_block {
+                    block.events.push(event.clone().into_static());
+                }
+            }
+            Event::End(TagEnd::Heading(_)) => {
+                depth -= 1;
+                if let Some(ref mut block) = current_block {
+                    block.events.push(event.clone().into_static());
+                    block.end_line = line;
+                }
+                if depth == 0 {
+                    if let Some(block) = current_block.take() {
+                        blocks.push(block);
+                    }
+                }
+            }
+            Event::Start(Tag::List(_)) => {
+                if !in_list {
+                    in_list = true;
+                    list_start_line = line;
+                    list_items.clear();
+                }
+                depth += 1;
+            }
+            Event::End(TagEnd::List(_)) => {
+                depth -= 1;
+                if depth == 0 && in_list {
+                    in_list = false;
+                    // Create a list block containing all list items
+                    if !list_items.is_empty() {
+                        blocks.push(BlockInfo {
+                            start_line: list_start_line,
+                            end_line: line,
+                            block_type: BlockType::List,
+                            events: vec![], // List events are handled through list_items
+                        });
+                        // Store list items for later processing
+                        for item in list_items.drain(..) {
+                            blocks.push(BlockInfo {
+                                block_type: BlockType::ListItem,
+                                ..item
+                            });
+                        }
+                    }
+                }
+            }
+            Event::Start(Tag::Item) => {
+                if in_list {
+                    current_block = Some(BlockInfo {
+                        start_line: line,
+                        end_line: line,
+                        block_type: BlockType::ListItem,
+                        events: Vec::new(),
+                    });
+                }
+            }
+            Event::End(TagEnd::Item) => {
+                if let Some(ref mut block) = current_block {
+                    block.end_line = line;
+                }
+                if let Some(block) = current_block.take() {
+                    list_items.push(block);
+                }
+            }
+            Event::Start(Tag::CodeBlock(_)) => {
+                if depth == 0 {
+                    current_block = Some(BlockInfo {
+                        start_line: line,
+                        end_line: line,
+                        block_type: BlockType::CodeBlock,
+                        events: Vec::new(),
+                    });
+                }
+                depth += 1;
+                if let Some(ref mut block) = current_block {
+                    block.events.push(event.clone().into_static());
+                }
+            }
+            Event::End(TagEnd::CodeBlock) => {
+                depth -= 1;
+                if let Some(ref mut block) = current_block {
+                    block.events.push(event.clone().into_static());
+                    block.end_line = line;
+                }
+                if depth == 0 {
+                    if let Some(block) = current_block.take() {
+                        blocks.push(block);
+                    }
+                }
+            }
+            _ => {
+                if let Some(ref mut block) = current_block {
+                    block.events.push(event.clone().into_static());
+                }
+            }
+        }
+    }
+
+    blocks
+}
+
+fn heading_level_to_usize(level: HeadingLevel) -> usize {
+    match level {
+        HeadingLevel::H1 => 1,
+        HeadingLevel::H2 => 2,
+        HeadingLevel::H3 => 3,
+        HeadingLevel::H4 => 4,
+        HeadingLevel::H5 => 5,
+        HeadingLevel::H6 => 6,
+    }
+}
+
+/// Check if a block has markers.
+fn block_has_markers(block: &BlockInfo) -> bool {
+    has_markers(block.events.iter().cloned())
+}
+
+/// Extract markers and tags from a block.
+fn extract_block_markers_and_tags(block: &BlockInfo) -> (Vec<String>, Vec<String>) {
+    extract_markers_and_tags(block.events.iter().cloned())
+}
+
+/// Find positions of paragraph blocks that have markers.
+fn find_paragraph_shard_positions(blocks: &[BlockInfo]) -> Vec<usize> {
+    blocks
+        .iter()
+        .enumerate()
+        .filter(|(_, block)| block.block_type == BlockType::Paragraph && block_has_markers(block))
+        .map(|(i, _)| i)
+        .collect()
+}
+
+/// Find positions of headings at a specific level.
+fn find_headings_by_level(blocks: &[BlockInfo], level: usize) -> Vec<usize> {
+    blocks
+        .iter()
+        .enumerate()
+        .filter(|(_, block)| matches!(block.block_type, BlockType::Heading(l) if l == level))
+        .map(|(i, _)| i)
+        .collect()
+}
+
+/// Calculate the heading level to split on for the next parsing step.
+fn calculate_heading_level_for_next_split(blocks: &[BlockInfo]) -> Option<usize> {
+    // Find heading levels that have markers (excluding first block)
+    let levels_with_markers: Vec<usize> = blocks[1..]
+        .iter()
+        .filter_map(|block| {
+            if let BlockType::Heading(level) = block.block_type {
+                if block_has_markers(block) {
+                    return Some(level);
+                }
+            }
+            None
+        })
+        .collect();
+
+    if levels_with_markers.is_empty() {
+        return None;
+    }
+
+    // Count headings at each level
+    let mut level_counts: HashMap<usize, usize> = HashMap::new();
+    for block in blocks {
+        if let BlockType::Heading(level) = block.block_type {
+            *level_counts.entry(level).or_insert(0) += 1;
+        }
+    }
+
+    // Return the minimum level that either:
+    // - Has count >= 2
+    // - Has a marker (excluding first block)
+    let levels_with_multiple: Vec<usize> = level_counts
+        .into_iter()
+        .filter(|(_, count)| *count >= 2)
+        .map(|(level, _)| level)
+        .collect();
+
+    let mut candidates = levels_with_multiple;
+    candidates.extend(levels_with_markers);
+
+    candidates.into_iter().min()
+}
+
+/// Split a slice at the given positions.
+fn split_at<T: Clone>(items: &[T], positions: &[usize]) -> Vec<Vec<T>> {
+    let mut all_positions: Vec<usize> = vec![0];
+    all_positions.extend(positions.iter().cloned());
+    all_positions.push(items.len());
+    all_positions.sort();
+    all_positions.dedup();
+
+    all_positions
+        .windows(2)
+        .map(|window| items[window[0]..window[1]].to_vec())
+        .filter(|v| !v.is_empty())
+        .collect()
+}
+
+/// Parse blocks into shard hierarchy based on headings.
+fn parse_header_shards(
+    blocks: &[BlockInfo],
+    start_line: usize,
+    end_line: usize,
+    use_first_child_as_header: bool,
+) -> Option<Shard> {
+    if blocks.is_empty() {
+        return Some(build_shard(start_line, end_line, vec![], vec![], vec![]));
+    }
+
+    let split_at_heading_level = calculate_heading_level_for_next_split(blocks);
+
+    if split_at_heading_level.is_none() {
+        return parse_multiple_block_shards(blocks, start_line, end_line, true).0;
+    }
+
+    let heading_level = split_at_heading_level.unwrap();
+    let heading_positions = find_headings_by_level(blocks, heading_level);
+    let block_groups = split_at(blocks, &heading_positions);
+
+    let mut children = Vec::new();
+
+    for (i, group) in block_groups.iter().enumerate() {
+        if group.is_empty() {
+            continue;
+        }
+
+        let child_start_line = group[0].start_line;
+        let child_end_line = if i + 1 < block_groups.len() && !block_groups[i + 1].is_empty() {
+            block_groups[i + 1][0].start_line - 1
+        } else {
+            end_line
+        };
+
+        if let Some(child_shard) = parse_header_shards(
+            group,
+            child_start_line,
+            child_end_line,
+            i > 0 || heading_positions.contains(&0),
+        ) {
+            children.push(child_shard);
+        }
+    }
+
+    if use_first_child_as_header && !children.is_empty() {
+        Some(merge_into_first_shard(
+            children,
+            start_line,
+            end_line,
+            vec![],
+        ))
+    } else {
+        Some(build_shard(start_line, end_line, vec![], vec![], children))
+    }
+}
+
+/// Parse multiple blocks into shards.
+fn parse_multiple_block_shards(
+    blocks: &[BlockInfo],
+    start_line: usize,
+    end_line: usize,
+    enforce_shard: bool,
+) -> (Option<Shard>, Vec<String>) {
+    if blocks.is_empty() {
+        if enforce_shard {
+            return (
+                Some(build_shard(start_line, end_line, vec![], vec![], vec![])),
+                vec![],
+            );
+        }
+        return (None, vec![]);
+    }
+
+    let is_first_block_heading =
+        matches!(blocks[0].block_type, BlockType::Heading(_)) && block_has_markers(&blocks[0]);
+
+    let paragraph_positions = find_paragraph_shard_positions(blocks);
+    let mut children = Vec::new();
+    let mut tags = Vec::new();
+    let mut is_first_block_only_with_marker = false;
+
+    for (i, block) in blocks.iter().enumerate() {
+        if paragraph_positions.contains(&i) {
+            is_first_block_only_with_marker = i == 0;
+        }
+
+        let child_start_line = block.start_line;
+        let child_end_line = if i + 1 < blocks.len() {
+            blocks[i + 1].start_line - 1
+        } else {
+            end_line
+        };
+
+        let (child_shard, child_tags) =
+            parse_single_block_shard(block, child_start_line, child_end_line);
+
+        if let Some(shard) = child_shard {
+            children.push(shard);
+        }
+        tags.extend(child_tags);
+    }
+
+    if children.is_empty() && !enforce_shard {
+        return (None, tags);
+    }
+
+    if is_first_block_heading || is_first_block_only_with_marker {
+        (
+            Some(merge_into_first_shard(children, start_line, end_line, tags)),
+            vec![],
+        )
+    } else {
+        (
+            Some(build_shard(start_line, end_line, vec![], tags, children)),
+            vec![],
+        )
+    }
+}
+
+/// Parse a single block into a shard.
+fn parse_single_block_shard(
+    block: &BlockInfo,
+    start_line: usize,
+    end_line: usize,
+) -> (Option<Shard>, Vec<String>) {
+    match block.block_type {
+        BlockType::Paragraph | BlockType::Heading(_) => {
+            let (markers, tags) = extract_block_markers_and_tags(block);
+            if markers.is_empty() {
+                (None, tags)
+            } else {
+                (
+                    Some(build_shard(start_line, end_line, markers, tags, vec![])),
+                    vec![],
+                )
+            }
+        }
+        BlockType::List | BlockType::ListItem => {
+            // List handling is complex - for now, extract any markers/tags
+            let (markers, tags) = extract_block_markers_and_tags(block);
+            if markers.is_empty() {
+                (None, tags)
+            } else {
+                (
+                    Some(build_shard(start_line, end_line, markers, tags, vec![])),
+                    vec![],
+                )
+            }
+        }
+        _ => (None, vec![]),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn make_file_name() -> String {
+        "test.md".to_string()
+    }
+
+    #[test]
+    fn test_parse_empty_file() {
+        let result = parse_markdown_file(&make_file_name(), "");
+        assert_eq!(
+            result,
+            StreamFile {
+                file_name: make_file_name(),
+                shard: Some(Shard::new(1, 1)),
+            }
+        );
+    }
+
+    #[test]
+    fn test_parse_basic_one_line_file() {
+        let result = parse_markdown_file(&make_file_name(), "Hello World");
+        assert_eq!(
+            result,
+            StreamFile {
+                file_name: make_file_name(),
+                shard: Some(Shard::new(1, 1)),
+            }
+        );
+    }
+
+    #[test]
+    fn test_parse_basic_multi_line_file() {
+        let result = parse_markdown_file(&make_file_name(), "Hello World\n\nHello again!");
+        assert_eq!(
+            result,
+            StreamFile {
+                file_name: make_file_name(),
+                shard: Some(Shard::new(1, 3)),
+            }
+        );
+    }
+
+    #[test]
+    fn test_parse_single_line_with_tag() {
+        let result = parse_markdown_file(&make_file_name(), "@Tag Hello World");
+        assert_eq!(
+            result,
+            StreamFile {
+                file_name: make_file_name(),
+                shard: Some(Shard {
+                    markers: vec!["Tag".to_string()],
+                    tags: vec![],
+                    start_line: 1,
+                    end_line: 1,
+                    children: vec![],
+                }),
+            }
+        );
+    }
+
+    #[test]
+    fn test_parse_single_line_with_two_tags() {
+        let result = parse_markdown_file(&make_file_name(), "@Marker1 @Marker2 Hello World");
+        assert_eq!(
+            result,
+            StreamFile {
+                file_name: make_file_name(),
+                shard: Some(Shard {
+                    markers: vec!["Marker1".to_string(), "Marker2".to_string()],
+                    tags: vec![],
+                    start_line: 1,
+                    end_line: 1,
+                    children: vec![],
+                }),
+            }
+        );
+    }
+
+    #[test]
+    fn test_parse_single_line_with_two_tags_and_misplaced_tag() {
+        let result = parse_markdown_file(&make_file_name(), "@Tag1 @Tag2 Hello World @Tag3");
+        assert_eq!(
+            result,
+            StreamFile {
+                file_name: make_file_name(),
+                shard: Some(Shard {
+                    markers: vec!["Tag1".to_string(), "Tag2".to_string()],
+                    tags: vec!["Tag3".to_string()],
+                    start_line: 1,
+                    end_line: 1,
+                    children: vec![],
+                }),
+            }
+        );
+    }
+
+    #[test]
+    fn test_parse_header_without_markers() {
+        let result = parse_markdown_file(&make_file_name(), "# Heading\n\n## Subheading");
+        assert_eq!(
+            result,
+            StreamFile {
+                file_name: make_file_name(),
+                shard: Some(Shard::new(1, 3)),
+            }
+        );
+    }
+
+    #[test]
+    fn test_parse_ignores_tags_in_code() {
+        let result = parse_markdown_file(&make_file_name(), "```\n@Marker\n```");
+        assert_eq!(
+            result,
+            StreamFile {
+                file_name: make_file_name(),
+                shard: Some(Shard::new(1, 3)),
+            }
+        );
+    }
+
+    #[test]
+    fn test_parse_finds_tags_in_italic_text() {
+        let result = parse_markdown_file(&make_file_name(), "*@ItalicMarker*");
+        assert_eq!(
+            result,
+            StreamFile {
+                file_name: make_file_name(),
+                shard: Some(Shard {
+                    markers: vec!["ItalicMarker".to_string()],
+                    tags: vec![],
+                    start_line: 1,
+                    end_line: 1,
+                    children: vec![],
+                }),
+            }
+        );
+    }
+
+    #[test]
+    fn test_parse_finds_tags_in_bold_text() {
+        let result = parse_markdown_file(&make_file_name(), "**@BoldMarker**");
+        assert_eq!(
+            result,
+            StreamFile {
+                file_name: make_file_name(),
+                shard: Some(Shard {
+                    markers: vec!["BoldMarker".to_string()],
+                    tags: vec![],
+                    start_line: 1,
+                    end_line: 1,
+                    children: vec![],
+                }),
+            }
+        );
+    }
+
+    #[test]
+    fn test_parse_finds_tags_in_strikethrough_text() {
+        let result = parse_markdown_file(&make_file_name(), "~~@StrikeMarker~~");
+        assert_eq!(
+            result,
+            StreamFile {
+                file_name: make_file_name(),
+                shard: Some(Shard {
+                    markers: vec!["StrikeMarker".to_string()],
+                    tags: vec![],
+                    start_line: 1,
+                    end_line: 1,
+                    children: vec![],
+                }),
+            }
+        );
+    }
+
+    #[test]
+    fn test_parse_finds_tags_in_link() {
+        let result = parse_markdown_file(&make_file_name(), "[@LinkMarker](https://example.com)");
+        assert_eq!(
+            result,
+            StreamFile {
+                file_name: make_file_name(),
+                shard: Some(Shard {
+                    markers: vec!["LinkMarker".to_string()],
+                    tags: vec![],
+                    start_line: 1,
+                    end_line: 1,
+                    children: vec![],
+                }),
+            }
+        );
+    }
+
+    #[test]
+    fn test_parse_continues_looking_for_markers_after_first_link_marker() {
+        let result = parse_markdown_file(
+            &make_file_name(),
+            "[@LinkMarker1](https://example.com) [@LinkMarker2](https://example.com)",
+        );
+        assert_eq!(
+            result,
+            StreamFile {
+                file_name: make_file_name(),
+                shard: Some(Shard {
+                    markers: vec!["LinkMarker1".to_string(), "LinkMarker2".to_string()],
+                    tags: vec![],
+                    start_line: 1,
+                    end_line: 1,
+                    children: vec![],
+                }),
+            }
+        );
+    }
+}
--- a/src/extract/tag_extraction.rs
+++ b/src/extract/tag_extraction.rs
@ -0,0 +1,219 @@
+use once_cell::sync::Lazy;
+use pulldown_cmark::{Event, Tag, TagEnd};
+use regex::Regex;
+
+/// Regex pattern for matching @Tags.
+/// Matches @ followed by any characters except whitespace, *, `, ~, [, ]
+static TAG_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"@([^\s*`~\[\]]+)").unwrap());
+
+/// Token type for tag extraction state machine.
+#[derive(Debug, Clone)]
+enum Token {
+    Tag(String),
+    Content,
+    Whitespace,
+}
+
+/// Tokenizes text content into Tags, Content, and Whitespace tokens.
+fn tokenize(text: &str) -> Vec<Token> {
+    let mut tokens = Vec::new();
+    let mut last_end = 0;
+
+    for mat in TAG_PATTERN.find_iter(text) {
+        // Handle content before the match
+        let before = &text[last_end..mat.start()];
+        if !before.is_empty() {
+            if before.chars().all(|c| c.is_whitespace()) {
+                tokens.push(Token::Whitespace);
+            } else {
+                tokens.push(Token::Content);
+            }
+        }
+
+        // Extract the tag name (without the @)
+        let tag_name = &text[mat.start() + 1..mat.end()];
+        tokens.push(Token::Tag(tag_name.to_string()));
+        last_end = mat.end();
+    }
+
+    // Handle remaining content after last match
+    if last_end < text.len() {
+        let remaining = &text[last_end..];
+        if !remaining.is_empty() {
+            if remaining.chars().all(|c| c.is_whitespace()) {
+                tokens.push(Token::Whitespace);
+            } else {
+                tokens.push(Token::Content);
+            }
+        }
+    }
+
+    tokens
+}
+
+/// Extract markers and tags from a sequence of pulldown-cmark events.
+///
+/// Markers are @-prefixed identifiers that appear before any non-whitespace content.
+/// Tags are @-prefixed identifiers that appear after content has started.
+///
+/// Returns (markers, tags).
+pub fn extract_markers_and_tags<'a>(
+    events: impl Iterator<Item = Event<'a>>,
+) -> (Vec<String>, Vec<String>) {
+    let mut markers = Vec::new();
+    let mut tags = Vec::new();
+    let mut boundary_crossed = false;
+    let mut in_code = false;
+
+    for event in events {
+        match event {
+            Event::Start(Tag::CodeBlock(_)) | Event::Start(Tag::MetadataBlock(_)) => {
+                in_code = true;
+            }
+            Event::End(TagEnd::CodeBlock) | Event::End(TagEnd::MetadataBlock(_)) => {
+                in_code = false;
+            }
+            Event::Code(_) => {
+                // Inline code is a content boundary but we don't extract tags from it
+                boundary_crossed = true;
+            }
+            Event::Text(text) | Event::InlineHtml(text) if !in_code => {
+                for token in tokenize(&text) {
+                    match token {
+                        Token::Whitespace => {}
+                        Token::Tag(name) => {
+                            if boundary_crossed {
+                                tags.push(name);
+                            } else {
+                                markers.push(name);
+                            }
+                        }
+                        Token::Content => {
+                            boundary_crossed = true;
+                        }
+                    }
+                }
+            }
+            _ => {}
+        }
+    }
+
+    (markers, tags)
+}
+
+/// Check if the events contain any markers (tags before content).
+pub fn has_markers<'a>(events: impl Iterator<Item = Event<'a>>) -> bool {
+    let (markers, _) = extract_markers_and_tags(events);
+    !markers.is_empty()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use pulldown_cmark::Parser;
+
+    fn extract_from_text(text: &str) -> (Vec<String>, Vec<String>) {
+        let mut options = pulldown_cmark::Options::empty();
+        options.insert(pulldown_cmark::Options::ENABLE_STRIKETHROUGH);
+        let parser = Parser::new_ext(text, options);
+        extract_markers_and_tags(parser)
+    }
+
+    #[test]
+    fn test_extract_single_marker() {
+        let (markers, tags) = extract_from_text("@Tag Hello World");
+        assert_eq!(markers, vec!["Tag"]);
+        assert!(tags.is_empty());
+    }
+
+    #[test]
+    fn test_extract_two_markers() {
+        let (markers, tags) = extract_from_text("@Marker1 @Marker2 Hello World");
+        assert_eq!(markers, vec!["Marker1", "Marker2"]);
+        assert!(tags.is_empty());
+    }
+
+    #[test]
+    fn test_extract_markers_and_tags() {
+        let (markers, tags) = extract_from_text("@Tag1 @Tag2 Hello World @Tag3");
+        assert_eq!(markers, vec!["Tag1", "Tag2"]);
+        assert_eq!(tags, vec!["Tag3"]);
+    }
+
+    #[test]
+    fn test_extract_inner_tags() {
+        let (markers, tags) = extract_from_text("Hello @Tag1 World!");
+        assert!(markers.is_empty());
+        assert_eq!(tags, vec!["Tag1"]);
+    }
+
+    #[test]
+    fn test_extract_ignores_code_blocks() {
+        let (markers, tags) = extract_from_text("```\n@Marker\n```");
+        assert!(markers.is_empty());
+        assert!(tags.is_empty());
+    }
+
+    #[test]
+    fn test_extract_italic_marker() {
+        let (markers, tags) = extract_from_text("*@ItalicMarker*");
+        assert_eq!(markers, vec!["ItalicMarker"]);
+        assert!(tags.is_empty());
+    }
+
+    #[test]
+    fn test_extract_bold_marker() {
+        let (markers, tags) = extract_from_text("**@BoldMarker**");
+        assert_eq!(markers, vec!["BoldMarker"]);
+        assert!(tags.is_empty());
+    }
+
+    #[test]
+    fn test_extract_strikethrough_marker() {
+        let (markers, tags) = extract_from_text("~~@StrikeMarker~~");
+        assert_eq!(markers, vec!["StrikeMarker"]);
+        assert!(tags.is_empty());
+    }
+
+    #[test]
+    fn test_extract_link_marker() {
+        let (markers, tags) = extract_from_text("[@LinkMarker](https://example.com)");
+        assert_eq!(markers, vec!["LinkMarker"]);
+        assert!(tags.is_empty());
+    }
+
+    #[test]
+    fn test_extract_multiple_link_markers() {
+        let (markers, tags) = extract_from_text(
+            "[@LinkMarker1](https://example.com) [@LinkMarker2](https://example.com)",
+        );
+        assert_eq!(markers, vec!["LinkMarker1", "LinkMarker2"]);
+        assert!(tags.is_empty());
+    }
+
+    #[test]
+    fn test_has_markers_true() {
+        let parser = Parser::new("@Tag Hello");
+        assert!(has_markers(parser));
+    }
+
+    #[test]
+    fn test_has_markers_false() {
+        let parser = Parser::new("Hello @Tag");
+        assert!(!has_markers(parser));
+    }
+
+    #[test]
+    fn test_empty_text() {
+        let (markers, tags) = extract_from_text("");
+        assert!(markers.is_empty());
+        assert!(tags.is_empty());
+    }
+
+    #[test]
+    fn test_no_tags() {
+        let (markers, tags) = extract_from_text("Hello World");
+        assert!(markers.is_empty());
+        assert!(tags.is_empty());
+    }
+}