refactor: rewrite in rust
This commit is contained in:
parent
20a3e8b437
commit
ed493cff29
72 changed files with 5684 additions and 3688 deletions
5
src/extract/mod.rs
Normal file
5
src/extract/mod.rs
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
mod parser;
|
||||
mod tag_extraction;
|
||||
|
||||
pub use parser::parse_markdown_file;
|
||||
pub use tag_extraction::{extract_markers_and_tags, has_markers};
|
||||
739
src/extract/parser.rs
Normal file
739
src/extract/parser.rs
Normal file
|
|
@ -0,0 +1,739 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use pulldown_cmark::{Event, HeadingLevel, Options, Parser, Tag, TagEnd};
|
||||
|
||||
use crate::extract::tag_extraction::{extract_markers_and_tags, has_markers};
|
||||
use crate::models::{Shard, StreamFile};
|
||||
|
||||
/// Information about a block element.
|
||||
#[derive(Debug, Clone)]
|
||||
struct BlockInfo {
|
||||
start_line: usize,
|
||||
end_line: usize,
|
||||
block_type: BlockType,
|
||||
events: Vec<Event<'static>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
enum BlockType {
|
||||
Paragraph,
|
||||
Heading(usize),
|
||||
List,
|
||||
ListItem,
|
||||
CodeBlock,
|
||||
#[allow(dead_code)]
|
||||
Other,
|
||||
}
|
||||
|
||||
/// Build a shard, applying simplification rules.
|
||||
/// If the shard has exactly one child with the same line range and no markers/tags,
|
||||
/// return that child instead.
|
||||
fn build_shard(
|
||||
start_line: usize,
|
||||
end_line: usize,
|
||||
markers: Vec<String>,
|
||||
tags: Vec<String>,
|
||||
children: Vec<Shard>,
|
||||
) -> Shard {
|
||||
if children.len() == 1
|
||||
&& tags.is_empty()
|
||||
&& markers.is_empty()
|
||||
&& children[0].start_line == start_line
|
||||
&& children[0].end_line == end_line
|
||||
{
|
||||
return children.into_iter().next().unwrap();
|
||||
}
|
||||
|
||||
Shard {
|
||||
markers,
|
||||
tags,
|
||||
start_line,
|
||||
end_line,
|
||||
children,
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge shards where the first one becomes the parent with its markers/tags preserved.
|
||||
fn merge_into_first_shard(
|
||||
mut shards: Vec<Shard>,
|
||||
start_line: usize,
|
||||
end_line: usize,
|
||||
additional_tags: Vec<String>,
|
||||
) -> Shard {
|
||||
if shards.is_empty() {
|
||||
return build_shard(start_line, end_line, vec![], additional_tags, vec![]);
|
||||
}
|
||||
|
||||
let mut first = shards.remove(0);
|
||||
first.start_line = start_line;
|
||||
first.end_line = end_line;
|
||||
first.children = shards;
|
||||
first.tags.extend(additional_tags);
|
||||
first
|
||||
}
|
||||
|
||||
/// Parse a markdown file into a StreamFile with shard structure.
|
||||
pub fn parse_markdown_file(file_name: &str, file_content: &str) -> StreamFile {
|
||||
let line_count = std::cmp::max(file_content.lines().count(), 1);
|
||||
let end_line = line_count;
|
||||
|
||||
// Handle empty file
|
||||
if file_content.is_empty() {
|
||||
return StreamFile {
|
||||
file_name: file_name.to_string(),
|
||||
shard: Some(Shard::new(1, 1)),
|
||||
};
|
||||
}
|
||||
|
||||
// Parse the markdown with offset tracking
|
||||
let mut options = Options::empty();
|
||||
options.insert(Options::ENABLE_STRIKETHROUGH);
|
||||
let parser = Parser::new_ext(file_content, options);
|
||||
|
||||
// Collect blocks with their line information
|
||||
let blocks = collect_blocks(file_content, parser);
|
||||
|
||||
// Parse into shard structure
|
||||
let shard = if blocks.is_empty() {
|
||||
Shard::new(1, end_line)
|
||||
} else {
|
||||
parse_header_shards(&blocks, 1, end_line, false).unwrap_or_else(|| Shard::new(1, end_line))
|
||||
};
|
||||
|
||||
StreamFile {
|
||||
file_name: file_name.to_string(),
|
||||
shard: Some(shard),
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect block-level elements from the parser.
|
||||
fn collect_blocks(content: &str, parser: Parser) -> Vec<BlockInfo> {
|
||||
let mut blocks = Vec::new();
|
||||
let mut current_block: Option<BlockInfo> = None;
|
||||
let _current_events: Vec<Event<'static>> = Vec::new();
|
||||
let mut depth = 0;
|
||||
let mut list_items: Vec<BlockInfo> = Vec::new();
|
||||
let mut in_list = false;
|
||||
let mut list_start_line = 0;
|
||||
|
||||
// Pre-compute line starts for offset-to-line mapping
|
||||
let line_starts: Vec<usize> = std::iter::once(0)
|
||||
.chain(content.match_indices('\n').map(|(i, _)| i + 1))
|
||||
.collect();
|
||||
|
||||
let offset_to_line =
|
||||
|offset: usize| -> usize { line_starts.partition_point(|&start| start <= offset) };
|
||||
|
||||
for (event, range) in parser.into_offset_iter() {
|
||||
let line = offset_to_line(range.start);
|
||||
|
||||
match &event {
|
||||
Event::Start(Tag::Paragraph) => {
|
||||
if depth == 0 {
|
||||
current_block = Some(BlockInfo {
|
||||
start_line: line,
|
||||
end_line: line,
|
||||
block_type: BlockType::Paragraph,
|
||||
events: Vec::new(),
|
||||
});
|
||||
}
|
||||
depth += 1;
|
||||
if let Some(ref mut block) = current_block {
|
||||
block.events.push(event.clone().into_static());
|
||||
}
|
||||
}
|
||||
Event::End(TagEnd::Paragraph) => {
|
||||
depth -= 1;
|
||||
if let Some(ref mut block) = current_block {
|
||||
block.events.push(event.clone().into_static());
|
||||
block.end_line = line;
|
||||
}
|
||||
if depth == 0 {
|
||||
if let Some(block) = current_block.take() {
|
||||
if in_list {
|
||||
list_items.push(block);
|
||||
} else {
|
||||
blocks.push(block);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Event::Start(Tag::Heading { level, .. }) => {
|
||||
let heading_level = heading_level_to_usize(*level);
|
||||
if depth == 0 {
|
||||
current_block = Some(BlockInfo {
|
||||
start_line: line,
|
||||
end_line: line,
|
||||
block_type: BlockType::Heading(heading_level),
|
||||
events: Vec::new(),
|
||||
});
|
||||
}
|
||||
depth += 1;
|
||||
if let Some(ref mut block) = current_block {
|
||||
block.events.push(event.clone().into_static());
|
||||
}
|
||||
}
|
||||
Event::End(TagEnd::Heading(_)) => {
|
||||
depth -= 1;
|
||||
if let Some(ref mut block) = current_block {
|
||||
block.events.push(event.clone().into_static());
|
||||
block.end_line = line;
|
||||
}
|
||||
if depth == 0 {
|
||||
if let Some(block) = current_block.take() {
|
||||
blocks.push(block);
|
||||
}
|
||||
}
|
||||
}
|
||||
Event::Start(Tag::List(_)) => {
|
||||
if !in_list {
|
||||
in_list = true;
|
||||
list_start_line = line;
|
||||
list_items.clear();
|
||||
}
|
||||
depth += 1;
|
||||
}
|
||||
Event::End(TagEnd::List(_)) => {
|
||||
depth -= 1;
|
||||
if depth == 0 && in_list {
|
||||
in_list = false;
|
||||
// Create a list block containing all list items
|
||||
if !list_items.is_empty() {
|
||||
blocks.push(BlockInfo {
|
||||
start_line: list_start_line,
|
||||
end_line: line,
|
||||
block_type: BlockType::List,
|
||||
events: vec![], // List events are handled through list_items
|
||||
});
|
||||
// Store list items for later processing
|
||||
for item in list_items.drain(..) {
|
||||
blocks.push(BlockInfo {
|
||||
block_type: BlockType::ListItem,
|
||||
..item
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Event::Start(Tag::Item) => {
|
||||
if in_list {
|
||||
current_block = Some(BlockInfo {
|
||||
start_line: line,
|
||||
end_line: line,
|
||||
block_type: BlockType::ListItem,
|
||||
events: Vec::new(),
|
||||
});
|
||||
}
|
||||
}
|
||||
Event::End(TagEnd::Item) => {
|
||||
if let Some(ref mut block) = current_block {
|
||||
block.end_line = line;
|
||||
}
|
||||
if let Some(block) = current_block.take() {
|
||||
list_items.push(block);
|
||||
}
|
||||
}
|
||||
Event::Start(Tag::CodeBlock(_)) => {
|
||||
if depth == 0 {
|
||||
current_block = Some(BlockInfo {
|
||||
start_line: line,
|
||||
end_line: line,
|
||||
block_type: BlockType::CodeBlock,
|
||||
events: Vec::new(),
|
||||
});
|
||||
}
|
||||
depth += 1;
|
||||
if let Some(ref mut block) = current_block {
|
||||
block.events.push(event.clone().into_static());
|
||||
}
|
||||
}
|
||||
Event::End(TagEnd::CodeBlock) => {
|
||||
depth -= 1;
|
||||
if let Some(ref mut block) = current_block {
|
||||
block.events.push(event.clone().into_static());
|
||||
block.end_line = line;
|
||||
}
|
||||
if depth == 0 {
|
||||
if let Some(block) = current_block.take() {
|
||||
blocks.push(block);
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
if let Some(ref mut block) = current_block {
|
||||
block.events.push(event.clone().into_static());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
blocks
|
||||
}
|
||||
|
||||
fn heading_level_to_usize(level: HeadingLevel) -> usize {
|
||||
match level {
|
||||
HeadingLevel::H1 => 1,
|
||||
HeadingLevel::H2 => 2,
|
||||
HeadingLevel::H3 => 3,
|
||||
HeadingLevel::H4 => 4,
|
||||
HeadingLevel::H5 => 5,
|
||||
HeadingLevel::H6 => 6,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if a block has markers.
|
||||
fn block_has_markers(block: &BlockInfo) -> bool {
|
||||
has_markers(block.events.iter().cloned())
|
||||
}
|
||||
|
||||
/// Extract markers and tags from a block.
|
||||
fn extract_block_markers_and_tags(block: &BlockInfo) -> (Vec<String>, Vec<String>) {
|
||||
extract_markers_and_tags(block.events.iter().cloned())
|
||||
}
|
||||
|
||||
/// Find positions of paragraph blocks that have markers.
|
||||
fn find_paragraph_shard_positions(blocks: &[BlockInfo]) -> Vec<usize> {
|
||||
blocks
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, block)| block.block_type == BlockType::Paragraph && block_has_markers(block))
|
||||
.map(|(i, _)| i)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Find positions of headings at a specific level.
|
||||
fn find_headings_by_level(blocks: &[BlockInfo], level: usize) -> Vec<usize> {
|
||||
blocks
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, block)| matches!(block.block_type, BlockType::Heading(l) if l == level))
|
||||
.map(|(i, _)| i)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Calculate the heading level to split on for the next parsing step.
|
||||
fn calculate_heading_level_for_next_split(blocks: &[BlockInfo]) -> Option<usize> {
|
||||
// Find heading levels that have markers (excluding first block)
|
||||
let levels_with_markers: Vec<usize> = blocks[1..]
|
||||
.iter()
|
||||
.filter_map(|block| {
|
||||
if let BlockType::Heading(level) = block.block_type {
|
||||
if block_has_markers(block) {
|
||||
return Some(level);
|
||||
}
|
||||
}
|
||||
None
|
||||
})
|
||||
.collect();
|
||||
|
||||
if levels_with_markers.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Count headings at each level
|
||||
let mut level_counts: HashMap<usize, usize> = HashMap::new();
|
||||
for block in blocks {
|
||||
if let BlockType::Heading(level) = block.block_type {
|
||||
*level_counts.entry(level).or_insert(0) += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Return the minimum level that either:
|
||||
// - Has count >= 2
|
||||
// - Has a marker (excluding first block)
|
||||
let levels_with_multiple: Vec<usize> = level_counts
|
||||
.into_iter()
|
||||
.filter(|(_, count)| *count >= 2)
|
||||
.map(|(level, _)| level)
|
||||
.collect();
|
||||
|
||||
let mut candidates = levels_with_multiple;
|
||||
candidates.extend(levels_with_markers);
|
||||
|
||||
candidates.into_iter().min()
|
||||
}
|
||||
|
||||
/// Split a slice at the given positions.
|
||||
fn split_at<T: Clone>(items: &[T], positions: &[usize]) -> Vec<Vec<T>> {
|
||||
let mut all_positions: Vec<usize> = vec![0];
|
||||
all_positions.extend(positions.iter().cloned());
|
||||
all_positions.push(items.len());
|
||||
all_positions.sort();
|
||||
all_positions.dedup();
|
||||
|
||||
all_positions
|
||||
.windows(2)
|
||||
.map(|window| items[window[0]..window[1]].to_vec())
|
||||
.filter(|v| !v.is_empty())
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Parse blocks into shard hierarchy based on headings.
|
||||
fn parse_header_shards(
|
||||
blocks: &[BlockInfo],
|
||||
start_line: usize,
|
||||
end_line: usize,
|
||||
use_first_child_as_header: bool,
|
||||
) -> Option<Shard> {
|
||||
if blocks.is_empty() {
|
||||
return Some(build_shard(start_line, end_line, vec![], vec![], vec![]));
|
||||
}
|
||||
|
||||
let split_at_heading_level = calculate_heading_level_for_next_split(blocks);
|
||||
|
||||
if split_at_heading_level.is_none() {
|
||||
return parse_multiple_block_shards(blocks, start_line, end_line, true).0;
|
||||
}
|
||||
|
||||
let heading_level = split_at_heading_level.unwrap();
|
||||
let heading_positions = find_headings_by_level(blocks, heading_level);
|
||||
let block_groups = split_at(blocks, &heading_positions);
|
||||
|
||||
let mut children = Vec::new();
|
||||
|
||||
for (i, group) in block_groups.iter().enumerate() {
|
||||
if group.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let child_start_line = group[0].start_line;
|
||||
let child_end_line = if i + 1 < block_groups.len() && !block_groups[i + 1].is_empty() {
|
||||
block_groups[i + 1][0].start_line - 1
|
||||
} else {
|
||||
end_line
|
||||
};
|
||||
|
||||
if let Some(child_shard) = parse_header_shards(
|
||||
group,
|
||||
child_start_line,
|
||||
child_end_line,
|
||||
i > 0 || heading_positions.contains(&0),
|
||||
) {
|
||||
children.push(child_shard);
|
||||
}
|
||||
}
|
||||
|
||||
if use_first_child_as_header && !children.is_empty() {
|
||||
Some(merge_into_first_shard(
|
||||
children,
|
||||
start_line,
|
||||
end_line,
|
||||
vec![],
|
||||
))
|
||||
} else {
|
||||
Some(build_shard(start_line, end_line, vec![], vec![], children))
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse multiple blocks into shards.
|
||||
fn parse_multiple_block_shards(
|
||||
blocks: &[BlockInfo],
|
||||
start_line: usize,
|
||||
end_line: usize,
|
||||
enforce_shard: bool,
|
||||
) -> (Option<Shard>, Vec<String>) {
|
||||
if blocks.is_empty() {
|
||||
if enforce_shard {
|
||||
return (
|
||||
Some(build_shard(start_line, end_line, vec![], vec![], vec![])),
|
||||
vec![],
|
||||
);
|
||||
}
|
||||
return (None, vec![]);
|
||||
}
|
||||
|
||||
let is_first_block_heading =
|
||||
matches!(blocks[0].block_type, BlockType::Heading(_)) && block_has_markers(&blocks[0]);
|
||||
|
||||
let paragraph_positions = find_paragraph_shard_positions(blocks);
|
||||
let mut children = Vec::new();
|
||||
let mut tags = Vec::new();
|
||||
let mut is_first_block_only_with_marker = false;
|
||||
|
||||
for (i, block) in blocks.iter().enumerate() {
|
||||
if paragraph_positions.contains(&i) {
|
||||
is_first_block_only_with_marker = i == 0;
|
||||
}
|
||||
|
||||
let child_start_line = block.start_line;
|
||||
let child_end_line = if i + 1 < blocks.len() {
|
||||
blocks[i + 1].start_line - 1
|
||||
} else {
|
||||
end_line
|
||||
};
|
||||
|
||||
let (child_shard, child_tags) =
|
||||
parse_single_block_shard(block, child_start_line, child_end_line);
|
||||
|
||||
if let Some(shard) = child_shard {
|
||||
children.push(shard);
|
||||
}
|
||||
tags.extend(child_tags);
|
||||
}
|
||||
|
||||
if children.is_empty() && !enforce_shard {
|
||||
return (None, tags);
|
||||
}
|
||||
|
||||
if is_first_block_heading || is_first_block_only_with_marker {
|
||||
(
|
||||
Some(merge_into_first_shard(children, start_line, end_line, tags)),
|
||||
vec![],
|
||||
)
|
||||
} else {
|
||||
(
|
||||
Some(build_shard(start_line, end_line, vec![], tags, children)),
|
||||
vec![],
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse a single block into a shard.
|
||||
fn parse_single_block_shard(
|
||||
block: &BlockInfo,
|
||||
start_line: usize,
|
||||
end_line: usize,
|
||||
) -> (Option<Shard>, Vec<String>) {
|
||||
match block.block_type {
|
||||
BlockType::Paragraph | BlockType::Heading(_) => {
|
||||
let (markers, tags) = extract_block_markers_and_tags(block);
|
||||
if markers.is_empty() {
|
||||
(None, tags)
|
||||
} else {
|
||||
(
|
||||
Some(build_shard(start_line, end_line, markers, tags, vec![])),
|
||||
vec![],
|
||||
)
|
||||
}
|
||||
}
|
||||
BlockType::List | BlockType::ListItem => {
|
||||
// List handling is complex - for now, extract any markers/tags
|
||||
let (markers, tags) = extract_block_markers_and_tags(block);
|
||||
if markers.is_empty() {
|
||||
(None, tags)
|
||||
} else {
|
||||
(
|
||||
Some(build_shard(start_line, end_line, markers, tags, vec![])),
|
||||
vec![],
|
||||
)
|
||||
}
|
||||
}
|
||||
_ => (None, vec![]),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn make_file_name() -> String {
|
||||
"test.md".to_string()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_empty_file() {
|
||||
let result = parse_markdown_file(&make_file_name(), "");
|
||||
assert_eq!(
|
||||
result,
|
||||
StreamFile {
|
||||
file_name: make_file_name(),
|
||||
shard: Some(Shard::new(1, 1)),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_basic_one_line_file() {
|
||||
let result = parse_markdown_file(&make_file_name(), "Hello World");
|
||||
assert_eq!(
|
||||
result,
|
||||
StreamFile {
|
||||
file_name: make_file_name(),
|
||||
shard: Some(Shard::new(1, 1)),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_basic_multi_line_file() {
|
||||
let result = parse_markdown_file(&make_file_name(), "Hello World\n\nHello again!");
|
||||
assert_eq!(
|
||||
result,
|
||||
StreamFile {
|
||||
file_name: make_file_name(),
|
||||
shard: Some(Shard::new(1, 3)),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_single_line_with_tag() {
|
||||
let result = parse_markdown_file(&make_file_name(), "@Tag Hello World");
|
||||
assert_eq!(
|
||||
result,
|
||||
StreamFile {
|
||||
file_name: make_file_name(),
|
||||
shard: Some(Shard {
|
||||
markers: vec!["Tag".to_string()],
|
||||
tags: vec![],
|
||||
start_line: 1,
|
||||
end_line: 1,
|
||||
children: vec![],
|
||||
}),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_single_line_with_two_tags() {
|
||||
let result = parse_markdown_file(&make_file_name(), "@Marker1 @Marker2 Hello World");
|
||||
assert_eq!(
|
||||
result,
|
||||
StreamFile {
|
||||
file_name: make_file_name(),
|
||||
shard: Some(Shard {
|
||||
markers: vec!["Marker1".to_string(), "Marker2".to_string()],
|
||||
tags: vec![],
|
||||
start_line: 1,
|
||||
end_line: 1,
|
||||
children: vec![],
|
||||
}),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_single_line_with_two_tags_and_misplaced_tag() {
|
||||
let result = parse_markdown_file(&make_file_name(), "@Tag1 @Tag2 Hello World @Tag3");
|
||||
assert_eq!(
|
||||
result,
|
||||
StreamFile {
|
||||
file_name: make_file_name(),
|
||||
shard: Some(Shard {
|
||||
markers: vec!["Tag1".to_string(), "Tag2".to_string()],
|
||||
tags: vec!["Tag3".to_string()],
|
||||
start_line: 1,
|
||||
end_line: 1,
|
||||
children: vec![],
|
||||
}),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_header_without_markers() {
|
||||
let result = parse_markdown_file(&make_file_name(), "# Heading\n\n## Subheading");
|
||||
assert_eq!(
|
||||
result,
|
||||
StreamFile {
|
||||
file_name: make_file_name(),
|
||||
shard: Some(Shard::new(1, 3)),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_ignores_tags_in_code() {
|
||||
let result = parse_markdown_file(&make_file_name(), "```\n@Marker\n```");
|
||||
assert_eq!(
|
||||
result,
|
||||
StreamFile {
|
||||
file_name: make_file_name(),
|
||||
shard: Some(Shard::new(1, 3)),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_finds_tags_in_italic_text() {
|
||||
let result = parse_markdown_file(&make_file_name(), "*@ItalicMarker*");
|
||||
assert_eq!(
|
||||
result,
|
||||
StreamFile {
|
||||
file_name: make_file_name(),
|
||||
shard: Some(Shard {
|
||||
markers: vec!["ItalicMarker".to_string()],
|
||||
tags: vec![],
|
||||
start_line: 1,
|
||||
end_line: 1,
|
||||
children: vec![],
|
||||
}),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_finds_tags_in_bold_text() {
|
||||
let result = parse_markdown_file(&make_file_name(), "**@BoldMarker**");
|
||||
assert_eq!(
|
||||
result,
|
||||
StreamFile {
|
||||
file_name: make_file_name(),
|
||||
shard: Some(Shard {
|
||||
markers: vec!["BoldMarker".to_string()],
|
||||
tags: vec![],
|
||||
start_line: 1,
|
||||
end_line: 1,
|
||||
children: vec![],
|
||||
}),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_finds_tags_in_strikethrough_text() {
|
||||
let result = parse_markdown_file(&make_file_name(), "~~@StrikeMarker~~");
|
||||
assert_eq!(
|
||||
result,
|
||||
StreamFile {
|
||||
file_name: make_file_name(),
|
||||
shard: Some(Shard {
|
||||
markers: vec!["StrikeMarker".to_string()],
|
||||
tags: vec![],
|
||||
start_line: 1,
|
||||
end_line: 1,
|
||||
children: vec![],
|
||||
}),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_finds_tags_in_link() {
|
||||
let result = parse_markdown_file(&make_file_name(), "[@LinkMarker](https://example.com)");
|
||||
assert_eq!(
|
||||
result,
|
||||
StreamFile {
|
||||
file_name: make_file_name(),
|
||||
shard: Some(Shard {
|
||||
markers: vec!["LinkMarker".to_string()],
|
||||
tags: vec![],
|
||||
start_line: 1,
|
||||
end_line: 1,
|
||||
children: vec![],
|
||||
}),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_continues_looking_for_markers_after_first_link_marker() {
|
||||
let result = parse_markdown_file(
|
||||
&make_file_name(),
|
||||
"[@LinkMarker1](https://example.com) [@LinkMarker2](https://example.com)",
|
||||
);
|
||||
assert_eq!(
|
||||
result,
|
||||
StreamFile {
|
||||
file_name: make_file_name(),
|
||||
shard: Some(Shard {
|
||||
markers: vec!["LinkMarker1".to_string(), "LinkMarker2".to_string()],
|
||||
tags: vec![],
|
||||
start_line: 1,
|
||||
end_line: 1,
|
||||
children: vec![],
|
||||
}),
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
219
src/extract/tag_extraction.rs
Normal file
219
src/extract/tag_extraction.rs
Normal file
|
|
@ -0,0 +1,219 @@
|
|||
use once_cell::sync::Lazy;
|
||||
use pulldown_cmark::{Event, Tag, TagEnd};
|
||||
use regex::Regex;
|
||||
|
||||
/// Regex pattern for matching @Tags.
|
||||
/// Matches @ followed by any characters except whitespace, *, `, ~, [, ]
|
||||
static TAG_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"@([^\s*`~\[\]]+)").unwrap());
|
||||
|
||||
/// Token type for tag extraction state machine.
|
||||
#[derive(Debug, Clone)]
|
||||
enum Token {
|
||||
Tag(String),
|
||||
Content,
|
||||
Whitespace,
|
||||
}
|
||||
|
||||
/// Tokenizes text content into Tags, Content, and Whitespace tokens.
|
||||
fn tokenize(text: &str) -> Vec<Token> {
|
||||
let mut tokens = Vec::new();
|
||||
let mut last_end = 0;
|
||||
|
||||
for mat in TAG_PATTERN.find_iter(text) {
|
||||
// Handle content before the match
|
||||
let before = &text[last_end..mat.start()];
|
||||
if !before.is_empty() {
|
||||
if before.chars().all(|c| c.is_whitespace()) {
|
||||
tokens.push(Token::Whitespace);
|
||||
} else {
|
||||
tokens.push(Token::Content);
|
||||
}
|
||||
}
|
||||
|
||||
// Extract the tag name (without the @)
|
||||
let tag_name = &text[mat.start() + 1..mat.end()];
|
||||
tokens.push(Token::Tag(tag_name.to_string()));
|
||||
last_end = mat.end();
|
||||
}
|
||||
|
||||
// Handle remaining content after last match
|
||||
if last_end < text.len() {
|
||||
let remaining = &text[last_end..];
|
||||
if !remaining.is_empty() {
|
||||
if remaining.chars().all(|c| c.is_whitespace()) {
|
||||
tokens.push(Token::Whitespace);
|
||||
} else {
|
||||
tokens.push(Token::Content);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tokens
|
||||
}
|
||||
|
||||
/// Extract markers and tags from a sequence of pulldown-cmark events.
|
||||
///
|
||||
/// Markers are @-prefixed identifiers that appear before any non-whitespace content.
|
||||
/// Tags are @-prefixed identifiers that appear after content has started.
|
||||
///
|
||||
/// Returns (markers, tags).
|
||||
pub fn extract_markers_and_tags<'a>(
|
||||
events: impl Iterator<Item = Event<'a>>,
|
||||
) -> (Vec<String>, Vec<String>) {
|
||||
let mut markers = Vec::new();
|
||||
let mut tags = Vec::new();
|
||||
let mut boundary_crossed = false;
|
||||
let mut in_code = false;
|
||||
|
||||
for event in events {
|
||||
match event {
|
||||
Event::Start(Tag::CodeBlock(_)) | Event::Start(Tag::MetadataBlock(_)) => {
|
||||
in_code = true;
|
||||
}
|
||||
Event::End(TagEnd::CodeBlock) | Event::End(TagEnd::MetadataBlock(_)) => {
|
||||
in_code = false;
|
||||
}
|
||||
Event::Code(_) => {
|
||||
// Inline code is a content boundary but we don't extract tags from it
|
||||
boundary_crossed = true;
|
||||
}
|
||||
Event::Text(text) | Event::InlineHtml(text) if !in_code => {
|
||||
for token in tokenize(&text) {
|
||||
match token {
|
||||
Token::Whitespace => {}
|
||||
Token::Tag(name) => {
|
||||
if boundary_crossed {
|
||||
tags.push(name);
|
||||
} else {
|
||||
markers.push(name);
|
||||
}
|
||||
}
|
||||
Token::Content => {
|
||||
boundary_crossed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
(markers, tags)
|
||||
}
|
||||
|
||||
/// Check if the events contain any markers (tags before content).
|
||||
pub fn has_markers<'a>(events: impl Iterator<Item = Event<'a>>) -> bool {
|
||||
let (markers, _) = extract_markers_and_tags(events);
|
||||
!markers.is_empty()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use pulldown_cmark::Parser;
|
||||
|
||||
fn extract_from_text(text: &str) -> (Vec<String>, Vec<String>) {
|
||||
let mut options = pulldown_cmark::Options::empty();
|
||||
options.insert(pulldown_cmark::Options::ENABLE_STRIKETHROUGH);
|
||||
let parser = Parser::new_ext(text, options);
|
||||
extract_markers_and_tags(parser)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_single_marker() {
|
||||
let (markers, tags) = extract_from_text("@Tag Hello World");
|
||||
assert_eq!(markers, vec!["Tag"]);
|
||||
assert!(tags.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_two_markers() {
|
||||
let (markers, tags) = extract_from_text("@Marker1 @Marker2 Hello World");
|
||||
assert_eq!(markers, vec!["Marker1", "Marker2"]);
|
||||
assert!(tags.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_markers_and_tags() {
|
||||
let (markers, tags) = extract_from_text("@Tag1 @Tag2 Hello World @Tag3");
|
||||
assert_eq!(markers, vec!["Tag1", "Tag2"]);
|
||||
assert_eq!(tags, vec!["Tag3"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_inner_tags() {
|
||||
let (markers, tags) = extract_from_text("Hello @Tag1 World!");
|
||||
assert!(markers.is_empty());
|
||||
assert_eq!(tags, vec!["Tag1"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_ignores_code_blocks() {
|
||||
let (markers, tags) = extract_from_text("```\n@Marker\n```");
|
||||
assert!(markers.is_empty());
|
||||
assert!(tags.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_italic_marker() {
|
||||
let (markers, tags) = extract_from_text("*@ItalicMarker*");
|
||||
assert_eq!(markers, vec!["ItalicMarker"]);
|
||||
assert!(tags.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_bold_marker() {
|
||||
let (markers, tags) = extract_from_text("**@BoldMarker**");
|
||||
assert_eq!(markers, vec!["BoldMarker"]);
|
||||
assert!(tags.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_strikethrough_marker() {
|
||||
let (markers, tags) = extract_from_text("~~@StrikeMarker~~");
|
||||
assert_eq!(markers, vec!["StrikeMarker"]);
|
||||
assert!(tags.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_link_marker() {
|
||||
let (markers, tags) = extract_from_text("[@LinkMarker](https://example.com)");
|
||||
assert_eq!(markers, vec!["LinkMarker"]);
|
||||
assert!(tags.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_multiple_link_markers() {
|
||||
let (markers, tags) = extract_from_text(
|
||||
"[@LinkMarker1](https://example.com) [@LinkMarker2](https://example.com)",
|
||||
);
|
||||
assert_eq!(markers, vec!["LinkMarker1", "LinkMarker2"]);
|
||||
assert!(tags.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_has_markers_true() {
|
||||
let parser = Parser::new("@Tag Hello");
|
||||
assert!(has_markers(parser));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_has_markers_false() {
|
||||
let parser = Parser::new("Hello @Tag");
|
||||
assert!(!has_markers(parser));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_text() {
|
||||
let (markers, tags) = extract_from_text("");
|
||||
assert!(markers.is_empty());
|
||||
assert!(tags.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_tags() {
|
||||
let (markers, tags) = extract_from_text("Hello World");
|
||||
assert!(markers.is_empty());
|
||||
assert!(tags.is_empty());
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue