feat(localize): extract file_type from filename prefix

Add `extract_file_type_from_file_name` to parse prefixes like `_daily`
from filenames (e.g. `20260412-123456_daily.md` → `"daily"`).

Insert the result into `initial_location` in `localize_stream_file` so
all localized shards carry a `file_type` dimension value.

Also register the `file_type` dimension in `TaskConfiguration` so the
propagation contract is documented.
This commit is contained in:
Konstantin Fickel 2026-04-13 19:30:59 +02:00
parent e15e6f1053
commit b653590c36
Signed by: kfickel
GPG key ID: A793722F9933C1A5
4 changed files with 86 additions and 2 deletions

View file

@ -9,6 +9,11 @@ use std::path::Path;
static FILE_NAME_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^(?P<date>\d{8})(?:-(?P<time>\d{4,6}))?.+\.md$").unwrap());
/// Regex for extracting a file-type prefix from file names.
/// Matches filenames like `20260412-123456_daily.md` or `20260412_daily Some Title.md`.
static FILE_TYPE_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^\d{8}(?:-\d{4,6})?_([a-zA-Z0-9]+)").unwrap());
/// Regex for validating datetime marker format (14 digits).
static DATETIME_MARKER_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\d{14}$").unwrap());
@ -62,6 +67,28 @@ pub fn extract_datetime_from_file_name(file_name: &str, tz: Tz) -> Option<DateTi
.and_then(|dt| naive_to_utc(dt, tz))
}
/// Extract the file-type prefix from a filename.
///
/// Filenames with a `_prefix` segment after the timestamp (and optional time component)
/// are recognised. The prefix must consist of alphanumeric characters only.
///
/// # Examples
/// - `"20260412-123456_daily.md"` → `Some("daily")`
/// - `"20260412_daily Some Title.md"` → `Some("daily")`
/// - `"20260412-123456 Some Title.md"` → `None`
/// - `"/path/to/20260412-123456_daily.md"` → `Some("daily")`
pub fn extract_file_type_from_file_name(file_name: &str) -> Option<String> {
let base_name = Path::new(file_name)
.file_name()
.and_then(|s| s.to_str())
.unwrap_or(file_name);
FILE_TYPE_REGEX
.captures(base_name)
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string())
}
/// Parse a 14-digit marker string as a NaiveDateTime without timezone conversion.
fn parse_naive_datetime_from_marker(marker: &str) -> Option<NaiveDateTime> {
if !DATETIME_MARKER_REGEX.is_match(marker) {
@ -155,6 +182,51 @@ mod tests {
use chrono::TimeZone;
use chrono_tz::UTC;
#[test]
fn test_extract_file_type_with_time() {
assert_eq!(
extract_file_type_from_file_name("20260412-123456_daily.md"),
Some("daily".to_string())
);
}
#[test]
fn test_extract_file_type_with_time_and_title() {
assert_eq!(
extract_file_type_from_file_name("20260412-123456_daily Some Title.md"),
Some("daily".to_string())
);
}
#[test]
fn test_extract_file_type_without_time() {
assert_eq!(
extract_file_type_from_file_name("20260412_daily.md"),
Some("daily".to_string())
);
}
#[test]
fn test_extract_file_type_without_prefix() {
assert_eq!(
extract_file_type_from_file_name("20260412-123456 Some Title.md"),
None
);
}
#[test]
fn test_extract_file_type_with_full_path() {
assert_eq!(
extract_file_type_from_file_name("/path/to/20260412-123456_daily.md"),
Some("daily".to_string())
);
}
#[test]
fn test_extract_file_type_no_timestamp() {
assert_eq!(extract_file_type_from_file_name("notes.md"), None);
}
#[test]
fn test_extract_date_from_file_name_valid() {
let file_name = "20230101-123456 Some Text.md";

View file

@ -9,7 +9,7 @@ pub use configuration::{
};
pub use datetime::{
extract_date_from_marker, extract_datetime_from_file_name, extract_datetime_from_marker,
extract_datetime_from_marker_list, extract_time_from_marker,
extract_datetime_from_marker_list, extract_file_type_from_file_name, extract_time_from_marker,
};
pub use preconfigured::TaskConfiguration;
pub use shard::{localize_shard, localize_stream_file};

View file

@ -20,6 +20,12 @@ pub static TaskConfiguration: Lazy<RepositoryConfiguration> = Lazy::new(|| {
.with_comment("Project the task is attached to")
.with_propagate(true),
)
.with_dimension(
"file_type",
Dimension::new("File Type")
.with_comment("Type of file derived from filename prefix (e.g. 'daily')")
.with_propagate(true),
)
.with_marker(
"Task",
Marker::new("Task").with_placements(vec![

View file

@ -5,7 +5,10 @@ use indexmap::{IndexMap, IndexSet};
use crate::error::StreamdError;
use crate::models::{LocalizedShard, RepositoryConfiguration, Shard, StreamFile};
use super::datetime::{extract_datetime_from_file_name, extract_datetime_from_marker_list};
use super::datetime::{
extract_datetime_from_file_name, extract_datetime_from_marker_list,
extract_file_type_from_file_name,
};
/// Localize a shard within the repository's coordinate system.
///
@ -102,6 +105,9 @@ pub fn localize_stream_file(
let mut initial_location = IndexMap::new();
initial_location.insert("file".to_string(), stream_file.file_name.clone());
if let Some(file_type) = extract_file_type_from_file_name(&stream_file.file_name) {
initial_location.insert("file_type".to_string(), file_type);
}
Ok(localize_shard(
shard,