diff --git a/crates/core/src/parser/level.rs b/crates/core/src/parser/level.rs new file mode 100644 index 0000000..3c0c610 --- /dev/null +++ b/crates/core/src/parser/level.rs @@ -0,0 +1,212 @@ +// ─── level.rs ──────────────────────────────────────────────────────────────── +// Detects the log level from a line of text. +// +// Two detection strategies: +// 1. JSON lines: delegate to `super::json::parse_line` and extract the level field. +// 2. Plain-text lines: keyword scan with word-boundary checks. +// ────────────────────────────────────────────────────────────────────────────── + +use crate::types::LogLevel; + +/// Keywords to scan for, ordered longest-first so that "WARNING" is tried +/// before "WARN" at the same byte offset (earliest match wins). +const KEYWORDS: &[&str] = &[ + "INFORMATION", + "SEVERE", + "WARNING", + "FATAL", + "ERROR", + "DEBUG", + "TRACE", + "WARN", + "INFO", + "ERR", + "WRN", + "DBG", + "TRC", +]; + +/// Maximum number of bytes to scan in a non-JSON line. +/// Lines longer than this are safely truncated at a char boundary. +const SCAN_LIMIT: usize = 200; + +// ─── detect_level ─────────────────────────────────────────────────────────── +/// Detect the log level of a line. +/// +/// Returns `Some(level)` if a level is found, `None` otherwise. +/// For valid JSON lines the result comes from the parsed `level` field +/// (which may itself be `None` if the field is absent). +pub fn detect_level(line: &str) -> Option { + if line.trim().is_empty() { + return None; + } + + // Try JSON first. If parse_line returns Some, the line is valid JSON — + // trust its level field (even if None) and do NOT fall through to keyword scan. + if let Some(entry) = super::json::parse_line(line) { + return entry.level; + } + + // Non-JSON: keyword scan. + detect_level_from_text(line) +} + +// ─── detect_level_from_text ───────────────────────────────────────────────── +/// Keyword-based level detection for non-JSON lines. +fn detect_level_from_text(line: &str) -> Option { + // Safe truncation to SCAN_LIMIT bytes. + let scan = if line.len() > SCAN_LIMIT { + let mut end = SCAN_LIMIT; + while !line.is_char_boundary(end) { + end -= 1; + } + &line[..end] + } else { + line + }; + + let lower = scan.to_ascii_lowercase(); + + // Track earliest match. + let mut best_pos = usize::MAX; + let mut best_keyword = ""; + + for &kw in KEYWORDS { + let kw_lower = kw.to_ascii_lowercase(); + let mut start = 0; + while let Some(pos) = lower[start..].find(&kw_lower) { + let abs_pos = start + pos; + if is_word_boundary(&lower, abs_pos, kw_lower.len()) { + if abs_pos < best_pos { + best_pos = abs_pos; + best_keyword = kw; + } + break; // only need earliest occurrence of this keyword + } + // Move past this match to keep searching for a later occurrence with boundary. + start = abs_pos + 1; + } + } + + if best_keyword.is_empty() { + return None; + } + + // LogLevel::FromStr is case-insensitive and maps all aliases correctly. + Some( + best_keyword + .parse::() + .unwrap_or_else(|e| match e {}), + ) +} + +// ─── is_word_boundary ─────────────────────────────────────────────────────── +/// Check that the match at `start..start+len` is surrounded by non-alphabetic +/// characters (or the string edge). +fn is_word_boundary(text: &str, start: usize, len: usize) -> bool { + let before_ok = start == 0 || !text.as_bytes()[start - 1].is_ascii_alphabetic(); + let after_idx = start + len; + let after_ok = after_idx >= text.len() || !text.as_bytes()[after_idx].is_ascii_alphabetic(); + before_ok && after_ok +} + +// ─── tests ────────────────────────────────────────────────────────────────── +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_detect_level_json_error() { + let line = r#"{"level":"ERROR","message":"fail"}"#; + assert_eq!(detect_level(line), Some(LogLevel::Error)); + } + + #[test] + fn test_detect_level_json_info() { + let line = r#"{"level":"info","msg":"hello"}"#; + assert_eq!(detect_level(line), Some(LogLevel::Info)); + } + + #[test] + fn test_detect_level_json_no_level() { + let line = r#"{"message":"no level here"}"#; + assert_eq!(detect_level(line), None); + } + + #[test] + fn test_detect_level_keyword_error() { + assert_eq!( + detect_level("ERROR: connection failed"), + Some(LogLevel::Error) + ); + } + + #[test] + fn test_detect_level_keyword_warn() { + assert_eq!( + detect_level("WARN something happened"), + Some(LogLevel::Warn) + ); + } + + #[test] + fn test_detect_level_keyword_case_insensitive() { + assert_eq!(detect_level("error: failed"), Some(LogLevel::Error)); + } + + #[test] + fn test_detect_level_empty() { + assert_eq!(detect_level(""), None); + } + + #[test] + fn test_detect_level_no_level() { + assert_eq!(detect_level("just some text"), None); + } + + #[test] + fn test_detect_level_false_positive() { + // "errors" contains "error" but it's not a word boundary match. + assert_eq!(detect_level("GET /api/errors/404"), None); + } + + #[test] + fn test_detect_level_prefixed() { + assert_eq!( + detect_level("2024-01-01 ERROR something"), + Some(LogLevel::Error) + ); + } + + #[test] + fn test_detect_level_json_fallback() { + // JSON without level field but msg contains "ERROR" — must return None + // (trust JSON parse result, don't fall through to keyword scan). + let line = r#"{"msg":"ERROR happened"}"#; + assert_eq!(detect_level(line), None); + } + + #[test] + fn test_detect_level_severe() { + assert_eq!(detect_level("SEVERE: system crash"), Some(LogLevel::Error)); + } + + #[test] + fn test_detect_level_fatal() { + assert_eq!(detect_level("FATAL: unrecoverable"), Some(LogLevel::Error)); + } + + #[test] + fn test_detect_level_whitespace_only() { + assert_eq!(detect_level(" \t "), None); + } + + #[test] + fn test_detect_level_multibyte_prefix() { + // 63 Chinese chars = 189 bytes (3 bytes each), total with " ERROR something" = 204 bytes. + // "ERROR" at bytes 190..195 is within the 200-byte scan limit; truncation still kicks in. + let prefix: String = "你".repeat(63); // 189 bytes + let line = format!("{prefix} ERROR something"); + assert_eq!(detect_level(&line), Some(LogLevel::Error)); + } +} diff --git a/crates/core/src/parser/mod.rs b/crates/core/src/parser/mod.rs index e79a4b9..968c1e2 100644 --- a/crates/core/src/parser/mod.rs +++ b/crates/core/src/parser/mod.rs @@ -10,3 +10,4 @@ // 声明并导出 json 子模块(定义在 json.rs 文件中)。 // 该模块负责解析 JSON 格式的日志行。 pub mod json; +pub mod level;