feat(core): add detect_level() for plain text logs

Add level detection that tries JSON parsing first (trusts level field), then falls back to keyword scanning with word-boundary checks. Supports SEVERE, FATAL, ERROR, WARN, INFO, DEBUG, TRACE and their abbreviations.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
dailz
2026-04-12 10:50:59 +08:00
parent 105e428a43
commit c914912389
2 changed files with 213 additions and 0 deletions

View File

@@ -0,0 +1,212 @@
// ─── level.rs ────────────────────────────────────────────────────────────────
// Detects the log level from a line of text.
//
// Two detection strategies:
// 1. JSON lines: delegate to `super::json::parse_line` and extract the level field.
// 2. Plain-text lines: keyword scan with word-boundary checks.
// ──────────────────────────────────────────────────────────────────────────────
use crate::types::LogLevel;
/// Keywords to scan for, ordered longest-first so that "WARNING" is tried
/// before "WARN" at the same byte offset (earliest match wins).
const KEYWORDS: &[&str] = &[
"INFORMATION",
"SEVERE",
"WARNING",
"FATAL",
"ERROR",
"DEBUG",
"TRACE",
"WARN",
"INFO",
"ERR",
"WRN",
"DBG",
"TRC",
];
/// Maximum number of bytes to scan in a non-JSON line.
/// Lines longer than this are safely truncated at a char boundary.
const SCAN_LIMIT: usize = 200;
// ─── detect_level ───────────────────────────────────────────────────────────
/// Detect the log level of a line.
///
/// Returns `Some(level)` if a level is found, `None` otherwise.
/// For valid JSON lines the result comes from the parsed `level` field
/// (which may itself be `None` if the field is absent).
pub fn detect_level(line: &str) -> Option<LogLevel> {
if line.trim().is_empty() {
return None;
}
// Try JSON first. If parse_line returns Some, the line is valid JSON —
// trust its level field (even if None) and do NOT fall through to keyword scan.
if let Some(entry) = super::json::parse_line(line) {
return entry.level;
}
// Non-JSON: keyword scan.
detect_level_from_text(line)
}
// ─── detect_level_from_text ─────────────────────────────────────────────────
/// Keyword-based level detection for non-JSON lines.
fn detect_level_from_text(line: &str) -> Option<LogLevel> {
// Safe truncation to SCAN_LIMIT bytes.
let scan = if line.len() > SCAN_LIMIT {
let mut end = SCAN_LIMIT;
while !line.is_char_boundary(end) {
end -= 1;
}
&line[..end]
} else {
line
};
let lower = scan.to_ascii_lowercase();
// Track earliest match.
let mut best_pos = usize::MAX;
let mut best_keyword = "";
for &kw in KEYWORDS {
let kw_lower = kw.to_ascii_lowercase();
let mut start = 0;
while let Some(pos) = lower[start..].find(&kw_lower) {
let abs_pos = start + pos;
if is_word_boundary(&lower, abs_pos, kw_lower.len()) {
if abs_pos < best_pos {
best_pos = abs_pos;
best_keyword = kw;
}
break; // only need earliest occurrence of this keyword
}
// Move past this match to keep searching for a later occurrence with boundary.
start = abs_pos + 1;
}
}
if best_keyword.is_empty() {
return None;
}
// LogLevel::FromStr is case-insensitive and maps all aliases correctly.
Some(
best_keyword
.parse::<LogLevel>()
.unwrap_or_else(|e| match e {}),
)
}
// ─── is_word_boundary ───────────────────────────────────────────────────────
/// Check that the match at `start..start+len` is surrounded by non-alphabetic
/// characters (or the string edge).
fn is_word_boundary(text: &str, start: usize, len: usize) -> bool {
let before_ok = start == 0 || !text.as_bytes()[start - 1].is_ascii_alphabetic();
let after_idx = start + len;
let after_ok = after_idx >= text.len() || !text.as_bytes()[after_idx].is_ascii_alphabetic();
before_ok && after_ok
}
// ─── tests ──────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_level_json_error() {
let line = r#"{"level":"ERROR","message":"fail"}"#;
assert_eq!(detect_level(line), Some(LogLevel::Error));
}
#[test]
fn test_detect_level_json_info() {
let line = r#"{"level":"info","msg":"hello"}"#;
assert_eq!(detect_level(line), Some(LogLevel::Info));
}
#[test]
fn test_detect_level_json_no_level() {
let line = r#"{"message":"no level here"}"#;
assert_eq!(detect_level(line), None);
}
#[test]
fn test_detect_level_keyword_error() {
assert_eq!(
detect_level("ERROR: connection failed"),
Some(LogLevel::Error)
);
}
#[test]
fn test_detect_level_keyword_warn() {
assert_eq!(
detect_level("WARN something happened"),
Some(LogLevel::Warn)
);
}
#[test]
fn test_detect_level_keyword_case_insensitive() {
assert_eq!(detect_level("error: failed"), Some(LogLevel::Error));
}
#[test]
fn test_detect_level_empty() {
assert_eq!(detect_level(""), None);
}
#[test]
fn test_detect_level_no_level() {
assert_eq!(detect_level("just some text"), None);
}
#[test]
fn test_detect_level_false_positive() {
// "errors" contains "error" but it's not a word boundary match.
assert_eq!(detect_level("GET /api/errors/404"), None);
}
#[test]
fn test_detect_level_prefixed() {
assert_eq!(
detect_level("2024-01-01 ERROR something"),
Some(LogLevel::Error)
);
}
#[test]
fn test_detect_level_json_fallback() {
// JSON without level field but msg contains "ERROR" — must return None
// (trust JSON parse result, don't fall through to keyword scan).
let line = r#"{"msg":"ERROR happened"}"#;
assert_eq!(detect_level(line), None);
}
#[test]
fn test_detect_level_severe() {
assert_eq!(detect_level("SEVERE: system crash"), Some(LogLevel::Error));
}
#[test]
fn test_detect_level_fatal() {
assert_eq!(detect_level("FATAL: unrecoverable"), Some(LogLevel::Error));
}
#[test]
fn test_detect_level_whitespace_only() {
assert_eq!(detect_level(" \t "), None);
}
#[test]
fn test_detect_level_multibyte_prefix() {
// 63 Chinese chars = 189 bytes (3 bytes each), total with " ERROR something" = 204 bytes.
// "ERROR" at bytes 190..195 is within the 200-byte scan limit; truncation still kicks in.
let prefix: String = "".repeat(63); // 189 bytes
let line = format!("{prefix} ERROR something");
assert_eq!(detect_level(&line), Some(LogLevel::Error));
}
}

View File

@@ -10,3 +10,4 @@
// 声明并导出 json 子模块(定义在 json.rs 文件中)。
// 该模块负责解析 JSON 格式的日志行。
pub mod json;
pub mod level;