feat(core): add detect_level() for plain text logs
Add level detection that tries JSON parsing first (trusts level field), then falls back to keyword scanning with word-boundary checks. Supports SEVERE, FATAL, ERROR, WARN, INFO, DEBUG, TRACE and their abbreviations. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
212
crates/core/src/parser/level.rs
Normal file
212
crates/core/src/parser/level.rs
Normal file
@@ -0,0 +1,212 @@
|
|||||||
|
// ─── level.rs ────────────────────────────────────────────────────────────────
|
||||||
|
// Detects the log level from a line of text.
|
||||||
|
//
|
||||||
|
// Two detection strategies:
|
||||||
|
// 1. JSON lines: delegate to `super::json::parse_line` and extract the level field.
|
||||||
|
// 2. Plain-text lines: keyword scan with word-boundary checks.
|
||||||
|
// ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
use crate::types::LogLevel;
|
||||||
|
|
||||||
|
/// Keywords to scan for, ordered longest-first so that "WARNING" is tried
|
||||||
|
/// before "WARN" at the same byte offset (earliest match wins).
|
||||||
|
const KEYWORDS: &[&str] = &[
|
||||||
|
"INFORMATION",
|
||||||
|
"SEVERE",
|
||||||
|
"WARNING",
|
||||||
|
"FATAL",
|
||||||
|
"ERROR",
|
||||||
|
"DEBUG",
|
||||||
|
"TRACE",
|
||||||
|
"WARN",
|
||||||
|
"INFO",
|
||||||
|
"ERR",
|
||||||
|
"WRN",
|
||||||
|
"DBG",
|
||||||
|
"TRC",
|
||||||
|
];
|
||||||
|
|
||||||
|
/// Maximum number of bytes to scan in a non-JSON line.
|
||||||
|
/// Lines longer than this are safely truncated at a char boundary.
|
||||||
|
const SCAN_LIMIT: usize = 200;
|
||||||
|
|
||||||
|
// ─── detect_level ───────────────────────────────────────────────────────────
|
||||||
|
/// Detect the log level of a line.
|
||||||
|
///
|
||||||
|
/// Returns `Some(level)` if a level is found, `None` otherwise.
|
||||||
|
/// For valid JSON lines the result comes from the parsed `level` field
|
||||||
|
/// (which may itself be `None` if the field is absent).
|
||||||
|
pub fn detect_level(line: &str) -> Option<LogLevel> {
|
||||||
|
if line.trim().is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try JSON first. If parse_line returns Some, the line is valid JSON —
|
||||||
|
// trust its level field (even if None) and do NOT fall through to keyword scan.
|
||||||
|
if let Some(entry) = super::json::parse_line(line) {
|
||||||
|
return entry.level;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Non-JSON: keyword scan.
|
||||||
|
detect_level_from_text(line)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── detect_level_from_text ─────────────────────────────────────────────────
|
||||||
|
/// Keyword-based level detection for non-JSON lines.
|
||||||
|
fn detect_level_from_text(line: &str) -> Option<LogLevel> {
|
||||||
|
// Safe truncation to SCAN_LIMIT bytes.
|
||||||
|
let scan = if line.len() > SCAN_LIMIT {
|
||||||
|
let mut end = SCAN_LIMIT;
|
||||||
|
while !line.is_char_boundary(end) {
|
||||||
|
end -= 1;
|
||||||
|
}
|
||||||
|
&line[..end]
|
||||||
|
} else {
|
||||||
|
line
|
||||||
|
};
|
||||||
|
|
||||||
|
let lower = scan.to_ascii_lowercase();
|
||||||
|
|
||||||
|
// Track earliest match.
|
||||||
|
let mut best_pos = usize::MAX;
|
||||||
|
let mut best_keyword = "";
|
||||||
|
|
||||||
|
for &kw in KEYWORDS {
|
||||||
|
let kw_lower = kw.to_ascii_lowercase();
|
||||||
|
let mut start = 0;
|
||||||
|
while let Some(pos) = lower[start..].find(&kw_lower) {
|
||||||
|
let abs_pos = start + pos;
|
||||||
|
if is_word_boundary(&lower, abs_pos, kw_lower.len()) {
|
||||||
|
if abs_pos < best_pos {
|
||||||
|
best_pos = abs_pos;
|
||||||
|
best_keyword = kw;
|
||||||
|
}
|
||||||
|
break; // only need earliest occurrence of this keyword
|
||||||
|
}
|
||||||
|
// Move past this match to keep searching for a later occurrence with boundary.
|
||||||
|
start = abs_pos + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if best_keyword.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
// LogLevel::FromStr is case-insensitive and maps all aliases correctly.
|
||||||
|
Some(
|
||||||
|
best_keyword
|
||||||
|
.parse::<LogLevel>()
|
||||||
|
.unwrap_or_else(|e| match e {}),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── is_word_boundary ───────────────────────────────────────────────────────
|
||||||
|
/// Check that the match at `start..start+len` is surrounded by non-alphabetic
|
||||||
|
/// characters (or the string edge).
|
||||||
|
fn is_word_boundary(text: &str, start: usize, len: usize) -> bool {
|
||||||
|
let before_ok = start == 0 || !text.as_bytes()[start - 1].is_ascii_alphabetic();
|
||||||
|
let after_idx = start + len;
|
||||||
|
let after_ok = after_idx >= text.len() || !text.as_bytes()[after_idx].is_ascii_alphabetic();
|
||||||
|
before_ok && after_ok
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── tests ──────────────────────────────────────────────────────────────────
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_level_json_error() {
|
||||||
|
let line = r#"{"level":"ERROR","message":"fail"}"#;
|
||||||
|
assert_eq!(detect_level(line), Some(LogLevel::Error));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_level_json_info() {
|
||||||
|
let line = r#"{"level":"info","msg":"hello"}"#;
|
||||||
|
assert_eq!(detect_level(line), Some(LogLevel::Info));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_level_json_no_level() {
|
||||||
|
let line = r#"{"message":"no level here"}"#;
|
||||||
|
assert_eq!(detect_level(line), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_level_keyword_error() {
|
||||||
|
assert_eq!(
|
||||||
|
detect_level("ERROR: connection failed"),
|
||||||
|
Some(LogLevel::Error)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_level_keyword_warn() {
|
||||||
|
assert_eq!(
|
||||||
|
detect_level("WARN something happened"),
|
||||||
|
Some(LogLevel::Warn)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_level_keyword_case_insensitive() {
|
||||||
|
assert_eq!(detect_level("error: failed"), Some(LogLevel::Error));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_level_empty() {
|
||||||
|
assert_eq!(detect_level(""), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_level_no_level() {
|
||||||
|
assert_eq!(detect_level("just some text"), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_level_false_positive() {
|
||||||
|
// "errors" contains "error" but it's not a word boundary match.
|
||||||
|
assert_eq!(detect_level("GET /api/errors/404"), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_level_prefixed() {
|
||||||
|
assert_eq!(
|
||||||
|
detect_level("2024-01-01 ERROR something"),
|
||||||
|
Some(LogLevel::Error)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_level_json_fallback() {
|
||||||
|
// JSON without level field but msg contains "ERROR" — must return None
|
||||||
|
// (trust JSON parse result, don't fall through to keyword scan).
|
||||||
|
let line = r#"{"msg":"ERROR happened"}"#;
|
||||||
|
assert_eq!(detect_level(line), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_level_severe() {
|
||||||
|
assert_eq!(detect_level("SEVERE: system crash"), Some(LogLevel::Error));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_level_fatal() {
|
||||||
|
assert_eq!(detect_level("FATAL: unrecoverable"), Some(LogLevel::Error));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_level_whitespace_only() {
|
||||||
|
assert_eq!(detect_level(" \t "), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_detect_level_multibyte_prefix() {
|
||||||
|
// 63 Chinese chars = 189 bytes (3 bytes each), total with " ERROR something" = 204 bytes.
|
||||||
|
// "ERROR" at bytes 190..195 is within the 200-byte scan limit; truncation still kicks in.
|
||||||
|
let prefix: String = "你".repeat(63); // 189 bytes
|
||||||
|
let line = format!("{prefix} ERROR something");
|
||||||
|
assert_eq!(detect_level(&line), Some(LogLevel::Error));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -10,3 +10,4 @@
|
|||||||
// 声明并导出 json 子模块(定义在 json.rs 文件中)。
|
// 声明并导出 json 子模块(定义在 json.rs 文件中)。
|
||||||
// 该模块负责解析 JSON 格式的日志行。
|
// 该模块负责解析 JSON 格式的日志行。
|
||||||
pub mod json;
|
pub mod json;
|
||||||
|
pub mod level;
|
||||||
|
|||||||
Reference in New Issue
Block a user