diff --git a/Cargo.lock b/Cargo.lock index e1c8b8a..7cea9de 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2333,6 +2333,7 @@ dependencies = [ "tempfile", "thiserror 2.0.18", "toml", + "unicode-width", "xxhash-rust", ] diff --git a/Cargo.toml b/Cargo.toml index b97d064..24edfa7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,3 +27,4 @@ textwrap = "0.16" tempfile = "3" xxhash-rust = { version = "0.8", features = ["xxh3"] } bincode = "1" +unicode-width = "0.2" diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 0cdba3b..bc695e2 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -17,6 +17,7 @@ memmap2.workspace = true directories.workspace = true xxhash-rust.workspace = true bincode.workspace = true +unicode-width.workspace = true [dev-dependencies] insta.workspace = true diff --git a/crates/core/src/io/wrap.rs b/crates/core/src/io/wrap.rs index a33b09b..94e269c 100644 --- a/crates/core/src/io/wrap.rs +++ b/crates/core/src/io/wrap.rs @@ -2,9 +2,12 @@ /// Lines exceeding this are returned as-is to avoid pathological cases. pub const MAX_WRAP_INPUT_LEN: usize = 10 * 1024 * 1024; -/// Split a line into chunks of exactly `width` characters (display columns). +/// Split a line into chunks of exactly `width` display columns. /// For a log viewer, we want character-level wrapping, not word-level. +/// Uses `unicode-width` for correct CJK/emoji/zero-width handling. pub fn wrap_line_chars(line: &str, width: usize) -> Vec { + use unicode_width::UnicodeWidthChar; + if width == 0 { return vec![String::new()]; } @@ -15,7 +18,15 @@ pub fn wrap_line_chars(line: &str, width: usize) -> Vec { let mut row = String::new(); let mut col = 0; for ch in line.chars() { - let w = if ch == '\t' { 4 } else { 1 }; + let w = if ch == '\t' { + 4 + } else if ch.is_control() { + // Control characters (except tab): width 0, still pushed to preserve content. + // Visible rendering is the caller's responsibility. + 0 + } else { + ch.width().unwrap_or(0) + }; if col + w > width && !row.is_empty() { result.push(std::mem::take(&mut row)); col = 0; @@ -132,4 +143,48 @@ mod tests { fn test_max_wrap_input_len_constant() { assert_eq!(MAX_WRAP_INPUT_LEN, 10 * 1024 * 1024); } + + #[test] + fn test_wrap_cjk_chars() { + let result = wrap_line_chars("你好", 3); + assert_eq!(result, vec!["你", "好"]); + } + + #[test] + fn test_wrap_cjk_ascii_mixed() { + let result = wrap_line_chars("a你好", 4); + assert_eq!(result, vec!["a你", "好"]); + } + + #[test] + fn test_wrap_zero_width_char() { + let result = wrap_line_chars("a\u{200B}b", 2); + assert_eq!(result, vec!["a\u{200B}b"]); + } + + #[test] + fn test_wrap_emoji() { + let result = wrap_line_chars("😀a", 3); + assert_eq!(result, vec!["😀a"]); + } + + #[test] + fn test_wrap_emoji_exact_wrap() { + let result = wrap_line_chars("😀a", 2); + assert_eq!(result, vec!["😀", "a"]); + } + + #[test] + fn test_wrap_combining_mark() { + // Scalar-width wrapping: combining mark (width 0) stays with next base char, + // not the preceding one, because the base char already triggered a flush. + let result = wrap_line_chars("a\u{0301}b", 1); + assert_eq!(result, vec!["a", "\u{0301}b"]); + } + + #[test] + fn test_wrap_cjk_width_one() { + let result = wrap_line_chars("你好", 1); + assert_eq!(result, vec!["你", "好"]); + } }