fix(io): use unicode-width for correct CJK/emoji/zero-width display width (closes #12)

This commit is contained in:
dailz
2026-06-07 12:50:17 +08:00
parent d4679a7543
commit b58d66f2aa
4 changed files with 60 additions and 2 deletions

1
Cargo.lock generated
View File

@@ -2333,6 +2333,7 @@ dependencies = [
"tempfile", "tempfile",
"thiserror 2.0.18", "thiserror 2.0.18",
"toml", "toml",
"unicode-width",
"xxhash-rust", "xxhash-rust",
] ]

View File

@@ -27,3 +27,4 @@ textwrap = "0.16"
tempfile = "3" tempfile = "3"
xxhash-rust = { version = "0.8", features = ["xxh3"] } xxhash-rust = { version = "0.8", features = ["xxh3"] }
bincode = "1" bincode = "1"
unicode-width = "0.2"

View File

@@ -17,6 +17,7 @@ memmap2.workspace = true
directories.workspace = true directories.workspace = true
xxhash-rust.workspace = true xxhash-rust.workspace = true
bincode.workspace = true bincode.workspace = true
unicode-width.workspace = true
[dev-dependencies] [dev-dependencies]
insta.workspace = true insta.workspace = true

View File

@@ -2,9 +2,12 @@
/// Lines exceeding this are returned as-is to avoid pathological cases. /// Lines exceeding this are returned as-is to avoid pathological cases.
pub const MAX_WRAP_INPUT_LEN: usize = 10 * 1024 * 1024; pub const MAX_WRAP_INPUT_LEN: usize = 10 * 1024 * 1024;
/// Split a line into chunks of exactly `width` characters (display columns). /// Split a line into chunks of exactly `width` display columns.
/// For a log viewer, we want character-level wrapping, not word-level. /// For a log viewer, we want character-level wrapping, not word-level.
/// Uses `unicode-width` for correct CJK/emoji/zero-width handling.
pub fn wrap_line_chars(line: &str, width: usize) -> Vec<String> { pub fn wrap_line_chars(line: &str, width: usize) -> Vec<String> {
use unicode_width::UnicodeWidthChar;
if width == 0 { if width == 0 {
return vec![String::new()]; return vec![String::new()];
} }
@@ -15,7 +18,15 @@ pub fn wrap_line_chars(line: &str, width: usize) -> Vec<String> {
let mut row = String::new(); let mut row = String::new();
let mut col = 0; let mut col = 0;
for ch in line.chars() { for ch in line.chars() {
let w = if ch == '\t' { 4 } else { 1 }; let w = if ch == '\t' {
4
} else if ch.is_control() {
// Control characters (except tab): width 0, still pushed to preserve content.
// Visible rendering is the caller's responsibility.
0
} else {
ch.width().unwrap_or(0)
};
if col + w > width && !row.is_empty() { if col + w > width && !row.is_empty() {
result.push(std::mem::take(&mut row)); result.push(std::mem::take(&mut row));
col = 0; col = 0;
@@ -132,4 +143,48 @@ mod tests {
fn test_max_wrap_input_len_constant() { fn test_max_wrap_input_len_constant() {
assert_eq!(MAX_WRAP_INPUT_LEN, 10 * 1024 * 1024); assert_eq!(MAX_WRAP_INPUT_LEN, 10 * 1024 * 1024);
} }
#[test]
fn test_wrap_cjk_chars() {
let result = wrap_line_chars("你好", 3);
assert_eq!(result, vec!["", ""]);
}
#[test]
fn test_wrap_cjk_ascii_mixed() {
let result = wrap_line_chars("a你好", 4);
assert_eq!(result, vec!["a你", ""]);
}
#[test]
fn test_wrap_zero_width_char() {
let result = wrap_line_chars("a\u{200B}b", 2);
assert_eq!(result, vec!["a\u{200B}b"]);
}
#[test]
fn test_wrap_emoji() {
let result = wrap_line_chars("😀a", 3);
assert_eq!(result, vec!["😀a"]);
}
#[test]
fn test_wrap_emoji_exact_wrap() {
let result = wrap_line_chars("😀a", 2);
assert_eq!(result, vec!["😀", "a"]);
}
#[test]
fn test_wrap_combining_mark() {
// Scalar-width wrapping: combining mark (width 0) stays with next base char,
// not the preceding one, because the base char already triggered a flush.
let result = wrap_line_chars("a\u{0301}b", 1);
assert_eq!(result, vec!["a", "\u{0301}b"]);
}
#[test]
fn test_wrap_cjk_width_one() {
let result = wrap_line_chars("你好", 1);
assert_eq!(result, vec!["", ""]);
}
} }