fix(io): use unicode-width for correct CJK/emoji/zero-width display width (closes #12)
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -2333,6 +2333,7 @@ dependencies = [
|
||||
"tempfile",
|
||||
"thiserror 2.0.18",
|
||||
"toml",
|
||||
"unicode-width",
|
||||
"xxhash-rust",
|
||||
]
|
||||
|
||||
|
||||
@@ -27,3 +27,4 @@ textwrap = "0.16"
|
||||
tempfile = "3"
|
||||
xxhash-rust = { version = "0.8", features = ["xxh3"] }
|
||||
bincode = "1"
|
||||
unicode-width = "0.2"
|
||||
|
||||
@@ -17,6 +17,7 @@ memmap2.workspace = true
|
||||
directories.workspace = true
|
||||
xxhash-rust.workspace = true
|
||||
bincode.workspace = true
|
||||
unicode-width.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
insta.workspace = true
|
||||
|
||||
@@ -2,9 +2,12 @@
|
||||
/// Lines exceeding this are returned as-is to avoid pathological cases.
|
||||
pub const MAX_WRAP_INPUT_LEN: usize = 10 * 1024 * 1024;
|
||||
|
||||
/// Split a line into chunks of exactly `width` characters (display columns).
|
||||
/// Split a line into chunks of exactly `width` display columns.
|
||||
/// For a log viewer, we want character-level wrapping, not word-level.
|
||||
/// Uses `unicode-width` for correct CJK/emoji/zero-width handling.
|
||||
pub fn wrap_line_chars(line: &str, width: usize) -> Vec<String> {
|
||||
use unicode_width::UnicodeWidthChar;
|
||||
|
||||
if width == 0 {
|
||||
return vec![String::new()];
|
||||
}
|
||||
@@ -15,7 +18,15 @@ pub fn wrap_line_chars(line: &str, width: usize) -> Vec<String> {
|
||||
let mut row = String::new();
|
||||
let mut col = 0;
|
||||
for ch in line.chars() {
|
||||
let w = if ch == '\t' { 4 } else { 1 };
|
||||
let w = if ch == '\t' {
|
||||
4
|
||||
} else if ch.is_control() {
|
||||
// Control characters (except tab): width 0, still pushed to preserve content.
|
||||
// Visible rendering is the caller's responsibility.
|
||||
0
|
||||
} else {
|
||||
ch.width().unwrap_or(0)
|
||||
};
|
||||
if col + w > width && !row.is_empty() {
|
||||
result.push(std::mem::take(&mut row));
|
||||
col = 0;
|
||||
@@ -132,4 +143,48 @@ mod tests {
|
||||
fn test_max_wrap_input_len_constant() {
|
||||
assert_eq!(MAX_WRAP_INPUT_LEN, 10 * 1024 * 1024);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wrap_cjk_chars() {
|
||||
let result = wrap_line_chars("你好", 3);
|
||||
assert_eq!(result, vec!["你", "好"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wrap_cjk_ascii_mixed() {
|
||||
let result = wrap_line_chars("a你好", 4);
|
||||
assert_eq!(result, vec!["a你", "好"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wrap_zero_width_char() {
|
||||
let result = wrap_line_chars("a\u{200B}b", 2);
|
||||
assert_eq!(result, vec!["a\u{200B}b"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wrap_emoji() {
|
||||
let result = wrap_line_chars("😀a", 3);
|
||||
assert_eq!(result, vec!["😀a"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wrap_emoji_exact_wrap() {
|
||||
let result = wrap_line_chars("😀a", 2);
|
||||
assert_eq!(result, vec!["😀", "a"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wrap_combining_mark() {
|
||||
// Scalar-width wrapping: combining mark (width 0) stays with next base char,
|
||||
// not the preceding one, because the base char already triggered a flush.
|
||||
let result = wrap_line_chars("a\u{0301}b", 1);
|
||||
assert_eq!(result, vec!["a", "\u{0301}b"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wrap_cjk_width_one() {
|
||||
let result = wrap_line_chars("你好", 1);
|
||||
assert_eq!(result, vec!["你", "好"]);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user