fix(io): use unicode-width for correct CJK/emoji/zero-width display width (closes #12)
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -2333,6 +2333,7 @@ dependencies = [
|
|||||||
"tempfile",
|
"tempfile",
|
||||||
"thiserror 2.0.18",
|
"thiserror 2.0.18",
|
||||||
"toml",
|
"toml",
|
||||||
|
"unicode-width",
|
||||||
"xxhash-rust",
|
"xxhash-rust",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -27,3 +27,4 @@ textwrap = "0.16"
|
|||||||
tempfile = "3"
|
tempfile = "3"
|
||||||
xxhash-rust = { version = "0.8", features = ["xxh3"] }
|
xxhash-rust = { version = "0.8", features = ["xxh3"] }
|
||||||
bincode = "1"
|
bincode = "1"
|
||||||
|
unicode-width = "0.2"
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ memmap2.workspace = true
|
|||||||
directories.workspace = true
|
directories.workspace = true
|
||||||
xxhash-rust.workspace = true
|
xxhash-rust.workspace = true
|
||||||
bincode.workspace = true
|
bincode.workspace = true
|
||||||
|
unicode-width.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
insta.workspace = true
|
insta.workspace = true
|
||||||
|
|||||||
@@ -2,9 +2,12 @@
|
|||||||
/// Lines exceeding this are returned as-is to avoid pathological cases.
|
/// Lines exceeding this are returned as-is to avoid pathological cases.
|
||||||
pub const MAX_WRAP_INPUT_LEN: usize = 10 * 1024 * 1024;
|
pub const MAX_WRAP_INPUT_LEN: usize = 10 * 1024 * 1024;
|
||||||
|
|
||||||
/// Split a line into chunks of exactly `width` characters (display columns).
|
/// Split a line into chunks of exactly `width` display columns.
|
||||||
/// For a log viewer, we want character-level wrapping, not word-level.
|
/// For a log viewer, we want character-level wrapping, not word-level.
|
||||||
|
/// Uses `unicode-width` for correct CJK/emoji/zero-width handling.
|
||||||
pub fn wrap_line_chars(line: &str, width: usize) -> Vec<String> {
|
pub fn wrap_line_chars(line: &str, width: usize) -> Vec<String> {
|
||||||
|
use unicode_width::UnicodeWidthChar;
|
||||||
|
|
||||||
if width == 0 {
|
if width == 0 {
|
||||||
return vec![String::new()];
|
return vec![String::new()];
|
||||||
}
|
}
|
||||||
@@ -15,7 +18,15 @@ pub fn wrap_line_chars(line: &str, width: usize) -> Vec<String> {
|
|||||||
let mut row = String::new();
|
let mut row = String::new();
|
||||||
let mut col = 0;
|
let mut col = 0;
|
||||||
for ch in line.chars() {
|
for ch in line.chars() {
|
||||||
let w = if ch == '\t' { 4 } else { 1 };
|
let w = if ch == '\t' {
|
||||||
|
4
|
||||||
|
} else if ch.is_control() {
|
||||||
|
// Control characters (except tab): width 0, still pushed to preserve content.
|
||||||
|
// Visible rendering is the caller's responsibility.
|
||||||
|
0
|
||||||
|
} else {
|
||||||
|
ch.width().unwrap_or(0)
|
||||||
|
};
|
||||||
if col + w > width && !row.is_empty() {
|
if col + w > width && !row.is_empty() {
|
||||||
result.push(std::mem::take(&mut row));
|
result.push(std::mem::take(&mut row));
|
||||||
col = 0;
|
col = 0;
|
||||||
@@ -132,4 +143,48 @@ mod tests {
|
|||||||
fn test_max_wrap_input_len_constant() {
|
fn test_max_wrap_input_len_constant() {
|
||||||
assert_eq!(MAX_WRAP_INPUT_LEN, 10 * 1024 * 1024);
|
assert_eq!(MAX_WRAP_INPUT_LEN, 10 * 1024 * 1024);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_wrap_cjk_chars() {
|
||||||
|
let result = wrap_line_chars("你好", 3);
|
||||||
|
assert_eq!(result, vec!["你", "好"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_wrap_cjk_ascii_mixed() {
|
||||||
|
let result = wrap_line_chars("a你好", 4);
|
||||||
|
assert_eq!(result, vec!["a你", "好"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_wrap_zero_width_char() {
|
||||||
|
let result = wrap_line_chars("a\u{200B}b", 2);
|
||||||
|
assert_eq!(result, vec!["a\u{200B}b"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_wrap_emoji() {
|
||||||
|
let result = wrap_line_chars("😀a", 3);
|
||||||
|
assert_eq!(result, vec!["😀a"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_wrap_emoji_exact_wrap() {
|
||||||
|
let result = wrap_line_chars("😀a", 2);
|
||||||
|
assert_eq!(result, vec!["😀", "a"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_wrap_combining_mark() {
|
||||||
|
// Scalar-width wrapping: combining mark (width 0) stays with next base char,
|
||||||
|
// not the preceding one, because the base char already triggered a flush.
|
||||||
|
let result = wrap_line_chars("a\u{0301}b", 1);
|
||||||
|
assert_eq!(result, vec!["a", "\u{0301}b"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_wrap_cjk_width_one() {
|
||||||
|
let result = wrap_line_chars("你好", 1);
|
||||||
|
assert_eq!(result, vec!["你", "好"]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user