From 210eecfa66ce006673b3fc8dcfa106095004f632 Mon Sep 17 00:00:00 2001 From: dailz Date: Tue, 14 Apr 2026 09:06:52 +0800 Subject: [PATCH] feat(core): extract wrap utilities and extend LineIndex for progressive loading Move wrap_line_chars and format_json_line from app.rs to core/io/wrap.rs with MAX_WRAP_INPUT_LEN guard. Add serde derives, pub getters, and extend_from_bytes() to LineIndex for incremental index building. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus --- crates/core/src/io/line_index.rs | 254 +++++++++++++++++++++++++++++++ crates/core/src/io/wrap.rs | 135 ++++++++++++++++ 2 files changed, 389 insertions(+) create mode 100644 crates/core/src/io/wrap.rs diff --git a/crates/core/src/io/line_index.rs b/crates/core/src/io/line_index.rs index d6aa701..0ba2fcd 100644 --- a/crates/core/src/io/line_index.rs +++ b/crates/core/src/io/line_index.rs @@ -16,6 +16,7 @@ const BLOCK_SIZE: usize = 256; // ─── LineIndex 结构体定义 ──────────────────────────────────────────────────── +#[derive(serde::Serialize, serde::Deserialize)] pub struct LineIndex { // 采样偏移量:每 BLOCK_SIZE 行记录一个起始字节偏移。 // sampled_offsets[i] 存储第 (i * BLOCK_SIZE) 行的字节起始位置。 @@ -139,11 +140,91 @@ impl LineIndex { }) } + // ─── extend_from_bytes:用追加的数据增量更新索引 ───────────────────── + /// Extend the index with new content appended after the existing data. + /// `new_data` is the bytes that were appended. + /// `start_offset` is the byte offset where `new_data` starts in the file. + pub fn extend_from_bytes(&mut self, new_data: &[u8], start_offset: u64) { + if new_data.is_empty() { + return; + } + + let old_total = self.total_lines; + let old_had_trailing = self.has_trailing_newline; + + // Determine whether the first byte of new_data starts a new line + // or continues the old partial last line. + let starts_new_line = old_total == 0 || old_had_trailing; + + // If the junction falls on a block boundary, record the start offset + // (analogous to from_bytes always pushing offset 0 for line 0). + if starts_new_line && (old_total as usize) % BLOCK_SIZE == 0 { + self.sampled_offsets.push(start_offset); + } + + // Scan new_data for newlines, recording sampled offsets at block boundaries. + let mut next_line_idx = if starts_new_line { + (old_total + 1) as usize + } else { + old_total as usize + }; + + let mut new_newlines: usize = 0; + for pos in memchr::memchr_iter(b'\n', new_data) { + new_newlines += 1; + if next_line_idx.is_multiple_of(BLOCK_SIZE) { + self.sampled_offsets.push(start_offset + pos as u64 + 1); + } + next_line_idx += 1; + } + + let new_has_trailing = new_data.last().is_some_and(|&b| b == b'\n'); + + // Compute how many new lines the appended data contributes. + // - starts_new_line: the new data begins a fresh line. Each \n ends a line. + // If the new data doesn't end with \n there's one extra trailing-partial line. + // - !starts_new_line: the new data continues an old partial line. The first \n + // closes that old line (already counted). Remaining \n's each close a new line. + let added = if starts_new_line { + new_newlines + if new_has_trailing { 0 } else { 1 } + } else if new_has_trailing { + new_newlines.saturating_sub(1) + } else { + new_newlines + }; + + self.total_lines += added as u64; + self.has_trailing_newline = new_has_trailing; + + // Trailing pop: if total_lines landed exactly on a block boundary and the + // file ends with \n, the offset we pushed for that boundary points to a + // non-existent trailing line — remove it (same logic as from_bytes). + if self.has_trailing_newline + && self.total_lines > 0 + && (self.total_lines as usize).is_multiple_of(BLOCK_SIZE) + { + self.sampled_offsets.pop(); + } + } + // ─── line_count:返回总行数 ─────────────────────────────────────────── pub fn line_count(&self) -> usize { self.total_lines as usize } + // ─── getter 方法 ──────────────────────────────────────────────────── + pub(crate) fn sampled_offsets(&self) -> &[u64] { + &self.sampled_offsets + } + + pub(crate) fn total_lines(&self) -> u64 { + self.total_lines + } + + pub(crate) fn has_trailing_newline(&self) -> bool { + self.has_trailing_newline + } + // ─── get_line:根据行号获取行内容 ───────────────────────────────────── // 通过稀疏索引定位到所在块的起始位置,然后向前扫描少量换行符来定位目标行。 pub fn get_line<'a>(&self, data: &'a [u8], idx: usize) -> Option<&'a str> { @@ -490,4 +571,177 @@ mod tests { assert_eq!(idx.get_line(&data, 256), Some("line256")); assert_eq!(idx.get_line(&data, 299), Some("line299")); } + + #[test] + fn test_lineindex_getters() { + let data = make_lines(300); + let idx = LineIndex::from_bytes(&data); + assert_eq!(idx.total_lines(), 300); + assert!(idx.has_trailing_newline()); + assert_eq!(idx.sampled_offsets().len(), 2); + assert_eq!(idx.sampled_offsets()[0], 0); + } + + // ─── extend_from_bytes tests ────────────────────────────────────────── + + #[test] + fn test_extend_from_bytes_basic() { + let mut idx = LineIndex::from_bytes(b"aaa\nbbb\n"); + assert_eq!(idx.line_count(), 2); + idx.extend_from_bytes(b"ccc\nddd\n", 8); + assert_eq!(idx.line_count(), 4); + assert_eq!(idx.has_trailing_newline(), true); + } + + #[test] + fn test_extend_from_bytes_trailing_to_trailing() { + let mut idx = LineIndex::from_bytes(b"hello\n"); + assert_eq!(idx.line_count(), 1); + idx.extend_from_bytes(b"world\n", 6); + assert_eq!(idx.line_count(), 2); + assert_eq!(idx.has_trailing_newline(), true); + } + + #[test] + fn test_extend_from_bytes_trailing_to_no_trailing() { + let mut idx = LineIndex::from_bytes(b"hello\n"); + assert_eq!(idx.line_count(), 1); + idx.extend_from_bytes(b"world", 6); + assert_eq!(idx.line_count(), 2); + assert_eq!(idx.has_trailing_newline(), false); + } + + #[test] + fn test_extend_from_bytes_no_trailing_to_trailing() { + let mut idx = LineIndex::from_bytes(b"hello"); + assert_eq!(idx.line_count(), 1); + idx.extend_from_bytes(b"\nworld\n", 5); + assert_eq!(idx.line_count(), 2); + assert_eq!(idx.has_trailing_newline(), true); + } + + #[test] + fn test_extend_from_bytes_no_trailing_merge() { + let mut idx = LineIndex::from_bytes(b"hel"); + assert_eq!(idx.line_count(), 1); + idx.extend_from_bytes(b"lo", 3); + assert_eq!(idx.line_count(), 1); + assert_eq!(idx.has_trailing_newline(), false); + } + + #[test] + fn test_extend_from_bytes_empty() { + let mut idx = LineIndex::from_bytes(b"hello\n"); + let lines_before = idx.line_count(); + let offsets_before = idx.sampled_offsets().to_vec(); + idx.extend_from_bytes(b"", 6); + assert_eq!(idx.line_count(), lines_before); + assert_eq!(idx.sampled_offsets(), offsets_before.as_slice()); + } + + #[test] + fn test_extend_from_bytes_from_empty() { + let mut idx = LineIndex::from_bytes(b""); + assert_eq!(idx.line_count(), 0); + idx.extend_from_bytes(b"aaa\nbbb\n", 0); + assert_eq!(idx.line_count(), 2); + assert_eq!(idx.has_trailing_newline(), true); + } + + #[test] + fn test_extend_from_bytes_matches_from_bytes() { + let cases: Vec<(&[u8], &[u8])> = vec![ + (b"hello\n", b"world\n"), + (b"hello", b"\nworld\n"), + (b"hello", b"world"), + (b"", b"aaa\nbbb\n"), + (b"aaa\n", b""), + (b"a\nb\nc", b"\nd\ne\n"), + (b"a\nb\nc\n", b"d\ne\nf"), + ]; + + for (i, (old, new)) in cases.iter().enumerate() { + let combined: Vec = old.iter().chain(new.iter()).copied().collect(); + let full_idx = LineIndex::from_bytes(&combined); + + let mut ext_idx = LineIndex::from_bytes(old); + ext_idx.extend_from_bytes(new, old.len() as u64); + + assert_eq!( + ext_idx.total_lines, full_idx.total_lines, + "case {}: total_lines mismatch (old={:?}, new={:?})", + i, old, new + ); + assert_eq!( + ext_idx.has_trailing_newline, full_idx.has_trailing_newline, + "case {}: has_trailing_newline mismatch", + i + ); + assert_eq!( + ext_idx.sampled_offsets, full_idx.sampled_offsets, + "case {}: sampled_offsets mismatch", + i + ); + } + } + + #[test] + fn test_extend_from_bytes_256_block_boundary() { + let old = make_lines(256); + let new = make_lines(256); + let combined: Vec = old.iter().chain(new.iter()).copied().collect(); + + let full_idx = LineIndex::from_bytes(&combined); + let mut ext_idx = LineIndex::from_bytes(&old); + ext_idx.extend_from_bytes(&new, old.len() as u64); + + assert_eq!(ext_idx.total_lines, full_idx.total_lines); + assert_eq!(ext_idx.sampled_offsets, full_idx.sampled_offsets); + assert_eq!(ext_idx.has_trailing_newline, full_idx.has_trailing_newline); + + assert_eq!(ext_idx.get_line(&combined, 0), Some("line0")); + assert_eq!(ext_idx.get_line(&combined, 255), Some("line255")); + assert_eq!(ext_idx.get_line(&combined, 256), Some("line0")); + assert_eq!(ext_idx.get_line(&combined, 511), Some("line255")); + } + + #[test] + fn test_extend_from_bytes_300_plus_300() { + let old = make_lines(300); + let new = make_lines(300); + let combined: Vec = old.iter().chain(new.iter()).copied().collect(); + + let full_idx = LineIndex::from_bytes(&combined); + let mut ext_idx = LineIndex::from_bytes(&old); + ext_idx.extend_from_bytes(&new, old.len() as u64); + + assert_eq!(ext_idx.total_lines, full_idx.total_lines); + assert_eq!(ext_idx.sampled_offsets, full_idx.sampled_offsets); + + assert_eq!(ext_idx.get_line(&combined, 299), Some("line299")); + assert_eq!(ext_idx.get_line(&combined, 300), Some("line0")); + assert_eq!(ext_idx.get_line(&combined, 599), Some("line299")); + } + + #[test] + fn test_lineindex_serde_roundtrip() { + let data = make_lines(300); + let original = LineIndex::from_bytes(&data); + + let bytes = bincode::serialize(&original).expect("serialize"); + let restored: LineIndex = bincode::deserialize(&bytes).expect("deserialize"); + + assert_eq!(restored.total_lines(), original.total_lines()); + assert_eq!( + restored.has_trailing_newline(), + original.has_trailing_newline() + ); + assert_eq!(restored.sampled_offsets(), original.sampled_offsets()); + assert_eq!(restored.line_count(), original.line_count()); + + // Verify restored index still works for line lookups + assert_eq!(restored.get_line(&data, 0), Some("line0")); + assert_eq!(restored.get_line(&data, 255), Some("line255")); + assert_eq!(restored.get_line(&data, 299), Some("line299")); + } } diff --git a/crates/core/src/io/wrap.rs b/crates/core/src/io/wrap.rs new file mode 100644 index 0000000..a33b09b --- /dev/null +++ b/crates/core/src/io/wrap.rs @@ -0,0 +1,135 @@ +/// Maximum input length for wrap/format operations (10 MB). +/// Lines exceeding this are returned as-is to avoid pathological cases. +pub const MAX_WRAP_INPUT_LEN: usize = 10 * 1024 * 1024; + +/// Split a line into chunks of exactly `width` characters (display columns). +/// For a log viewer, we want character-level wrapping, not word-level. +pub fn wrap_line_chars(line: &str, width: usize) -> Vec { + if width == 0 { + return vec![String::new()]; + } + if line.is_empty() { + return vec![String::new()]; + } + let mut result = Vec::new(); + let mut row = String::new(); + let mut col = 0; + for ch in line.chars() { + let w = if ch == '\t' { 4 } else { 1 }; + if col + w > width && !row.is_empty() { + result.push(std::mem::take(&mut row)); + col = 0; + } + if ch == '\t' { + row.push_str(" "); + col += 4; + } else { + row.push(ch); + col += w; + } + if col >= width { + result.push(std::mem::take(&mut row)); + col = 0; + } + } + if !row.is_empty() { + result.push(row); + } + if result.is_empty() { + result.push(String::new()); + } + result +} + +/// Format a line as pretty-printed JSON if it's a JSON Object. +/// Returns the original line unchanged for non-JSON or non-Object content. +pub fn format_json_line(line: &str) -> String { + if line.trim().is_empty() { + return String::new(); + } + // Quick pre-check: only try parsing if it starts with '{' + if !line.trim_start().starts_with('{') { + return line.to_string(); + } + match serde_json::from_str::(line) { + Ok(value) if value.is_object() => { + serde_json::to_string_pretty(&value).unwrap_or_else(|_| line.to_string()) + } + _ => line.to_string(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_wrap_empty_line() { + let result = wrap_line_chars("", 80); + assert_eq!(result, vec![""]); + } + + #[test] + fn test_wrap_zero_width() { + let result = wrap_line_chars("hello", 0); + assert_eq!(result, vec![""]); + } + + #[test] + fn test_wrap_short_line() { + let result = wrap_line_chars("hello", 80); + assert_eq!(result, vec!["hello"]); + } + + #[test] + fn test_wrap_exact_width() { + let result = wrap_line_chars("abc", 3); + assert_eq!(result, vec!["abc"]); + } + + #[test] + fn test_wrap_multi_row() { + let result = wrap_line_chars("abcdef", 3); + assert_eq!(result, vec!["abc", "def"]); + } + + #[test] + fn test_wrap_with_tab() { + let result = wrap_line_chars("a\tb", 4); + assert_eq!(result, vec!["a", " ", "b"]); + } + + #[test] + fn test_format_json_empty() { + assert_eq!(format_json_line(""), ""); + assert_eq!(format_json_line(" "), ""); + } + + #[test] + fn test_format_json_non_json() { + assert_eq!(format_json_line("hello world"), "hello world"); + } + + #[test] + fn test_format_json_valid_object() { + let input = r#"{"key":"value"}"#; + let output = format_json_line(input); + assert!( + output.contains('\n'), + "pretty-printed JSON should have newlines" + ); + assert!(output.contains("key")); + assert!(output.contains("value")); + } + + #[test] + fn test_format_json_array_unchanged() { + let input = r#"[1,2,3]"#; + assert_eq!(format_json_line(input), input); + } + + #[test] + fn test_max_wrap_input_len_constant() { + assert_eq!(MAX_WRAP_INPUT_LEN, 10 * 1024 * 1024); + } +}