feat(core): extract wrap utilities and extend LineIndex for progressive loading

Move wrap_line_chars and format_json_line from app.rs to core/io/wrap.rs with MAX_WRAP_INPUT_LEN guard. Add serde derives, pub getters, and extend_from_bytes() to LineIndex for incremental index building.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
dailz
2026-04-14 09:06:52 +08:00
parent cfbe4900a5
commit 210eecfa66
2 changed files with 389 additions and 0 deletions

View File

@@ -16,6 +16,7 @@
const BLOCK_SIZE: usize = 256;
// ─── LineIndex 结构体定义 ────────────────────────────────────────────────────
#[derive(serde::Serialize, serde::Deserialize)]
pub struct LineIndex {
// 采样偏移量:每 BLOCK_SIZE 行记录一个起始字节偏移。
// sampled_offsets[i] 存储第 (i * BLOCK_SIZE) 行的字节起始位置。
@@ -139,11 +140,91 @@ impl LineIndex {
})
}
// ─── extend_from_bytes用追加的数据增量更新索引 ─────────────────────
/// Extend the index with new content appended after the existing data.
/// `new_data` is the bytes that were appended.
/// `start_offset` is the byte offset where `new_data` starts in the file.
pub fn extend_from_bytes(&mut self, new_data: &[u8], start_offset: u64) {
if new_data.is_empty() {
return;
}
let old_total = self.total_lines;
let old_had_trailing = self.has_trailing_newline;
// Determine whether the first byte of new_data starts a new line
// or continues the old partial last line.
let starts_new_line = old_total == 0 || old_had_trailing;
// If the junction falls on a block boundary, record the start offset
// (analogous to from_bytes always pushing offset 0 for line 0).
if starts_new_line && (old_total as usize) % BLOCK_SIZE == 0 {
self.sampled_offsets.push(start_offset);
}
// Scan new_data for newlines, recording sampled offsets at block boundaries.
let mut next_line_idx = if starts_new_line {
(old_total + 1) as usize
} else {
old_total as usize
};
let mut new_newlines: usize = 0;
for pos in memchr::memchr_iter(b'\n', new_data) {
new_newlines += 1;
if next_line_idx.is_multiple_of(BLOCK_SIZE) {
self.sampled_offsets.push(start_offset + pos as u64 + 1);
}
next_line_idx += 1;
}
let new_has_trailing = new_data.last().is_some_and(|&b| b == b'\n');
// Compute how many new lines the appended data contributes.
// - starts_new_line: the new data begins a fresh line. Each \n ends a line.
// If the new data doesn't end with \n there's one extra trailing-partial line.
// - !starts_new_line: the new data continues an old partial line. The first \n
// closes that old line (already counted). Remaining \n's each close a new line.
let added = if starts_new_line {
new_newlines + if new_has_trailing { 0 } else { 1 }
} else if new_has_trailing {
new_newlines.saturating_sub(1)
} else {
new_newlines
};
self.total_lines += added as u64;
self.has_trailing_newline = new_has_trailing;
// Trailing pop: if total_lines landed exactly on a block boundary and the
// file ends with \n, the offset we pushed for that boundary points to a
// non-existent trailing line — remove it (same logic as from_bytes).
if self.has_trailing_newline
&& self.total_lines > 0
&& (self.total_lines as usize).is_multiple_of(BLOCK_SIZE)
{
self.sampled_offsets.pop();
}
}
// ─── line_count返回总行数 ───────────────────────────────────────────
pub fn line_count(&self) -> usize {
self.total_lines as usize
}
// ─── getter 方法 ────────────────────────────────────────────────────
pub(crate) fn sampled_offsets(&self) -> &[u64] {
&self.sampled_offsets
}
pub(crate) fn total_lines(&self) -> u64 {
self.total_lines
}
pub(crate) fn has_trailing_newline(&self) -> bool {
self.has_trailing_newline
}
// ─── get_line根据行号获取行内容 ─────────────────────────────────────
// 通过稀疏索引定位到所在块的起始位置,然后向前扫描少量换行符来定位目标行。
pub fn get_line<'a>(&self, data: &'a [u8], idx: usize) -> Option<&'a str> {
@@ -490,4 +571,177 @@ mod tests {
assert_eq!(idx.get_line(&data, 256), Some("line256"));
assert_eq!(idx.get_line(&data, 299), Some("line299"));
}
#[test]
fn test_lineindex_getters() {
let data = make_lines(300);
let idx = LineIndex::from_bytes(&data);
assert_eq!(idx.total_lines(), 300);
assert!(idx.has_trailing_newline());
assert_eq!(idx.sampled_offsets().len(), 2);
assert_eq!(idx.sampled_offsets()[0], 0);
}
// ─── extend_from_bytes tests ──────────────────────────────────────────
#[test]
fn test_extend_from_bytes_basic() {
let mut idx = LineIndex::from_bytes(b"aaa\nbbb\n");
assert_eq!(idx.line_count(), 2);
idx.extend_from_bytes(b"ccc\nddd\n", 8);
assert_eq!(idx.line_count(), 4);
assert_eq!(idx.has_trailing_newline(), true);
}
#[test]
fn test_extend_from_bytes_trailing_to_trailing() {
let mut idx = LineIndex::from_bytes(b"hello\n");
assert_eq!(idx.line_count(), 1);
idx.extend_from_bytes(b"world\n", 6);
assert_eq!(idx.line_count(), 2);
assert_eq!(idx.has_trailing_newline(), true);
}
#[test]
fn test_extend_from_bytes_trailing_to_no_trailing() {
let mut idx = LineIndex::from_bytes(b"hello\n");
assert_eq!(idx.line_count(), 1);
idx.extend_from_bytes(b"world", 6);
assert_eq!(idx.line_count(), 2);
assert_eq!(idx.has_trailing_newline(), false);
}
#[test]
fn test_extend_from_bytes_no_trailing_to_trailing() {
let mut idx = LineIndex::from_bytes(b"hello");
assert_eq!(idx.line_count(), 1);
idx.extend_from_bytes(b"\nworld\n", 5);
assert_eq!(idx.line_count(), 2);
assert_eq!(idx.has_trailing_newline(), true);
}
#[test]
fn test_extend_from_bytes_no_trailing_merge() {
let mut idx = LineIndex::from_bytes(b"hel");
assert_eq!(idx.line_count(), 1);
idx.extend_from_bytes(b"lo", 3);
assert_eq!(idx.line_count(), 1);
assert_eq!(idx.has_trailing_newline(), false);
}
#[test]
fn test_extend_from_bytes_empty() {
let mut idx = LineIndex::from_bytes(b"hello\n");
let lines_before = idx.line_count();
let offsets_before = idx.sampled_offsets().to_vec();
idx.extend_from_bytes(b"", 6);
assert_eq!(idx.line_count(), lines_before);
assert_eq!(idx.sampled_offsets(), offsets_before.as_slice());
}
#[test]
fn test_extend_from_bytes_from_empty() {
let mut idx = LineIndex::from_bytes(b"");
assert_eq!(idx.line_count(), 0);
idx.extend_from_bytes(b"aaa\nbbb\n", 0);
assert_eq!(idx.line_count(), 2);
assert_eq!(idx.has_trailing_newline(), true);
}
#[test]
fn test_extend_from_bytes_matches_from_bytes() {
let cases: Vec<(&[u8], &[u8])> = vec![
(b"hello\n", b"world\n"),
(b"hello", b"\nworld\n"),
(b"hello", b"world"),
(b"", b"aaa\nbbb\n"),
(b"aaa\n", b""),
(b"a\nb\nc", b"\nd\ne\n"),
(b"a\nb\nc\n", b"d\ne\nf"),
];
for (i, (old, new)) in cases.iter().enumerate() {
let combined: Vec<u8> = old.iter().chain(new.iter()).copied().collect();
let full_idx = LineIndex::from_bytes(&combined);
let mut ext_idx = LineIndex::from_bytes(old);
ext_idx.extend_from_bytes(new, old.len() as u64);
assert_eq!(
ext_idx.total_lines, full_idx.total_lines,
"case {}: total_lines mismatch (old={:?}, new={:?})",
i, old, new
);
assert_eq!(
ext_idx.has_trailing_newline, full_idx.has_trailing_newline,
"case {}: has_trailing_newline mismatch",
i
);
assert_eq!(
ext_idx.sampled_offsets, full_idx.sampled_offsets,
"case {}: sampled_offsets mismatch",
i
);
}
}
#[test]
fn test_extend_from_bytes_256_block_boundary() {
let old = make_lines(256);
let new = make_lines(256);
let combined: Vec<u8> = old.iter().chain(new.iter()).copied().collect();
let full_idx = LineIndex::from_bytes(&combined);
let mut ext_idx = LineIndex::from_bytes(&old);
ext_idx.extend_from_bytes(&new, old.len() as u64);
assert_eq!(ext_idx.total_lines, full_idx.total_lines);
assert_eq!(ext_idx.sampled_offsets, full_idx.sampled_offsets);
assert_eq!(ext_idx.has_trailing_newline, full_idx.has_trailing_newline);
assert_eq!(ext_idx.get_line(&combined, 0), Some("line0"));
assert_eq!(ext_idx.get_line(&combined, 255), Some("line255"));
assert_eq!(ext_idx.get_line(&combined, 256), Some("line0"));
assert_eq!(ext_idx.get_line(&combined, 511), Some("line255"));
}
#[test]
fn test_extend_from_bytes_300_plus_300() {
let old = make_lines(300);
let new = make_lines(300);
let combined: Vec<u8> = old.iter().chain(new.iter()).copied().collect();
let full_idx = LineIndex::from_bytes(&combined);
let mut ext_idx = LineIndex::from_bytes(&old);
ext_idx.extend_from_bytes(&new, old.len() as u64);
assert_eq!(ext_idx.total_lines, full_idx.total_lines);
assert_eq!(ext_idx.sampled_offsets, full_idx.sampled_offsets);
assert_eq!(ext_idx.get_line(&combined, 299), Some("line299"));
assert_eq!(ext_idx.get_line(&combined, 300), Some("line0"));
assert_eq!(ext_idx.get_line(&combined, 599), Some("line299"));
}
#[test]
fn test_lineindex_serde_roundtrip() {
let data = make_lines(300);
let original = LineIndex::from_bytes(&data);
let bytes = bincode::serialize(&original).expect("serialize");
let restored: LineIndex = bincode::deserialize(&bytes).expect("deserialize");
assert_eq!(restored.total_lines(), original.total_lines());
assert_eq!(
restored.has_trailing_newline(),
original.has_trailing_newline()
);
assert_eq!(restored.sampled_offsets(), original.sampled_offsets());
assert_eq!(restored.line_count(), original.line_count());
// Verify restored index still works for line lookups
assert_eq!(restored.get_line(&data, 0), Some("line0"));
assert_eq!(restored.get_line(&data, 255), Some("line255"));
assert_eq!(restored.get_line(&data, 299), Some("line299"));
}
}

135
crates/core/src/io/wrap.rs Normal file
View File

@@ -0,0 +1,135 @@
/// Maximum input length for wrap/format operations (10 MB).
/// Lines exceeding this are returned as-is to avoid pathological cases.
pub const MAX_WRAP_INPUT_LEN: usize = 10 * 1024 * 1024;
/// Split a line into chunks of exactly `width` characters (display columns).
/// For a log viewer, we want character-level wrapping, not word-level.
pub fn wrap_line_chars(line: &str, width: usize) -> Vec<String> {
if width == 0 {
return vec![String::new()];
}
if line.is_empty() {
return vec![String::new()];
}
let mut result = Vec::new();
let mut row = String::new();
let mut col = 0;
for ch in line.chars() {
let w = if ch == '\t' { 4 } else { 1 };
if col + w > width && !row.is_empty() {
result.push(std::mem::take(&mut row));
col = 0;
}
if ch == '\t' {
row.push_str(" ");
col += 4;
} else {
row.push(ch);
col += w;
}
if col >= width {
result.push(std::mem::take(&mut row));
col = 0;
}
}
if !row.is_empty() {
result.push(row);
}
if result.is_empty() {
result.push(String::new());
}
result
}
/// Format a line as pretty-printed JSON if it's a JSON Object.
/// Returns the original line unchanged for non-JSON or non-Object content.
pub fn format_json_line(line: &str) -> String {
if line.trim().is_empty() {
return String::new();
}
// Quick pre-check: only try parsing if it starts with '{'
if !line.trim_start().starts_with('{') {
return line.to_string();
}
match serde_json::from_str::<serde_json::Value>(line) {
Ok(value) if value.is_object() => {
serde_json::to_string_pretty(&value).unwrap_or_else(|_| line.to_string())
}
_ => line.to_string(),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_wrap_empty_line() {
let result = wrap_line_chars("", 80);
assert_eq!(result, vec![""]);
}
#[test]
fn test_wrap_zero_width() {
let result = wrap_line_chars("hello", 0);
assert_eq!(result, vec![""]);
}
#[test]
fn test_wrap_short_line() {
let result = wrap_line_chars("hello", 80);
assert_eq!(result, vec!["hello"]);
}
#[test]
fn test_wrap_exact_width() {
let result = wrap_line_chars("abc", 3);
assert_eq!(result, vec!["abc"]);
}
#[test]
fn test_wrap_multi_row() {
let result = wrap_line_chars("abcdef", 3);
assert_eq!(result, vec!["abc", "def"]);
}
#[test]
fn test_wrap_with_tab() {
let result = wrap_line_chars("a\tb", 4);
assert_eq!(result, vec!["a", " ", "b"]);
}
#[test]
fn test_format_json_empty() {
assert_eq!(format_json_line(""), "");
assert_eq!(format_json_line(" "), "");
}
#[test]
fn test_format_json_non_json() {
assert_eq!(format_json_line("hello world"), "hello world");
}
#[test]
fn test_format_json_valid_object() {
let input = r#"{"key":"value"}"#;
let output = format_json_line(input);
assert!(
output.contains('\n'),
"pretty-printed JSON should have newlines"
);
assert!(output.contains("key"));
assert!(output.contains("value"));
}
#[test]
fn test_format_json_array_unchanged() {
let input = r#"[1,2,3]"#;
assert_eq!(format_json_line(input), input);
}
#[test]
fn test_max_wrap_input_len_constant() {
assert_eq!(MAX_WRAP_INPUT_LEN, 10 * 1024 * 1024);
}
}