From 2260d603029f425e8bbbbeb003ee48529bda5d5e Mon Sep 17 00:00:00 2001 From: dailz Date: Tue, 14 Apr 2026 09:06:36 +0800 Subject: [PATCH] feat(core): add disk index cache infrastructure Add xxhash-rust and bincode workspace dependencies for fast hashing and serialization. Implement cache_util for cache directory/path resolution with versioning, and IndexCache for saving/loading line indices to disk with file-hash validation. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus --- Cargo.lock | 48 +++++- Cargo.toml | 4 +- crates/core/Cargo.toml | 2 + crates/core/src/error.rs | 4 + crates/core/src/io/cache_util.rs | 103 +++++++++++++ crates/core/src/io/index_cache.rs | 245 ++++++++++++++++++++++++++++++ 6 files changed, 403 insertions(+), 3 deletions(-) create mode 100644 crates/core/src/io/cache_util.rs create mode 100644 crates/core/src/io/index_cache.rs diff --git a/Cargo.lock b/Cargo.lock index 209df0e..e1c8b8a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -477,6 +477,15 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bit-set" version = "0.5.3" @@ -2292,10 +2301,25 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "log-viewer-bench" +version = "0.1.0" +dependencies = [ + "clap", + "crossbeam-channel", + "libc", + "memchr", + "memmap2", + "nix 0.30.1", + "serde_json", + "tempfile", +] + [[package]] name = "log-viewer-core" version = "0.1.0" dependencies = [ + "bincode", "crossbeam-channel", "directories", "insta", @@ -2309,6 +2333,7 @@ dependencies = [ "tempfile", "thiserror 2.0.18", "toml", + "xxhash-rust", ] [[package]] @@ -2326,6 +2351,7 @@ version = "0.1.0" dependencies = [ "anyhow", "clap", + "crossbeam-channel", "crossterm", "log-viewer-core", "ratatui", @@ -2348,7 +2374,7 @@ version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0aeb26bf5e836cc1c341c8106051b573f1766dfa05aa87f0b98be5e51b02303" dependencies = [ - "nix", + "nix 0.29.0", "winapi", ] @@ -2489,6 +2515,18 @@ dependencies = [ "memoffset", ] +[[package]] +name = "nix" +version = "0.30.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" +dependencies = [ + "bitflags 2.11.0", + "cfg-if", + "cfg_aliases", + "libc", +] + [[package]] name = "nohash-hasher" version = "0.2.0" @@ -4062,7 +4100,7 @@ dependencies = [ "libc", "log", "memmem", - "nix", + "nix 0.29.0", "num-derive", "num-traits", "ordered-float 4.6.0", @@ -5532,6 +5570,12 @@ version = "0.8.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3ae8337f8a065cfc972643663ea4279e04e7256de865aa66fe25cec5fb912d3f" +[[package]] +name = "xxhash-rust" +version = "0.8.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" + [[package]] name = "yoke" version = "0.8.2" diff --git a/Cargo.toml b/Cargo.toml index 082d4ca..b97d064 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [workspace] resolver = "2" -members = ["crates/core", "crates/gui", "crates/tui"] +members = ["crates/core", "crates/gui", "crates/tui", "crates/bench"] default-members = ["crates/core"] [workspace.dependencies] @@ -25,3 +25,5 @@ clap = { version = "4", features = ["derive"] } log-viewer-core = { path = "crates/core" } textwrap = "0.16" tempfile = "3" +xxhash-rust = { version = "0.8", features = ["xxh3"] } +bincode = "1" diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 0dcf99c..0cdba3b 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -15,6 +15,8 @@ regex.workspace = true memchr.workspace = true memmap2.workspace = true directories.workspace = true +xxhash-rust.workspace = true +bincode.workspace = true [dev-dependencies] insta.workspace = true diff --git a/crates/core/src/error.rs b/crates/core/src/error.rs index 151d714..7931081 100644 --- a/crates/core/src/error.rs +++ b/crates/core/src/error.rs @@ -123,6 +123,10 @@ pub enum CoreError { #[error("mmap error: {0}")] Mmap(String), + // ─── Cache 变体:缓存操作错误(缓存目录创建失败、缓存读写失败等)──────────── + #[error("cache error: {0}")] + Cache(String), + // ─── FileNotFound 变体:文件未找到 ────────────────────────────────────── #[error("file not found: {path:?}")] // {path:?} 使用 Debug 格式化输出路径,会保留引号,如 "file not found: "/path/to/file"" diff --git a/crates/core/src/io/cache_util.rs b/crates/core/src/io/cache_util.rs new file mode 100644 index 0000000..f658686 --- /dev/null +++ b/crates/core/src/io/cache_util.rs @@ -0,0 +1,103 @@ +use std::path::{Path, PathBuf}; + +use directories::ProjectDirs; + +pub const CACHE_VERSION: u8 = 1; + +pub fn cache_dir() -> Option { + let proj_dirs = ProjectDirs::from("", "", "log-viewer")?; + let cache_dir = proj_dirs.cache_dir().join("indexes"); + if cache_dir.exists() { + Some(cache_dir) + } else { + std::fs::create_dir_all(&cache_dir).ok().map(|_| cache_dir) + } +} + +pub fn cache_path(file_path: &Path) -> Option { + let canonical = std::fs::canonicalize(file_path).ok()?; + let hash = xxhash_rust::xxh3::xxh3_64(canonical.to_str()?.as_bytes()); + let dir = cache_dir()?; + Some(dir.join(format!("{:016x}.index", hash))) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::NamedTempFile; + + #[test] + fn test_cache_dir_exists_or_creates() { + let dir = cache_dir(); + assert!(dir.is_some(), "cache_dir should return Some"); + let path = dir.unwrap(); + assert!(path.exists(), "cache directory should exist on disk"); + assert!( + path.to_string_lossy().contains("log-viewer"), + "cache dir should contain 'log-viewer': {}", + path.display() + ); + assert!( + path.to_string_lossy().contains("indexes"), + "cache dir should contain 'indexes': {}", + path.display() + ); + } + + #[test] + fn test_cache_path_canonicalized() { + let file = NamedTempFile::new().unwrap(); + let abs_path = file.path().canonicalize().unwrap(); + + let result_abs = cache_path(&abs_path).expect("cache_path with absolute path"); + let result_canonical = cache_path(file.path()).expect("cache_path with canonicalized path"); + + assert_eq!( + result_abs, result_canonical, + "same file via different paths must produce same cache path" + ); + assert!( + result_abs.to_string_lossy().ends_with(".index"), + "cache path should end with .index: {}", + result_abs.display() + ); + } + + #[test] + fn test_cache_path_consistent_hashes() { + let file1 = NamedTempFile::new().unwrap(); + let file2 = NamedTempFile::new().unwrap(); + + let path1 = cache_path(file1.path()).expect("cache_path for file1"); + let path2 = cache_path(file2.path()).expect("cache_path for file2"); + + assert_ne!( + path1, path2, + "different files must produce different cache paths" + ); + + let path1_again = cache_path(file1.path()).expect("cache_path for file1 again"); + assert_eq!( + path1, path1_again, + "same file must produce consistent cache path across calls" + ); + } + + #[test] + fn test_cache_dir_graceful() { + let result = cache_dir(); + assert!( + result.is_some() || result.is_none(), + "cache_dir must never panic, only return Some or None" + ); + } + + #[test] + fn test_cache_path_nonexistent_file() { + let result = cache_path(Path::new("/nonexistent/path/to/file.log")); + assert!( + result.is_none(), + "cache_path should return None for nonexistent file" + ); + } +} diff --git a/crates/core/src/io/index_cache.rs b/crates/core/src/io/index_cache.rs new file mode 100644 index 0000000..108c85e --- /dev/null +++ b/crates/core/src/io/index_cache.rs @@ -0,0 +1,245 @@ +use std::io::{Read as _, Write as _}; +use std::path::Path; + +use crate::io::cache_util::{cache_path, CACHE_VERSION}; +use crate::io::line_index::LineIndex; + +pub struct IndexCache; + +impl IndexCache { + /// Save a `LineIndex` to disk using atomic write (write to .tmp, then rename). + pub fn save(file_path: &Path, index: &LineIndex) -> std::io::Result<()> { + let dest = cache_path(file_path).ok_or_else(|| { + std::io::Error::new(std::io::ErrorKind::NotFound, "cannot determine cache path") + })?; + + let file_hash = compute_file_hash(file_path)?; + let index_bytes = bincode::serialize(index) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?; + + let mut buf = Vec::with_capacity(1 + 8 + index_bytes.len()); + buf.push(CACHE_VERSION); + buf.extend_from_slice(&file_hash.to_le_bytes()); + buf.extend_from_slice(&index_bytes); + + let tmp_path = dest.with_extension("index.tmp"); + { + let mut f = std::fs::File::create(&tmp_path)?; + f.write_all(&buf)?; + f.sync_all()?; + } + std::fs::rename(&tmp_path, &dest)?; + + Ok(()) + } + + /// Load a cached `LineIndex` from disk. + /// Returns `None` if: cache missing, version mismatch, file modified, or corruption. + pub fn load(file_path: &Path) -> Option { + let path = cache_path(file_path)?; + let data = std::fs::read(&path).ok()?; + + if data.len() < 9 { + return None; + } + + // Validate version byte + if data[0] != CACHE_VERSION { + return None; + } + + // Validate file hash + let stored_hash = u64::from_le_bytes(data[1..9].try_into().ok()?); + let current_hash = compute_file_hash(file_path).ok()?; + if stored_hash != current_hash { + return None; + } + + // Deserialize index + bincode::deserialize(&data[9..]).ok() + } +} + +/// Compute a fast fingerprint of the file: xxhash of (head 4KB + tail 4KB + file size). +/// Returns 0 for empty files. +fn compute_file_hash(file_path: &Path) -> std::io::Result { + let file = std::fs::File::open(file_path)?; + let file_size = file.metadata()?.len(); + + if file_size == 0 { + return Ok(0); + } + + let mut f = std::io::BufReader::new(file); + + let head_size = 4096.min(file_size as usize); + let mut head = vec![0u8; head_size]; + f.read_exact(&mut head)?; + + let tail_size = 4096.min(file_size as usize); + let mut tail = vec![0u8; tail_size]; + + if file_size as usize > head_size + tail_size { + // Need to seek to tail region + let mut file = std::fs::File::open(file_path)?; + std::io::Seek::seek(&mut file, std::io::SeekFrom::End(-(tail_size as i64)))?; + let mut tf = std::io::BufReader::new(file); + tf.read_exact(&mut tail)?; + } else { + // File is small enough that head already covers everything; + // tail overlaps with head — just take the last tail_size bytes + let start = head.len().saturating_sub(tail_size); + tail = head[start..].to_vec(); + tail.resize(tail_size, 0); + } + + let mut hasher_state = xxhash_rust::xxh3::Xxh3::new(); + hasher_state.update(&head); + hasher_state.update(&tail); + hasher_state.update(&file_size.to_le_bytes()); + + Ok(hasher_state.digest()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write as _; + use tempfile::NamedTempFile; + + fn make_test_file(lines: usize) -> NamedTempFile { + let mut file = NamedTempFile::new().unwrap(); + for i in 0..lines { + writeln!(file, "line {}", i).unwrap(); + } + file.flush().unwrap(); + file + } + + #[test] + fn test_cache_roundtrip() { + let file = make_test_file(300); + let data = std::fs::read(file.path()).unwrap(); + let index = LineIndex::from_bytes(&data); + + IndexCache::save(file.path(), &index).expect("save should succeed"); + let loaded = IndexCache::load(file.path()).expect("load should return Some"); + + assert_eq!(loaded.line_count(), index.line_count()); + assert_eq!(loaded.sampled_offsets(), index.sampled_offsets()); + assert_eq!(loaded.has_trailing_newline(), index.has_trailing_newline()); + } + + #[test] + fn test_cache_invalidation_file_modified() { + let file = make_test_file(300); + let data = std::fs::read(file.path()).unwrap(); + let index = LineIndex::from_bytes(&data); + + IndexCache::save(file.path(), &index).expect("save should succeed"); + + // Append to file, changing its content + { + let mut f = std::fs::OpenOptions::new() + .append(true) + .open(file.path()) + .unwrap(); + writeln!(f, "extra line").unwrap(); + } + + let loaded = IndexCache::load(file.path()); + assert!( + loaded.is_none(), + "cache should be invalidated after file modification" + ); + } + + #[test] + fn test_cache_corruption() { + let file = make_test_file(300); + let data = std::fs::read(file.path()).unwrap(); + let index = LineIndex::from_bytes(&data); + + IndexCache::save(file.path(), &index).expect("save should succeed"); + + // Corrupt the cache file + let cache_path = cache_path(file.path()).expect("cache path"); + let mut cache_data = std::fs::read(&cache_path).unwrap(); + // Truncate the file (remove last 10 bytes) + let new_len = cache_data.len().saturating_sub(10); + cache_data.truncate(new_len); + std::fs::write(&cache_path, &cache_data).unwrap(); + + let loaded = IndexCache::load(file.path()); + assert!(loaded.is_none(), "corrupt cache should return None"); + } + + #[test] + fn test_cache_version_mismatch() { + let file = make_test_file(300); + let data = std::fs::read(file.path()).unwrap(); + let index = LineIndex::from_bytes(&data); + + IndexCache::save(file.path(), &index).expect("save should succeed"); + + // Modify first byte (version) + let cache_path = cache_path(file.path()).expect("cache path"); + let mut cache_data = std::fs::read(&cache_path).unwrap(); + cache_data[0] = cache_data[0].wrapping_add(1); + std::fs::write(&cache_path, &cache_data).unwrap(); + + let loaded = IndexCache::load(file.path()); + assert!(loaded.is_none(), "version mismatch should return None"); + } + + #[test] + fn test_cache_empty_file() { + let file = NamedTempFile::new().unwrap(); + let index = LineIndex::from_bytes(b""); + + IndexCache::save(file.path(), &index).expect("save should succeed"); + let loaded = IndexCache::load(file.path()).expect("load should return Some"); + + assert_eq!(loaded.line_count(), 0); + } + + #[test] + fn test_cache_nonexistent_source() { + let loaded = IndexCache::load(Path::new("/nonexistent/file.log")); + assert!(loaded.is_none(), "nonexistent file should return None"); + } + + #[test] + fn test_compute_file_hash_empty() { + let file = NamedTempFile::new().unwrap(); + let hash = compute_file_hash(file.path()).unwrap(); + assert_eq!(hash, 0, "empty file should hash to 0"); + } + + #[test] + fn test_compute_file_hash_deterministic() { + let mut file = NamedTempFile::new().unwrap(); + write!(file, "hello world").unwrap(); + file.flush().unwrap(); + + let h1 = compute_file_hash(file.path()).unwrap(); + let h2 = compute_file_hash(file.path()).unwrap(); + assert_eq!(h1, h2, "same file must produce same hash"); + } + + #[test] + fn test_compute_file_hash_changes_on_content_change() { + let mut file = NamedTempFile::new().unwrap(); + write!(file, "version 1").unwrap(); + file.flush().unwrap(); + let h1 = compute_file_hash(file.path()).unwrap(); + + // Overwrite with different content + let mut file2 = std::fs::File::create(file.path()).unwrap(); + write!(file2, "version 2 with more data").unwrap(); + file2.flush().unwrap(); + let h2 = compute_file_hash(file.path()).unwrap(); + + assert_ne!(h1, h2, "hash should change when content changes"); + } +}