feat(core): add disk index cache infrastructure

Add xxhash-rust and bincode workspace dependencies for fast hashing and serialization. Implement cache_util for cache directory/path resolution with versioning, and IndexCache for saving/loading line indices to disk with file-hash validation.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
dailz
2026-04-14 09:06:36 +08:00
parent 62a176441e
commit 2260d60302
6 changed files with 403 additions and 3 deletions

View File

@@ -15,6 +15,8 @@ regex.workspace = true
memchr.workspace = true
memmap2.workspace = true
directories.workspace = true
xxhash-rust.workspace = true
bincode.workspace = true
[dev-dependencies]
insta.workspace = true

View File

@@ -123,6 +123,10 @@ pub enum CoreError {
#[error("mmap error: {0}")]
Mmap(String),
// ─── Cache 变体:缓存操作错误(缓存目录创建失败、缓存读写失败等)────────────
#[error("cache error: {0}")]
Cache(String),
// ─── FileNotFound 变体:文件未找到 ──────────────────────────────────────
#[error("file not found: {path:?}")]
// {path:?} 使用 Debug 格式化输出路径,会保留引号,如 "file not found: "/path/to/file""

View File

@@ -0,0 +1,103 @@
use std::path::{Path, PathBuf};
use directories::ProjectDirs;
pub const CACHE_VERSION: u8 = 1;
pub fn cache_dir() -> Option<PathBuf> {
let proj_dirs = ProjectDirs::from("", "", "log-viewer")?;
let cache_dir = proj_dirs.cache_dir().join("indexes");
if cache_dir.exists() {
Some(cache_dir)
} else {
std::fs::create_dir_all(&cache_dir).ok().map(|_| cache_dir)
}
}
pub fn cache_path(file_path: &Path) -> Option<PathBuf> {
let canonical = std::fs::canonicalize(file_path).ok()?;
let hash = xxhash_rust::xxh3::xxh3_64(canonical.to_str()?.as_bytes());
let dir = cache_dir()?;
Some(dir.join(format!("{:016x}.index", hash)))
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::NamedTempFile;
#[test]
fn test_cache_dir_exists_or_creates() {
let dir = cache_dir();
assert!(dir.is_some(), "cache_dir should return Some");
let path = dir.unwrap();
assert!(path.exists(), "cache directory should exist on disk");
assert!(
path.to_string_lossy().contains("log-viewer"),
"cache dir should contain 'log-viewer': {}",
path.display()
);
assert!(
path.to_string_lossy().contains("indexes"),
"cache dir should contain 'indexes': {}",
path.display()
);
}
#[test]
fn test_cache_path_canonicalized() {
let file = NamedTempFile::new().unwrap();
let abs_path = file.path().canonicalize().unwrap();
let result_abs = cache_path(&abs_path).expect("cache_path with absolute path");
let result_canonical = cache_path(file.path()).expect("cache_path with canonicalized path");
assert_eq!(
result_abs, result_canonical,
"same file via different paths must produce same cache path"
);
assert!(
result_abs.to_string_lossy().ends_with(".index"),
"cache path should end with .index: {}",
result_abs.display()
);
}
#[test]
fn test_cache_path_consistent_hashes() {
let file1 = NamedTempFile::new().unwrap();
let file2 = NamedTempFile::new().unwrap();
let path1 = cache_path(file1.path()).expect("cache_path for file1");
let path2 = cache_path(file2.path()).expect("cache_path for file2");
assert_ne!(
path1, path2,
"different files must produce different cache paths"
);
let path1_again = cache_path(file1.path()).expect("cache_path for file1 again");
assert_eq!(
path1, path1_again,
"same file must produce consistent cache path across calls"
);
}
#[test]
fn test_cache_dir_graceful() {
let result = cache_dir();
assert!(
result.is_some() || result.is_none(),
"cache_dir must never panic, only return Some or None"
);
}
#[test]
fn test_cache_path_nonexistent_file() {
let result = cache_path(Path::new("/nonexistent/path/to/file.log"));
assert!(
result.is_none(),
"cache_path should return None for nonexistent file"
);
}
}

View File

@@ -0,0 +1,245 @@
use std::io::{Read as _, Write as _};
use std::path::Path;
use crate::io::cache_util::{cache_path, CACHE_VERSION};
use crate::io::line_index::LineIndex;
pub struct IndexCache;
impl IndexCache {
/// Save a `LineIndex` to disk using atomic write (write to .tmp, then rename).
pub fn save(file_path: &Path, index: &LineIndex) -> std::io::Result<()> {
let dest = cache_path(file_path).ok_or_else(|| {
std::io::Error::new(std::io::ErrorKind::NotFound, "cannot determine cache path")
})?;
let file_hash = compute_file_hash(file_path)?;
let index_bytes = bincode::serialize(index)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
let mut buf = Vec::with_capacity(1 + 8 + index_bytes.len());
buf.push(CACHE_VERSION);
buf.extend_from_slice(&file_hash.to_le_bytes());
buf.extend_from_slice(&index_bytes);
let tmp_path = dest.with_extension("index.tmp");
{
let mut f = std::fs::File::create(&tmp_path)?;
f.write_all(&buf)?;
f.sync_all()?;
}
std::fs::rename(&tmp_path, &dest)?;
Ok(())
}
/// Load a cached `LineIndex` from disk.
/// Returns `None` if: cache missing, version mismatch, file modified, or corruption.
pub fn load(file_path: &Path) -> Option<LineIndex> {
let path = cache_path(file_path)?;
let data = std::fs::read(&path).ok()?;
if data.len() < 9 {
return None;
}
// Validate version byte
if data[0] != CACHE_VERSION {
return None;
}
// Validate file hash
let stored_hash = u64::from_le_bytes(data[1..9].try_into().ok()?);
let current_hash = compute_file_hash(file_path).ok()?;
if stored_hash != current_hash {
return None;
}
// Deserialize index
bincode::deserialize(&data[9..]).ok()
}
}
/// Compute a fast fingerprint of the file: xxhash of (head 4KB + tail 4KB + file size).
/// Returns 0 for empty files.
fn compute_file_hash(file_path: &Path) -> std::io::Result<u64> {
let file = std::fs::File::open(file_path)?;
let file_size = file.metadata()?.len();
if file_size == 0 {
return Ok(0);
}
let mut f = std::io::BufReader::new(file);
let head_size = 4096.min(file_size as usize);
let mut head = vec![0u8; head_size];
f.read_exact(&mut head)?;
let tail_size = 4096.min(file_size as usize);
let mut tail = vec![0u8; tail_size];
if file_size as usize > head_size + tail_size {
// Need to seek to tail region
let mut file = std::fs::File::open(file_path)?;
std::io::Seek::seek(&mut file, std::io::SeekFrom::End(-(tail_size as i64)))?;
let mut tf = std::io::BufReader::new(file);
tf.read_exact(&mut tail)?;
} else {
// File is small enough that head already covers everything;
// tail overlaps with head — just take the last tail_size bytes
let start = head.len().saturating_sub(tail_size);
tail = head[start..].to_vec();
tail.resize(tail_size, 0);
}
let mut hasher_state = xxhash_rust::xxh3::Xxh3::new();
hasher_state.update(&head);
hasher_state.update(&tail);
hasher_state.update(&file_size.to_le_bytes());
Ok(hasher_state.digest())
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write as _;
use tempfile::NamedTempFile;
fn make_test_file(lines: usize) -> NamedTempFile {
let mut file = NamedTempFile::new().unwrap();
for i in 0..lines {
writeln!(file, "line {}", i).unwrap();
}
file.flush().unwrap();
file
}
#[test]
fn test_cache_roundtrip() {
let file = make_test_file(300);
let data = std::fs::read(file.path()).unwrap();
let index = LineIndex::from_bytes(&data);
IndexCache::save(file.path(), &index).expect("save should succeed");
let loaded = IndexCache::load(file.path()).expect("load should return Some");
assert_eq!(loaded.line_count(), index.line_count());
assert_eq!(loaded.sampled_offsets(), index.sampled_offsets());
assert_eq!(loaded.has_trailing_newline(), index.has_trailing_newline());
}
#[test]
fn test_cache_invalidation_file_modified() {
let file = make_test_file(300);
let data = std::fs::read(file.path()).unwrap();
let index = LineIndex::from_bytes(&data);
IndexCache::save(file.path(), &index).expect("save should succeed");
// Append to file, changing its content
{
let mut f = std::fs::OpenOptions::new()
.append(true)
.open(file.path())
.unwrap();
writeln!(f, "extra line").unwrap();
}
let loaded = IndexCache::load(file.path());
assert!(
loaded.is_none(),
"cache should be invalidated after file modification"
);
}
#[test]
fn test_cache_corruption() {
let file = make_test_file(300);
let data = std::fs::read(file.path()).unwrap();
let index = LineIndex::from_bytes(&data);
IndexCache::save(file.path(), &index).expect("save should succeed");
// Corrupt the cache file
let cache_path = cache_path(file.path()).expect("cache path");
let mut cache_data = std::fs::read(&cache_path).unwrap();
// Truncate the file (remove last 10 bytes)
let new_len = cache_data.len().saturating_sub(10);
cache_data.truncate(new_len);
std::fs::write(&cache_path, &cache_data).unwrap();
let loaded = IndexCache::load(file.path());
assert!(loaded.is_none(), "corrupt cache should return None");
}
#[test]
fn test_cache_version_mismatch() {
let file = make_test_file(300);
let data = std::fs::read(file.path()).unwrap();
let index = LineIndex::from_bytes(&data);
IndexCache::save(file.path(), &index).expect("save should succeed");
// Modify first byte (version)
let cache_path = cache_path(file.path()).expect("cache path");
let mut cache_data = std::fs::read(&cache_path).unwrap();
cache_data[0] = cache_data[0].wrapping_add(1);
std::fs::write(&cache_path, &cache_data).unwrap();
let loaded = IndexCache::load(file.path());
assert!(loaded.is_none(), "version mismatch should return None");
}
#[test]
fn test_cache_empty_file() {
let file = NamedTempFile::new().unwrap();
let index = LineIndex::from_bytes(b"");
IndexCache::save(file.path(), &index).expect("save should succeed");
let loaded = IndexCache::load(file.path()).expect("load should return Some");
assert_eq!(loaded.line_count(), 0);
}
#[test]
fn test_cache_nonexistent_source() {
let loaded = IndexCache::load(Path::new("/nonexistent/file.log"));
assert!(loaded.is_none(), "nonexistent file should return None");
}
#[test]
fn test_compute_file_hash_empty() {
let file = NamedTempFile::new().unwrap();
let hash = compute_file_hash(file.path()).unwrap();
assert_eq!(hash, 0, "empty file should hash to 0");
}
#[test]
fn test_compute_file_hash_deterministic() {
let mut file = NamedTempFile::new().unwrap();
write!(file, "hello world").unwrap();
file.flush().unwrap();
let h1 = compute_file_hash(file.path()).unwrap();
let h2 = compute_file_hash(file.path()).unwrap();
assert_eq!(h1, h2, "same file must produce same hash");
}
#[test]
fn test_compute_file_hash_changes_on_content_change() {
let mut file = NamedTempFile::new().unwrap();
write!(file, "version 1").unwrap();
file.flush().unwrap();
let h1 = compute_file_hash(file.path()).unwrap();
// Overwrite with different content
let mut file2 = std::fs::File::create(file.path()).unwrap();
write!(file2, "version 2 with more data").unwrap();
file2.flush().unwrap();
let h2 = compute_file_hash(file.path()).unwrap();
assert_ne!(h1, h2, "hash should change when content changes");
}
}