use std::io::{Read as _, Write as _}; use std::path::Path; use crate::io::cache_util::{cache_path, CACHE_VERSION}; use crate::io::line_index::LineIndex; pub struct IndexCache; /// Write `buf` to `dest` atomically using a unique temporary file in the same directory. /// /// Each call creates its own temp file via `tempfile::Builder`, eliminating collisions /// when multiple threads (or processes) save to the same cache path concurrently. /// The temp file is created in `dest.parent()` so the final `rename` stays on the /// same filesystem and remains atomic. fn write_cache_atomically(dest: &Path, buf: &[u8]) -> std::io::Result<()> { let dir = dest.parent().ok_or_else(|| { std::io::Error::new( std::io::ErrorKind::InvalidInput, "cache destination has no parent directory", ) })?; let mut tmp = tempfile::Builder::new() .prefix("index-cache-") .suffix(".tmp") .tempfile_in(dir)?; tmp.as_file_mut().write_all(buf)?; tmp.as_file_mut().sync_all()?; tmp.persist(dest).map(|_| ()).map_err(|e| e.error) } fn encode_cache(file_hash: u64, index: &LineIndex) -> std::io::Result> { let index_bytes = bincode::serialize(index) .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?; let mut buf = Vec::with_capacity(1 + 8 + index_bytes.len()); buf.push(CACHE_VERSION); buf.extend_from_slice(&file_hash.to_le_bytes()); buf.extend_from_slice(&index_bytes); Ok(buf) } impl IndexCache { /// Save a `LineIndex` to disk using atomic write (unique temp file, then rename). /// /// The file hash is derived from `data` (the same byte slice used to build the index), /// avoiding TOCTOU issues from re-reading the file from disk. pub fn save_with_hash( file_path: &Path, index: &LineIndex, data: &[u8], ) -> std::io::Result<()> { let dest = cache_path(file_path).ok_or_else(|| { std::io::Error::new(std::io::ErrorKind::NotFound, "cannot determine cache path") })?; let file_hash = compute_data_hash(data); let buf = encode_cache(file_hash, index)?; write_cache_atomically(&dest, &buf) } /// Save a `LineIndex` to disk, computing the hash by re-reading the file. /// /// Prefer [`save_with_hash`] when the source data is already in memory /// to avoid a TOCTOU race between indexing and hash computation. pub fn save(file_path: &Path, index: &LineIndex) -> std::io::Result<()> { let dest = cache_path(file_path).ok_or_else(|| { std::io::Error::new(std::io::ErrorKind::NotFound, "cannot determine cache path") })?; let file_hash = compute_file_hash(file_path)?; let buf = encode_cache(file_hash, index)?; write_cache_atomically(&dest, &buf) } /// Load a cached `LineIndex` from disk. /// Returns `None` if: cache missing, version mismatch, file modified, or corruption. pub fn load(file_path: &Path) -> Option { let path = cache_path(file_path)?; let data = std::fs::read(&path).ok()?; if data.len() < 9 { return None; } // Validate version byte if data[0] != CACHE_VERSION { return None; } // Validate file hash let stored_hash = u64::from_le_bytes(data[1..9].try_into().ok()?); let current_hash = compute_file_hash(file_path).ok()?; if stored_hash != current_hash { return None; } // Deserialize index bincode::deserialize(&data[9..]).ok() } } /// Compute a fingerprint from in-memory data, using the same algorithm as `compute_file_hash`. /// Returns 0 for empty data. pub fn compute_data_hash(data: &[u8]) -> u64 { let len = data.len(); if len == 0 { return 0; } let head_size = 4096.min(len); let tail_size = 4096.min(len); let mut hasher_state = xxhash_rust::xxh3::Xxh3::new(); hasher_state.update(&data[..head_size]); if len > head_size + tail_size { hasher_state.update(&data[len - tail_size..]); } else { // For small files the tail overlaps with the head; hash from the real tail start. let tail_start = len.saturating_sub(tail_size); hasher_state.update(&data[tail_start..]); } hasher_state.update(&(len as u64).to_le_bytes()); hasher_state.digest() } /// Compute a fast fingerprint of the file: xxhash of (head 4KB + tail 4KB + file size). /// Returns 0 for empty files. fn compute_file_hash(file_path: &Path) -> std::io::Result { let file = std::fs::File::open(file_path)?; let file_size = file.metadata()?.len(); if file_size == 0 { return Ok(0); } let mut f = std::io::BufReader::new(file); let head_size = 4096.min(file_size as usize); let mut head = vec![0u8; head_size]; f.read_exact(&mut head)?; let tail_size = 4096.min(file_size as usize); let mut tail = vec![0u8; tail_size]; if file_size as usize > head_size + tail_size { // Need to seek to tail region let mut file = std::fs::File::open(file_path)?; std::io::Seek::seek(&mut file, std::io::SeekFrom::End(-(tail_size as i64)))?; let mut tf = std::io::BufReader::new(file); tf.read_exact(&mut tail)?; } else { // File fits within head+tail window. Seek to read the real tail // for correctness — approximating from head misses bytes beyond head_size. let tail_start = file_size.saturating_sub(tail_size as u64); let mut file = std::fs::File::open(file_path)?; std::io::Seek::seek(&mut file, std::io::SeekFrom::Start(tail_start))?; file.read_exact(&mut tail)?; } let mut hasher_state = xxhash_rust::xxh3::Xxh3::new(); hasher_state.update(&head); hasher_state.update(&tail); hasher_state.update(&file_size.to_le_bytes()); Ok(hasher_state.digest()) } #[cfg(test)] mod tests { use super::*; use std::io::Write as _; use tempfile::NamedTempFile; fn make_test_file(lines: usize) -> NamedTempFile { let mut file = NamedTempFile::new().unwrap(); for i in 0..lines { writeln!(file, "line {}", i).unwrap(); } file.flush().unwrap(); file } #[test] fn test_cache_roundtrip() { let file = make_test_file(300); let data = std::fs::read(file.path()).unwrap(); let index = LineIndex::from_bytes(&data); IndexCache::save(file.path(), &index).expect("save should succeed"); let loaded = IndexCache::load(file.path()).expect("load should return Some"); assert_eq!(loaded.line_count(), index.line_count()); assert_eq!(loaded.sampled_offsets(), index.sampled_offsets()); assert_eq!(loaded.has_trailing_newline(), index.has_trailing_newline()); } #[test] fn test_cache_invalidation_file_modified() { let file = make_test_file(300); let data = std::fs::read(file.path()).unwrap(); let index = LineIndex::from_bytes(&data); IndexCache::save(file.path(), &index).expect("save should succeed"); // Append to file, changing its content { let mut f = std::fs::OpenOptions::new() .append(true) .open(file.path()) .unwrap(); writeln!(f, "extra line").unwrap(); } let loaded = IndexCache::load(file.path()); assert!( loaded.is_none(), "cache should be invalidated after file modification" ); } #[test] fn test_cache_corruption() { let file = make_test_file(300); let data = std::fs::read(file.path()).unwrap(); let index = LineIndex::from_bytes(&data); IndexCache::save(file.path(), &index).expect("save should succeed"); // Corrupt the cache file let cache_path = cache_path(file.path()).expect("cache path"); let mut cache_data = std::fs::read(&cache_path).unwrap(); // Truncate the file (remove last 10 bytes) let new_len = cache_data.len().saturating_sub(10); cache_data.truncate(new_len); std::fs::write(&cache_path, &cache_data).unwrap(); let loaded = IndexCache::load(file.path()); assert!(loaded.is_none(), "corrupt cache should return None"); } #[test] fn test_cache_version_mismatch() { let file = make_test_file(300); let data = std::fs::read(file.path()).unwrap(); let index = LineIndex::from_bytes(&data); IndexCache::save(file.path(), &index).expect("save should succeed"); // Modify first byte (version) let cache_path = cache_path(file.path()).expect("cache path"); let mut cache_data = std::fs::read(&cache_path).unwrap(); cache_data[0] = cache_data[0].wrapping_add(1); std::fs::write(&cache_path, &cache_data).unwrap(); let loaded = IndexCache::load(file.path()); assert!(loaded.is_none(), "version mismatch should return None"); } #[test] fn test_cache_empty_file() { let file = NamedTempFile::new().unwrap(); let index = LineIndex::from_bytes(b""); IndexCache::save(file.path(), &index).expect("save should succeed"); let loaded = IndexCache::load(file.path()).expect("load should return Some"); assert_eq!(loaded.line_count(), 0); } #[test] fn test_cache_nonexistent_source() { let loaded = IndexCache::load(Path::new("/nonexistent/file.log")); assert!(loaded.is_none(), "nonexistent file should return None"); } #[test] fn test_compute_file_hash_empty() { let file = NamedTempFile::new().unwrap(); let hash = compute_file_hash(file.path()).unwrap(); assert_eq!(hash, 0, "empty file should hash to 0"); } #[test] fn test_compute_file_hash_deterministic() { let mut file = NamedTempFile::new().unwrap(); write!(file, "hello world").unwrap(); file.flush().unwrap(); let h1 = compute_file_hash(file.path()).unwrap(); let h2 = compute_file_hash(file.path()).unwrap(); assert_eq!(h1, h2, "same file must produce same hash"); } #[test] fn test_compute_file_hash_changes_on_content_change() { let mut file = NamedTempFile::new().unwrap(); write!(file, "version 1").unwrap(); file.flush().unwrap(); let h1 = compute_file_hash(file.path()).unwrap(); // Overwrite with different content let mut file2 = std::fs::File::create(file.path()).unwrap(); write!(file2, "version 2 with more data").unwrap(); file2.flush().unwrap(); let h2 = compute_file_hash(file.path()).unwrap(); assert_ne!(h1, h2, "hash should change when content changes"); } #[test] fn test_concurrent_save_with_hash_no_corruption() { use std::sync::{Arc, Barrier}; let file = make_test_file(300); let data = std::fs::read(file.path()).unwrap(); let num_threads = 8; let iterations = 50; let barrier = Arc::new(Barrier::new(num_threads)); let path = file.path().to_path_buf(); let handles: Vec<_> = (0..num_threads) .map(|_| { let barrier = Arc::clone(&barrier); let path = path.clone(); let data = data.clone(); std::thread::spawn(move || { let index = LineIndex::from_bytes(&data); barrier.wait(); for _ in 0..iterations { IndexCache::save_with_hash(&path, &index, &data) .expect("concurrent save_with_hash should succeed"); } }) }) .collect(); for h in handles { h.join().expect("thread should not panic"); } let loaded = IndexCache::load(file.path()).expect("final cache should load successfully"); let expected = LineIndex::from_bytes(&data); assert_eq!(loaded.line_count(), expected.line_count()); assert_eq!( loaded.sampled_offsets(), expected.sampled_offsets(), "concurrent writes must not produce interleaved data" ); } #[test] fn test_concurrent_save_same_dest_all_succeed() { use std::sync::{Arc, Barrier}; let dir = tempfile::tempdir().unwrap(); let dest = dir.path().join("test.index"); let payloads: Vec> = (0..8).map(|i| vec![i; 64 * 1024]).collect(); let barrier = Arc::new(Barrier::new(8)); let handles: Vec<_> = payloads .into_iter() .map(|payload| { let barrier = Arc::clone(&barrier); let dest = dest.clone(); std::thread::spawn(move || { barrier.wait(); write_cache_atomically(&dest, &payload) .expect("concurrent atomic write should succeed"); }) }) .collect(); for h in handles { h.join().expect("thread should not panic"); } let final_data = std::fs::read(&dest).expect("dest file should exist"); assert_eq!(final_data.len(), 64 * 1024, "final file must be exactly one payload"); } }