Files
logViewer/crates/core/src/io/read_cache.rs
dailz d37ed6df68 fix(io): harden read_cache against zero-length false hits and overflow (closes #13)
- Add early return for len==0 (Ok(&[])) matching std::io semantics
- Add slot.len > 0 guard to cache hit predicate to prevent empty-slot
  false matches
- Replace unchecked arithmetic with checked_add/saturating_add for
  request_end, block_end, and post-read coverage check
- Fix misleading comment about get(file,0,0) behavior on miss path
- Strengthen clear() to fully reset block_offset and last_access
- Register read_cache module in io/mod.rs
- Add 4 regression tests: zero-len on fresh/populated cache,
  zero-len at u64::MAX, overflow error on nonzero read at u64::MAX
2026-06-09 10:48:34 +08:00

495 lines
17 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// NOTE: This module is implemented and tested but not yet integrated into
// the production read path (FileReader uses mmap for line access).
// It is kept as a building block for a future pread-based reader that
// would avoid the SIGBUS risk of mmap.
// ─── read_cache.rs ─────────────────────────────────────────────────────────
// 16-slot LRU read cache with 4KB page-aligned keys.
// Reduces syscalls by caching recently-read 4KB blocks.
// Cross-block reads are handled via a spill buffer (not cached).
// ──────────────────────────────────────────────────────────────────────────
use std::fs::File;
use std::io;
use std::os::unix::fs::FileExt;
const LRU_SLOTS: usize = 16;
pub const BLOCK_ALIGN: usize = 4096;
struct CacheSlot {
buf: Vec<u8>,
block_offset: u64,
len: usize,
last_access: u64,
}
pub struct LruReadCache {
slots: [CacheSlot; LRU_SLOTS],
spill_buf: Vec<u8>,
spill_len: usize,
tick: u64,
}
pub type ReadCache = LruReadCache;
impl Default for LruReadCache {
fn default() -> Self {
Self {
slots: std::array::from_fn(|_| CacheSlot {
buf: vec![0u8; BLOCK_ALIGN],
block_offset: 0,
len: 0,
last_access: 0,
}),
spill_buf: Vec::new(),
spill_len: 0,
tick: 0,
}
}
}
impl LruReadCache {
pub fn new() -> Self {
Self::default()
}
/// Read `len` bytes starting at `offset`. Returns a slice into the cache
/// on a hit, or fills a cache slot on a miss. Cross-block reads go through
/// the spill buffer and are not cached.
pub fn get(&mut self, file: &File, offset: u64, len: usize) -> io::Result<&[u8]> {
if len == 0 {
return Ok(&[]);
}
let aligned_key = offset & !(BLOCK_ALIGN as u64 - 1);
let request_end = offset.checked_add(len as u64).ok_or_else(|| {
io::Error::new(io::ErrorKind::InvalidInput, "read range overflows u64")
})?;
let block_end = aligned_key.saturating_add(BLOCK_ALIGN as u64);
if request_end > block_end {
self.spill_buf.resize(len, 0);
let bytes_read = file.read_at(&mut self.spill_buf[..len], offset)?;
if bytes_read == 0 {
return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "read 0 bytes"));
}
if bytes_read < len {
return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "short read"));
}
self.spill_len = len;
return Ok(&self.spill_buf[..len]);
}
let hit_idx = self.slots.iter().position(|slot| {
let slot_end = slot.block_offset.saturating_add(slot.len as u64);
slot.len > 0 && slot.block_offset == aligned_key && request_end <= slot_end
});
if let Some(idx) = hit_idx {
self.slots[idx].last_access = self.tick;
self.tick += 1;
let start = (offset - self.slots[idx].block_offset) as usize;
return Ok(&self.slots[idx].buf[start..start + len]);
}
let mut evict_idx = 0;
let mut min_access = self.slots[0].last_access;
for (i, slot) in self.slots.iter().enumerate() {
if slot.last_access < min_access {
min_access = slot.last_access;
evict_idx = i;
}
}
let slot = &mut self.slots[evict_idx];
let bytes_read = file.read_at(&mut slot.buf, aligned_key)?;
// Non-empty reads that return 0 are EOF. Zero-length reads are handled above
// as a successful no-op.
if bytes_read == 0 {
return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "read 0 bytes"));
}
slot.block_offset = aligned_key;
slot.len = bytes_read;
slot.last_access = self.tick;
self.tick += 1;
let bytes_end = aligned_key.saturating_add(bytes_read as u64);
if request_end > bytes_end {
return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "short read"));
}
let start = (offset - slot.block_offset) as usize;
Ok(&slot.buf[start..start + len])
}
/// Invalidate all cache slots and the spill buffer.
pub fn clear(&mut self) {
for slot in &mut self.slots {
slot.block_offset = 0;
slot.len = 0;
slot.last_access = 0;
}
self.spill_len = 0;
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
fn make_file(data: &[u8]) -> NamedTempFile {
let mut f = NamedTempFile::new().unwrap();
f.write_all(data).unwrap();
f.flush().unwrap();
f
}
#[test]
fn cache_hit_returns_same_data() {
// Read the same range twice — second read should be a cache hit.
let f = make_file(b"Hello, World! This is a test of the cache.");
let file = File::open(f.path()).unwrap();
let mut cache = ReadCache::new();
let first = cache.get(&file, 0, 13).unwrap().to_vec();
let second = cache.get(&file, 0, 13).unwrap().to_vec();
assert_eq!(first, second);
assert_eq!(&first, b"Hello, World!");
}
#[test]
fn cache_miss_reads_correct_data() {
// Two non-overlapping ranges — both must be misses but return correct data.
let data = b"0123456789ABCDEFGHIJ";
let f = make_file(data);
let file = File::open(f.path()).unwrap();
let mut cache = ReadCache::new();
let a = cache.get(&file, 0, 10).unwrap().to_vec();
assert_eq!(&a, b"0123456789");
let b = cache.get(&file, 10, 10).unwrap().to_vec();
assert_eq!(&b, b"ABCDEFGHIJ");
}
#[test]
fn cross_block_read_uses_spill_buffer() {
// Read spanning a 4KB block boundary uses spill buffer, not cache.
let data = vec![0xABu8; 8192];
let f = make_file(&data);
let file = File::open(f.path()).unwrap();
let mut cache = ReadCache::new();
// First, cache block 0.
let a = cache.get(&file, 0, 100).unwrap().to_vec();
assert_eq!(a, vec![0xABu8; 100]);
// Now read spanning the boundary: offset=4000, len=200 spans [0,4096) and [4096,8192).
let b = cache.get(&file, 4000, 200).unwrap().to_vec();
assert_eq!(b, vec![0xABu8; 200]);
}
#[test]
fn empty_file_read_fails() {
let f = make_file(b"");
let file = File::open(f.path()).unwrap();
let mut cache = ReadCache::new();
// Reading 1 byte from empty file should fail.
let result = cache.get(&file, 0, 1);
assert!(result.is_err());
}
#[test]
fn clear_invalidates_cache() {
let data = b"original data here";
let f = make_file(data);
let file = File::open(f.path()).unwrap();
let mut cache = ReadCache::new();
// Populate cache.
let first = cache.get(&file, 0, 10).unwrap().to_vec();
assert_eq!(&first, b"original d");
// Invalidate.
cache.clear();
// After clear, reading same range should still work (re-reads from file).
let after = cache.get(&file, 0, 10).unwrap().to_vec();
assert_eq!(&after, b"original d");
}
// ─── New LRU-specific tests ───────────────────────────────────────────
#[test]
fn lru_multi_block_hit() {
// Read 3 different aligned blocks, verify re-reading each hits cache.
let data = vec![0u8; BLOCK_ALIGN * 4];
let f = make_file(&data);
let file = File::open(f.path()).unwrap();
let mut cache = ReadCache::new();
// Write distinct patterns to each block.
drop(file);
{
use std::io::Seek;
let mut f2 = std::fs::OpenOptions::new()
.write(true)
.open(f.path())
.unwrap();
f2.seek(std::io::SeekFrom::Start(0)).unwrap();
f2.write_all(&[1u8; BLOCK_ALIGN]).unwrap();
f2.seek(std::io::SeekFrom::Start(BLOCK_ALIGN as u64))
.unwrap();
f2.write_all(&[2u8; BLOCK_ALIGN]).unwrap();
f2.seek(std::io::SeekFrom::Start((BLOCK_ALIGN * 2) as u64))
.unwrap();
f2.write_all(&[3u8; BLOCK_ALIGN]).unwrap();
}
let file = File::open(f.path()).unwrap();
let block0 = cache.get(&file, 0, 16).unwrap().to_vec();
let block1 = cache.get(&file, BLOCK_ALIGN as u64, 16).unwrap().to_vec();
let block2 = cache
.get(&file, (BLOCK_ALIGN * 2) as u64, 16)
.unwrap()
.to_vec();
assert_eq!(block0, vec![1u8; 16]);
assert_eq!(block1, vec![2u8; 16]);
assert_eq!(block2, vec![3u8; 16]);
// Re-read — should hit cache and return same data.
let block0_again = cache.get(&file, 0, 16).unwrap().to_vec();
let block1_again = cache.get(&file, BLOCK_ALIGN as u64, 16).unwrap().to_vec();
let block2_again = cache
.get(&file, (BLOCK_ALIGN * 2) as u64, 16)
.unwrap()
.to_vec();
assert_eq!(block0_again, block0);
assert_eq!(block1_again, block1);
assert_eq!(block2_again, block2);
}
#[test]
fn lru_eviction_order() {
// Fill all 16 slots, re-access slot 0, add 17th block,
// verify slot at offset 4096 (slot 1) evicted, not slot 0.
let data = vec![0u8; BLOCK_ALIGN * 20];
let f = make_file(&data);
let file = File::open(f.path()).unwrap();
let mut cache = ReadCache::new();
// Fill all 16 slots with blocks 0..16.
for i in 0..16u64 {
cache.get(&file, i * BLOCK_ALIGN as u64, 1).unwrap();
}
// Re-access block 0 so it's not the LRU.
cache.get(&file, 0, 1).unwrap();
// Add 17th block — should evict block 1 (offset 4096), which is the oldest
// since block 0 was re-accessed.
cache.get(&file, 16 * BLOCK_ALIGN as u64, 1).unwrap();
// Reading block 1 (offset 4096) should be a miss (evicted).
// We verify by checking the cache slots: block 1 should not be cached.
let has_block1 = cache
.slots
.iter()
.any(|s| s.block_offset == BLOCK_ALIGN as u64 && s.len > 0);
assert!(!has_block1, "block 1 should have been evicted");
// Block 0 should still be cached.
let has_block0 = cache.slots.iter().any(|s| s.block_offset == 0 && s.len > 0);
assert!(has_block0, "block 0 should still be cached");
}
#[test]
fn lru_clear_all_slots() {
// Fill 3+ slots, call clear(), verify subsequent reads all miss.
let data = vec![0x42u8; BLOCK_ALIGN * 4];
let f = make_file(&data);
let file = File::open(f.path()).unwrap();
let mut cache = ReadCache::new();
// Fill 3 slots.
cache.get(&file, 0, 1).unwrap();
cache.get(&file, BLOCK_ALIGN as u64, 1).unwrap();
cache.get(&file, (BLOCK_ALIGN * 2) as u64, 1).unwrap();
cache.clear();
// All slots should be fully reset.
for slot in &cache.slots {
assert_eq!(slot.block_offset, 0);
assert_eq!(slot.len, 0);
assert_eq!(slot.last_access, 0);
}
assert_eq!(cache.spill_len, 0);
// Re-read should still work (reads from file).
let val = cache.get(&file, 0, 1).unwrap();
assert_eq!(val[0], 0x42);
}
#[test]
fn lru_aligned_keys() {
// offset=100 and offset=200 both align to block 0 — should hit same slot.
let data = vec![0xEEu8; BLOCK_ALIGN];
let f = make_file(&data);
let file = File::open(f.path()).unwrap();
let mut cache = ReadCache::new();
let a = cache.get(&file, 100, 10).unwrap().to_vec();
assert_eq!(a, vec![0xEEu8; 10]);
let b = cache.get(&file, 200, 10).unwrap().to_vec();
assert_eq!(b, vec![0xEEu8; 10]);
// Both should be served from the same cache slot (block 0).
let slot_count = cache
.slots
.iter()
.filter(|s| s.block_offset == 0 && s.len > 0)
.count();
assert_eq!(slot_count, 1, "only one slot should hold block 0");
}
#[test]
fn lru_cross_block_uses_spill_buffer() {
// File [0xAA×4096, 0xBB×4096], get(file, 4090, 20) → [0xAA×6, 0xBB×14].
let mut data = vec![0xAAu8; BLOCK_ALIGN];
data.extend_from_slice(&vec![0xBBu8; BLOCK_ALIGN]);
let f = make_file(&data);
let file = File::open(f.path()).unwrap();
let mut cache = ReadCache::new();
let result = cache.get(&file, 4090, 20).unwrap().to_vec();
assert_eq!(&result[..6], &[0xAAu8; 6]);
assert_eq!(&result[6..], &[0xBBu8; 14]);
// Verify no cache slot holds block 0 or block 1.
for slot in &cache.slots {
assert!(
slot.len == 0,
"cross-block data should not be cached in slots"
);
}
}
#[test]
fn lru_partial_last_block() {
// 5000 byte file, get(file, 4096, 904) reads last 904 bytes,
// then get(file, 4096, 100) hits cache.
let data = vec![0x77u8; 5000];
let f = make_file(&data);
let file = File::open(f.path()).unwrap();
let mut cache = ReadCache::new();
let first = cache.get(&file, 4096, 904).unwrap().to_vec();
assert_eq!(first.len(), 904);
assert_eq!(first, vec![0x77u8; 904]);
// Second read of 100 bytes from same block should hit cache.
let second = cache.get(&file, 4096, 100).unwrap().to_vec();
assert_eq!(second.len(), 100);
assert_eq!(second, vec![0x77u8; 100]);
}
#[test]
fn lru_short_file_overread() {
// 1 byte file, get(file, 0, 100) → Err (not panic).
let f = make_file(b"X");
let file = File::open(f.path()).unwrap();
let mut cache = ReadCache::new();
let result = cache.get(&file, 0, 100);
assert!(result.is_err());
}
#[test]
fn lru_empty_file_returns_error() {
// Empty file, get(file, 0, 1) → Err.
let f = make_file(b"");
let file = File::open(f.path()).unwrap();
let mut cache = ReadCache::new();
let result = cache.get(&file, 0, 1);
assert!(result.is_err());
}
#[test]
fn long_line_spans_multiple_blocks() {
// Create file with line >4KB: b'A'×4090 + "\n" + b'B'×4090 + "\n"
let mut data = vec![b'A'; 4090];
data.push(b'\n');
data.extend_from_slice(&vec![b'B'; 4090]);
data.push(b'\n');
let f = make_file(&data);
let file = File::open(f.path()).unwrap();
let mut cache = ReadCache::new();
// Read the first line: offset 0, len 4091 (fits in block 0 since 4091 <= 4096).
let line1 = cache.get(&file, 0, 4091).unwrap().to_vec();
assert_eq!(&line1[..4090], &[b'A'; 4090]);
assert_eq!(line1[4090], b'\n');
// Read the second line: offset 4091, len 4091 (starts in block 0, ends in block 1 — cross-block).
let line2 = cache.get(&file, 4091, 4091).unwrap().to_vec();
assert_eq!(&line2[..4090], &[b'B'; 4090]);
assert_eq!(line2[4090], b'\n');
}
#[test]
fn zero_len_read_is_noop_on_fresh_cache() {
let f = make_file(b"");
let file = File::open(f.path()).unwrap();
let mut cache = ReadCache::new();
let result = cache.get(&file, 0, 0).unwrap();
assert!(result.is_empty());
assert_eq!(cache.tick, 0);
assert!(cache.slots.iter().all(|s| s.len == 0));
}
#[test]
fn zero_len_read_is_noop_on_populated_cache() {
let f = make_file(b"abc");
let file = File::open(f.path()).unwrap();
let mut cache = ReadCache::new();
cache.get(&file, 0, 1).unwrap();
let tick_before = cache.tick;
let result = cache.get(&file, 0, 0).unwrap();
assert!(result.is_empty());
assert_eq!(cache.tick, tick_before);
}
#[test]
fn zero_len_read_at_max_offset_is_ok() {
let f = make_file(b"");
let file = File::open(f.path()).unwrap();
let mut cache = ReadCache::new();
let result = cache.get(&file, u64::MAX, 0).unwrap();
assert!(result.is_empty());
assert_eq!(cache.tick, 0);
assert!(cache.slots.iter().all(|s| s.len == 0));
}
#[test]
fn nonzero_read_range_overflow_returns_invalid_input() {
let f = make_file(b"abc");
let file = File::open(f.path()).unwrap();
let mut cache = ReadCache::new();
let err = cache.get(&file, u64::MAX, 1).unwrap_err();
assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
}
}