- Add early return for len==0 (Ok(&[])) matching std::io semantics - Add slot.len > 0 guard to cache hit predicate to prevent empty-slot false matches - Replace unchecked arithmetic with checked_add/saturating_add for request_end, block_end, and post-read coverage check - Fix misleading comment about get(file,0,0) behavior on miss path - Strengthen clear() to fully reset block_offset and last_access - Register read_cache module in io/mod.rs - Add 4 regression tests: zero-len on fresh/populated cache, zero-len at u64::MAX, overflow error on nonzero read at u64::MAX
495 lines
17 KiB
Rust
495 lines
17 KiB
Rust
// NOTE: This module is implemented and tested but not yet integrated into
|
||
// the production read path (FileReader uses mmap for line access).
|
||
// It is kept as a building block for a future pread-based reader that
|
||
// would avoid the SIGBUS risk of mmap.
|
||
// ─── read_cache.rs ─────────────────────────────────────────────────────────
|
||
// 16-slot LRU read cache with 4KB page-aligned keys.
|
||
// Reduces syscalls by caching recently-read 4KB blocks.
|
||
// Cross-block reads are handled via a spill buffer (not cached).
|
||
// ──────────────────────────────────────────────────────────────────────────
|
||
|
||
use std::fs::File;
|
||
use std::io;
|
||
use std::os::unix::fs::FileExt;
|
||
|
||
const LRU_SLOTS: usize = 16;
|
||
pub const BLOCK_ALIGN: usize = 4096;
|
||
|
||
struct CacheSlot {
|
||
buf: Vec<u8>,
|
||
block_offset: u64,
|
||
len: usize,
|
||
last_access: u64,
|
||
}
|
||
|
||
pub struct LruReadCache {
|
||
slots: [CacheSlot; LRU_SLOTS],
|
||
spill_buf: Vec<u8>,
|
||
spill_len: usize,
|
||
tick: u64,
|
||
}
|
||
|
||
pub type ReadCache = LruReadCache;
|
||
|
||
impl Default for LruReadCache {
|
||
fn default() -> Self {
|
||
Self {
|
||
slots: std::array::from_fn(|_| CacheSlot {
|
||
buf: vec![0u8; BLOCK_ALIGN],
|
||
block_offset: 0,
|
||
len: 0,
|
||
last_access: 0,
|
||
}),
|
||
spill_buf: Vec::new(),
|
||
spill_len: 0,
|
||
tick: 0,
|
||
}
|
||
}
|
||
}
|
||
|
||
impl LruReadCache {
|
||
pub fn new() -> Self {
|
||
Self::default()
|
||
}
|
||
|
||
/// Read `len` bytes starting at `offset`. Returns a slice into the cache
|
||
/// on a hit, or fills a cache slot on a miss. Cross-block reads go through
|
||
/// the spill buffer and are not cached.
|
||
pub fn get(&mut self, file: &File, offset: u64, len: usize) -> io::Result<&[u8]> {
|
||
if len == 0 {
|
||
return Ok(&[]);
|
||
}
|
||
|
||
let aligned_key = offset & !(BLOCK_ALIGN as u64 - 1);
|
||
let request_end = offset.checked_add(len as u64).ok_or_else(|| {
|
||
io::Error::new(io::ErrorKind::InvalidInput, "read range overflows u64")
|
||
})?;
|
||
let block_end = aligned_key.saturating_add(BLOCK_ALIGN as u64);
|
||
|
||
if request_end > block_end {
|
||
self.spill_buf.resize(len, 0);
|
||
let bytes_read = file.read_at(&mut self.spill_buf[..len], offset)?;
|
||
if bytes_read == 0 {
|
||
return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "read 0 bytes"));
|
||
}
|
||
if bytes_read < len {
|
||
return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "short read"));
|
||
}
|
||
self.spill_len = len;
|
||
return Ok(&self.spill_buf[..len]);
|
||
}
|
||
|
||
let hit_idx = self.slots.iter().position(|slot| {
|
||
let slot_end = slot.block_offset.saturating_add(slot.len as u64);
|
||
slot.len > 0 && slot.block_offset == aligned_key && request_end <= slot_end
|
||
});
|
||
|
||
if let Some(idx) = hit_idx {
|
||
self.slots[idx].last_access = self.tick;
|
||
self.tick += 1;
|
||
let start = (offset - self.slots[idx].block_offset) as usize;
|
||
return Ok(&self.slots[idx].buf[start..start + len]);
|
||
}
|
||
|
||
let mut evict_idx = 0;
|
||
let mut min_access = self.slots[0].last_access;
|
||
for (i, slot) in self.slots.iter().enumerate() {
|
||
if slot.last_access < min_access {
|
||
min_access = slot.last_access;
|
||
evict_idx = i;
|
||
}
|
||
}
|
||
|
||
let slot = &mut self.slots[evict_idx];
|
||
let bytes_read = file.read_at(&mut slot.buf, aligned_key)?;
|
||
|
||
// Non-empty reads that return 0 are EOF. Zero-length reads are handled above
|
||
// as a successful no-op.
|
||
if bytes_read == 0 {
|
||
return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "read 0 bytes"));
|
||
}
|
||
|
||
slot.block_offset = aligned_key;
|
||
slot.len = bytes_read;
|
||
slot.last_access = self.tick;
|
||
self.tick += 1;
|
||
|
||
let bytes_end = aligned_key.saturating_add(bytes_read as u64);
|
||
if request_end > bytes_end {
|
||
return Err(io::Error::new(io::ErrorKind::UnexpectedEof, "short read"));
|
||
}
|
||
|
||
let start = (offset - slot.block_offset) as usize;
|
||
Ok(&slot.buf[start..start + len])
|
||
}
|
||
|
||
/// Invalidate all cache slots and the spill buffer.
|
||
pub fn clear(&mut self) {
|
||
for slot in &mut self.slots {
|
||
slot.block_offset = 0;
|
||
slot.len = 0;
|
||
slot.last_access = 0;
|
||
}
|
||
self.spill_len = 0;
|
||
}
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
use std::io::Write;
|
||
use tempfile::NamedTempFile;
|
||
|
||
fn make_file(data: &[u8]) -> NamedTempFile {
|
||
let mut f = NamedTempFile::new().unwrap();
|
||
f.write_all(data).unwrap();
|
||
f.flush().unwrap();
|
||
f
|
||
}
|
||
|
||
#[test]
|
||
fn cache_hit_returns_same_data() {
|
||
// Read the same range twice — second read should be a cache hit.
|
||
let f = make_file(b"Hello, World! This is a test of the cache.");
|
||
let file = File::open(f.path()).unwrap();
|
||
let mut cache = ReadCache::new();
|
||
|
||
let first = cache.get(&file, 0, 13).unwrap().to_vec();
|
||
let second = cache.get(&file, 0, 13).unwrap().to_vec();
|
||
assert_eq!(first, second);
|
||
assert_eq!(&first, b"Hello, World!");
|
||
}
|
||
|
||
#[test]
|
||
fn cache_miss_reads_correct_data() {
|
||
// Two non-overlapping ranges — both must be misses but return correct data.
|
||
let data = b"0123456789ABCDEFGHIJ";
|
||
let f = make_file(data);
|
||
let file = File::open(f.path()).unwrap();
|
||
let mut cache = ReadCache::new();
|
||
|
||
let a = cache.get(&file, 0, 10).unwrap().to_vec();
|
||
assert_eq!(&a, b"0123456789");
|
||
|
||
let b = cache.get(&file, 10, 10).unwrap().to_vec();
|
||
assert_eq!(&b, b"ABCDEFGHIJ");
|
||
}
|
||
|
||
#[test]
|
||
fn cross_block_read_uses_spill_buffer() {
|
||
// Read spanning a 4KB block boundary uses spill buffer, not cache.
|
||
let data = vec![0xABu8; 8192];
|
||
let f = make_file(&data);
|
||
let file = File::open(f.path()).unwrap();
|
||
let mut cache = ReadCache::new();
|
||
|
||
// First, cache block 0.
|
||
let a = cache.get(&file, 0, 100).unwrap().to_vec();
|
||
assert_eq!(a, vec![0xABu8; 100]);
|
||
|
||
// Now read spanning the boundary: offset=4000, len=200 spans [0,4096) and [4096,8192).
|
||
let b = cache.get(&file, 4000, 200).unwrap().to_vec();
|
||
assert_eq!(b, vec![0xABu8; 200]);
|
||
}
|
||
|
||
#[test]
|
||
fn empty_file_read_fails() {
|
||
let f = make_file(b"");
|
||
let file = File::open(f.path()).unwrap();
|
||
let mut cache = ReadCache::new();
|
||
|
||
// Reading 1 byte from empty file should fail.
|
||
let result = cache.get(&file, 0, 1);
|
||
assert!(result.is_err());
|
||
}
|
||
|
||
#[test]
|
||
fn clear_invalidates_cache() {
|
||
let data = b"original data here";
|
||
let f = make_file(data);
|
||
let file = File::open(f.path()).unwrap();
|
||
let mut cache = ReadCache::new();
|
||
|
||
// Populate cache.
|
||
let first = cache.get(&file, 0, 10).unwrap().to_vec();
|
||
assert_eq!(&first, b"original d");
|
||
|
||
// Invalidate.
|
||
cache.clear();
|
||
|
||
// After clear, reading same range should still work (re-reads from file).
|
||
let after = cache.get(&file, 0, 10).unwrap().to_vec();
|
||
assert_eq!(&after, b"original d");
|
||
}
|
||
|
||
// ─── New LRU-specific tests ───────────────────────────────────────────
|
||
|
||
#[test]
|
||
fn lru_multi_block_hit() {
|
||
// Read 3 different aligned blocks, verify re-reading each hits cache.
|
||
let data = vec![0u8; BLOCK_ALIGN * 4];
|
||
let f = make_file(&data);
|
||
let file = File::open(f.path()).unwrap();
|
||
let mut cache = ReadCache::new();
|
||
|
||
// Write distinct patterns to each block.
|
||
drop(file);
|
||
{
|
||
use std::io::Seek;
|
||
let mut f2 = std::fs::OpenOptions::new()
|
||
.write(true)
|
||
.open(f.path())
|
||
.unwrap();
|
||
f2.seek(std::io::SeekFrom::Start(0)).unwrap();
|
||
f2.write_all(&[1u8; BLOCK_ALIGN]).unwrap();
|
||
f2.seek(std::io::SeekFrom::Start(BLOCK_ALIGN as u64))
|
||
.unwrap();
|
||
f2.write_all(&[2u8; BLOCK_ALIGN]).unwrap();
|
||
f2.seek(std::io::SeekFrom::Start((BLOCK_ALIGN * 2) as u64))
|
||
.unwrap();
|
||
f2.write_all(&[3u8; BLOCK_ALIGN]).unwrap();
|
||
}
|
||
let file = File::open(f.path()).unwrap();
|
||
|
||
let block0 = cache.get(&file, 0, 16).unwrap().to_vec();
|
||
let block1 = cache.get(&file, BLOCK_ALIGN as u64, 16).unwrap().to_vec();
|
||
let block2 = cache
|
||
.get(&file, (BLOCK_ALIGN * 2) as u64, 16)
|
||
.unwrap()
|
||
.to_vec();
|
||
|
||
assert_eq!(block0, vec![1u8; 16]);
|
||
assert_eq!(block1, vec![2u8; 16]);
|
||
assert_eq!(block2, vec![3u8; 16]);
|
||
|
||
// Re-read — should hit cache and return same data.
|
||
let block0_again = cache.get(&file, 0, 16).unwrap().to_vec();
|
||
let block1_again = cache.get(&file, BLOCK_ALIGN as u64, 16).unwrap().to_vec();
|
||
let block2_again = cache
|
||
.get(&file, (BLOCK_ALIGN * 2) as u64, 16)
|
||
.unwrap()
|
||
.to_vec();
|
||
|
||
assert_eq!(block0_again, block0);
|
||
assert_eq!(block1_again, block1);
|
||
assert_eq!(block2_again, block2);
|
||
}
|
||
|
||
#[test]
|
||
fn lru_eviction_order() {
|
||
// Fill all 16 slots, re-access slot 0, add 17th block,
|
||
// verify slot at offset 4096 (slot 1) evicted, not slot 0.
|
||
let data = vec![0u8; BLOCK_ALIGN * 20];
|
||
let f = make_file(&data);
|
||
let file = File::open(f.path()).unwrap();
|
||
let mut cache = ReadCache::new();
|
||
|
||
// Fill all 16 slots with blocks 0..16.
|
||
for i in 0..16u64 {
|
||
cache.get(&file, i * BLOCK_ALIGN as u64, 1).unwrap();
|
||
}
|
||
|
||
// Re-access block 0 so it's not the LRU.
|
||
cache.get(&file, 0, 1).unwrap();
|
||
|
||
// Add 17th block — should evict block 1 (offset 4096), which is the oldest
|
||
// since block 0 was re-accessed.
|
||
cache.get(&file, 16 * BLOCK_ALIGN as u64, 1).unwrap();
|
||
|
||
// Reading block 1 (offset 4096) should be a miss (evicted).
|
||
// We verify by checking the cache slots: block 1 should not be cached.
|
||
let has_block1 = cache
|
||
.slots
|
||
.iter()
|
||
.any(|s| s.block_offset == BLOCK_ALIGN as u64 && s.len > 0);
|
||
assert!(!has_block1, "block 1 should have been evicted");
|
||
|
||
// Block 0 should still be cached.
|
||
let has_block0 = cache.slots.iter().any(|s| s.block_offset == 0 && s.len > 0);
|
||
assert!(has_block0, "block 0 should still be cached");
|
||
}
|
||
|
||
#[test]
|
||
fn lru_clear_all_slots() {
|
||
// Fill 3+ slots, call clear(), verify subsequent reads all miss.
|
||
let data = vec![0x42u8; BLOCK_ALIGN * 4];
|
||
let f = make_file(&data);
|
||
let file = File::open(f.path()).unwrap();
|
||
let mut cache = ReadCache::new();
|
||
|
||
// Fill 3 slots.
|
||
cache.get(&file, 0, 1).unwrap();
|
||
cache.get(&file, BLOCK_ALIGN as u64, 1).unwrap();
|
||
cache.get(&file, (BLOCK_ALIGN * 2) as u64, 1).unwrap();
|
||
|
||
cache.clear();
|
||
|
||
// All slots should be fully reset.
|
||
for slot in &cache.slots {
|
||
assert_eq!(slot.block_offset, 0);
|
||
assert_eq!(slot.len, 0);
|
||
assert_eq!(slot.last_access, 0);
|
||
}
|
||
assert_eq!(cache.spill_len, 0);
|
||
|
||
// Re-read should still work (reads from file).
|
||
let val = cache.get(&file, 0, 1).unwrap();
|
||
assert_eq!(val[0], 0x42);
|
||
}
|
||
|
||
#[test]
|
||
fn lru_aligned_keys() {
|
||
// offset=100 and offset=200 both align to block 0 — should hit same slot.
|
||
let data = vec![0xEEu8; BLOCK_ALIGN];
|
||
let f = make_file(&data);
|
||
let file = File::open(f.path()).unwrap();
|
||
let mut cache = ReadCache::new();
|
||
|
||
let a = cache.get(&file, 100, 10).unwrap().to_vec();
|
||
assert_eq!(a, vec![0xEEu8; 10]);
|
||
|
||
let b = cache.get(&file, 200, 10).unwrap().to_vec();
|
||
assert_eq!(b, vec![0xEEu8; 10]);
|
||
|
||
// Both should be served from the same cache slot (block 0).
|
||
let slot_count = cache
|
||
.slots
|
||
.iter()
|
||
.filter(|s| s.block_offset == 0 && s.len > 0)
|
||
.count();
|
||
assert_eq!(slot_count, 1, "only one slot should hold block 0");
|
||
}
|
||
|
||
#[test]
|
||
fn lru_cross_block_uses_spill_buffer() {
|
||
// File [0xAA×4096, 0xBB×4096], get(file, 4090, 20) → [0xAA×6, 0xBB×14].
|
||
let mut data = vec![0xAAu8; BLOCK_ALIGN];
|
||
data.extend_from_slice(&vec![0xBBu8; BLOCK_ALIGN]);
|
||
let f = make_file(&data);
|
||
let file = File::open(f.path()).unwrap();
|
||
let mut cache = ReadCache::new();
|
||
|
||
let result = cache.get(&file, 4090, 20).unwrap().to_vec();
|
||
assert_eq!(&result[..6], &[0xAAu8; 6]);
|
||
assert_eq!(&result[6..], &[0xBBu8; 14]);
|
||
|
||
// Verify no cache slot holds block 0 or block 1.
|
||
for slot in &cache.slots {
|
||
assert!(
|
||
slot.len == 0,
|
||
"cross-block data should not be cached in slots"
|
||
);
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn lru_partial_last_block() {
|
||
// 5000 byte file, get(file, 4096, 904) reads last 904 bytes,
|
||
// then get(file, 4096, 100) hits cache.
|
||
let data = vec![0x77u8; 5000];
|
||
let f = make_file(&data);
|
||
let file = File::open(f.path()).unwrap();
|
||
let mut cache = ReadCache::new();
|
||
|
||
let first = cache.get(&file, 4096, 904).unwrap().to_vec();
|
||
assert_eq!(first.len(), 904);
|
||
assert_eq!(first, vec![0x77u8; 904]);
|
||
|
||
// Second read of 100 bytes from same block should hit cache.
|
||
let second = cache.get(&file, 4096, 100).unwrap().to_vec();
|
||
assert_eq!(second.len(), 100);
|
||
assert_eq!(second, vec![0x77u8; 100]);
|
||
}
|
||
|
||
#[test]
|
||
fn lru_short_file_overread() {
|
||
// 1 byte file, get(file, 0, 100) → Err (not panic).
|
||
let f = make_file(b"X");
|
||
let file = File::open(f.path()).unwrap();
|
||
let mut cache = ReadCache::new();
|
||
|
||
let result = cache.get(&file, 0, 100);
|
||
assert!(result.is_err());
|
||
}
|
||
|
||
#[test]
|
||
fn lru_empty_file_returns_error() {
|
||
// Empty file, get(file, 0, 1) → Err.
|
||
let f = make_file(b"");
|
||
let file = File::open(f.path()).unwrap();
|
||
let mut cache = ReadCache::new();
|
||
|
||
let result = cache.get(&file, 0, 1);
|
||
assert!(result.is_err());
|
||
}
|
||
|
||
#[test]
|
||
fn long_line_spans_multiple_blocks() {
|
||
// Create file with line >4KB: b'A'×4090 + "\n" + b'B'×4090 + "\n"
|
||
let mut data = vec![b'A'; 4090];
|
||
data.push(b'\n');
|
||
data.extend_from_slice(&vec![b'B'; 4090]);
|
||
data.push(b'\n');
|
||
let f = make_file(&data);
|
||
let file = File::open(f.path()).unwrap();
|
||
let mut cache = ReadCache::new();
|
||
|
||
// Read the first line: offset 0, len 4091 (fits in block 0 since 4091 <= 4096).
|
||
let line1 = cache.get(&file, 0, 4091).unwrap().to_vec();
|
||
assert_eq!(&line1[..4090], &[b'A'; 4090]);
|
||
assert_eq!(line1[4090], b'\n');
|
||
|
||
// Read the second line: offset 4091, len 4091 (starts in block 0, ends in block 1 — cross-block).
|
||
let line2 = cache.get(&file, 4091, 4091).unwrap().to_vec();
|
||
assert_eq!(&line2[..4090], &[b'B'; 4090]);
|
||
assert_eq!(line2[4090], b'\n');
|
||
}
|
||
|
||
#[test]
|
||
fn zero_len_read_is_noop_on_fresh_cache() {
|
||
let f = make_file(b"");
|
||
let file = File::open(f.path()).unwrap();
|
||
let mut cache = ReadCache::new();
|
||
|
||
let result = cache.get(&file, 0, 0).unwrap();
|
||
assert!(result.is_empty());
|
||
assert_eq!(cache.tick, 0);
|
||
assert!(cache.slots.iter().all(|s| s.len == 0));
|
||
}
|
||
|
||
#[test]
|
||
fn zero_len_read_is_noop_on_populated_cache() {
|
||
let f = make_file(b"abc");
|
||
let file = File::open(f.path()).unwrap();
|
||
let mut cache = ReadCache::new();
|
||
|
||
cache.get(&file, 0, 1).unwrap();
|
||
let tick_before = cache.tick;
|
||
|
||
let result = cache.get(&file, 0, 0).unwrap();
|
||
assert!(result.is_empty());
|
||
assert_eq!(cache.tick, tick_before);
|
||
}
|
||
|
||
#[test]
|
||
fn zero_len_read_at_max_offset_is_ok() {
|
||
let f = make_file(b"");
|
||
let file = File::open(f.path()).unwrap();
|
||
let mut cache = ReadCache::new();
|
||
|
||
let result = cache.get(&file, u64::MAX, 0).unwrap();
|
||
assert!(result.is_empty());
|
||
assert_eq!(cache.tick, 0);
|
||
assert!(cache.slots.iter().all(|s| s.len == 0));
|
||
}
|
||
|
||
#[test]
|
||
fn nonzero_read_range_overflow_returns_invalid_input() {
|
||
let f = make_file(b"abc");
|
||
let file = File::open(f.path()).unwrap();
|
||
let mut cache = ReadCache::new();
|
||
|
||
let err = cache.get(&file, u64::MAX, 1).unwrap_err();
|
||
assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
|
||
}
|
||
} |