Replace `static mut OLD_SIGBUS_HANDLER` with AtomicU8 + AtomicPtr to remove data race UB when concurrent benchmarks call open() from multiple threads. Key changes: - Use `Once::call_once` to guarantee single handler installation - Publish old handler to atomics BEFORE installing new handler (closes the handler-active-but-state-unpublished race window) - Read atomics with Acquire in signal handler (async-signal-safe) - Align si_addr to page boundary before mmap(MAP_FIXED) - Add concurrent test: 8 threads open all 5 variants simultaneously
111 lines
3.9 KiB
Rust
111 lines
3.9 KiB
Rust
// ─── line_index.rs ───────────────────────────────────────────────────────────
|
|
// Vendored from crates/core/src/io/line_index.rs
|
|
// Sparse line index: sample every 256 lines to reduce memory usage.
|
|
// ──────────────────────────────────────────────────────────────────────────────
|
|
|
|
const BLOCK_SIZE: usize = 256;
|
|
|
|
pub struct LineIndex {
|
|
pub(crate) sampled_offsets: Vec<u64>,
|
|
pub(crate) total_lines: u64,
|
|
#[allow(dead_code)]
|
|
pub(crate) has_trailing_newline: bool,
|
|
}
|
|
|
|
impl LineIndex {
|
|
/// Build sparse line index from a streaming reader.
|
|
/// Uses fill_buf()/consume() to avoid loading the entire file into memory.
|
|
/// RSS stays at ~64KB (BufReader buffer size), independent of file size.
|
|
pub fn from_reader(reader: &mut impl std::io::BufRead) -> std::io::Result<Self> {
|
|
let mut sampled_offsets: Vec<u64> = vec![0]; // line 0 starts at offset 0
|
|
let mut next_line_idx: usize = 1;
|
|
let mut newline_count: usize = 0;
|
|
let mut chunk_offset: u64 = 0;
|
|
let mut last_byte: Option<u8> = None;
|
|
|
|
loop {
|
|
let buf = reader.fill_buf()?;
|
|
if buf.is_empty() {
|
|
break;
|
|
}
|
|
|
|
if let Some(&b) = buf.last() {
|
|
last_byte = Some(b);
|
|
}
|
|
|
|
for pos in memchr::memchr_iter(b'\n', buf) {
|
|
newline_count += 1;
|
|
if next_line_idx.is_multiple_of(BLOCK_SIZE) {
|
|
sampled_offsets.push(chunk_offset + pos as u64 + 1);
|
|
}
|
|
next_line_idx += 1;
|
|
}
|
|
|
|
let consumed = buf.len();
|
|
chunk_offset += consumed as u64;
|
|
reader.consume(consumed);
|
|
}
|
|
|
|
// Empty file: no data at all
|
|
if chunk_offset == 0 {
|
|
return Ok(LineIndex {
|
|
sampled_offsets: vec![],
|
|
total_lines: 0,
|
|
has_trailing_newline: false,
|
|
});
|
|
}
|
|
|
|
let has_trailing_newline = last_byte == Some(b'\n') && newline_count > 0;
|
|
|
|
let total_lines: u64 = if has_trailing_newline && newline_count > 0 {
|
|
newline_count as u64
|
|
} else {
|
|
(1 + newline_count) as u64
|
|
};
|
|
|
|
// Trailing \n pop logic
|
|
if has_trailing_newline && newline_count > 0 {
|
|
let trailing_line_idx = newline_count;
|
|
if trailing_line_idx.is_multiple_of(BLOCK_SIZE) {
|
|
sampled_offsets.pop();
|
|
}
|
|
}
|
|
|
|
Ok(LineIndex {
|
|
sampled_offsets,
|
|
total_lines,
|
|
has_trailing_newline,
|
|
})
|
|
}
|
|
|
|
/// Return total line count.
|
|
pub fn line_count(&self) -> usize {
|
|
self.total_lines as usize
|
|
}
|
|
|
|
/// Retrieve the content of line `idx` from the given data slice.
|
|
/// Uses sparse index to locate the block start, then scans forward
|
|
/// a small number of newlines to find the target line.
|
|
pub fn get_line<'a>(&self, data: &'a [u8], idx: usize) -> Option<&'a str> {
|
|
if idx >= self.total_lines as usize || data.is_empty() {
|
|
return None;
|
|
}
|
|
let block = idx / BLOCK_SIZE;
|
|
let offset_in_block = idx % BLOCK_SIZE;
|
|
let mut pos = self.sampled_offsets[block] as usize;
|
|
for _ in 0..offset_in_block {
|
|
match memchr::memchr(b'\n', &data[pos..]) {
|
|
Some(rel) => pos = pos + rel + 1,
|
|
None => return None,
|
|
}
|
|
}
|
|
let end = memchr::memchr(b'\n', &data[pos..])
|
|
.map(|rel| pos + rel)
|
|
.unwrap_or(data.len());
|
|
let line_bytes = &data[pos..end];
|
|
std::str::from_utf8(line_bytes)
|
|
.map(|s| s.trim_end_matches(['\r', '\n']))
|
|
.ok()
|
|
}
|
|
}
|