// ─── line_index.rs ─────────────────────────────────────────────────────────── // Vendored from crates/core/src/io/line_index.rs // Sparse line index: sample every 256 lines to reduce memory usage. // ────────────────────────────────────────────────────────────────────────────── const BLOCK_SIZE: usize = 256; pub struct LineIndex { pub(crate) sampled_offsets: Vec, pub(crate) total_lines: u64, #[allow(dead_code)] pub(crate) has_trailing_newline: bool, } impl LineIndex { /// Build sparse line index from a streaming reader. /// Uses fill_buf()/consume() to avoid loading the entire file into memory. /// RSS stays at ~64KB (BufReader buffer size), independent of file size. pub fn from_reader(reader: &mut impl std::io::BufRead) -> std::io::Result { let mut sampled_offsets: Vec = vec![0]; // line 0 starts at offset 0 let mut next_line_idx: usize = 1; let mut newline_count: usize = 0; let mut chunk_offset: u64 = 0; let mut last_byte: Option = None; loop { let buf = reader.fill_buf()?; if buf.is_empty() { break; } if let Some(&b) = buf.last() { last_byte = Some(b); } for pos in memchr::memchr_iter(b'\n', buf) { newline_count += 1; if next_line_idx.is_multiple_of(BLOCK_SIZE) { sampled_offsets.push(chunk_offset + pos as u64 + 1); } next_line_idx += 1; } let consumed = buf.len(); chunk_offset += consumed as u64; reader.consume(consumed); } // Empty file: no data at all if chunk_offset == 0 { return Ok(LineIndex { sampled_offsets: vec![], total_lines: 0, has_trailing_newline: false, }); } let has_trailing_newline = last_byte == Some(b'\n') && newline_count > 0; let total_lines: u64 = if has_trailing_newline && newline_count > 0 { newline_count as u64 } else { (1 + newline_count) as u64 }; // Trailing \n pop logic if has_trailing_newline && newline_count > 0 { let trailing_line_idx = newline_count; if trailing_line_idx.is_multiple_of(BLOCK_SIZE) { sampled_offsets.pop(); } } Ok(LineIndex { sampled_offsets, total_lines, has_trailing_newline, }) } /// Return total line count. pub fn line_count(&self) -> usize { self.total_lines as usize } /// Retrieve the content of line `idx` from the given data slice. /// Uses sparse index to locate the block start, then scans forward /// a small number of newlines to find the target line. pub fn get_line<'a>(&self, data: &'a [u8], idx: usize) -> Option<&'a str> { if idx >= self.total_lines as usize || data.is_empty() { return None; } let block = idx / BLOCK_SIZE; let offset_in_block = idx % BLOCK_SIZE; let mut pos = self.sampled_offsets[block] as usize; for _ in 0..offset_in_block { match memchr::memchr(b'\n', &data[pos..]) { Some(rel) => pos = pos + rel + 1, None => return None, } } let end = memchr::memchr(b'\n', &data[pos..]) .map(|rel| pos + rel) .unwrap_or(data.len()); let line_bytes = &data[pos..end]; std::str::from_utf8(line_bytes) .map(|s| s.trim_end_matches(['\r', '\n'])) .ok() } }