feat(core): add LineSampler for fast line count estimation

Implement head/tail 64KB sampling to estimate total line count without scanning the entire file. Also provide read_first_lines() for reading the first N lines for immediate display during progressive loading.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
dailz
2026-04-14 09:06:42 +08:00
parent 2260d60302
commit cfbe4900a5

View File

@@ -0,0 +1,337 @@
// ─── line_sampler.rs ─────────────────────────────────────────────────────────
// 快速行数采样:仅读取文件头部和尾部各 64KB通过换行符密度外推总行数。
// 对于小文件≤128KB返回精确计数对于大文件IO 总量始终 < 1MB。
// ──────────────────────────────────────────────────────────────────────────────
use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
use std::path::Path;
/// Each sample reads at most 64KB from head and tail.
const SAMPLE_SIZE: usize = 64 * 1024;
// ─── LineSample ──────────────────────────────────────────────────────────────
/// Result of a quick line-count estimation.
#[derive(Debug, Clone)]
pub struct LineSample {
/// Estimated total number of lines in the file.
pub estimated_lines: u64,
/// Average bytes per line (including the newline).
pub avg_line_length: f64,
/// Confidence in the estimate: 0.0 (low) 1.0 (exact).
pub confidence: f64,
}
// ─── count_newlines ──────────────────────────────────────────────────────────
/// Count `\n` occurrences in a byte slice using SIMD-accelerated memchr.
fn count_newlines(data: &[u8]) -> usize {
memchr::memchr_iter(b'\n', data).count()
}
// ─── count_lines_exact ──────────────────────────────────────────────────────
/// Count lines in a complete byte buffer, matching the semantics of LineIndex:
/// - Empty data → 0 lines
/// - Trailing `\n` does not create an extra empty line
/// - No trailing `\n` → last line still counts
fn count_lines_exact(data: &[u8]) -> u64 {
if data.is_empty() {
return 0;
}
let newline_count = count_newlines(data);
let has_trailing_newline = data.last() == Some(&b'\n');
if has_trailing_newline {
newline_count as u64
} else {
(newline_count + 1) as u64
}
}
// ─── sample_line_count ──────────────────────────────────────────────────────
/// Quickly estimate the total line count of a file by sampling.
///
/// - Files ≤ 128KB: exact count, confidence = 1.0
/// - Larger files: reads only head 64KB + tail 64KB, extrapolates, IO < 1MB
pub fn sample_line_count(path: &Path) -> std::io::Result<LineSample> {
let mut file = std::fs::File::open(path)?;
let file_size = file.metadata()?.len();
if file_size == 0 {
return Ok(LineSample {
estimated_lines: 0,
avg_line_length: 0.0,
confidence: 1.0,
});
}
if file_size as usize <= 2 * SAMPLE_SIZE {
let mut buf = Vec::with_capacity(file_size as usize);
file.read_to_end(&mut buf)?;
let lines = count_lines_exact(&buf);
let avg = if lines > 0 {
file_size as f64 / lines as f64
} else {
0.0
};
return Ok(LineSample {
estimated_lines: lines,
avg_line_length: avg,
confidence: 1.0,
});
}
// Large file: sample head 64KB + tail 64KB
let mut head_buf = vec![0u8; SAMPLE_SIZE];
file.seek(SeekFrom::Start(0))?;
file.read_exact(&mut head_buf)?;
let tail_start = file_size - SAMPLE_SIZE as u64;
let mut tail_buf = vec![0u8; SAMPLE_SIZE];
file.seek(SeekFrom::Start(tail_start))?;
file.read_exact(&mut tail_buf)?;
let head_newlines = count_newlines(&head_buf);
let tail_newlines = count_newlines(&tail_buf);
let head_density = head_newlines as f64 / SAMPLE_SIZE as f64;
let tail_density = tail_newlines as f64 / SAMPLE_SIZE as f64;
let avg_density = (head_density + tail_density) / 2.0;
let estimated = (avg_density * file_size as f64).round() as u64;
let estimated = if tail_buf.last() == Some(&b'\n') {
estimated
} else {
estimated.max(1)
};
let avg_line_length = if estimated > 0 {
file_size as f64 / estimated as f64
} else {
file_size as f64
};
let confidence = (2.0 * SAMPLE_SIZE as f64 / file_size as f64).min(1.0);
Ok(LineSample {
estimated_lines: estimated,
avg_line_length,
confidence,
})
}
// ─── read_first_lines ───────────────────────────────────────────────────────
/// Read the first `max_lines` lines from a file without building an index.
/// Useful for showing a quick preview before the full index is built.
pub fn read_first_lines(path: &Path, max_lines: usize) -> std::io::Result<Vec<String>> {
let file = std::fs::File::open(path)?;
let mut reader = BufReader::new(file);
let mut lines = Vec::with_capacity(max_lines);
for _ in 0..max_lines {
let mut line = String::new();
let bytes_read = reader.read_line(&mut line)?;
if bytes_read == 0 {
break;
}
let trimmed = line.trim_end_matches(['\n', '\r']).to_owned();
lines.push(trimmed);
}
Ok(lines)
}
// ─── 单元测试 ────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
/// Helper: create a temp file with given content, return its path.
/// The file is cleaned up when TempFile is dropped.
struct TempFile {
path: std::path::PathBuf,
}
impl TempFile {
fn new(name: &str, content: &[u8]) -> Self {
let dir = std::env::temp_dir();
let path = dir.join(format!("log_viewer_test_{}", name));
let mut f = std::fs::File::create(&path).unwrap();
f.write_all(content).unwrap();
TempFile { path }
}
fn path(&self) -> &Path {
&self.path
}
}
impl Drop for TempFile {
fn drop(&mut self) {
let _ = std::fs::remove_file(&self.path);
}
}
#[test]
fn test_sample_empty_file() {
let tmp = TempFile::new("empty", b"");
let result = sample_line_count(tmp.path()).unwrap();
assert_eq!(result.estimated_lines, 0);
assert_eq!(result.confidence, 1.0);
}
#[test]
fn test_sample_single_line() {
let tmp = TempFile::new("single", b"hello world");
let result = sample_line_count(tmp.path()).unwrap();
assert_eq!(result.estimated_lines, 1);
assert!(result.confidence > 0.0);
}
#[test]
fn test_sample_small_file() {
let content = b"line1\nline2\nline3\nline4\nline5\n";
let tmp = TempFile::new("small", content);
let result = sample_line_count(tmp.path()).unwrap();
assert_eq!(result.estimated_lines, 5);
assert_eq!(result.confidence, 1.0);
}
#[test]
fn test_sample_small_file_no_trailing_newline() {
let content = b"aaa\nbbb\nccc";
let tmp = TempFile::new("small_no_nl", content);
let result = sample_line_count(tmp.path()).unwrap();
assert_eq!(result.estimated_lines, 3);
assert_eq!(result.confidence, 1.0);
}
#[test]
fn test_sample_large_file_accuracy() {
let line = "a".repeat(80);
let lines_per_chunk = 1000;
let num_chunks = 65;
let dir = std::env::temp_dir();
let path = dir.join("log_viewer_test_large_accuracy");
let mut f = std::fs::File::create(&path).unwrap();
for _ in 0..num_chunks {
for _ in 0..lines_per_chunk {
writeln!(f, "{}", line).unwrap();
}
}
drop(f);
let actual_lines = (lines_per_chunk * num_chunks) as u64;
let result = sample_line_count(&path).unwrap();
let lower = (actual_lines as f64 * 0.85).floor() as u64;
let upper = (actual_lines as f64 * 1.15).ceil() as u64;
assert!(
result.estimated_lines >= lower && result.estimated_lines <= upper,
"estimated={}, actual={}, bounds=[{}, {}]",
result.estimated_lines,
actual_lines,
lower,
upper
);
let _ = std::fs::remove_file(&path);
}
#[test]
fn test_sample_performance() {
let line = "x".repeat(100);
let target_bytes = 50 * 1024 * 1024;
let lines_needed = target_bytes / (line.len() + 1);
let dir = std::env::temp_dir();
let path = dir.join("log_viewer_test_perf_50mb");
let mut f = std::fs::File::create(&path).unwrap();
// Write in chunks to avoid excessive memory
let chunk_size = 10_000;
let mut written = 0;
while written < lines_needed {
let batch = chunk_size.min(lines_needed - written);
for _ in 0..batch {
writeln!(f, "{}", line).unwrap();
}
written += batch;
}
drop(f);
let start = std::time::Instant::now();
let result = sample_line_count(&path).unwrap();
let elapsed = start.elapsed();
assert!(
elapsed.as_millis() < 50,
"sampling took {:?}, expected < 50ms",
elapsed
);
assert!(result.estimated_lines > 0);
let _ = std::fs::remove_file(&path);
}
#[test]
fn test_read_first_lines_basic() {
let content = b"first\nsecond\nthird\nfourth\n";
let tmp = TempFile::new("first_lines", content);
let lines = read_first_lines(tmp.path(), 3).unwrap();
assert_eq!(lines, vec!["first", "second", "third"]);
}
#[test]
fn test_read_first_lines_more_than_file() {
let content = b"only\n";
let tmp = TempFile::new("first_lines_short", content);
let lines = read_first_lines(tmp.path(), 10).unwrap();
assert_eq!(lines, vec!["only"]);
}
#[test]
fn test_read_first_lines_empty() {
let tmp = TempFile::new("first_lines_empty", b"");
let lines = read_first_lines(tmp.path(), 5).unwrap();
assert!(lines.is_empty());
}
#[test]
fn test_exact_boundary_size() {
let line_len = 64;
let total_bytes = 2 * SAMPLE_SIZE;
let num_lines = total_bytes / (line_len + 1);
let line = "a".repeat(line_len);
let dir = std::env::temp_dir();
let path = dir.join("log_viewer_test_boundary");
let mut f = std::fs::File::create(&path).unwrap();
for i in 0..num_lines {
if i < num_lines - 1 {
writeln!(f, "{}", line).unwrap();
} else {
write!(f, "{}", line).unwrap();
}
}
drop(f);
let result = sample_line_count(&path).unwrap();
assert_eq!(result.estimated_lines, num_lines as u64);
assert_eq!(result.confidence, 1.0);
let _ = std::fs::remove_file(&path);
}
#[test]
fn test_avg_line_length() {
// 3 lines: "aa\n" (3B), "bb\n" (3B), "cc" (2B) = 8 bytes total
let content = b"aa\nbb\ncc";
let tmp = TempFile::new("avg_len", content);
let result = sample_line_count(tmp.path()).unwrap();
assert_eq!(result.estimated_lines, 3);
let expected_avg = 8.0 / 3.0;
assert!((result.avg_line_length - expected_avg).abs() < 0.01);
}
}