feat(core): add LineSampler for fast line count estimation
Implement head/tail 64KB sampling to estimate total line count without scanning the entire file. Also provide read_first_lines() for reading the first N lines for immediate display during progressive loading. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
337
crates/core/src/io/line_sampler.rs
Normal file
337
crates/core/src/io/line_sampler.rs
Normal file
@@ -0,0 +1,337 @@
|
|||||||
|
// ─── line_sampler.rs ─────────────────────────────────────────────────────────
|
||||||
|
// 快速行数采样:仅读取文件头部和尾部各 64KB,通过换行符密度外推总行数。
|
||||||
|
// 对于小文件(≤128KB)返回精确计数;对于大文件,IO 总量始终 < 1MB。
|
||||||
|
// ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
/// Each sample reads at most 64KB from head and tail.
|
||||||
|
const SAMPLE_SIZE: usize = 64 * 1024;
|
||||||
|
|
||||||
|
// ─── LineSample ──────────────────────────────────────────────────────────────
|
||||||
|
/// Result of a quick line-count estimation.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct LineSample {
|
||||||
|
/// Estimated total number of lines in the file.
|
||||||
|
pub estimated_lines: u64,
|
||||||
|
/// Average bytes per line (including the newline).
|
||||||
|
pub avg_line_length: f64,
|
||||||
|
/// Confidence in the estimate: 0.0 (low) – 1.0 (exact).
|
||||||
|
pub confidence: f64,
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── count_newlines ──────────────────────────────────────────────────────────
|
||||||
|
/// Count `\n` occurrences in a byte slice using SIMD-accelerated memchr.
|
||||||
|
fn count_newlines(data: &[u8]) -> usize {
|
||||||
|
memchr::memchr_iter(b'\n', data).count()
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── count_lines_exact ──────────────────────────────────────────────────────
|
||||||
|
/// Count lines in a complete byte buffer, matching the semantics of LineIndex:
|
||||||
|
/// - Empty data → 0 lines
|
||||||
|
/// - Trailing `\n` does not create an extra empty line
|
||||||
|
/// - No trailing `\n` → last line still counts
|
||||||
|
fn count_lines_exact(data: &[u8]) -> u64 {
|
||||||
|
if data.is_empty() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
let newline_count = count_newlines(data);
|
||||||
|
let has_trailing_newline = data.last() == Some(&b'\n');
|
||||||
|
if has_trailing_newline {
|
||||||
|
newline_count as u64
|
||||||
|
} else {
|
||||||
|
(newline_count + 1) as u64
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── sample_line_count ──────────────────────────────────────────────────────
|
||||||
|
/// Quickly estimate the total line count of a file by sampling.
|
||||||
|
///
|
||||||
|
/// - Files ≤ 128KB: exact count, confidence = 1.0
|
||||||
|
/// - Larger files: reads only head 64KB + tail 64KB, extrapolates, IO < 1MB
|
||||||
|
pub fn sample_line_count(path: &Path) -> std::io::Result<LineSample> {
|
||||||
|
let mut file = std::fs::File::open(path)?;
|
||||||
|
let file_size = file.metadata()?.len();
|
||||||
|
|
||||||
|
if file_size == 0 {
|
||||||
|
return Ok(LineSample {
|
||||||
|
estimated_lines: 0,
|
||||||
|
avg_line_length: 0.0,
|
||||||
|
confidence: 1.0,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if file_size as usize <= 2 * SAMPLE_SIZE {
|
||||||
|
let mut buf = Vec::with_capacity(file_size as usize);
|
||||||
|
file.read_to_end(&mut buf)?;
|
||||||
|
let lines = count_lines_exact(&buf);
|
||||||
|
let avg = if lines > 0 {
|
||||||
|
file_size as f64 / lines as f64
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
return Ok(LineSample {
|
||||||
|
estimated_lines: lines,
|
||||||
|
avg_line_length: avg,
|
||||||
|
confidence: 1.0,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Large file: sample head 64KB + tail 64KB
|
||||||
|
let mut head_buf = vec![0u8; SAMPLE_SIZE];
|
||||||
|
file.seek(SeekFrom::Start(0))?;
|
||||||
|
file.read_exact(&mut head_buf)?;
|
||||||
|
|
||||||
|
let tail_start = file_size - SAMPLE_SIZE as u64;
|
||||||
|
let mut tail_buf = vec![0u8; SAMPLE_SIZE];
|
||||||
|
file.seek(SeekFrom::Start(tail_start))?;
|
||||||
|
file.read_exact(&mut tail_buf)?;
|
||||||
|
|
||||||
|
let head_newlines = count_newlines(&head_buf);
|
||||||
|
let tail_newlines = count_newlines(&tail_buf);
|
||||||
|
|
||||||
|
let head_density = head_newlines as f64 / SAMPLE_SIZE as f64;
|
||||||
|
let tail_density = tail_newlines as f64 / SAMPLE_SIZE as f64;
|
||||||
|
let avg_density = (head_density + tail_density) / 2.0;
|
||||||
|
|
||||||
|
let estimated = (avg_density * file_size as f64).round() as u64;
|
||||||
|
let estimated = if tail_buf.last() == Some(&b'\n') {
|
||||||
|
estimated
|
||||||
|
} else {
|
||||||
|
estimated.max(1)
|
||||||
|
};
|
||||||
|
|
||||||
|
let avg_line_length = if estimated > 0 {
|
||||||
|
file_size as f64 / estimated as f64
|
||||||
|
} else {
|
||||||
|
file_size as f64
|
||||||
|
};
|
||||||
|
|
||||||
|
let confidence = (2.0 * SAMPLE_SIZE as f64 / file_size as f64).min(1.0);
|
||||||
|
|
||||||
|
Ok(LineSample {
|
||||||
|
estimated_lines: estimated,
|
||||||
|
avg_line_length,
|
||||||
|
confidence,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── read_first_lines ───────────────────────────────────────────────────────
|
||||||
|
/// Read the first `max_lines` lines from a file without building an index.
|
||||||
|
/// Useful for showing a quick preview before the full index is built.
|
||||||
|
pub fn read_first_lines(path: &Path, max_lines: usize) -> std::io::Result<Vec<String>> {
|
||||||
|
let file = std::fs::File::open(path)?;
|
||||||
|
let mut reader = BufReader::new(file);
|
||||||
|
let mut lines = Vec::with_capacity(max_lines);
|
||||||
|
|
||||||
|
for _ in 0..max_lines {
|
||||||
|
let mut line = String::new();
|
||||||
|
let bytes_read = reader.read_line(&mut line)?;
|
||||||
|
if bytes_read == 0 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let trimmed = line.trim_end_matches(['\n', '\r']).to_owned();
|
||||||
|
lines.push(trimmed);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(lines)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── 单元测试 ────────────────────────────────────────────────────────────────
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use std::io::Write;
|
||||||
|
|
||||||
|
/// Helper: create a temp file with given content, return its path.
|
||||||
|
/// The file is cleaned up when TempFile is dropped.
|
||||||
|
struct TempFile {
|
||||||
|
path: std::path::PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TempFile {
|
||||||
|
fn new(name: &str, content: &[u8]) -> Self {
|
||||||
|
let dir = std::env::temp_dir();
|
||||||
|
let path = dir.join(format!("log_viewer_test_{}", name));
|
||||||
|
let mut f = std::fs::File::create(&path).unwrap();
|
||||||
|
f.write_all(content).unwrap();
|
||||||
|
TempFile { path }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn path(&self) -> &Path {
|
||||||
|
&self.path
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for TempFile {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
let _ = std::fs::remove_file(&self.path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_sample_empty_file() {
|
||||||
|
let tmp = TempFile::new("empty", b"");
|
||||||
|
let result = sample_line_count(tmp.path()).unwrap();
|
||||||
|
assert_eq!(result.estimated_lines, 0);
|
||||||
|
assert_eq!(result.confidence, 1.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_sample_single_line() {
|
||||||
|
let tmp = TempFile::new("single", b"hello world");
|
||||||
|
let result = sample_line_count(tmp.path()).unwrap();
|
||||||
|
assert_eq!(result.estimated_lines, 1);
|
||||||
|
assert!(result.confidence > 0.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_sample_small_file() {
|
||||||
|
let content = b"line1\nline2\nline3\nline4\nline5\n";
|
||||||
|
let tmp = TempFile::new("small", content);
|
||||||
|
let result = sample_line_count(tmp.path()).unwrap();
|
||||||
|
assert_eq!(result.estimated_lines, 5);
|
||||||
|
assert_eq!(result.confidence, 1.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_sample_small_file_no_trailing_newline() {
|
||||||
|
let content = b"aaa\nbbb\nccc";
|
||||||
|
let tmp = TempFile::new("small_no_nl", content);
|
||||||
|
let result = sample_line_count(tmp.path()).unwrap();
|
||||||
|
assert_eq!(result.estimated_lines, 3);
|
||||||
|
assert_eq!(result.confidence, 1.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_sample_large_file_accuracy() {
|
||||||
|
let line = "a".repeat(80);
|
||||||
|
let lines_per_chunk = 1000;
|
||||||
|
let num_chunks = 65;
|
||||||
|
|
||||||
|
let dir = std::env::temp_dir();
|
||||||
|
let path = dir.join("log_viewer_test_large_accuracy");
|
||||||
|
let mut f = std::fs::File::create(&path).unwrap();
|
||||||
|
|
||||||
|
for _ in 0..num_chunks {
|
||||||
|
for _ in 0..lines_per_chunk {
|
||||||
|
writeln!(f, "{}", line).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
drop(f);
|
||||||
|
|
||||||
|
let actual_lines = (lines_per_chunk * num_chunks) as u64;
|
||||||
|
let result = sample_line_count(&path).unwrap();
|
||||||
|
|
||||||
|
let lower = (actual_lines as f64 * 0.85).floor() as u64;
|
||||||
|
let upper = (actual_lines as f64 * 1.15).ceil() as u64;
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
result.estimated_lines >= lower && result.estimated_lines <= upper,
|
||||||
|
"estimated={}, actual={}, bounds=[{}, {}]",
|
||||||
|
result.estimated_lines,
|
||||||
|
actual_lines,
|
||||||
|
lower,
|
||||||
|
upper
|
||||||
|
);
|
||||||
|
|
||||||
|
let _ = std::fs::remove_file(&path);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_sample_performance() {
|
||||||
|
let line = "x".repeat(100);
|
||||||
|
let target_bytes = 50 * 1024 * 1024;
|
||||||
|
let lines_needed = target_bytes / (line.len() + 1);
|
||||||
|
|
||||||
|
let dir = std::env::temp_dir();
|
||||||
|
let path = dir.join("log_viewer_test_perf_50mb");
|
||||||
|
let mut f = std::fs::File::create(&path).unwrap();
|
||||||
|
|
||||||
|
// Write in chunks to avoid excessive memory
|
||||||
|
let chunk_size = 10_000;
|
||||||
|
let mut written = 0;
|
||||||
|
while written < lines_needed {
|
||||||
|
let batch = chunk_size.min(lines_needed - written);
|
||||||
|
for _ in 0..batch {
|
||||||
|
writeln!(f, "{}", line).unwrap();
|
||||||
|
}
|
||||||
|
written += batch;
|
||||||
|
}
|
||||||
|
drop(f);
|
||||||
|
|
||||||
|
let start = std::time::Instant::now();
|
||||||
|
let result = sample_line_count(&path).unwrap();
|
||||||
|
let elapsed = start.elapsed();
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
elapsed.as_millis() < 50,
|
||||||
|
"sampling took {:?}, expected < 50ms",
|
||||||
|
elapsed
|
||||||
|
);
|
||||||
|
assert!(result.estimated_lines > 0);
|
||||||
|
|
||||||
|
let _ = std::fs::remove_file(&path);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_read_first_lines_basic() {
|
||||||
|
let content = b"first\nsecond\nthird\nfourth\n";
|
||||||
|
let tmp = TempFile::new("first_lines", content);
|
||||||
|
let lines = read_first_lines(tmp.path(), 3).unwrap();
|
||||||
|
assert_eq!(lines, vec!["first", "second", "third"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_read_first_lines_more_than_file() {
|
||||||
|
let content = b"only\n";
|
||||||
|
let tmp = TempFile::new("first_lines_short", content);
|
||||||
|
let lines = read_first_lines(tmp.path(), 10).unwrap();
|
||||||
|
assert_eq!(lines, vec!["only"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_read_first_lines_empty() {
|
||||||
|
let tmp = TempFile::new("first_lines_empty", b"");
|
||||||
|
let lines = read_first_lines(tmp.path(), 5).unwrap();
|
||||||
|
assert!(lines.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_exact_boundary_size() {
|
||||||
|
let line_len = 64;
|
||||||
|
let total_bytes = 2 * SAMPLE_SIZE;
|
||||||
|
let num_lines = total_bytes / (line_len + 1);
|
||||||
|
let line = "a".repeat(line_len);
|
||||||
|
|
||||||
|
let dir = std::env::temp_dir();
|
||||||
|
let path = dir.join("log_viewer_test_boundary");
|
||||||
|
let mut f = std::fs::File::create(&path).unwrap();
|
||||||
|
for i in 0..num_lines {
|
||||||
|
if i < num_lines - 1 {
|
||||||
|
writeln!(f, "{}", line).unwrap();
|
||||||
|
} else {
|
||||||
|
write!(f, "{}", line).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
drop(f);
|
||||||
|
|
||||||
|
let result = sample_line_count(&path).unwrap();
|
||||||
|
assert_eq!(result.estimated_lines, num_lines as u64);
|
||||||
|
assert_eq!(result.confidence, 1.0);
|
||||||
|
|
||||||
|
let _ = std::fs::remove_file(&path);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_avg_line_length() {
|
||||||
|
// 3 lines: "aa\n" (3B), "bb\n" (3B), "cc" (2B) = 8 bytes total
|
||||||
|
let content = b"aa\nbb\ncc";
|
||||||
|
let tmp = TempFile::new("avg_len", content);
|
||||||
|
let result = sample_line_count(tmp.path()).unwrap();
|
||||||
|
assert_eq!(result.estimated_lines, 3);
|
||||||
|
let expected_avg = 8.0 / 3.0;
|
||||||
|
assert!((result.avg_line_length - expected_avg).abs() < 0.01);
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user