From cfbe4900a50d7e34ec2975c670fdadc34d9796cf Mon Sep 17 00:00:00 2001 From: dailz Date: Tue, 14 Apr 2026 09:06:42 +0800 Subject: [PATCH] feat(core): add LineSampler for fast line count estimation Implement head/tail 64KB sampling to estimate total line count without scanning the entire file. Also provide read_first_lines() for reading the first N lines for immediate display during progressive loading. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus --- crates/core/src/io/line_sampler.rs | 337 +++++++++++++++++++++++++++++ 1 file changed, 337 insertions(+) create mode 100644 crates/core/src/io/line_sampler.rs diff --git a/crates/core/src/io/line_sampler.rs b/crates/core/src/io/line_sampler.rs new file mode 100644 index 0000000..fb21c1d --- /dev/null +++ b/crates/core/src/io/line_sampler.rs @@ -0,0 +1,337 @@ +// ─── line_sampler.rs ───────────────────────────────────────────────────────── +// 快速行数采样:仅读取文件头部和尾部各 64KB,通过换行符密度外推总行数。 +// 对于小文件(≤128KB)返回精确计数;对于大文件,IO 总量始终 < 1MB。 +// ────────────────────────────────────────────────────────────────────────────── + +use std::io::{BufRead, BufReader, Read, Seek, SeekFrom}; +use std::path::Path; + +/// Each sample reads at most 64KB from head and tail. +const SAMPLE_SIZE: usize = 64 * 1024; + +// ─── LineSample ────────────────────────────────────────────────────────────── +/// Result of a quick line-count estimation. +#[derive(Debug, Clone)] +pub struct LineSample { + /// Estimated total number of lines in the file. + pub estimated_lines: u64, + /// Average bytes per line (including the newline). + pub avg_line_length: f64, + /// Confidence in the estimate: 0.0 (low) – 1.0 (exact). + pub confidence: f64, +} + +// ─── count_newlines ────────────────────────────────────────────────────────── +/// Count `\n` occurrences in a byte slice using SIMD-accelerated memchr. +fn count_newlines(data: &[u8]) -> usize { + memchr::memchr_iter(b'\n', data).count() +} + +// ─── count_lines_exact ────────────────────────────────────────────────────── +/// Count lines in a complete byte buffer, matching the semantics of LineIndex: +/// - Empty data → 0 lines +/// - Trailing `\n` does not create an extra empty line +/// - No trailing `\n` → last line still counts +fn count_lines_exact(data: &[u8]) -> u64 { + if data.is_empty() { + return 0; + } + let newline_count = count_newlines(data); + let has_trailing_newline = data.last() == Some(&b'\n'); + if has_trailing_newline { + newline_count as u64 + } else { + (newline_count + 1) as u64 + } +} + +// ─── sample_line_count ────────────────────────────────────────────────────── +/// Quickly estimate the total line count of a file by sampling. +/// +/// - Files ≤ 128KB: exact count, confidence = 1.0 +/// - Larger files: reads only head 64KB + tail 64KB, extrapolates, IO < 1MB +pub fn sample_line_count(path: &Path) -> std::io::Result { + let mut file = std::fs::File::open(path)?; + let file_size = file.metadata()?.len(); + + if file_size == 0 { + return Ok(LineSample { + estimated_lines: 0, + avg_line_length: 0.0, + confidence: 1.0, + }); + } + + if file_size as usize <= 2 * SAMPLE_SIZE { + let mut buf = Vec::with_capacity(file_size as usize); + file.read_to_end(&mut buf)?; + let lines = count_lines_exact(&buf); + let avg = if lines > 0 { + file_size as f64 / lines as f64 + } else { + 0.0 + }; + return Ok(LineSample { + estimated_lines: lines, + avg_line_length: avg, + confidence: 1.0, + }); + } + + // Large file: sample head 64KB + tail 64KB + let mut head_buf = vec![0u8; SAMPLE_SIZE]; + file.seek(SeekFrom::Start(0))?; + file.read_exact(&mut head_buf)?; + + let tail_start = file_size - SAMPLE_SIZE as u64; + let mut tail_buf = vec![0u8; SAMPLE_SIZE]; + file.seek(SeekFrom::Start(tail_start))?; + file.read_exact(&mut tail_buf)?; + + let head_newlines = count_newlines(&head_buf); + let tail_newlines = count_newlines(&tail_buf); + + let head_density = head_newlines as f64 / SAMPLE_SIZE as f64; + let tail_density = tail_newlines as f64 / SAMPLE_SIZE as f64; + let avg_density = (head_density + tail_density) / 2.0; + + let estimated = (avg_density * file_size as f64).round() as u64; + let estimated = if tail_buf.last() == Some(&b'\n') { + estimated + } else { + estimated.max(1) + }; + + let avg_line_length = if estimated > 0 { + file_size as f64 / estimated as f64 + } else { + file_size as f64 + }; + + let confidence = (2.0 * SAMPLE_SIZE as f64 / file_size as f64).min(1.0); + + Ok(LineSample { + estimated_lines: estimated, + avg_line_length, + confidence, + }) +} + +// ─── read_first_lines ─────────────────────────────────────────────────────── +/// Read the first `max_lines` lines from a file without building an index. +/// Useful for showing a quick preview before the full index is built. +pub fn read_first_lines(path: &Path, max_lines: usize) -> std::io::Result> { + let file = std::fs::File::open(path)?; + let mut reader = BufReader::new(file); + let mut lines = Vec::with_capacity(max_lines); + + for _ in 0..max_lines { + let mut line = String::new(); + let bytes_read = reader.read_line(&mut line)?; + if bytes_read == 0 { + break; + } + let trimmed = line.trim_end_matches(['\n', '\r']).to_owned(); + lines.push(trimmed); + } + + Ok(lines) +} + +// ─── 单元测试 ──────────────────────────────────────────────────────────────── +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + + /// Helper: create a temp file with given content, return its path. + /// The file is cleaned up when TempFile is dropped. + struct TempFile { + path: std::path::PathBuf, + } + + impl TempFile { + fn new(name: &str, content: &[u8]) -> Self { + let dir = std::env::temp_dir(); + let path = dir.join(format!("log_viewer_test_{}", name)); + let mut f = std::fs::File::create(&path).unwrap(); + f.write_all(content).unwrap(); + TempFile { path } + } + + fn path(&self) -> &Path { + &self.path + } + } + + impl Drop for TempFile { + fn drop(&mut self) { + let _ = std::fs::remove_file(&self.path); + } + } + + #[test] + fn test_sample_empty_file() { + let tmp = TempFile::new("empty", b""); + let result = sample_line_count(tmp.path()).unwrap(); + assert_eq!(result.estimated_lines, 0); + assert_eq!(result.confidence, 1.0); + } + + #[test] + fn test_sample_single_line() { + let tmp = TempFile::new("single", b"hello world"); + let result = sample_line_count(tmp.path()).unwrap(); + assert_eq!(result.estimated_lines, 1); + assert!(result.confidence > 0.0); + } + + #[test] + fn test_sample_small_file() { + let content = b"line1\nline2\nline3\nline4\nline5\n"; + let tmp = TempFile::new("small", content); + let result = sample_line_count(tmp.path()).unwrap(); + assert_eq!(result.estimated_lines, 5); + assert_eq!(result.confidence, 1.0); + } + + #[test] + fn test_sample_small_file_no_trailing_newline() { + let content = b"aaa\nbbb\nccc"; + let tmp = TempFile::new("small_no_nl", content); + let result = sample_line_count(tmp.path()).unwrap(); + assert_eq!(result.estimated_lines, 3); + assert_eq!(result.confidence, 1.0); + } + + #[test] + fn test_sample_large_file_accuracy() { + let line = "a".repeat(80); + let lines_per_chunk = 1000; + let num_chunks = 65; + + let dir = std::env::temp_dir(); + let path = dir.join("log_viewer_test_large_accuracy"); + let mut f = std::fs::File::create(&path).unwrap(); + + for _ in 0..num_chunks { + for _ in 0..lines_per_chunk { + writeln!(f, "{}", line).unwrap(); + } + } + drop(f); + + let actual_lines = (lines_per_chunk * num_chunks) as u64; + let result = sample_line_count(&path).unwrap(); + + let lower = (actual_lines as f64 * 0.85).floor() as u64; + let upper = (actual_lines as f64 * 1.15).ceil() as u64; + + assert!( + result.estimated_lines >= lower && result.estimated_lines <= upper, + "estimated={}, actual={}, bounds=[{}, {}]", + result.estimated_lines, + actual_lines, + lower, + upper + ); + + let _ = std::fs::remove_file(&path); + } + + #[test] + fn test_sample_performance() { + let line = "x".repeat(100); + let target_bytes = 50 * 1024 * 1024; + let lines_needed = target_bytes / (line.len() + 1); + + let dir = std::env::temp_dir(); + let path = dir.join("log_viewer_test_perf_50mb"); + let mut f = std::fs::File::create(&path).unwrap(); + + // Write in chunks to avoid excessive memory + let chunk_size = 10_000; + let mut written = 0; + while written < lines_needed { + let batch = chunk_size.min(lines_needed - written); + for _ in 0..batch { + writeln!(f, "{}", line).unwrap(); + } + written += batch; + } + drop(f); + + let start = std::time::Instant::now(); + let result = sample_line_count(&path).unwrap(); + let elapsed = start.elapsed(); + + assert!( + elapsed.as_millis() < 50, + "sampling took {:?}, expected < 50ms", + elapsed + ); + assert!(result.estimated_lines > 0); + + let _ = std::fs::remove_file(&path); + } + + #[test] + fn test_read_first_lines_basic() { + let content = b"first\nsecond\nthird\nfourth\n"; + let tmp = TempFile::new("first_lines", content); + let lines = read_first_lines(tmp.path(), 3).unwrap(); + assert_eq!(lines, vec!["first", "second", "third"]); + } + + #[test] + fn test_read_first_lines_more_than_file() { + let content = b"only\n"; + let tmp = TempFile::new("first_lines_short", content); + let lines = read_first_lines(tmp.path(), 10).unwrap(); + assert_eq!(lines, vec!["only"]); + } + + #[test] + fn test_read_first_lines_empty() { + let tmp = TempFile::new("first_lines_empty", b""); + let lines = read_first_lines(tmp.path(), 5).unwrap(); + assert!(lines.is_empty()); + } + + #[test] + fn test_exact_boundary_size() { + let line_len = 64; + let total_bytes = 2 * SAMPLE_SIZE; + let num_lines = total_bytes / (line_len + 1); + let line = "a".repeat(line_len); + + let dir = std::env::temp_dir(); + let path = dir.join("log_viewer_test_boundary"); + let mut f = std::fs::File::create(&path).unwrap(); + for i in 0..num_lines { + if i < num_lines - 1 { + writeln!(f, "{}", line).unwrap(); + } else { + write!(f, "{}", line).unwrap(); + } + } + drop(f); + + let result = sample_line_count(&path).unwrap(); + assert_eq!(result.estimated_lines, num_lines as u64); + assert_eq!(result.confidence, 1.0); + + let _ = std::fs::remove_file(&path); + } + + #[test] + fn test_avg_line_length() { + // 3 lines: "aa\n" (3B), "bb\n" (3B), "cc" (2B) = 8 bytes total + let content = b"aa\nbb\ncc"; + let tmp = TempFile::new("avg_len", content); + let result = sample_line_count(tmp.path()).unwrap(); + assert_eq!(result.estimated_lines, 3); + let expected_avg = 8.0 / 3.0; + assert!((result.avg_line_length - expected_avg).abs() < 0.01); + } +}