From 62a176441eb105dae19cc6a9c7cec53bd37e261c Mon Sep 17 00:00:00 2001
From: dailz <dailz@example.com>
Date: Sun, 12 Apr 2026 20:50:58 +0800
Subject: [PATCH] feat(core): refactor FileReader to use mmap with low-RSS
 index building

Replace Vec<u8> with memmap2::Mmap for file content. Build line index via BufReader streaming (from_reader) instead of scanning mmap'd data, keeping RSS at ~3MB for 5GB files instead of ~5GB.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
---
 crates/core/src/io/file_reader.rs | 341 +++++++++++-------------------
 1 file changed, 121 insertions(+), 220 deletions(-)
diff --git a/crates/core/src/io/file_reader.rs b/crates/core/src/io/file_reader.rs
index 3a0c2ab..f623172 100644
--- a/crates/core/src/io/file_reader.rs
+++ b/crates/core/src/io/file_reader.rs
@@ -1,294 +1,195 @@
 // ─── file_reader.rs ─────────────────────────────────────────────────────────
-// 这个文件定义了一个"文件读取器"（FileReader）结构体，用于将日志文件一次性
-// 读入内存，并支持按行号快速访问文件中的任意一行内容。
+// 文件读取器：使用 mmap（内存映射）将文件内容映射到虚拟地址空间，
+// 避免将整个文件拷贝到用户态缓冲区。配合稀疏行索引 LineIndex，
+// 支持按行号快速访问文件中的任意一行内容。
 // ──────────────────────────────────────────────────────────────────────────────
 
-// `use` 语句用于引入其他模块中定义的类型，类似于 Python 的 import 或 C++ 的 #include。
-// 这里从 crate（当前项目）的 error 模块中引入了 CoreError 和 Result 两个类型。
-//   - CoreError: 项目自定义的错误类型，用于表示各种可能的错误情况。
-//   - Result: 项目自定义的 Result 类型（注意：这不是 Rust 标准库的 Result，而是项目自己定义的别名）。
 use crate::error::{CoreError, Result};
-
-// 从同目录下的 line_index 模块引入 LineIndex 类型。
-// LineIndex 用于记录文件中每一行的起始字节位置，从而支持快速定位某一行。
 use crate::io::line_index::LineIndex;
-
-// 从 Rust 标准库的 std::path 模块引入 Path 和 PathBuf 两个类型。
-//   - Path: 文件路径的不可变引用类型（类似于 &str 之于 String）。
-//   - PathBuf: 拥有所有权的文件路径类型（类似于 String 之于 &str），可以修改和存储。
 use std::path::{Path, PathBuf};
 
-// ─── FileReader 结构体定义 ────────────────────────────────────────────────────
-// `pub struct` 定义一个公开的结构体（struct），类似于其他语言中的 class。
-// 结构体是数据的容器，可以包含多个字段（field）。
-// `pub` 表示这个结构体可以被其他模块访问，如果不加 pub 则只能在当前模块内使用。
 pub struct FileReader {
-    // `path: PathBuf` — 文件路径，存储被打开的文件的完整路径。
-    // PathBuf 是一个堆分配的、可增长的路径类型（类似于 String）。
     path: PathBuf,
-
-    // `data: Vec<u8>` — 文件的原始字节数据。
-    // Vec<u8> 是一个动态数组（向量），存储的是 u8（无符号 8 位整数，即一个字节）。
-    // 整个文件的内容被读取后以字节形式存储在这里。
-    // Vec 是 Rust 中最常用的集合类型，类似于 C++ 的 std::vector 或 Python 的 list。
-    data: Vec<u8>,
-
-    // `line_index: LineIndex` — 行索引，记录每行在 data 中的起始位置。
-    // 通过这个索引可以快速找到第 N 行在 data 字节数组中的位置。
+    mmap: Option<memmap2::Mmap>,
     line_index: LineIndex,
 }
 
-// ─── FileReader 的实现块 ─────────────────────────────────────────────────────
-// `impl FileReader` 块用来给 FileReader 结构体添加方法（method）。
-// 类似于其他语言中给 class 添加方法。Rust 中方法分为两种：
-//   - 关联函数（类似静态方法）：没有 &self 参数，用 `FileReader::open(...)` 调用。
-//   - 实例方法：第一个参数是 &self（不可变引用）或 &mut self（可变引用），
-//     用 `reader.line_count()` 这样的方式调用。
 impl FileReader {
-    // ─── open 方法（关联函数，即"构造函数"）──────────────────────────────────
-    // `pub fn open(path: &Path) -> Result<Self>` 的含义：
-    //   - pub fn: 公开的函数（function）。
-    //   - open: 函数名。
-    //   - path: &Path: 参数名为 path，类型是 &Path（Path 的不可变引用）。
-    //     & 表示借用（borrow），即我们不获取所有权，只是借用一下这个路径来读取。
-    //   - -> Result<Self>: 返回值类型。Result 是项目自定义的结果类型，
-    //     Self 指代当前类型（即 FileReader 本身）。
-    //     Result<Self> 实际上意味着"要么成功返回一个 FileReader，要么返回一个错误"。
     pub fn open(path: &Path) -> Result<Self> {
-        // `std::fs::read(path)?` — 读取文件的全部内容到内存中。
-        // std::fs 是 Rust 标准库中的文件系统模块。
-        // read() 函数接受一个文件路径，将整个文件内容读取为 Vec<u8>（字节数组）。
-        // 末尾的 `?` 是 Rust 的错误传播操作符：如果 read() 返回错误，
-        // 这个错误会自动从当前函数中返回（即提前退出函数）。
-        // 这比手写 match/if 来处理错误要简洁得多。
-        let data = std::fs::read(path)?;
+        let file = std::fs::File::open(path)?;
+        let file_size = file.metadata()?.len();
 
-        // ─── UTF-8 编码检查 ──────────────────────────────────────────────────
-        // `std::str::from_utf8(&data)` 尝试将字节数组解释为 UTF-8 编码的字符串。
-        // 如果字节数组不是有效的 UTF-8 编码，会返回 Err。
-        // `.is_err()` 检查结果是否为错误（即不是有效的 UTF-8）。
-        if std::str::from_utf8(&data).is_err() {
-            // 如果不是有效的 UTF-8，返回一个自定义的编码错误。
-            // `Err(...)` 创建一个包含错误值的 Result::Err 变体。
-            // `CoreError::Encoding { ... }` 是 CoreError 枚举的一个变体（variant），
-            // 这里使用了结构体风格的枚举变体，包含两个字段：
-            return Err(CoreError::Encoding {
-                // `line: 0` — 错误发生在第 0 行（此处只表示"文件开头附近"，
-                // 因为这里还没有逐行解析，所以总是 0）。
-                line: 0,
-                // `bytes: data.iter().take(64).copied().collect()` — 这是一段链式调用：
-                //   1. data.iter() — 创建一个迭代器，遍历 data 中的每个 &u8（字节的引用）。
-                //   2. .take(64) — 只取前 64 个元素（如果文件不满 64 字节则取全部）。
-                //   3. .copied() — 将 &u8（引用）转换为 u8（值），即复制一份。
-                //   4. .collect() — 将迭代器收集为一个新的集合（这里推断为 Vec<u8>）。
-                // 这行代码的作用是：取文件前 64 字节内容，放在错误信息中，方便调试。
-                bytes: data.iter().take(64).copied().collect(),
-            });
-        }
+        let mmap = if file_size == 0 {
+            None
+        } else {
+            // SAFETY: 使用只读 Mmap（非 MmapMut），文件以只读方式打开。
+            // memmap2 内部持有文件描述符，确保 mmap 期间文件不会被关闭。
+            //
+            // ⚠️ Known limitation (Phase 5): 如果文件在 mmap 期间被外部进程截断，
+            // 访问截断区域的内存会触发 SIGBUS（致命信号，无法恢复）。
+            // FileWatcher Phase 将添加文件修改检测和 re-mmap 机制来处理此情况。
+            // 在 Phase 5 中，假设打开的文件不会被外部修改。
+            Some(unsafe { memmap2::Mmap::map(&file) }.map_err(|e| CoreError::Mmap(e.to_string()))?)
+        };
 
-        // ─── 构建行索引 ──────────────────────────────────────────────────────
-        // 调用 LineIndex::from_bytes() 静态方法，传入文件字节数据的引用。
-        // 这个方法会扫描整个字节数组，找到所有换行符的位置，
-        // 构建一个索引，记录每一行的起始字节偏移量。
-        // &data 中的 & 表示传递引用（不转移所有权）。
-        let line_index = LineIndex::from_bytes(&data);
+        let line_index = {
+            let mut reader = std::io::BufReader::new(&file);
+            LineIndex::from_reader(&mut reader).map_err(|e| CoreError::Io {
+                source: e,
+                context: "building line index".into(),
+            })?
+        };
 
-        // ─── 构造并返回 FileReader 实例 ─────────────────────────────────────
-        // Ok(...) 创建一个成功的 Result 值。
-        // FileReader { ... } 使用结构体字面量语法创建实例：
-        //   - 字段名和变量名相同时，可以用简写语法（如 path, data, line_index），
-        //     而不需要写成 path: path, data: data, ...
-        // path.to_path_buf() 将 &Path（引用）转换为 PathBuf（拥有所有权的路径类型）。
-        Ok(FileReader {
+        Ok(Self {
             path: path.to_path_buf(),
-            data,
+            mmap,
             line_index,
         })
     }
 
-    // ─── data 方法 ──────────────────────────────────────────────────────────
-    // `&self` 表示这是一个实例方法，通过不可变引用访问自身。
-    // 返回类型 `&[u8]` 是一个"字节切片引用"（即对字节数组的只读视图）。
-    // &[u8] 类似于其他语言中"只读字节数组"的概念，它不拥有数据，只是指向数据。
     pub fn data(&self) -> &[u8] {
-        // `&self.data` 获取 self.data（即 Vec<u8>）的引用。
-        // Rust 会自动将 &Vec<u8> 转换为 &[u8]（这叫做 Deref 强制转换）。
-        &self.data
+        self.mmap.as_deref().unwrap_or(&[])
     }
 
-    // ─── line_count 方法 ────────────────────────────────────────────────────
-    // 返回文件的总行数。usize 是 Rust 中表示大小/索引的无符号整数类型
-    // （类似 C 的 size_t），在 64 位系统上占 8 字节。
     pub fn line_count(&self) -> usize {
-        // 委托给 line_index 的 line_count() 方法。
-        // 即实际上是由 LineIndex 来计算行数的。
         self.line_index.line_count()
     }
 
-    // ─── get_line 方法 ──────────────────────────────────────────────────────
-    // 根据行号获取某一行内容。
-    // `idx: usize` — 行号索引，从 0 开始（第 0 行、第 1 行……）。
-    // `-> Option<&str>` — 返回类型：
-    //   - Option 是 Rust 的可选类型，要么是 Some(值) 要么是 None。
-    //   - &str 是字符串切片的引用（只读字符串视图）。
-    //   - 如果行号有效，返回 Some("该行内容")；如果行号越界，返回 None。
     pub fn get_line(&self, idx: usize) -> Option<&str> {
-        // 委托给 line_index 的 get_line() 方法。
-        // 传入文件的字节数据和行号索引。
-        self.line_index.get_line(&self.data, idx)
+        self.line_index.get_line(self.data(), idx)
     }
 
-    // ─── file_size 方法 ─────────────────────────────────────────────────────
-    // 返回文件大小（以字节为单位）。
-    // u64 是无符号 64 位整数类型。
     pub fn file_size(&self) -> u64 {
-        // self.data.len() 获取 Vec<u8> 的长度（字节数），返回 usize 类型。
-        // `as u64` 是类型转换（cast），将 usize 转换为 u64。
-        self.data.len() as u64
+        self.mmap.as_ref().map_or(0, |m| m.len() as u64)
     }
 
-    // ─── path 方法 ──────────────────────────────────────────────────────────
-    // 返回文件路径的引用。
-    // &Path 是不可变的路径引用（不拥有所有权）。
     pub fn path(&self) -> &Path {
-        // &self.path 获取 PathBuf 的引用，Rust 会自动转换为 &Path。
         &self.path
     }
 }
 
-// ─── 单元测试 ────────────────────────────────────────────────────────────────
-// `#[cfg(test)]` 是一个条件编译属性（attribute），表示以下代码只在运行测试时编译。
-// 普通的 `cargo build` 不会编译这部分代码，只有 `cargo test` 才会。
-// 这是 Rust 中编写单元测试的标准方式——测试代码和业务代码放在同一个文件中。
+const _: () = {
+    #[allow(dead_code)]
+    fn assert_send_sync<T: Send + Sync>() {}
+    fn _check() {
+        assert_send_sync::<FileReader>();
+    }
+};
+
 #[cfg(test)]
 mod tests {
-    // `use super::*;` — 将父模块（即外面的 FileReader 等）的所有公开内容引入当前作用域。
-    // super 指代父模块，* 是通配符，表示"所有内容"。
     use super::*;
-    // 引入标准库中的临时目录函数。
-    use std::env::temp_dir;
+    use tempfile::NamedTempFile;
 
-    // 辅助函数：生成临时目录下的文件路径。
-    // `-> PathBuf` 表示返回一个 PathBuf（拥有所有权的路径类型）。
-    fn temp_path(name: &str) -> PathBuf {
-        // temp_dir() 返回系统临时目录（如 /tmp）。
-        // .join(name) 将文件名拼接到临时目录路径后面，形成完整路径。
-        temp_dir().join(name)
+    fn create_temp_file(content: &[u8]) -> NamedTempFile {
+        use std::io::Write;
+        let mut f = NamedTempFile::new().unwrap();
+        f.write_all(content).unwrap();
+        f
     }
 
-    // `#[test]` 属性标记这是一个测试函数。cargo test 会自动发现并运行它。
     #[test]
-    // 测试：空文件应该有 0 行。
     fn test_empty_file() {
-        // 生成临时文件路径。
-        let path = temp_path("file_reader_test_empty");
-        // 将空字节数组 b"" 写入文件。b"" 是字节字符串字面量语法。
-        // .unwrap() 的含义是"我确信这不会失败；如果失败了就直接 panic（崩溃）"。
-        // 在测试代码中常用 .unwrap() 来简化错误处理。
-        std::fs::write(&path, b"").unwrap();
-        // 打开文件创建 FileReader。unwrap() 断言操作成功。
-        let reader = FileReader::open(&path).unwrap();
-        // assert_eq! 宏断言两个值相等。这里断言行数为 0。
+        let f = create_temp_file(b"");
+        let reader = FileReader::open(f.path()).unwrap();
         assert_eq!(reader.line_count(), 0);
-        // 清理：删除临时文件。let _ = 表示忽略返回值（不关心删除是否成功）。
-        let _ = std::fs::remove_file(&path);
+        assert_eq!(reader.get_line(0), None);
+        assert_eq!(reader.file_size(), 0);
+        assert_eq!(reader.data(), b"");
     }
 
     #[test]
-    // 测试：多行文件（带换行符和不带末尾换行符的情况）。
     fn test_multi_line_file() {
-        let path = temp_path("file_reader_test_multi");
-        // b"hello\nworld\nfoo" — 三行内容：hello、world、foo。
-        // 注意最后一行 foo 后面没有换行符。
-        std::fs::write(&path, b"hello\nworld\nfoo").unwrap();
-        let reader = FileReader::open(&path).unwrap();
-        // 应该识别为 3 行。
+        let f = create_temp_file(b"hello\nworld\nfoo");
+        let reader = FileReader::open(f.path()).unwrap();
         assert_eq!(reader.line_count(), 3);
-        // 逐行验证内容。Some("hello") 表示第 0 行是 "hello"。
         assert_eq!(reader.get_line(0), Some("hello"));
         assert_eq!(reader.get_line(1), Some("world"));
         assert_eq!(reader.get_line(2), Some("foo"));
-        let _ = std::fs::remove_file(&path);
     }
 
     #[test]
-    // 测试：打开不存在的文件应该返回 IO 错误。
     fn test_nonexistent_file() {
-        let path = temp_path("file_reader_test_nonexistent_xyzzy");
-        // 先删除文件确保它不存在。
+        let path = std::env::temp_dir().join("file_reader_test_nonexistent_xyzzy_12345");
         let _ = std::fs::remove_file(&path);
-        // 尝试打开不存在的文件。
         let result = FileReader::open(&path);
-        // 使用 match 进行模式匹配（类似于 switch-case，但更强大）。
         match result {
-            // 期望得到 Io 类型的错误（文件不存在的 IO 错误）。
-            // `{ .. }` 表示忽略 CoreError::Io 中的字段细节。
             Err(CoreError::Io { .. }) => {}
-            // 如果是其他类型的错误，说明出问题了，panic 并打印实际收到的错误类型。
             Err(other) => panic!("expected Io variant, got {other:?}"),
-            // 如果竟然成功了，也 panic。
-            // {other:?} 中的 :? 是 Debug 格式化，打印详细调试信息。
             Ok(_) => panic!("expected error, got success"),
         }
     }
 
     #[test]
-    // 测试：非 UTF-8 编码的文件应该返回 Encoding 错误。
-    fn test_non_utf8_file() {
-        let path = temp_path("file_reader_test_nonutf8");
-        // [0xFF, 0xFE] 是无效的 UTF-8 字节序列（这是 UTF-16 LE 的 BOM 头）。
-        // &[] 创建一个数组的引用（在这里是 &[u8; 2] 类型）。
-        std::fs::write(&path, &[0xFF, 0xFE]).unwrap();
-        let result = FileReader::open(&path);
-        match result {
-            // 期望得到 Encoding 错误，并验证其中的字段。
-            Err(CoreError::Encoding { line, bytes }) => {
-                // 错误行号应该是 0（文件开头）。
-                assert_eq!(line, 0);
-                // bytes 应该包含我们写入的那两个无效字节。
-                assert_eq!(bytes, vec![0xFF, 0xFE]);
-            }
-            Err(other) => panic!("expected Encoding variant, got {other:?}"),
-            Ok(_) => panic!("expected error, got success"),
-        }
-        let _ = std::fs::remove_file(&path);
-    }
-
-    #[test]
-    // 测试：file_size() 返回的文件大小是否正确。
-    fn test_file_size() {
-        let path = temp_path("file_reader_test_size");
-        // b"hello world" — 11 个字节。
-        let content = b"hello world";
-        std::fs::write(&path, content).unwrap();
-        let reader = FileReader::open(&path).unwrap();
-        // 文件大小应该等于内容的字节长度。
-        assert_eq!(reader.file_size(), content.len() as u64);
-        let _ = std::fs::remove_file(&path);
-    }
-
-    #[test]
-    // 测试：path() 返回的路径是否与传入的路径一致。
-    fn test_path() {
-        let path = temp_path("file_reader_test_path");
-        std::fs::write(&path, b"data").unwrap();
-        let reader = FileReader::open(&path).unwrap();
-        // 验证存储的路径与原始路径相同。
-        assert_eq!(reader.path(), path);
-        let _ = std::fs::remove_file(&path);
-    }
-
-    #[test]
-    // 测试：只有一行且没有末尾换行符的情况。
-    fn test_single_line_no_newline() {
-        let path = temp_path("file_reader_test_single");
-        // b"hello" — 只有一行，没有换行符。
-        std::fs::write(&path, b"hello").unwrap();
-        let reader = FileReader::open(&path).unwrap();
-        // 应该识别为 1 行。
+    fn test_non_utf8_file_get_line_returns_none() {
+        let f = create_temp_file(&[0xFF, 0xFE]);
+        let reader = FileReader::open(f.path()).unwrap();
         assert_eq!(reader.line_count(), 1);
-        // 第 0 行的内容应该是 "hello"。
-        assert_eq!(reader.get_line(0), Some("hello"));
-        let _ = std::fs::remove_file(&path);
+        assert_eq!(reader.get_line(0), None);
+    }
+
+    #[test]
+    fn test_file_size() {
+        let content = b"hello world";
+        let f = create_temp_file(content);
+        let reader = FileReader::open(f.path()).unwrap();
+        assert_eq!(reader.file_size(), content.len() as u64);
+
+        let f_empty = create_temp_file(b"");
+        let reader_empty = FileReader::open(f_empty.path()).unwrap();
+        assert_eq!(reader_empty.file_size(), 0);
+    }
+
+    #[test]
+    fn test_path() {
+        let f = create_temp_file(b"data");
+        let reader = FileReader::open(f.path()).unwrap();
+        assert_eq!(reader.path(), f.path());
+    }
+
+    #[test]
+    fn test_data_returns_correct_bytes() {
+        let content = b"line1\nline2\nline3\n";
+        let f = create_temp_file(content);
+        let reader = FileReader::open(f.path()).unwrap();
+        let expected = std::fs::read(f.path()).unwrap();
+        assert_eq!(reader.data(), expected.as_slice());
+    }
+
+    #[test]
+    fn test_large_file() {
+        let mut content = Vec::with_capacity(1024 * 1024);
+        let num_lines = 25000;
+        for i in 0..num_lines {
+            let line = format!("This is line number {:05} with some padding data\n", i);
+            content.extend_from_slice(line.as_bytes());
+        }
+        assert!(content.len() > 1024 * 1024, "test data should exceed 1MB");
+
+        let f = create_temp_file(&content);
+        let reader = FileReader::open(f.path()).unwrap();
+
+        assert_eq!(reader.line_count(), num_lines);
+        assert_eq!(reader.file_size(), content.len() as u64);
+        assert_eq!(
+            reader.get_line(0),
+            Some("This is line number 00000 with some padding data")
+        );
+        assert_eq!(
+            reader.get_line(num_lines - 1),
+            Some("This is line number 24999 with some padding data")
+        );
+        assert_eq!(
+            reader.get_line(8000),
+            Some("This is line number 08000 with some padding data")
+        );
+    }
+
+    #[test]
+    fn test_data_empty_file() {
+        let f = create_temp_file(b"");
+        let reader = FileReader::open(f.path()).unwrap();
+        assert!(reader.data().is_empty());
+        assert_eq!(reader.data(), b"");
     }
 }