feat: GPU-downscale + software H.264 encode pipeline (WIP)

Add SwEncState in avhw.rs: GPU pipeline using scale_vaapi to downscale
4K BGRA -> 2K NV12 on AMD iGPU, then software encode with libopenh264.

- import_dma_buf_to_vaapi: av_hwframe_map based DMA-BUF import
- SwEncState: GPU filter graph (scale_vaapi) + NV12->YUV420P + libopenh264
- state_portal.rs: integrated SwEncState, auto DRM device detection
- vaapi_import_bench.rs: CPU vs GPU pipeline benchmark
- sw_encode_bench.rs: software encode benchmark

Benchmark results: GPU pipeline ~91 FPS theoretical (10.95ms/frame)
vs CPU pipeline ~33 FPS (30.21ms/frame).

Known issue: only 1 frame encoded in production recording,
diagnostic STATS logging added to debug frame flow.
This commit is contained in:
dailz
2026-05-29 22:04:12 +08:00
parent 55abb5e56d
commit d80b34f44f
9 changed files with 2416 additions and 305 deletions

545
src/bin/sw_encode_bench.rs Normal file
View File

@@ -0,0 +1,545 @@
// sw_encode_bench.rs — Software encoding pipeline benchmark for screen capture
//
// Benchmarks: Portal capture -> mmap DMA-BUF -> sws_scale BGR0->YUV420P -> libx264 encode
//
// Usage: cargo run --bin sw_encode_bench -- --output /tmp/bench_test.mp4
use std::ffi::CString;
use std::os::fd::AsRawFd;
use std::path::Path;
use std::ptr;
use std::time::Instant;
use anyhow::{bail, Result};
use clap::Parser;
use ffmpeg_next as ff;
use ffmpeg_next::ffi;
use ffmpeg_next::packet::Mut;
use wl_webrtc::args::Args;
use wl_webrtc::cap_portal::{CapPortal, PwCtrlEvent};
#[derive(Parser, Debug)]
#[command(
name = "sw_encode_bench",
about = "Software encoding pipeline benchmark"
)]
struct BenchArgs {
#[arg(short, long)]
output: String,
#[arg(long, default_value_t = 120)]
frames: u32,
#[arg(long, default_value_t = 2560)]
enc_width: u32,
#[arg(long, default_value_t = 1440)]
enc_height: u32,
}
#[derive(Default)]
struct FrameStats {
mmap_us: Vec<u64>,
scale_us: Vec<u64>,
encode_us: Vec<u64>,
total_us: Vec<u64>,
mmap_failures: u32,
}
impl FrameStats {
fn avg_ms(data: &[u64]) -> f64 {
if data.is_empty() {
return 0.0;
}
data.iter().sum::<u64>() as f64 / data.len() as f64 / 1000.0
}
}
fn pix_fmt(p: ff::format::Pixel) -> ffi::AVPixelFormat {
Into::<ffi::AVPixelFormat>::into(p)
}
fn receive_first_frame(cap: &CapPortal) -> Result<wl_webrtc::cap_portal::PwDmaBufFrame> {
loop {
if let Ok(ctrl) = cap.event_receiver().try_recv() {
match ctrl {
PwCtrlEvent::StreamEnded => bail!("PipeWire stream ended before first frame"),
PwCtrlEvent::Error(e) => bail!("PipeWire error: {e}"),
}
}
match cap
.frame_receiver()
.recv_timeout(std::time::Duration::from_secs(10))
{
Ok(frame) => return Ok(frame),
Err(crossbeam_channel::RecvTimeoutError::Timeout) => {
bail!("Timeout waiting for first frame (10s)");
}
Err(crossbeam_channel::RecvTimeoutError::Disconnected) => {
bail!("PipeWire frame channel disconnected");
}
}
}
}
fn main() -> Result<()> {
let bench_args = BenchArgs::parse();
println!("=== Software Encode Benchmark ===");
println!("Output: {}", bench_args.output);
println!("Target frames: {}", bench_args.frames);
println!(
"Encode resolution: {}x{}",
bench_args.enc_width, bench_args.enc_height
);
println!();
ff::init()?;
println!("[1/4] Requesting screen capture via XDG Portal...");
println!(" (Select a screen to share in the portal dialog)");
let portal_args = Args {
output: bench_args.output.clone(),
output_name: None,
fps: 60,
codec: "h264".to_string(),
hw_accel: "vaapi".to_string(),
drm_device: None,
bitrate: None,
gop_size: None,
verbose: false,
backend: Some("portal".to_string()),
port: 0,
};
let cap = CapPortal::new(&portal_args)?;
println!("[1/4] Portal connected, PipeWire stream active\n");
println!("[2/4] Waiting for first frame from PipeWire...");
let first_frame = receive_first_frame(&cap)?;
let src_width = first_frame.width;
let src_height = first_frame.height;
let src_stride = first_frame.stride;
let enc_width = bench_args.enc_width;
let enc_height = bench_args.enc_height;
println!(
"[2/4] First frame: {}x{}, stride={}, format=0x{:08X}",
src_width, src_height, src_stride, first_frame.format
);
println!(
" Capture: {}x{} Encode: {}x{}\n",
src_width, src_height, enc_width, enc_height
);
println!("[3/4] Testing mmap on DMA-BUF...");
let mmap_size = (src_stride as usize) * (src_height as usize);
let mmap_ptr = unsafe {
libc::mmap(
ptr::null_mut(),
mmap_size,
libc::PROT_READ,
libc::MAP_SHARED,
first_frame.fd.as_raw_fd(),
first_frame.offset as i64,
)
};
if mmap_ptr == libc::MAP_FAILED {
let errno = std::io::Error::last_os_error();
bail!(
"mmap on DMA-BUF fd FAILED — AMD driver may not support \
CPU read of screen capture DMA-BUF buffers.\n\
Error: {} (errno={})\n\
\n\
Workarounds:\n\
1. Use VAAPI hardware import (av_hwframe_map) instead of mmap\n\
2. Use wlroots compositor with wlr-screencopy (SHM-based)\n\
3. Use a virtual display or software renderer",
errno,
errno.raw_os_error().unwrap_or(-1)
);
}
println!(
"[3/4] mmap SUCCESS — CPU can read DMA-BUF ({:.1} MB)\n",
mmap_size as f64 / 1024.0 / 1024.0
);
unsafe {
libc::munmap(mmap_ptr, mmap_size);
}
drop(first_frame);
// Set up libx264 encoder via FFI (same pattern as avhw.rs)
println!("[4/4] Setting up libx264 encoder...");
let output_path = Path::new(&bench_args.output);
let output_cstr = CString::new(output_path.to_str().unwrap())?;
// Try libx264 first (best quality/speed), fall back to openh264
let codec = ff::encoder::find_by_name("libx264")
.or_else(|| ff::encoder::find_by_name("libopenh264"))
.ok_or_else(|| {
anyhow::anyhow!("No H.264 software encoder found (tried libx264, libopenh264)")
})?;
println!("[4/4] Using encoder: {}\n", codec.name());
let mut enc = {
let ctx = ff::codec::Context::new_with_codec(codec);
ctx.encoder().video()?
};
enc.set_width(enc_width);
enc.set_height(enc_height);
enc.set_format(ff::format::Pixel::YUV420P);
enc.set_time_base(ff::Rational::new(1, 60));
enc.set_max_b_frames(0);
enc.set_gop(60);
let codec_name = codec.name();
if codec_name == "libx264" {
unsafe {
let key = CString::new("preset").unwrap();
let val = CString::new("veryfast").unwrap();
ffi::av_opt_set((*enc.as_mut_ptr()).priv_data, key.as_ptr(), val.as_ptr(), 0);
let key = CString::new("tune").unwrap();
let val = CString::new("zerolatency").unwrap();
ffi::av_opt_set((*enc.as_mut_ptr()).priv_data, key.as_ptr(), val.as_ptr(), 0);
}
}
let opened = enc.open()?;
let mut enc_video = opened.0;
// Create output format context via FFI
let mut fmt_ctx_ptr: *mut ffi::AVFormatContext = ptr::null_mut();
let ret = unsafe {
ffi::avformat_alloc_output_context2(
&mut fmt_ctx_ptr,
ptr::null_mut(),
ptr::null(),
output_cstr.as_ptr(),
)
};
if ret < 0 || fmt_ctx_ptr.is_null() {
bail!("Failed to allocate output format context: error {ret}");
}
let stream_ptr = unsafe { ffi::avformat_new_stream(fmt_ctx_ptr, ptr::null()) };
if stream_ptr.is_null() {
bail!("Failed to create new stream");
}
let ret =
unsafe { ffi::avcodec_parameters_from_context((*stream_ptr).codecpar, enc_video.as_ptr()) };
if ret < 0 {
bail!("Failed to copy encoder parameters: error {ret}");
}
unsafe {
(*stream_ptr).time_base = (*enc_video.as_ptr()).time_base;
}
let ret = unsafe {
ffi::avio_open(
&mut (*fmt_ctx_ptr).pb,
output_cstr.as_ptr(),
ffi::AVIO_FLAG_WRITE,
)
};
if ret < 0 {
bail!(
"Failed to open output file '{}': error {ret}",
output_path.display()
);
}
let ret = unsafe { ffi::avformat_write_header(fmt_ctx_ptr, ptr::null_mut()) };
if ret < 0 {
bail!("Failed to write header: error {ret}");
}
let mut octx = unsafe { ff::format::context::Output::wrap(fmt_ctx_ptr) };
// Create sws_scale context: BGRZ (BGR0) -> YUV420P
let bgr0_fmt = pix_fmt(ff::format::Pixel::BGRZ);
let yuv420p_fmt = pix_fmt(ff::format::Pixel::YUV420P);
let sws_ctx = unsafe {
ffi::sws_getContext(
src_width as i32,
src_height as i32,
bgr0_fmt,
enc_width as i32,
enc_height as i32,
yuv420p_fmt,
2,
ptr::null_mut(),
ptr::null_mut(),
ptr::null_mut(),
)
};
if sws_ctx.is_null() {
bail!("Failed to create sws_scale context");
}
// Allocate reusable YUV frame
let mut yuv_frame = unsafe {
let mut f = ffi::av_frame_alloc();
if f.is_null() {
bail!("av_frame_alloc failed");
}
(*f).width = enc_width as i32;
(*f).height = enc_height as i32;
(*f).format = yuv420p_fmt as i32;
let ret = ffi::av_frame_get_buffer(f, 0);
if ret < 0 {
ffi::av_frame_free(&mut f);
bail!("av_frame_get_buffer failed: {ret}");
}
f
};
println!(
"[4/4] Encoder ready: {}, {}x{}\n",
codec_name, enc_width, enc_height
);
println!("=== Encoding {} frames ===\n", bench_args.frames);
let mut stats = FrameStats::default();
let total_start = Instant::now();
let mut frames_encoded: u32 = 0;
let mut pts: i64 = 0;
while frames_encoded < bench_args.frames {
if let Ok(ctrl) = cap.event_receiver().try_recv() {
match ctrl {
PwCtrlEvent::StreamEnded => {
eprintln!("PipeWire stream ended after {} frames", frames_encoded);
break;
}
PwCtrlEvent::Error(e) => {
eprintln!("PipeWire error after {} frames: {}", frames_encoded, e);
break;
}
}
}
let frame = match cap
.frame_receiver()
.recv_timeout(std::time::Duration::from_secs(5))
{
Ok(f) => f,
Err(_) => {
eprintln!("Frame timeout/disconnect after {} frames", frames_encoded);
break;
}
};
let frame_start = Instant::now();
let mmap_start = Instant::now();
let frame_size = (frame.stride as usize) * (frame.height as usize);
let mmap_ptr = unsafe {
libc::mmap(
ptr::null_mut(),
frame_size,
libc::PROT_READ,
libc::MAP_SHARED,
frame.fd.as_raw_fd(),
frame.offset as i64,
)
};
if mmap_ptr == libc::MAP_FAILED {
stats.mmap_failures += 1;
eprintln!("mmap failed on frame {}", frames_encoded);
drop(frame);
continue;
}
stats.mmap_us.push(mmap_start.elapsed().as_micros() as u64);
let scale_start = Instant::now();
let src_data = unsafe { std::slice::from_raw_parts(mmap_ptr as *const u8, frame_size) };
unsafe {
ffi::av_frame_make_writable(yuv_frame);
let src_ptr = src_data.as_ptr();
let src_linesize = frame.stride as i32;
ffi::sws_scale(
sws_ctx,
&src_ptr as *const *const u8,
&src_linesize as *const i32,
0,
frame.height as i32,
(*yuv_frame).data.as_ptr() as *mut *mut u8,
(*yuv_frame).linesize.as_ptr() as *mut i32,
);
}
stats
.scale_us
.push(scale_start.elapsed().as_micros() as u64);
unsafe {
libc::munmap(mmap_ptr, frame_size);
}
drop(frame);
let encode_start = Instant::now();
unsafe {
(*yuv_frame).pts = pts;
pts += 1;
let ret = ffi::avcodec_send_frame(enc_video.as_mut_ptr(), yuv_frame);
if ret < 0 {
eprintln!("avcodec_send_frame failed: {ret}");
continue;
}
}
drain_encoder(&mut enc_video, &mut octx)?;
stats
.encode_us
.push(encode_start.elapsed().as_micros() as u64);
stats
.total_us
.push(frame_start.elapsed().as_micros() as u64);
frames_encoded += 1;
if frames_encoded % 30 == 0 {
let fps = frames_encoded as f64 / total_start.elapsed().as_secs_f64();
println!(
" [{}/{}] {:.1} FPS",
frames_encoded, bench_args.frames, fps
);
}
}
let total_elapsed = total_start.elapsed();
println!("\nFlushing encoder...");
unsafe {
ffi::avcodec_send_frame(enc_video.as_mut_ptr(), ptr::null());
}
drain_encoder(&mut enc_video, &mut octx)?;
octx.write_trailer()
.map_err(|e| anyhow::anyhow!("Failed to write trailer: {e}"))?;
// Cleanup
unsafe {
ffi::av_frame_free(&mut yuv_frame as *mut _);
ffi::sws_freeContext(sws_ctx);
}
drop(cap);
// Print results
let mmap_count = stats.mmap_us.len() as u32;
let mmap_success_rate = if mmap_count + stats.mmap_failures > 0 {
mmap_count as f64 / (mmap_count + stats.mmap_failures) as f64 * 100.0
} else {
0.0
};
let total_fps = frames_encoded as f64 / total_elapsed.as_secs_f64();
let avg_total_ms = FrameStats::avg_ms(&stats.total_us);
let max_fps = if avg_total_ms > 0.0 {
1000.0 / avg_total_ms
} else {
0.0
};
println!();
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ Software Encode Benchmark Results ║");
println!("╚══════════════════════════════════════════════════════════════╝");
println!();
println!("Capture resolution: {}x{}", src_width, src_height);
println!("Encode resolution: {}x{}", enc_width, enc_height);
println!("Frames encoded: {}", frames_encoded);
println!("Total time: {:.2}s", total_elapsed.as_secs_f64());
println!();
println!("mmap (DMA-BUF -> CPU):");
println!(
" avg: {:.2} ms/frame",
FrameStats::avg_ms(&stats.mmap_us)
);
println!(
" success rate: {:.1}% ({}/{})",
mmap_success_rate,
mmap_count,
mmap_count + stats.mmap_failures
);
println!();
println!("scale (BGR0 -> YUV420P via sws_scale):");
println!(
" avg: {:.2} ms/frame",
FrameStats::avg_ms(&stats.scale_us)
);
println!();
println!("encode ({}):", codec_name);
println!(
" avg: {:.2} ms/frame",
FrameStats::avg_ms(&stats.encode_us)
);
println!();
println!("total pipeline:");
println!(" avg: {:.2} ms/frame", avg_total_ms);
println!(" achieved FPS: {:.1}", total_fps);
println!(" max theoretical: {:.1} FPS", max_fps);
println!();
if mmap_success_rate < 100.0 {
println!(
"WARNING: Some mmap operations failed ({}/{})",
stats.mmap_failures,
stats.mmap_failures + mmap_count
);
}
if total_fps < 30.0 {
println!(
"NOTE: Achieved FPS ({:.1}) is below 30 FPS target.",
total_fps
);
}
println!("Output written to: {}", bench_args.output);
Ok(())
}
fn drain_encoder(
enc_video: &mut ff::encoder::video::Video,
octx: &mut ff::format::context::Output,
) -> Result<()> {
loop {
let mut pkt = ff::Packet::empty();
let ret = unsafe { ffi::avcodec_receive_packet(enc_video.as_mut_ptr(), pkt.as_mut_ptr()) };
if ret < 0 {
if ret == ffi::AVERROR(ffi::EAGAIN) || ret == ffi::AVERROR_EOF {
break;
}
eprintln!("avcodec_receive_packet failed: {ret}");
break;
}
let enc_tb = enc_video.time_base();
let stream_tb = unsafe {
let streams = (*octx.as_ptr()).streams;
let st = *streams.add(0);
ff::Rational::from((*st).time_base)
};
pkt.rescale_ts(enc_tb, stream_tb);
pkt.set_stream(0);
pkt.write_interleaved(octx)
.map_err(|e| anyhow::anyhow!("write packet failed: {e}"))?;
}
Ok(())
}

File diff suppressed because it is too large Load Diff