listener: implement rms (voice activity) detection

This commit is contained in:
2026-05-10 19:20:01 -04:00
parent 37bc07f667
commit a06ebf722f
2 changed files with 122 additions and 23 deletions
+78 -11
View File
@@ -1,5 +1,14 @@
use std::collections::VecDeque;
use std::io::Read;
use std::process::{Child, ChildStdout, Command, Stdio};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
const FRAME_SAMPLES: usize = 1600; // 100 ms at 16 kHz
const PRE_BUFFER_FRAMES: usize = 10; // 1.0 s pre-roll captured before speech onset
pub const SILENCE_THRESHOLD: f32 = 0.02; // RMS energy: tune up for noisy environments
const HANGOVER_FRAMES: usize = 20; // 2.0 s trailing silence before segment closes
const MAX_SEGMENT_SAMPLES: usize = 16000 * 120; // 2-minute hard cap per segment
pub fn decode(input: &str) -> Result<Vec<f32>, Box<dyn std::error::Error>> {
let output = Command::new("ffmpeg")
@@ -33,18 +42,12 @@ impl LiveStream {
Ok(LiveStream { child, stdout })
}
// Reads exactly `secs` seconds of audio. Returns None when the stream ends.
pub fn next_chunk(&mut self, secs: u32) -> Result<Option<Vec<f32>>, Box<dyn std::error::Error>> {
let num_bytes = secs as usize * 16000 * 4;
let mut buf = vec![0u8; num_bytes];
fn next_frame(&mut self) -> Result<Option<Vec<f32>>, Box<dyn std::error::Error>> {
let mut buf = vec![0u8; FRAME_SAMPLES * 4];
match self.stdout.read_exact(&mut buf) {
Ok(()) => {
let samples = buf.chunks_exact(4)
.map(|b| f32::from_le_bytes(b.try_into().unwrap()))
.collect();
Ok(Some(samples))
}
Ok(()) => Ok(Some(buf.chunks_exact(4)
.map(|b| f32::from_le_bytes(b.try_into().unwrap()))
.collect())),
Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => Ok(None),
Err(e) => Err(e.into()),
}
@@ -56,3 +59,67 @@ impl Drop for LiveStream {
let _ = self.child.kill();
}
}
fn rms(samples: &[f32]) -> f32 {
(samples.iter().map(|&s| s * s).sum::<f32>() / samples.len() as f32).sqrt()
}
pub struct VadStream {
inner: LiveStream,
pre_buffer: VecDeque<Vec<f32>>,
running: Arc<AtomicBool>,
}
impl VadStream {
pub fn open(source: &str, running: Arc<AtomicBool>) -> Result<Self, Box<dyn std::error::Error>> {
Ok(VadStream {
inner: LiveStream::open(source)?,
pre_buffer: VecDeque::with_capacity(PRE_BUFFER_FRAMES + 1),
running,
})
}
/// Function itself blocks until a complete speech segment is captured, then returns it.
/// returns None when the underlying stream ends or running is set to false.
pub fn next_segment(&mut self) -> Result<Option<Vec<f32>>, Box<dyn std::error::Error>> {
let mut speech: Vec<f32> = Vec::new();
let mut speech_active = false;
let mut hangover = 0usize;
loop {
if !self.running.load(Ordering::SeqCst) {
return if speech.is_empty() { Ok(None) } else { Ok(Some(speech)) };
}
let frame = match self.inner.next_frame()? {
Some(f) => f,
None => return if speech.is_empty() { Ok(None) } else { Ok(Some(speech)) },
};
let energy = rms(&frame);
if energy > SILENCE_THRESHOLD {
if !speech_active {
speech_active = true;
eprintln!(" [recording]");
for pre in self.pre_buffer.drain(..) {
speech.extend(pre);
}
}
speech.extend(&frame);
hangover = 0;
} else if speech_active {
speech.extend(&frame);
hangover += 1;
if hangover >= HANGOVER_FRAMES || speech.len() >= MAX_SEGMENT_SAMPLES {
return Ok(Some(speech));
}
} else {
if self.pre_buffer.len() >= PRE_BUFFER_FRAMES {
self.pre_buffer.pop_front();
}
self.pre_buffer.push_back(frame);
}
}
}
}
+44 -12
View File
@@ -1,7 +1,7 @@
use std::fs::{self, OpenOptions};
use std::io::Write;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::sync::{mpsc, Arc};
use chrono::{DateTime, Local};
use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters};
@@ -67,24 +67,56 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
r.store(false, Ordering::SeqCst);
})?;
let ctx = WhisperContext::new_with_params(&model_path, WhisperContextParameters::default())
.expect("failed to load model");
let mut state = ctx.create_state().expect("failed to create state");
let mut out = OpenOptions::new().create(true).append(true).open(&output_path)?;
let mut counter: u32 = 0;
eprintln!("Transcribing {}{} (clips → {}) (Ctrl+C to stop)", audio_arg, output_path, CLIP_DIR);
if let Some(source) = audio_arg.strip_prefix("pulse:") {
let mut stream = audio::LiveStream::open(source)?;
while running.load(Ordering::SeqCst) {
match stream.next_chunk(CHUNK_SECS)? {
Some(chunk) => transcribe_chunk(&mut state, &chunk, &mut out, &mut counter)?,
eprintln!("Listening for speech (silence threshold: {:.3} RMS) …", audio::SILENCE_THRESHOLD);
let (tx, rx) = mpsc::channel::<Vec<f32>>();
// transcription run goes to a background thread so capture is never blocked.
let model_path_t = model_path.clone();
let output_path_t = output_path.clone();
let transcription_thread = std::thread::spawn(move || {
let ctx = WhisperContext::new_with_params(&model_path_t, WhisperContextParameters::default())
.expect("failed to load model");
let mut state = ctx.create_state().expect("failed to create state");
let mut out = OpenOptions::new().create(true).append(true).open(&output_path_t)
.expect("failed to open output file");
let mut counter: u32 = 0;
for segment in rx {
let secs = segment.len() as f32 / 16000.0;
eprintln!(" [transcribing {:.1}s segment…]", secs);
if let Err(e) = transcribe_chunk(&mut state, &segment, &mut out, &mut counter) {
eprintln!("Transcription error: {e}");
}
}
});
// capture loop.. never pauses for transcription.
let mut stream = audio::VadStream::open(source, running.clone())?;
loop {
match stream.next_segment()? {
Some(segment) => {
let secs = segment.len() as f32 / 16000.0;
eprintln!(" [captured {:.1}s, queued for transcription]", secs);
if tx.send(segment).is_err() {
break; // transcription thread died
}
}
None => break,
}
}
drop(tx); // closing the channel signals the transcription thread to finish
transcription_thread.join().expect("transcription thread panicked");
} else {
let ctx = WhisperContext::new_with_params(&model_path, WhisperContextParameters::default())
.expect("failed to load model");
let mut state = ctx.create_state().expect("failed to create state");
let mut out = OpenOptions::new().create(true).append(true).open(&output_path)?;
let mut counter: u32 = 0;
while running.load(Ordering::SeqCst) {
let audio = audio::decode(&audio_arg)?;
for chunk in audio.chunks(CHUNK_SAMPLES) {