listener: implement rms (voice activity) detection

listener: introduce clips feature
2026-05-10 19:20:01 -04:00 · 2026-05-10 19:02:54 -04:00
5 changed files with 164 additions and 28 deletions
@@ -27,4 +27,7 @@ rustc-ice-*.txt
 models/
 # Transcription text file
-transcription.txt
+transcription.txt
 # Clips dir
 audio_clips/
@@ -208,6 +208,12 @@ version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
 [[package]]
 name = "hound"
 version = "3.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "62adaabb884c94955b19907d60019f4e145d091c75345379e70d1ee696f7854f"
 [[package]]
 name = "iana-time-zone"
 version = "0.1.65"
@@ -420,6 +426,7 @@ version = "0.1.0"
 dependencies = [
 "chrono",
 "ctrlc",
 "hound",
 "whisper-rs",
 ]
@@ -6,4 +6,5 @@ edition = "2024"
 [dependencies]
 chrono = "0.4.44"
 ctrlc = "3.5.2"
 hound = "3.5.1"
 whisper-rs = "0.16.0"
@@ -1,5 +1,14 @@
 use std::collections::VecDeque;
 use std::io::Read;
 use std::process::{Child, ChildStdout, Command, Stdio};
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 const FRAME_SAMPLES: usize = 1600;             // 100 ms at 16 kHz
 const PRE_BUFFER_FRAMES: usize = 10;           // 1.0 s pre-roll captured before speech onset
 pub const SILENCE_THRESHOLD: f32 = 0.02;      // RMS energy: tune up for noisy environments
 const HANGOVER_FRAMES: usize = 20;             // 2.0 s trailing silence before segment closes
 const MAX_SEGMENT_SAMPLES: usize = 16000 * 120; // 2-minute hard cap per segment
 pub fn decode(input: &str) -> Result<Vec<f32>, Box<dyn std::error::Error>> {
    let output = Command::new("ffmpeg")
@@ -33,18 +42,12 @@ impl LiveStream {
        Ok(LiveStream { child, stdout })
    }
-    // Reads exactly `secs` seconds of audio. Returns None when the stream ends.
+    fn next_frame(&mut self) -> Result<Option<Vec<f32>>, Box<dyn std::error::Error>> {
-    pub fn next_chunk(&mut self, secs: u32) -> Result<Option<Vec<f32>>, Box<dyn std::error::Error>> {
+        let mut buf = vec![0u8; FRAME_SAMPLES * 4];
        let num_bytes = secs as usize * 16000 * 4;
        let mut buf = vec![0u8; num_bytes];
        match self.stdout.read_exact(&mut buf) {
-            Ok(()) => {
+            Ok(()) => Ok(Some(buf.chunks_exact(4)
-                let samples = buf.chunks_exact(4)
+                .map(|b| f32::from_le_bytes(b.try_into().unwrap()))
-                    .map(|b| f32::from_le_bytes(b.try_into().unwrap()))
+                .collect())),
                    .collect();
                Ok(Some(samples))
            }
            Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => Ok(None),
            Err(e) => Err(e.into()),
        }
@@ -56,3 +59,67 @@ impl Drop for LiveStream {
        let _ = self.child.kill();
    }
 }
 fn rms(samples: &[f32]) -> f32 {
    (samples.iter().map(|&s| s * s).sum::<f32>() / samples.len() as f32).sqrt()
 }
 pub struct VadStream {
    inner: LiveStream,
    pre_buffer: VecDeque<Vec<f32>>,
    running: Arc<AtomicBool>,
 }
 impl VadStream {
    pub fn open(source: &str, running: Arc<AtomicBool>) -> Result<Self, Box<dyn std::error::Error>> {
        Ok(VadStream {
            inner: LiveStream::open(source)?,
            pre_buffer: VecDeque::with_capacity(PRE_BUFFER_FRAMES + 1),
            running,
        })
    }
    /// Function itself blocks until a complete speech segment is captured, then returns it.
    /// returns None when the underlying stream ends or running is set to false.
    pub fn next_segment(&mut self) -> Result<Option<Vec<f32>>, Box<dyn std::error::Error>> {
        let mut speech: Vec<f32> = Vec::new();
        let mut speech_active = false;
        let mut hangover = 0usize;
        loop {
            if !self.running.load(Ordering::SeqCst) {
                return if speech.is_empty() { Ok(None) } else { Ok(Some(speech)) };
            }
            let frame = match self.inner.next_frame()? {
                Some(f) => f,
                None => return if speech.is_empty() { Ok(None) } else { Ok(Some(speech)) },
            };
            let energy = rms(&frame);
            if energy > SILENCE_THRESHOLD {
                if !speech_active {
                    speech_active = true;
                    eprintln!("  [recording]");
                    for pre in self.pre_buffer.drain(..) {
                        speech.extend(pre);
                    }
                }
                speech.extend(&frame);
                hangover = 0;
            } else if speech_active {
                speech.extend(&frame);
                hangover += 1;
                if hangover >= HANGOVER_FRAMES || speech.len() >= MAX_SEGMENT_SAMPLES {
                    return Ok(Some(speech));
                }
            } else {
                if self.pre_buffer.len() >= PRE_BUFFER_FRAMES {
                    self.pre_buffer.pop_front();
                }
                self.pre_buffer.push_back(frame);
            }
        }
    }
 }
@@ -1,7 +1,7 @@
-use std::fs::OpenOptions;
+use std::fs::{self, OpenOptions};
 use std::io::Write;
 use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::Arc;
+use std::sync::{mpsc, Arc};
 use chrono::{DateTime, Local};
 use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters};
@@ -9,18 +9,41 @@ mod audio;
 const CHUNK_SECS: u32 = 30;
 const CHUNK_SAMPLES: usize = 16000 * CHUNK_SECS as usize;
 const CLIP_DIR: &str = "./audio_clips";
 fn save_clip(samples: &[f32], path: &str) -> Result<(), Box<dyn std::error::Error>> {
    let spec = hound::WavSpec {
        channels: 1,
        sample_rate: 16000,
        bits_per_sample: 16,
        sample_format: hound::SampleFormat::Int,
    };
    let mut writer = hound::WavWriter::create(path, spec)?;
    for &s in samples {
        writer.write_sample((s.clamp(-1.0, 1.0) * i16::MAX as f32) as i16)?;
    }
    writer.finalize()?;
    Ok(())
 }
 fn transcribe_chunk(
    state: &mut whisper_rs::WhisperState,
    chunk: &[f32],
    out: &mut impl Write,
    counter: &mut u32,
 ) -> Result<(), Box<dyn std::error::Error>> {
    let time: DateTime<Local> = Local::now();
    let id = format!("{}_{:04}", time.format("%Y%m%d_%H%M%S"), counter);
    *counter += 1;
    let clip_path = format!("{}/{}.wav", CLIP_DIR, id);
    save_clip(chunk, &clip_path)?;
    let params = FullParams::new(SamplingStrategy::BeamSearch {
        beam_size: 5,
        patience: -1.0,
    });
-    let time: DateTime<Local> = Local::now();
+    out.write_all(format!("[{}] [{}]: ", time, id).as_bytes())?;
    out.write_all(format!("[{}]: ", time.to_string()).as_bytes())?;
    state.full(params, chunk)?;
    for segment in state.as_iter() {
        let line = format!("{}\n", segment);
@@ -36,36 +59,71 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
    let audio_arg  = std::env::args().nth(2).expect("usage: listener <model> <file.wav|pulse:SOURCE> [output.txt]");
    let output_path = std::env::args().nth(3).unwrap_or_else(|| "transcription.txt".to_string());
    fs::create_dir_all(CLIP_DIR)?;
    let running = Arc::new(AtomicBool::new(true));
    let r = running.clone();
    ctrlc::set_handler(move || {
        r.store(false, Ordering::SeqCst);
    })?;
-    let ctx = WhisperContext::new_with_params(&model_path, WhisperContextParameters::default())
+    eprintln!("Transcribing {} → {}  (clips → {})  (Ctrl+C to stop)", audio_arg, output_path, CLIP_DIR);
        .expect("failed to load model");
    let mut state = ctx.create_state().expect("failed to create state");
    let mut out = OpenOptions::new().create(true).append(true).open(&output_path)?;
    eprintln!("Transcribing {} → {}  (Ctrl+C to stop)", audio_arg, output_path);
    if let Some(source) = audio_arg.strip_prefix("pulse:") {
-        let mut stream = audio::LiveStream::open(source)?;
+        eprintln!("Listening for speech (silence threshold: {:.3} RMS) …", audio::SILENCE_THRESHOLD);
-        while running.load(Ordering::SeqCst) {
+
-            match stream.next_chunk(CHUNK_SECS)? {
+        let (tx, rx) = mpsc::channel::<Vec<f32>>();
-                Some(chunk) => transcribe_chunk(&mut state, &chunk, &mut out)?,
+
        // transcription run goes to a background thread so capture is never blocked.
        let model_path_t = model_path.clone();
        let output_path_t = output_path.clone();
        let transcription_thread = std::thread::spawn(move || {
            let ctx = WhisperContext::new_with_params(&model_path_t, WhisperContextParameters::default())
                .expect("failed to load model");
            let mut state = ctx.create_state().expect("failed to create state");
            let mut out = OpenOptions::new().create(true).append(true).open(&output_path_t)
                .expect("failed to open output file");
            let mut counter: u32 = 0;
            for segment in rx {
                let secs = segment.len() as f32 / 16000.0;
                eprintln!("  [transcribing {:.1}s segment…]", secs);
                if let Err(e) = transcribe_chunk(&mut state, &segment, &mut out, &mut counter) {
                    eprintln!("Transcription error: {e}");
                }
            }
        });
        // capture loop.. never pauses for transcription.
        let mut stream = audio::VadStream::open(source, running.clone())?;
        loop {
            match stream.next_segment()? {
                Some(segment) => {
                    let secs = segment.len() as f32 / 16000.0;
                    eprintln!("  [captured {:.1}s, queued for transcription]", secs);
                    if tx.send(segment).is_err() {
                        break; // transcription thread died
                    }
                }
                None => break,
            }
        }
        drop(tx); // closing the channel signals the transcription thread to finish
        transcription_thread.join().expect("transcription thread panicked");
    } else {
        let ctx = WhisperContext::new_with_params(&model_path, WhisperContextParameters::default())
            .expect("failed to load model");
        let mut state = ctx.create_state().expect("failed to create state");
        let mut out = OpenOptions::new().create(true).append(true).open(&output_path)?;
        let mut counter: u32 = 0;
        while running.load(Ordering::SeqCst) {
            let audio = audio::decode(&audio_arg)?;
            for chunk in audio.chunks(CHUNK_SAMPLES) {
                if !running.load(Ordering::SeqCst) {
                    break;
                }
-                transcribe_chunk(&mut state, chunk, &mut out)?;
+                transcribe_chunk(&mut state, chunk, &mut out, &mut counter)?;
            }
        }
    }
Author	SHA1	Message	Date
williamp	a06ebf722f	listener: implement rms (voice activity) detection	2026-05-10 19:20:01 -04:00
williamp	37bc07f667	listener: introduce clips feature	2026-05-10 19:02:54 -04:00