listener: implement rms (voice activity) detection

listener: introduce clips feature
2026-05-10 19:20:01 -04:00 · 2026-05-10 19:02:54 -04:00
5 changed files with 164 additions and 28 deletions
@@ -27,4 +27,7 @@ rustc-ice-*.txt
 models/

 # Transcription text file
-transcription.txt
+transcription.txt
+
+# Clips dir
+audio_clips/
@@ -208,6 +208,12 @@ version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"

+[[package]]
+name = "hound"
+version = "3.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62adaabb884c94955b19907d60019f4e145d091c75345379e70d1ee696f7854f"
+
 [[package]]
 name = "iana-time-zone"
 version = "0.1.65"
@@ -420,6 +426,7 @@ version = "0.1.0"
 dependencies = [
 "chrono",
 "ctrlc",
+ "hound",
 "whisper-rs",
 ]

@@ -6,4 +6,5 @@ edition = "2024"
 [dependencies]
 chrono = "0.4.44"
 ctrlc = "3.5.2"
+hound = "3.5.1"
 whisper-rs = "0.16.0"
@@ -1,5 +1,14 @@
+use std::collections::VecDeque;
 use std::io::Read;
 use std::process::{Child, ChildStdout, Command, Stdio};
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+
+const FRAME_SAMPLES: usize = 1600;             // 100 ms at 16 kHz
+const PRE_BUFFER_FRAMES: usize = 10;           // 1.0 s pre-roll captured before speech onset
+pub const SILENCE_THRESHOLD: f32 = 0.02;      // RMS energy: tune up for noisy environments
+const HANGOVER_FRAMES: usize = 20;             // 2.0 s trailing silence before segment closes
+const MAX_SEGMENT_SAMPLES: usize = 16000 * 120; // 2-minute hard cap per segment

 pub fn decode(input: &str) -> Result<Vec<f32>, Box<dyn std::error::Error>> {
    let output = Command::new("ffmpeg")
@@ -33,18 +42,12 @@ impl LiveStream {
        Ok(LiveStream { child, stdout })
    }

-    // Reads exactly `secs` seconds of audio. Returns None when the stream ends.
-    pub fn next_chunk(&mut self, secs: u32) -> Result<Option<Vec<f32>>, Box<dyn std::error::Error>> {
-        let num_bytes = secs as usize * 16000 * 4;
-        let mut buf = vec![0u8; num_bytes];
-
+    fn next_frame(&mut self) -> Result<Option<Vec<f32>>, Box<dyn std::error::Error>> {
+        let mut buf = vec![0u8; FRAME_SAMPLES * 4];
        match self.stdout.read_exact(&mut buf) {
-            Ok(()) => {
-                let samples = buf.chunks_exact(4)
-                    .map(|b| f32::from_le_bytes(b.try_into().unwrap()))
-                    .collect();
-                Ok(Some(samples))
-            }
+            Ok(()) => Ok(Some(buf.chunks_exact(4)
+                .map(|b| f32::from_le_bytes(b.try_into().unwrap()))
+                .collect())),
            Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => Ok(None),
            Err(e) => Err(e.into()),
        }
@@ -56,3 +59,67 @@ impl Drop for LiveStream {
        let _ = self.child.kill();
    }
 }
+
+fn rms(samples: &[f32]) -> f32 {
+    (samples.iter().map(|&s| s * s).sum::<f32>() / samples.len() as f32).sqrt()
+}
+
+pub struct VadStream {
+    inner: LiveStream,
+    pre_buffer: VecDeque<Vec<f32>>,
+    running: Arc<AtomicBool>,
+}
+
+impl VadStream {
+    pub fn open(source: &str, running: Arc<AtomicBool>) -> Result<Self, Box<dyn std::error::Error>> {
+        Ok(VadStream {
+            inner: LiveStream::open(source)?,
+            pre_buffer: VecDeque::with_capacity(PRE_BUFFER_FRAMES + 1),
+            running,
+        })
+    }
+
+    /// Function itself blocks until a complete speech segment is captured, then returns it.
+    /// returns None when the underlying stream ends or running is set to false.
+    pub fn next_segment(&mut self) -> Result<Option<Vec<f32>>, Box<dyn std::error::Error>> {
+        let mut speech: Vec<f32> = Vec::new();
+        let mut speech_active = false;
+        let mut hangover = 0usize;
+
+        loop {
+            if !self.running.load(Ordering::SeqCst) {
+                return if speech.is_empty() { Ok(None) } else { Ok(Some(speech)) };
+            }
+
+            let frame = match self.inner.next_frame()? {
+                Some(f) => f,
+                None => return if speech.is_empty() { Ok(None) } else { Ok(Some(speech)) },
+            };
+
+            let energy = rms(&frame);
+
+            if energy > SILENCE_THRESHOLD {
+                if !speech_active {
+                    speech_active = true;
+                    eprintln!("  [recording]");
+                    for pre in self.pre_buffer.drain(..) {
+                        speech.extend(pre);
+                    }
+                }
+                speech.extend(&frame);
+                hangover = 0;
+            } else if speech_active {
+                speech.extend(&frame);
+                hangover += 1;
+                if hangover >= HANGOVER_FRAMES || speech.len() >= MAX_SEGMENT_SAMPLES {
+                    return Ok(Some(speech));
+                }
+            } else {
+                if self.pre_buffer.len() >= PRE_BUFFER_FRAMES {
+                    self.pre_buffer.pop_front();
+                }
+                self.pre_buffer.push_back(frame);
+            }
+        }
+    }
+}
@@ -1,7 +1,7 @@
-use std::fs::OpenOptions;
+use std::fs::{self, OpenOptions};
 use std::io::Write;
 use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::Arc;
+use std::sync::{mpsc, Arc};
 use chrono::{DateTime, Local};
 use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters};

@@ -9,18 +9,41 @@ mod audio;

 const CHUNK_SECS: u32 = 30;
 const CHUNK_SAMPLES: usize = 16000 * CHUNK_SECS as usize;
+const CLIP_DIR: &str = "./audio_clips";
+
+fn save_clip(samples: &[f32], path: &str) -> Result<(), Box<dyn std::error::Error>> {
+    let spec = hound::WavSpec {
+        channels: 1,
+        sample_rate: 16000,
+        bits_per_sample: 16,
+        sample_format: hound::SampleFormat::Int,
+    };
+    let mut writer = hound::WavWriter::create(path, spec)?;
+    for &s in samples {
+        writer.write_sample((s.clamp(-1.0, 1.0) * i16::MAX as f32) as i16)?;
+    }
+    writer.finalize()?;
+    Ok(())
+}

 fn transcribe_chunk(
    state: &mut whisper_rs::WhisperState,
    chunk: &[f32],
    out: &mut impl Write,
+    counter: &mut u32,
 ) -> Result<(), Box<dyn std::error::Error>> {
+    let time: DateTime<Local> = Local::now();
+    let id = format!("{}_{:04}", time.format("%Y%m%d_%H%M%S"), counter);
+    *counter += 1;
+
+    let clip_path = format!("{}/{}.wav", CLIP_DIR, id);
+    save_clip(chunk, &clip_path)?;
+
    let params = FullParams::new(SamplingStrategy::BeamSearch {
        beam_size: 5,
        patience: -1.0,
    });
-    let time: DateTime<Local> = Local::now();
-    out.write_all(format!("[{}]: ", time.to_string()).as_bytes())?;
+    out.write_all(format!("[{}] [{}]: ", time, id).as_bytes())?;
    state.full(params, chunk)?;
    for segment in state.as_iter() {
        let line = format!("{}\n", segment);
@@ -36,36 +59,71 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
    let audio_arg  = std::env::args().nth(2).expect("usage: listener <model> <file.wav|pulse:SOURCE> [output.txt]");
    let output_path = std::env::args().nth(3).unwrap_or_else(|| "transcription.txt".to_string());

+    fs::create_dir_all(CLIP_DIR)?;
+
    let running = Arc::new(AtomicBool::new(true));
    let r = running.clone();
    ctrlc::set_handler(move || {
        r.store(false, Ordering::SeqCst);
    })?;

-    let ctx = WhisperContext::new_with_params(&model_path, WhisperContextParameters::default())
-        .expect("failed to load model");
-    let mut state = ctx.create_state().expect("failed to create state");
-
-    let mut out = OpenOptions::new().create(true).append(true).open(&output_path)?;
-
-    eprintln!("Transcribing {} → {}  (Ctrl+C to stop)", audio_arg, output_path);
+    eprintln!("Transcribing {} → {}  (clips → {})  (Ctrl+C to stop)", audio_arg, output_path, CLIP_DIR);

    if let Some(source) = audio_arg.strip_prefix("pulse:") {
-        let mut stream = audio::LiveStream::open(source)?;
-        while running.load(Ordering::SeqCst) {
-            match stream.next_chunk(CHUNK_SECS)? {
-                Some(chunk) => transcribe_chunk(&mut state, &chunk, &mut out)?,
+        eprintln!("Listening for speech (silence threshold: {:.3} RMS) …", audio::SILENCE_THRESHOLD);
+
+        let (tx, rx) = mpsc::channel::<Vec<f32>>();
+
+        // transcription run goes to a background thread so capture is never blocked.
+        let model_path_t = model_path.clone();
+        let output_path_t = output_path.clone();
+        let transcription_thread = std::thread::spawn(move || {
+            let ctx = WhisperContext::new_with_params(&model_path_t, WhisperContextParameters::default())
+                .expect("failed to load model");
+            let mut state = ctx.create_state().expect("failed to create state");
+            let mut out = OpenOptions::new().create(true).append(true).open(&output_path_t)
+                .expect("failed to open output file");
+            let mut counter: u32 = 0;
+            for segment in rx {
+                let secs = segment.len() as f32 / 16000.0;
+                eprintln!("  [transcribing {:.1}s segment…]", secs);
+                if let Err(e) = transcribe_chunk(&mut state, &segment, &mut out, &mut counter) {
+                    eprintln!("Transcription error: {e}");
+                }
+            }
+        });
+
+        // capture loop.. never pauses for transcription.
+        let mut stream = audio::VadStream::open(source, running.clone())?;
+        loop {
+            match stream.next_segment()? {
+                Some(segment) => {
+                    let secs = segment.len() as f32 / 16000.0;
+                    eprintln!("  [captured {:.1}s, queued for transcription]", secs);
+                    if tx.send(segment).is_err() {
+                        break; // transcription thread died
+                    }
+                }
                None => break,
            }
        }
+
+        drop(tx); // closing the channel signals the transcription thread to finish
+        transcription_thread.join().expect("transcription thread panicked");
    } else {
+        let ctx = WhisperContext::new_with_params(&model_path, WhisperContextParameters::default())
+            .expect("failed to load model");
+        let mut state = ctx.create_state().expect("failed to create state");
+        let mut out = OpenOptions::new().create(true).append(true).open(&output_path)?;
+        let mut counter: u32 = 0;
+
        while running.load(Ordering::SeqCst) {
            let audio = audio::decode(&audio_arg)?;
            for chunk in audio.chunks(CHUNK_SAMPLES) {
                if !running.load(Ordering::SeqCst) {
                    break;
                }
-                transcribe_chunk(&mut state, chunk, &mut out)?;
+                transcribe_chunk(&mut state, chunk, &mut out, &mut counter)?;
            }
        }
    }
Author	SHA1	Message	Date
williamp	a06ebf722f	listener: implement rms (voice activity) detection	2026-05-10 19:20:01 -04:00
williamp	37bc07f667	listener: introduce clips feature	2026-05-10 19:02:54 -04:00