From 8a190496caa7b82d134c74f59fb4e7b2fc8085b8 Mon Sep 17 00:00:00 2001 From: William P Date: Mon, 25 May 2026 01:21:24 +0000 Subject: [PATCH] listener: use toml configuration instead of command args --- listener/.gitignore | 5 +- listener/Cargo.lock | 108 +++++++++++++++++++++++++++++++++++ listener/Cargo.toml | 2 + listener/config.toml.example | 23 ++++++++ listener/src/audio.rs | 29 ++++++---- listener/src/main.rs | 70 ++++++++++++++++++----- 6 files changed, 210 insertions(+), 27 deletions(-) create mode 100644 listener/config.toml.example diff --git a/listener/.gitignore b/listener/.gitignore index 6576326..5879aa7 100644 --- a/listener/.gitignore +++ b/listener/.gitignore @@ -30,4 +30,7 @@ models/ transcription.txt # Clips dir -audio_clips/ \ No newline at end of file +audio_clips/ + +# Config file +config.toml \ No newline at end of file diff --git a/listener/Cargo.lock b/listener/Cargo.lock index 0e2304e..1c3fa06 100644 --- a/listener/Cargo.lock +++ b/listener/Cargo.lock @@ -166,6 +166,12 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + [[package]] name = "find-msvc-tools" version = "0.1.9" @@ -208,6 +214,12 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" +[[package]] +name = "hashbrown" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" + [[package]] name = "hound" version = "3.5.1" @@ -238,6 +250,16 @@ dependencies = [ "cc", ] +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown", +] + [[package]] name = "itertools" version = "0.13.0" @@ -427,6 +449,8 @@ dependencies = [ "chrono", "ctrlc", "hound", + "serde", + "toml", "whisper-rs", ] @@ -436,6 +460,45 @@ version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_spanned" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6662b5879511e06e8999a8a235d848113e942c9124f211511b16466ee2995f26" +dependencies = [ + "serde_core", +] + [[package]] name = "shlex" version = "1.3.0" @@ -459,6 +522,45 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "toml" +version = "1.1.2+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81f3d15e84cbcd896376e6730314d59fb5a87f31e4b038454184435cd57defee" +dependencies = [ + "indexmap", + "serde_core", + "serde_spanned", + "toml_datetime", + "toml_parser", + "toml_writer", + "winnow", +] + +[[package]] +name = "toml_datetime" +version = "1.1.1+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7" +dependencies = [ + "serde_core", +] + +[[package]] +name = "toml_parser" +version = "1.1.2+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" +dependencies = [ + "winnow", +] + +[[package]] +name = "toml_writer" +version = "1.1.1+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "756daf9b1013ebe47a8776667b466417e2d4c5679d441c26230efd9ef78692db" + [[package]] name = "unicode-ident" version = "1.0.24" @@ -599,3 +701,9 @@ checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" dependencies = [ "windows-link", ] + +[[package]] +name = "winnow" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0592e1c9d151f854e6fd382574c3a0855250e1d9b2f99d9281c6e6391af352f1" diff --git a/listener/Cargo.toml b/listener/Cargo.toml index 98e03b2..c1a99cb 100644 --- a/listener/Cargo.toml +++ b/listener/Cargo.toml @@ -7,4 +7,6 @@ edition = "2024" chrono = "0.4.44" ctrlc = "3.5.2" hound = "3.5.1" +serde = { version = "1.0.228", features = ["derive"] } +toml = "1.1.2" whisper-rs = "0.16.0" diff --git a/listener/config.toml.example b/listener/config.toml.example new file mode 100644 index 0000000..21ab54e --- /dev/null +++ b/listener/config.toml.example @@ -0,0 +1,23 @@ +# Path to the Whisper GGML model file +model = "models/ggml-base.en.bin" + +# Audio source: a file path, or "pulse:" for live PulseAudio capture +# Examples: +# audio = "recording.wav" +# audio = "pulse:default" +audio = "pulse:default" + +# Where to write the transcript +output = "transcription.txt" + +# Directory to store per-segment WAV clips +clip_dir = "./audio_clips" + +# File-mode only: how many seconds of audio to transcribe per chunk +chunk_secs = 30 + +# VAD settings (pulse mode only) +silence_threshold = 0.02 # RMS energy cutoff; raise for noisy environments +pre_buffer_secs = 1.0 # seconds of audio kept before speech onset +hangover_secs = 2.0 # trailing silence before a segment is closed +max_segment_secs = 120.0 # hard cap per segment diff --git a/listener/src/audio.rs b/listener/src/audio.rs index cbde5f1..7008221 100644 --- a/listener/src/audio.rs +++ b/listener/src/audio.rs @@ -4,11 +4,14 @@ use std::process::{Child, ChildStdout, Command, Stdio}; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; -const FRAME_SAMPLES: usize = 1600; // 100 ms at 16 kHz -const PRE_BUFFER_FRAMES: usize = 10; // 1.0 s pre-roll captured before speech onset -pub const SILENCE_THRESHOLD: f32 = 0.02; // RMS energy: tune up for noisy environments -const HANGOVER_FRAMES: usize = 20; // 2.0 s trailing silence before segment closes -const MAX_SEGMENT_SAMPLES: usize = 16000 * 120; // 2-minute hard cap per segment +const FRAME_SAMPLES: usize = 1600; // 100 ms at 16 kHz + +pub struct VadConfig { + pub silence_threshold: f32, + pub pre_buffer_frames: usize, + pub hangover_frames: usize, + pub max_segment_samples: usize, +} pub fn decode(input: &str) -> Result, Box> { let output = Command::new("ffmpeg") @@ -68,19 +71,21 @@ pub struct VadStream { inner: LiveStream, pre_buffer: VecDeque>, running: Arc, + cfg: VadConfig, } impl VadStream { - pub fn open(source: &str, running: Arc) -> Result> { + pub fn open(source: &str, running: Arc, cfg: VadConfig) -> Result> { Ok(VadStream { inner: LiveStream::open(source)?, - pre_buffer: VecDeque::with_capacity(PRE_BUFFER_FRAMES + 1), + pre_buffer: VecDeque::with_capacity(cfg.pre_buffer_frames + 1), running, + cfg, }) } - /// Function itself blocks until a complete speech segment is captured, then returns it. - /// returns None when the underlying stream ends or running is set to false. + /// Blocks until a complete speech segment is captured, then returns it. + /// Returns None when the underlying stream ends or running is set to false. pub fn next_segment(&mut self) -> Result>, Box> { let mut speech: Vec = Vec::new(); let mut speech_active = false; @@ -98,7 +103,7 @@ impl VadStream { let energy = rms(&frame); - if energy > SILENCE_THRESHOLD { + if energy > self.cfg.silence_threshold { if !speech_active { speech_active = true; eprintln!(" [recording]"); @@ -111,11 +116,11 @@ impl VadStream { } else if speech_active { speech.extend(&frame); hangover += 1; - if hangover >= HANGOVER_FRAMES || speech.len() >= MAX_SEGMENT_SAMPLES { + if hangover >= self.cfg.hangover_frames || speech.len() >= self.cfg.max_segment_samples { return Ok(Some(speech)); } } else { - if self.pre_buffer.len() >= PRE_BUFFER_FRAMES { + if self.pre_buffer.len() >= self.cfg.pre_buffer_frames { self.pre_buffer.pop_front(); } self.pre_buffer.push_back(frame); diff --git a/listener/src/main.rs b/listener/src/main.rs index 0049362..9e3bed8 100644 --- a/listener/src/main.rs +++ b/listener/src/main.rs @@ -3,13 +3,38 @@ use std::io::Write; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{mpsc, Arc}; use chrono::{DateTime, Local}; +use serde::Deserialize; use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters}; mod audio; -const CHUNK_SECS: u32 = 30; -const CHUNK_SAMPLES: usize = 16000 * CHUNK_SECS as usize; -const CLIP_DIR: &str = "./audio_clips"; +#[derive(Deserialize)] +struct Config { + model: String, + audio: String, + #[serde(default = "default_output")] + output: String, + #[serde(default = "default_clip_dir")] + clip_dir: String, + #[serde(default = "default_chunk_secs")] + chunk_secs: u32, + #[serde(default = "default_silence_threshold")] + silence_threshold: f32, + #[serde(default = "default_pre_buffer_secs")] + pre_buffer_secs: f32, + #[serde(default = "default_hangover_secs")] + hangover_secs: f32, + #[serde(default = "default_max_segment_secs")] + max_segment_secs: f32, +} + +fn default_output() -> String { "transcription.txt".into() } +fn default_clip_dir() -> String { "./audio_clips".into() } +fn default_chunk_secs() -> u32 { 30 } +fn default_silence_threshold() -> f32 { 0.02 } +fn default_pre_buffer_secs() -> f32 { 1.0 } +fn default_hangover_secs() -> f32 { 2.0 } +fn default_max_segment_secs() -> f32 { 120.0 } fn save_clip(samples: &[f32], path: &str) -> Result<(), Box> { let spec = hound::WavSpec { @@ -31,12 +56,13 @@ fn transcribe_chunk( chunk: &[f32], out: &mut impl Write, counter: &mut u32, + clip_dir: &str, ) -> Result<(), Box> { let time: DateTime = Local::now(); let id = format!("{}_{:04}", time.format("%Y%m%d_%H%M%S"), counter); *counter += 1; - let clip_path = format!("{}/{}.wav", CLIP_DIR, id); + let clip_path = format!("{}/{}.wav", clip_dir, id); save_clip(chunk, &clip_path)?; let params = FullParams::new(SamplingStrategy::BeamSearch { @@ -55,11 +81,26 @@ fn transcribe_chunk( } fn main() -> Result<(), Box> { - let model_path = std::env::args().nth(1).expect("usage: listener [output.txt]"); - let audio_arg = std::env::args().nth(2).expect("usage: listener [output.txt]"); - let output_path = std::env::args().nth(3).unwrap_or_else(|| "transcription.txt".to_string()); + let config_path = std::env::args().nth(1).unwrap_or_else(|| "config.toml".to_string()); + let config_str = fs::read_to_string(&config_path) + .unwrap_or_else(|_| panic!("failed to read config file: {config_path}")); + let cfg: Config = toml::from_str(&config_str) + .unwrap_or_else(|e| panic!("invalid config: {e}")); - fs::create_dir_all(CLIP_DIR)?; + let model_path = cfg.model.clone(); + let audio_arg = cfg.audio.clone(); + let output_path = cfg.output.clone(); + let clip_dir = cfg.clip_dir.clone(); + let chunk_samples: usize = 16000 * cfg.chunk_secs as usize; + + let vad_cfg = audio::VadConfig { + silence_threshold: cfg.silence_threshold, + pre_buffer_frames: (cfg.pre_buffer_secs * 10.0) as usize, + hangover_frames: (cfg.hangover_secs * 10.0) as usize, + max_segment_samples: (cfg.max_segment_secs * 16000.0) as usize, + }; + + fs::create_dir_all(&clip_dir)?; let running = Arc::new(AtomicBool::new(true)); let r = running.clone(); @@ -67,16 +108,17 @@ fn main() -> Result<(), Box> { r.store(false, Ordering::SeqCst); })?; - eprintln!("Transcribing {} → {} (clips → {}) (Ctrl+C to stop)", audio_arg, output_path, CLIP_DIR); + eprintln!("Transcribing {} → {} (clips → {}) (Ctrl+C to stop)", audio_arg, output_path, clip_dir); if let Some(source) = audio_arg.strip_prefix("pulse:") { - eprintln!("Listening for speech (silence threshold: {:.3} RMS) …", audio::SILENCE_THRESHOLD); + eprintln!("Listening for speech (silence threshold: {:.3} RMS) …", cfg.silence_threshold); let (tx, rx) = mpsc::channel::>(); // transcription run goes to a background thread so capture is never blocked. let model_path_t = model_path.clone(); let output_path_t = output_path.clone(); + let clip_dir_t = clip_dir.clone(); let transcription_thread = std::thread::spawn(move || { let ctx = WhisperContext::new_with_params(&model_path_t, WhisperContextParameters::default()) .expect("failed to load model"); @@ -87,14 +129,14 @@ fn main() -> Result<(), Box> { for segment in rx { let secs = segment.len() as f32 / 16000.0; eprintln!(" [transcribing {:.1}s segment…]", secs); - if let Err(e) = transcribe_chunk(&mut state, &segment, &mut out, &mut counter) { + if let Err(e) = transcribe_chunk(&mut state, &segment, &mut out, &mut counter, &clip_dir_t) { eprintln!("Transcription error: {e}"); } } }); // capture loop.. never pauses for transcription. - let mut stream = audio::VadStream::open(source, running.clone())?; + let mut stream = audio::VadStream::open(source, running.clone(), vad_cfg)?; loop { match stream.next_segment()? { Some(segment) => { @@ -119,11 +161,11 @@ fn main() -> Result<(), Box> { while running.load(Ordering::SeqCst) { let audio = audio::decode(&audio_arg)?; - for chunk in audio.chunks(CHUNK_SAMPLES) { + for chunk in audio.chunks(chunk_samples) { if !running.load(Ordering::SeqCst) { break; } - transcribe_chunk(&mut state, chunk, &mut out, &mut counter)?; + transcribe_chunk(&mut state, chunk, &mut out, &mut counter, &clip_dir)?; } } }