listener: use toml configuration instead of command args
This commit is contained in:
@@ -31,3 +31,6 @@ transcription.txt
|
||||
|
||||
# Clips dir
|
||||
audio_clips/
|
||||
|
||||
# Config file
|
||||
config.toml
|
||||
Generated
+108
@@ -166,6 +166,12 @@ version = "1.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
|
||||
|
||||
[[package]]
|
||||
name = "equivalent"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
|
||||
|
||||
[[package]]
|
||||
name = "find-msvc-tools"
|
||||
version = "0.1.9"
|
||||
@@ -208,6 +214,12 @@ version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.17.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
|
||||
|
||||
[[package]]
|
||||
name = "hound"
|
||||
version = "3.5.1"
|
||||
@@ -238,6 +250,16 @@ dependencies = [
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "2.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"hashbrown",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.13.0"
|
||||
@@ -427,6 +449,8 @@ dependencies = [
|
||||
"chrono",
|
||||
"ctrlc",
|
||||
"hound",
|
||||
"serde",
|
||||
"toml",
|
||||
"whisper-rs",
|
||||
]
|
||||
|
||||
@@ -436,6 +460,45 @@ version = "1.0.28"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.228"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
|
||||
dependencies = [
|
||||
"serde_core",
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_core"
|
||||
version = "1.0.228"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.228"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_spanned"
|
||||
version = "1.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6662b5879511e06e8999a8a235d848113e942c9124f211511b16466ee2995f26"
|
||||
dependencies = [
|
||||
"serde_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "shlex"
|
||||
version = "1.3.0"
|
||||
@@ -459,6 +522,45 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
version = "1.1.2+spec-1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "81f3d15e84cbcd896376e6730314d59fb5a87f31e4b038454184435cd57defee"
|
||||
dependencies = [
|
||||
"indexmap",
|
||||
"serde_core",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"toml_parser",
|
||||
"toml_writer",
|
||||
"winnow",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_datetime"
|
||||
version = "1.1.1+spec-1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7"
|
||||
dependencies = [
|
||||
"serde_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_parser"
|
||||
version = "1.1.2+spec-1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526"
|
||||
dependencies = [
|
||||
"winnow",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_writer"
|
||||
version = "1.1.1+spec-1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "756daf9b1013ebe47a8776667b466417e2d4c5679d441c26230efd9ef78692db"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.24"
|
||||
@@ -599,3 +701,9 @@ checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
|
||||
dependencies = [
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winnow"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0592e1c9d151f854e6fd382574c3a0855250e1d9b2f99d9281c6e6391af352f1"
|
||||
|
||||
@@ -7,4 +7,6 @@ edition = "2024"
|
||||
chrono = "0.4.44"
|
||||
ctrlc = "3.5.2"
|
||||
hound = "3.5.1"
|
||||
serde = { version = "1.0.228", features = ["derive"] }
|
||||
toml = "1.1.2"
|
||||
whisper-rs = "0.16.0"
|
||||
|
||||
@@ -0,0 +1,23 @@
|
||||
# Path to the Whisper GGML model file
|
||||
model = "models/ggml-base.en.bin"
|
||||
|
||||
# Audio source: a file path, or "pulse:<source>" for live PulseAudio capture
|
||||
# Examples:
|
||||
# audio = "recording.wav"
|
||||
# audio = "pulse:default"
|
||||
audio = "pulse:default"
|
||||
|
||||
# Where to write the transcript
|
||||
output = "transcription.txt"
|
||||
|
||||
# Directory to store per-segment WAV clips
|
||||
clip_dir = "./audio_clips"
|
||||
|
||||
# File-mode only: how many seconds of audio to transcribe per chunk
|
||||
chunk_secs = 30
|
||||
|
||||
# VAD settings (pulse mode only)
|
||||
silence_threshold = 0.02 # RMS energy cutoff; raise for noisy environments
|
||||
pre_buffer_secs = 1.0 # seconds of audio kept before speech onset
|
||||
hangover_secs = 2.0 # trailing silence before a segment is closed
|
||||
max_segment_secs = 120.0 # hard cap per segment
|
||||
+16
-11
@@ -5,10 +5,13 @@ use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
const FRAME_SAMPLES: usize = 1600; // 100 ms at 16 kHz
|
||||
const PRE_BUFFER_FRAMES: usize = 10; // 1.0 s pre-roll captured before speech onset
|
||||
pub const SILENCE_THRESHOLD: f32 = 0.02; // RMS energy: tune up for noisy environments
|
||||
const HANGOVER_FRAMES: usize = 20; // 2.0 s trailing silence before segment closes
|
||||
const MAX_SEGMENT_SAMPLES: usize = 16000 * 120; // 2-minute hard cap per segment
|
||||
|
||||
pub struct VadConfig {
|
||||
pub silence_threshold: f32,
|
||||
pub pre_buffer_frames: usize,
|
||||
pub hangover_frames: usize,
|
||||
pub max_segment_samples: usize,
|
||||
}
|
||||
|
||||
pub fn decode(input: &str) -> Result<Vec<f32>, Box<dyn std::error::Error>> {
|
||||
let output = Command::new("ffmpeg")
|
||||
@@ -68,19 +71,21 @@ pub struct VadStream {
|
||||
inner: LiveStream,
|
||||
pre_buffer: VecDeque<Vec<f32>>,
|
||||
running: Arc<AtomicBool>,
|
||||
cfg: VadConfig,
|
||||
}
|
||||
|
||||
impl VadStream {
|
||||
pub fn open(source: &str, running: Arc<AtomicBool>) -> Result<Self, Box<dyn std::error::Error>> {
|
||||
pub fn open(source: &str, running: Arc<AtomicBool>, cfg: VadConfig) -> Result<Self, Box<dyn std::error::Error>> {
|
||||
Ok(VadStream {
|
||||
inner: LiveStream::open(source)?,
|
||||
pre_buffer: VecDeque::with_capacity(PRE_BUFFER_FRAMES + 1),
|
||||
pre_buffer: VecDeque::with_capacity(cfg.pre_buffer_frames + 1),
|
||||
running,
|
||||
cfg,
|
||||
})
|
||||
}
|
||||
|
||||
/// Function itself blocks until a complete speech segment is captured, then returns it.
|
||||
/// returns None when the underlying stream ends or running is set to false.
|
||||
/// Blocks until a complete speech segment is captured, then returns it.
|
||||
/// Returns None when the underlying stream ends or running is set to false.
|
||||
pub fn next_segment(&mut self) -> Result<Option<Vec<f32>>, Box<dyn std::error::Error>> {
|
||||
let mut speech: Vec<f32> = Vec::new();
|
||||
let mut speech_active = false;
|
||||
@@ -98,7 +103,7 @@ impl VadStream {
|
||||
|
||||
let energy = rms(&frame);
|
||||
|
||||
if energy > SILENCE_THRESHOLD {
|
||||
if energy > self.cfg.silence_threshold {
|
||||
if !speech_active {
|
||||
speech_active = true;
|
||||
eprintln!(" [recording]");
|
||||
@@ -111,11 +116,11 @@ impl VadStream {
|
||||
} else if speech_active {
|
||||
speech.extend(&frame);
|
||||
hangover += 1;
|
||||
if hangover >= HANGOVER_FRAMES || speech.len() >= MAX_SEGMENT_SAMPLES {
|
||||
if hangover >= self.cfg.hangover_frames || speech.len() >= self.cfg.max_segment_samples {
|
||||
return Ok(Some(speech));
|
||||
}
|
||||
} else {
|
||||
if self.pre_buffer.len() >= PRE_BUFFER_FRAMES {
|
||||
if self.pre_buffer.len() >= self.cfg.pre_buffer_frames {
|
||||
self.pre_buffer.pop_front();
|
||||
}
|
||||
self.pre_buffer.push_back(frame);
|
||||
|
||||
+56
-14
@@ -3,13 +3,38 @@ use std::io::Write;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{mpsc, Arc};
|
||||
use chrono::{DateTime, Local};
|
||||
use serde::Deserialize;
|
||||
use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters};
|
||||
|
||||
mod audio;
|
||||
|
||||
const CHUNK_SECS: u32 = 30;
|
||||
const CHUNK_SAMPLES: usize = 16000 * CHUNK_SECS as usize;
|
||||
const CLIP_DIR: &str = "./audio_clips";
|
||||
#[derive(Deserialize)]
|
||||
struct Config {
|
||||
model: String,
|
||||
audio: String,
|
||||
#[serde(default = "default_output")]
|
||||
output: String,
|
||||
#[serde(default = "default_clip_dir")]
|
||||
clip_dir: String,
|
||||
#[serde(default = "default_chunk_secs")]
|
||||
chunk_secs: u32,
|
||||
#[serde(default = "default_silence_threshold")]
|
||||
silence_threshold: f32,
|
||||
#[serde(default = "default_pre_buffer_secs")]
|
||||
pre_buffer_secs: f32,
|
||||
#[serde(default = "default_hangover_secs")]
|
||||
hangover_secs: f32,
|
||||
#[serde(default = "default_max_segment_secs")]
|
||||
max_segment_secs: f32,
|
||||
}
|
||||
|
||||
fn default_output() -> String { "transcription.txt".into() }
|
||||
fn default_clip_dir() -> String { "./audio_clips".into() }
|
||||
fn default_chunk_secs() -> u32 { 30 }
|
||||
fn default_silence_threshold() -> f32 { 0.02 }
|
||||
fn default_pre_buffer_secs() -> f32 { 1.0 }
|
||||
fn default_hangover_secs() -> f32 { 2.0 }
|
||||
fn default_max_segment_secs() -> f32 { 120.0 }
|
||||
|
||||
fn save_clip(samples: &[f32], path: &str) -> Result<(), Box<dyn std::error::Error>> {
|
||||
let spec = hound::WavSpec {
|
||||
@@ -31,12 +56,13 @@ fn transcribe_chunk(
|
||||
chunk: &[f32],
|
||||
out: &mut impl Write,
|
||||
counter: &mut u32,
|
||||
clip_dir: &str,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
let time: DateTime<Local> = Local::now();
|
||||
let id = format!("{}_{:04}", time.format("%Y%m%d_%H%M%S"), counter);
|
||||
*counter += 1;
|
||||
|
||||
let clip_path = format!("{}/{}.wav", CLIP_DIR, id);
|
||||
let clip_path = format!("{}/{}.wav", clip_dir, id);
|
||||
save_clip(chunk, &clip_path)?;
|
||||
|
||||
let params = FullParams::new(SamplingStrategy::BeamSearch {
|
||||
@@ -55,11 +81,26 @@ fn transcribe_chunk(
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let model_path = std::env::args().nth(1).expect("usage: listener <model> <file.wav|pulse:SOURCE> [output.txt]");
|
||||
let audio_arg = std::env::args().nth(2).expect("usage: listener <model> <file.wav|pulse:SOURCE> [output.txt]");
|
||||
let output_path = std::env::args().nth(3).unwrap_or_else(|| "transcription.txt".to_string());
|
||||
let config_path = std::env::args().nth(1).unwrap_or_else(|| "config.toml".to_string());
|
||||
let config_str = fs::read_to_string(&config_path)
|
||||
.unwrap_or_else(|_| panic!("failed to read config file: {config_path}"));
|
||||
let cfg: Config = toml::from_str(&config_str)
|
||||
.unwrap_or_else(|e| panic!("invalid config: {e}"));
|
||||
|
||||
fs::create_dir_all(CLIP_DIR)?;
|
||||
let model_path = cfg.model.clone();
|
||||
let audio_arg = cfg.audio.clone();
|
||||
let output_path = cfg.output.clone();
|
||||
let clip_dir = cfg.clip_dir.clone();
|
||||
let chunk_samples: usize = 16000 * cfg.chunk_secs as usize;
|
||||
|
||||
let vad_cfg = audio::VadConfig {
|
||||
silence_threshold: cfg.silence_threshold,
|
||||
pre_buffer_frames: (cfg.pre_buffer_secs * 10.0) as usize,
|
||||
hangover_frames: (cfg.hangover_secs * 10.0) as usize,
|
||||
max_segment_samples: (cfg.max_segment_secs * 16000.0) as usize,
|
||||
};
|
||||
|
||||
fs::create_dir_all(&clip_dir)?;
|
||||
|
||||
let running = Arc::new(AtomicBool::new(true));
|
||||
let r = running.clone();
|
||||
@@ -67,16 +108,17 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
r.store(false, Ordering::SeqCst);
|
||||
})?;
|
||||
|
||||
eprintln!("Transcribing {} → {} (clips → {}) (Ctrl+C to stop)", audio_arg, output_path, CLIP_DIR);
|
||||
eprintln!("Transcribing {} → {} (clips → {}) (Ctrl+C to stop)", audio_arg, output_path, clip_dir);
|
||||
|
||||
if let Some(source) = audio_arg.strip_prefix("pulse:") {
|
||||
eprintln!("Listening for speech (silence threshold: {:.3} RMS) …", audio::SILENCE_THRESHOLD);
|
||||
eprintln!("Listening for speech (silence threshold: {:.3} RMS) …", cfg.silence_threshold);
|
||||
|
||||
let (tx, rx) = mpsc::channel::<Vec<f32>>();
|
||||
|
||||
// transcription run goes to a background thread so capture is never blocked.
|
||||
let model_path_t = model_path.clone();
|
||||
let output_path_t = output_path.clone();
|
||||
let clip_dir_t = clip_dir.clone();
|
||||
let transcription_thread = std::thread::spawn(move || {
|
||||
let ctx = WhisperContext::new_with_params(&model_path_t, WhisperContextParameters::default())
|
||||
.expect("failed to load model");
|
||||
@@ -87,14 +129,14 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
for segment in rx {
|
||||
let secs = segment.len() as f32 / 16000.0;
|
||||
eprintln!(" [transcribing {:.1}s segment…]", secs);
|
||||
if let Err(e) = transcribe_chunk(&mut state, &segment, &mut out, &mut counter) {
|
||||
if let Err(e) = transcribe_chunk(&mut state, &segment, &mut out, &mut counter, &clip_dir_t) {
|
||||
eprintln!("Transcription error: {e}");
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// capture loop.. never pauses for transcription.
|
||||
let mut stream = audio::VadStream::open(source, running.clone())?;
|
||||
let mut stream = audio::VadStream::open(source, running.clone(), vad_cfg)?;
|
||||
loop {
|
||||
match stream.next_segment()? {
|
||||
Some(segment) => {
|
||||
@@ -119,11 +161,11 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
|
||||
while running.load(Ordering::SeqCst) {
|
||||
let audio = audio::decode(&audio_arg)?;
|
||||
for chunk in audio.chunks(CHUNK_SAMPLES) {
|
||||
for chunk in audio.chunks(chunk_samples) {
|
||||
if !running.load(Ordering::SeqCst) {
|
||||
break;
|
||||
}
|
||||
transcribe_chunk(&mut state, chunk, &mut out, &mut counter)?;
|
||||
transcribe_chunk(&mut state, chunk, &mut out, &mut counter, &clip_dir)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user