Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
a06ebf722f
|
|||
|
37bc07f667
|
+4
-1
@@ -27,4 +27,7 @@ rustc-ice-*.txt
|
|||||||
models/
|
models/
|
||||||
|
|
||||||
# Transcription text file
|
# Transcription text file
|
||||||
transcription.txt
|
transcription.txt
|
||||||
|
|
||||||
|
# Clips dir
|
||||||
|
audio_clips/
|
||||||
Generated
+7
@@ -208,6 +208,12 @@ version = "0.3.3"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
|
checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hound"
|
||||||
|
version = "3.5.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "62adaabb884c94955b19907d60019f4e145d091c75345379e70d1ee696f7854f"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "iana-time-zone"
|
name = "iana-time-zone"
|
||||||
version = "0.1.65"
|
version = "0.1.65"
|
||||||
@@ -420,6 +426,7 @@ version = "0.1.0"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"chrono",
|
"chrono",
|
||||||
"ctrlc",
|
"ctrlc",
|
||||||
|
"hound",
|
||||||
"whisper-rs",
|
"whisper-rs",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -6,4 +6,5 @@ edition = "2024"
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
chrono = "0.4.44"
|
chrono = "0.4.44"
|
||||||
ctrlc = "3.5.2"
|
ctrlc = "3.5.2"
|
||||||
|
hound = "3.5.1"
|
||||||
whisper-rs = "0.16.0"
|
whisper-rs = "0.16.0"
|
||||||
|
|||||||
+78
-11
@@ -1,5 +1,14 @@
|
|||||||
|
use std::collections::VecDeque;
|
||||||
use std::io::Read;
|
use std::io::Read;
|
||||||
use std::process::{Child, ChildStdout, Command, Stdio};
|
use std::process::{Child, ChildStdout, Command, Stdio};
|
||||||
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
const FRAME_SAMPLES: usize = 1600; // 100 ms at 16 kHz
|
||||||
|
const PRE_BUFFER_FRAMES: usize = 10; // 1.0 s pre-roll captured before speech onset
|
||||||
|
pub const SILENCE_THRESHOLD: f32 = 0.02; // RMS energy: tune up for noisy environments
|
||||||
|
const HANGOVER_FRAMES: usize = 20; // 2.0 s trailing silence before segment closes
|
||||||
|
const MAX_SEGMENT_SAMPLES: usize = 16000 * 120; // 2-minute hard cap per segment
|
||||||
|
|
||||||
pub fn decode(input: &str) -> Result<Vec<f32>, Box<dyn std::error::Error>> {
|
pub fn decode(input: &str) -> Result<Vec<f32>, Box<dyn std::error::Error>> {
|
||||||
let output = Command::new("ffmpeg")
|
let output = Command::new("ffmpeg")
|
||||||
@@ -33,18 +42,12 @@ impl LiveStream {
|
|||||||
Ok(LiveStream { child, stdout })
|
Ok(LiveStream { child, stdout })
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reads exactly `secs` seconds of audio. Returns None when the stream ends.
|
fn next_frame(&mut self) -> Result<Option<Vec<f32>>, Box<dyn std::error::Error>> {
|
||||||
pub fn next_chunk(&mut self, secs: u32) -> Result<Option<Vec<f32>>, Box<dyn std::error::Error>> {
|
let mut buf = vec![0u8; FRAME_SAMPLES * 4];
|
||||||
let num_bytes = secs as usize * 16000 * 4;
|
|
||||||
let mut buf = vec![0u8; num_bytes];
|
|
||||||
|
|
||||||
match self.stdout.read_exact(&mut buf) {
|
match self.stdout.read_exact(&mut buf) {
|
||||||
Ok(()) => {
|
Ok(()) => Ok(Some(buf.chunks_exact(4)
|
||||||
let samples = buf.chunks_exact(4)
|
.map(|b| f32::from_le_bytes(b.try_into().unwrap()))
|
||||||
.map(|b| f32::from_le_bytes(b.try_into().unwrap()))
|
.collect())),
|
||||||
.collect();
|
|
||||||
Ok(Some(samples))
|
|
||||||
}
|
|
||||||
Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => Ok(None),
|
Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => Ok(None),
|
||||||
Err(e) => Err(e.into()),
|
Err(e) => Err(e.into()),
|
||||||
}
|
}
|
||||||
@@ -56,3 +59,67 @@ impl Drop for LiveStream {
|
|||||||
let _ = self.child.kill();
|
let _ = self.child.kill();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn rms(samples: &[f32]) -> f32 {
|
||||||
|
(samples.iter().map(|&s| s * s).sum::<f32>() / samples.len() as f32).sqrt()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct VadStream {
|
||||||
|
inner: LiveStream,
|
||||||
|
pre_buffer: VecDeque<Vec<f32>>,
|
||||||
|
running: Arc<AtomicBool>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl VadStream {
|
||||||
|
pub fn open(source: &str, running: Arc<AtomicBool>) -> Result<Self, Box<dyn std::error::Error>> {
|
||||||
|
Ok(VadStream {
|
||||||
|
inner: LiveStream::open(source)?,
|
||||||
|
pre_buffer: VecDeque::with_capacity(PRE_BUFFER_FRAMES + 1),
|
||||||
|
running,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Function itself blocks until a complete speech segment is captured, then returns it.
|
||||||
|
/// returns None when the underlying stream ends or running is set to false.
|
||||||
|
pub fn next_segment(&mut self) -> Result<Option<Vec<f32>>, Box<dyn std::error::Error>> {
|
||||||
|
let mut speech: Vec<f32> = Vec::new();
|
||||||
|
let mut speech_active = false;
|
||||||
|
let mut hangover = 0usize;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
if !self.running.load(Ordering::SeqCst) {
|
||||||
|
return if speech.is_empty() { Ok(None) } else { Ok(Some(speech)) };
|
||||||
|
}
|
||||||
|
|
||||||
|
let frame = match self.inner.next_frame()? {
|
||||||
|
Some(f) => f,
|
||||||
|
None => return if speech.is_empty() { Ok(None) } else { Ok(Some(speech)) },
|
||||||
|
};
|
||||||
|
|
||||||
|
let energy = rms(&frame);
|
||||||
|
|
||||||
|
if energy > SILENCE_THRESHOLD {
|
||||||
|
if !speech_active {
|
||||||
|
speech_active = true;
|
||||||
|
eprintln!(" [recording]");
|
||||||
|
for pre in self.pre_buffer.drain(..) {
|
||||||
|
speech.extend(pre);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
speech.extend(&frame);
|
||||||
|
hangover = 0;
|
||||||
|
} else if speech_active {
|
||||||
|
speech.extend(&frame);
|
||||||
|
hangover += 1;
|
||||||
|
if hangover >= HANGOVER_FRAMES || speech.len() >= MAX_SEGMENT_SAMPLES {
|
||||||
|
return Ok(Some(speech));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if self.pre_buffer.len() >= PRE_BUFFER_FRAMES {
|
||||||
|
self.pre_buffer.pop_front();
|
||||||
|
}
|
||||||
|
self.pre_buffer.push_back(frame);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
+74
-16
@@ -1,7 +1,7 @@
|
|||||||
use std::fs::OpenOptions;
|
use std::fs::{self, OpenOptions};
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use std::sync::atomic::{AtomicBool, Ordering};
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
use std::sync::Arc;
|
use std::sync::{mpsc, Arc};
|
||||||
use chrono::{DateTime, Local};
|
use chrono::{DateTime, Local};
|
||||||
use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters};
|
use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters};
|
||||||
|
|
||||||
@@ -9,18 +9,41 @@ mod audio;
|
|||||||
|
|
||||||
const CHUNK_SECS: u32 = 30;
|
const CHUNK_SECS: u32 = 30;
|
||||||
const CHUNK_SAMPLES: usize = 16000 * CHUNK_SECS as usize;
|
const CHUNK_SAMPLES: usize = 16000 * CHUNK_SECS as usize;
|
||||||
|
const CLIP_DIR: &str = "./audio_clips";
|
||||||
|
|
||||||
|
fn save_clip(samples: &[f32], path: &str) -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
let spec = hound::WavSpec {
|
||||||
|
channels: 1,
|
||||||
|
sample_rate: 16000,
|
||||||
|
bits_per_sample: 16,
|
||||||
|
sample_format: hound::SampleFormat::Int,
|
||||||
|
};
|
||||||
|
let mut writer = hound::WavWriter::create(path, spec)?;
|
||||||
|
for &s in samples {
|
||||||
|
writer.write_sample((s.clamp(-1.0, 1.0) * i16::MAX as f32) as i16)?;
|
||||||
|
}
|
||||||
|
writer.finalize()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn transcribe_chunk(
|
fn transcribe_chunk(
|
||||||
state: &mut whisper_rs::WhisperState,
|
state: &mut whisper_rs::WhisperState,
|
||||||
chunk: &[f32],
|
chunk: &[f32],
|
||||||
out: &mut impl Write,
|
out: &mut impl Write,
|
||||||
|
counter: &mut u32,
|
||||||
) -> Result<(), Box<dyn std::error::Error>> {
|
) -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
let time: DateTime<Local> = Local::now();
|
||||||
|
let id = format!("{}_{:04}", time.format("%Y%m%d_%H%M%S"), counter);
|
||||||
|
*counter += 1;
|
||||||
|
|
||||||
|
let clip_path = format!("{}/{}.wav", CLIP_DIR, id);
|
||||||
|
save_clip(chunk, &clip_path)?;
|
||||||
|
|
||||||
let params = FullParams::new(SamplingStrategy::BeamSearch {
|
let params = FullParams::new(SamplingStrategy::BeamSearch {
|
||||||
beam_size: 5,
|
beam_size: 5,
|
||||||
patience: -1.0,
|
patience: -1.0,
|
||||||
});
|
});
|
||||||
let time: DateTime<Local> = Local::now();
|
out.write_all(format!("[{}] [{}]: ", time, id).as_bytes())?;
|
||||||
out.write_all(format!("[{}]: ", time.to_string()).as_bytes())?;
|
|
||||||
state.full(params, chunk)?;
|
state.full(params, chunk)?;
|
||||||
for segment in state.as_iter() {
|
for segment in state.as_iter() {
|
||||||
let line = format!("{}\n", segment);
|
let line = format!("{}\n", segment);
|
||||||
@@ -36,36 +59,71 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||||||
let audio_arg = std::env::args().nth(2).expect("usage: listener <model> <file.wav|pulse:SOURCE> [output.txt]");
|
let audio_arg = std::env::args().nth(2).expect("usage: listener <model> <file.wav|pulse:SOURCE> [output.txt]");
|
||||||
let output_path = std::env::args().nth(3).unwrap_or_else(|| "transcription.txt".to_string());
|
let output_path = std::env::args().nth(3).unwrap_or_else(|| "transcription.txt".to_string());
|
||||||
|
|
||||||
|
fs::create_dir_all(CLIP_DIR)?;
|
||||||
|
|
||||||
let running = Arc::new(AtomicBool::new(true));
|
let running = Arc::new(AtomicBool::new(true));
|
||||||
let r = running.clone();
|
let r = running.clone();
|
||||||
ctrlc::set_handler(move || {
|
ctrlc::set_handler(move || {
|
||||||
r.store(false, Ordering::SeqCst);
|
r.store(false, Ordering::SeqCst);
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let ctx = WhisperContext::new_with_params(&model_path, WhisperContextParameters::default())
|
eprintln!("Transcribing {} → {} (clips → {}) (Ctrl+C to stop)", audio_arg, output_path, CLIP_DIR);
|
||||||
.expect("failed to load model");
|
|
||||||
let mut state = ctx.create_state().expect("failed to create state");
|
|
||||||
|
|
||||||
let mut out = OpenOptions::new().create(true).append(true).open(&output_path)?;
|
|
||||||
|
|
||||||
eprintln!("Transcribing {} → {} (Ctrl+C to stop)", audio_arg, output_path);
|
|
||||||
|
|
||||||
if let Some(source) = audio_arg.strip_prefix("pulse:") {
|
if let Some(source) = audio_arg.strip_prefix("pulse:") {
|
||||||
let mut stream = audio::LiveStream::open(source)?;
|
eprintln!("Listening for speech (silence threshold: {:.3} RMS) …", audio::SILENCE_THRESHOLD);
|
||||||
while running.load(Ordering::SeqCst) {
|
|
||||||
match stream.next_chunk(CHUNK_SECS)? {
|
let (tx, rx) = mpsc::channel::<Vec<f32>>();
|
||||||
Some(chunk) => transcribe_chunk(&mut state, &chunk, &mut out)?,
|
|
||||||
|
// transcription run goes to a background thread so capture is never blocked.
|
||||||
|
let model_path_t = model_path.clone();
|
||||||
|
let output_path_t = output_path.clone();
|
||||||
|
let transcription_thread = std::thread::spawn(move || {
|
||||||
|
let ctx = WhisperContext::new_with_params(&model_path_t, WhisperContextParameters::default())
|
||||||
|
.expect("failed to load model");
|
||||||
|
let mut state = ctx.create_state().expect("failed to create state");
|
||||||
|
let mut out = OpenOptions::new().create(true).append(true).open(&output_path_t)
|
||||||
|
.expect("failed to open output file");
|
||||||
|
let mut counter: u32 = 0;
|
||||||
|
for segment in rx {
|
||||||
|
let secs = segment.len() as f32 / 16000.0;
|
||||||
|
eprintln!(" [transcribing {:.1}s segment…]", secs);
|
||||||
|
if let Err(e) = transcribe_chunk(&mut state, &segment, &mut out, &mut counter) {
|
||||||
|
eprintln!("Transcription error: {e}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// capture loop.. never pauses for transcription.
|
||||||
|
let mut stream = audio::VadStream::open(source, running.clone())?;
|
||||||
|
loop {
|
||||||
|
match stream.next_segment()? {
|
||||||
|
Some(segment) => {
|
||||||
|
let secs = segment.len() as f32 / 16000.0;
|
||||||
|
eprintln!(" [captured {:.1}s, queued for transcription]", secs);
|
||||||
|
if tx.send(segment).is_err() {
|
||||||
|
break; // transcription thread died
|
||||||
|
}
|
||||||
|
}
|
||||||
None => break,
|
None => break,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
drop(tx); // closing the channel signals the transcription thread to finish
|
||||||
|
transcription_thread.join().expect("transcription thread panicked");
|
||||||
} else {
|
} else {
|
||||||
|
let ctx = WhisperContext::new_with_params(&model_path, WhisperContextParameters::default())
|
||||||
|
.expect("failed to load model");
|
||||||
|
let mut state = ctx.create_state().expect("failed to create state");
|
||||||
|
let mut out = OpenOptions::new().create(true).append(true).open(&output_path)?;
|
||||||
|
let mut counter: u32 = 0;
|
||||||
|
|
||||||
while running.load(Ordering::SeqCst) {
|
while running.load(Ordering::SeqCst) {
|
||||||
let audio = audio::decode(&audio_arg)?;
|
let audio = audio::decode(&audio_arg)?;
|
||||||
for chunk in audio.chunks(CHUNK_SAMPLES) {
|
for chunk in audio.chunks(CHUNK_SAMPLES) {
|
||||||
if !running.load(Ordering::SeqCst) {
|
if !running.load(Ordering::SeqCst) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
transcribe_chunk(&mut state, chunk, &mut out)?;
|
transcribe_chunk(&mut state, chunk, &mut out, &mut counter)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user