listener: implement gRPC for outgoing transcriptions and clips
This commit is contained in:
Generated
+936
-1
File diff suppressed because it is too large
Load Diff
@@ -7,6 +7,13 @@ edition = "2024"
|
||||
chrono = "0.4.44"
|
||||
ctrlc = "3.5.2"
|
||||
hound = "3.5.1"
|
||||
prost = "0.14.3"
|
||||
serde = { version = "1.0.228", features = ["derive"] }
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
toml = "1.1.2"
|
||||
tonic = "0.14.6"
|
||||
tonic-prost = "0.14.6"
|
||||
whisper-rs = "0.16.0"
|
||||
|
||||
[build-dependencies]
|
||||
tonic-prost-build = "0.14.6"
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
tonic_prost_build::configure()
|
||||
.build_server(false)
|
||||
.compile_protos(&["../server/proto/message.proto"], &["../server/proto"])?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -7,11 +7,14 @@ model = "models/ggml-base.en.bin"
|
||||
# audio = "pulse:default"
|
||||
audio = "pulse:default"
|
||||
|
||||
# Where to write the transcript
|
||||
output = "transcription.txt"
|
||||
# gRPC server address
|
||||
server_addr = "http://localhost:3001"
|
||||
|
||||
# Directory to store per-segment WAV clips
|
||||
clip_dir = "./audio_clips"
|
||||
# Channel UUID to post transcriptions to (required by the server)
|
||||
channel_id = "00000000-0000-0000-0000-000000000000"
|
||||
|
||||
# Directory for temporary WAV files (deleted after upload); defaults to /tmp
|
||||
# temp_dir = "/tmp"
|
||||
|
||||
# File-mode only: how many seconds of audio to transcribe per chunk
|
||||
chunk_secs = 30
|
||||
|
||||
+64
-31
@@ -1,5 +1,4 @@
|
||||
use std::fs::{self, OpenOptions};
|
||||
use std::io::Write;
|
||||
use std::fs;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{mpsc, Arc};
|
||||
use chrono::{DateTime, Local};
|
||||
@@ -8,14 +7,19 @@ use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextPar
|
||||
|
||||
mod audio;
|
||||
|
||||
pub mod scannerbot {
|
||||
tonic::include_proto!("scannerbot");
|
||||
}
|
||||
|
||||
use scannerbot::message_service_client::MessageServiceClient;
|
||||
use scannerbot::SendMessageRequest;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Config {
|
||||
model: String,
|
||||
audio: String,
|
||||
#[serde(default = "default_output")]
|
||||
output: String,
|
||||
#[serde(default = "default_clip_dir")]
|
||||
clip_dir: String,
|
||||
server_addr: String,
|
||||
channel_id: String,
|
||||
#[serde(default = "default_chunk_secs")]
|
||||
chunk_secs: u32,
|
||||
#[serde(default = "default_silence_threshold")]
|
||||
@@ -26,15 +30,16 @@ struct Config {
|
||||
hangover_secs: f32,
|
||||
#[serde(default = "default_max_segment_secs")]
|
||||
max_segment_secs: f32,
|
||||
#[serde(default = "default_temp_dir")]
|
||||
temp_dir: String,
|
||||
}
|
||||
|
||||
fn default_output() -> String { "transcription.txt".into() }
|
||||
fn default_clip_dir() -> String { "./audio_clips".into() }
|
||||
fn default_chunk_secs() -> u32 { 30 }
|
||||
fn default_silence_threshold() -> f32 { 0.02 }
|
||||
fn default_pre_buffer_secs() -> f32 { 1.0 }
|
||||
fn default_hangover_secs() -> f32 { 2.0 }
|
||||
fn default_max_segment_secs() -> f32 { 120.0 }
|
||||
fn default_temp_dir() -> String { "/tmp".into() }
|
||||
|
||||
fn save_clip(samples: &[f32], path: &str) -> Result<(), Box<dyn std::error::Error>> {
|
||||
let spec = hound::WavSpec {
|
||||
@@ -54,29 +59,54 @@ fn save_clip(samples: &[f32], path: &str) -> Result<(), Box<dyn std::error::Erro
|
||||
fn transcribe_chunk(
|
||||
state: &mut whisper_rs::WhisperState,
|
||||
chunk: &[f32],
|
||||
out: &mut impl Write,
|
||||
counter: &mut u32,
|
||||
clip_dir: &str,
|
||||
channel_id: &str,
|
||||
server_addr: &str,
|
||||
temp_dir: &str,
|
||||
rt: &tokio::runtime::Runtime,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
let time: DateTime<Local> = Local::now();
|
||||
let id = format!("{}_{:04}", time.format("%Y%m%d_%H%M%S"), counter);
|
||||
*counter += 1;
|
||||
|
||||
let clip_path = format!("{}/{}.wav", clip_dir, id);
|
||||
save_clip(chunk, &clip_path)?;
|
||||
let clip_path = std::path::Path::new(temp_dir).join(format!("scannerbot_{}.wav", id));
|
||||
save_clip(chunk, clip_path.to_str().unwrap())?;
|
||||
|
||||
let params = FullParams::new(SamplingStrategy::BeamSearch {
|
||||
beam_size: 5,
|
||||
patience: -1.0,
|
||||
});
|
||||
out.write_all(format!("[{}] [{}]: ", time, id).as_bytes())?;
|
||||
state.full(params, chunk)?;
|
||||
|
||||
let mut content = String::new();
|
||||
for segment in state.as_iter() {
|
||||
let line = format!("{}\n", segment);
|
||||
print!("{}", line);
|
||||
out.write_all(line.as_bytes())?;
|
||||
content.push_str(&line);
|
||||
}
|
||||
out.flush()?;
|
||||
|
||||
let audio_bytes = fs::read(&clip_path)?;
|
||||
let audio_filename = format!("scannerbot_{}.wav", id);
|
||||
let _ = fs::remove_file(&clip_path);
|
||||
|
||||
let channel_id = channel_id.to_string();
|
||||
let server_addr = server_addr.to_string();
|
||||
let result = rt.block_on(async move {
|
||||
let mut client = MessageServiceClient::connect(server_addr).await?;
|
||||
let request = tonic::Request::new(SendMessageRequest {
|
||||
channel_id,
|
||||
content,
|
||||
audio_filename,
|
||||
audio: audio_bytes,
|
||||
});
|
||||
let response = client.send_message(request).await?;
|
||||
eprintln!(" [message sent: {}]", response.into_inner().id);
|
||||
Ok::<(), Box<dyn std::error::Error + Send + Sync>>(())
|
||||
});
|
||||
if let Err(e) = result {
|
||||
return Err(e.to_string().into());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -89,8 +119,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
|
||||
let model_path = cfg.model.clone();
|
||||
let audio_arg = cfg.audio.clone();
|
||||
let output_path = cfg.output.clone();
|
||||
let clip_dir = cfg.clip_dir.clone();
|
||||
let channel_id = cfg.channel_id.clone();
|
||||
let server_addr = cfg.server_addr.clone();
|
||||
let temp_dir = cfg.temp_dir.clone();
|
||||
let chunk_samples: usize = 16000 * cfg.chunk_secs as usize;
|
||||
|
||||
let vad_cfg = audio::VadConfig {
|
||||
@@ -100,42 +131,42 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
max_segment_samples: (cfg.max_segment_secs * 16000.0) as usize,
|
||||
};
|
||||
|
||||
fs::create_dir_all(&clip_dir)?;
|
||||
|
||||
let running = Arc::new(AtomicBool::new(true));
|
||||
let r = running.clone();
|
||||
ctrlc::set_handler(move || {
|
||||
r.store(false, Ordering::SeqCst);
|
||||
})?;
|
||||
|
||||
eprintln!("Transcribing {} → {} (clips → {}) (Ctrl+C to stop)", audio_arg, output_path, clip_dir);
|
||||
eprintln!("Transcribing {} → {} (Ctrl+C to stop)", audio_arg, server_addr);
|
||||
|
||||
if let Some(source) = audio_arg.strip_prefix("pulse:") {
|
||||
eprintln!("Listening for speech (silence threshold: {:.3} RMS) …", cfg.silence_threshold);
|
||||
|
||||
let (tx, rx) = mpsc::channel::<Vec<f32>>();
|
||||
|
||||
// transcription run goes to a background thread so capture is never blocked.
|
||||
let model_path_t = model_path.clone();
|
||||
let output_path_t = output_path.clone();
|
||||
let clip_dir_t = clip_dir.clone();
|
||||
let channel_id_t = channel_id.clone();
|
||||
let server_addr_t = server_addr.clone();
|
||||
let temp_dir_t = temp_dir.clone();
|
||||
let transcription_thread = std::thread::spawn(move || {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to build tokio runtime");
|
||||
let ctx = WhisperContext::new_with_params(&model_path_t, WhisperContextParameters::default())
|
||||
.expect("failed to load model");
|
||||
let mut state = ctx.create_state().expect("failed to create state");
|
||||
let mut out = OpenOptions::new().create(true).append(true).open(&output_path_t)
|
||||
.expect("failed to open output file");
|
||||
let mut counter: u32 = 0;
|
||||
for segment in rx {
|
||||
let secs = segment.len() as f32 / 16000.0;
|
||||
eprintln!(" [transcribing {:.1}s segment…]", secs);
|
||||
if let Err(e) = transcribe_chunk(&mut state, &segment, &mut out, &mut counter, &clip_dir_t) {
|
||||
if let Err(e) = transcribe_chunk(&mut state, &segment, &mut counter, &channel_id_t, &server_addr_t, &temp_dir_t, &rt) {
|
||||
eprintln!("Transcription error: {e}");
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// capture loop.. never pauses for transcription.
|
||||
// capture loop never pauses for transcription
|
||||
let mut stream = audio::VadStream::open(source, running.clone(), vad_cfg)?;
|
||||
loop {
|
||||
match stream.next_segment()? {
|
||||
@@ -143,20 +174,22 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let secs = segment.len() as f32 / 16000.0;
|
||||
eprintln!(" [captured {:.1}s, queued for transcription]", secs);
|
||||
if tx.send(segment).is_err() {
|
||||
break; // transcription thread died
|
||||
break;
|
||||
}
|
||||
}
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
|
||||
drop(tx); // closing the channel signals the transcription thread to finish
|
||||
drop(tx);
|
||||
transcription_thread.join().expect("transcription thread panicked");
|
||||
} else {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
let ctx = WhisperContext::new_with_params(&model_path, WhisperContextParameters::default())
|
||||
.expect("failed to load model");
|
||||
let mut state = ctx.create_state().expect("failed to create state");
|
||||
let mut out = OpenOptions::new().create(true).append(true).open(&output_path)?;
|
||||
let mut counter: u32 = 0;
|
||||
|
||||
while running.load(Ordering::SeqCst) {
|
||||
@@ -165,7 +198,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
if !running.load(Ordering::SeqCst) {
|
||||
break;
|
||||
}
|
||||
transcribe_chunk(&mut state, chunk, &mut out, &mut counter, &clip_dir)?;
|
||||
transcribe_chunk(&mut state, chunk, &mut counter, &channel_id, &server_addr, &temp_dir, &rt)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user