From cf1833523518bcce4edfc3ccadc5206bc058e01e Mon Sep 17 00:00:00 2001 From: Wong Ding Feng Date: Fri, 29 May 2026 02:48:51 +0800 Subject: [PATCH] fix: simplify audio callback, use deque for pre-roll, add worker timeout warning - Remove frame_buf accumulation: blocksize=FRAME_SIZE guarantees indata is exactly FRAME_SIZE samples, so buffering was unnecessary. Use indata[:, 0].copy() to avoid stale references from sounddevice's buffer reuse. - Replace pre_roll list with collections.deque(maxlen=PRE_ROLL_FRAMES) to eliminate manual bounds-checking (pop(0)) on every frame. - Warn to stderr if the transcription worker thread outlives its 30s join timeout. Co-Authored-By: Claude Sonnet 4.6 --- transcribe.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/transcribe.py b/transcribe.py index 2212cc1..4f8c097 100644 --- a/transcribe.py +++ b/transcribe.py @@ -1,5 +1,6 @@ import sys import argparse +import collections import queue import threading import time @@ -92,7 +93,7 @@ class VADStateMachine: self.speaking = False self.speech_frames = 0 self.silence_frames = 0 - self.pre_roll = [] + self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES) self.segment = [] self.segment_start_time = 0.0 @@ -104,8 +105,6 @@ class VADStateMachine: if not self.speaking: self.pre_roll.append(frame) - if len(self.pre_roll) > PRE_ROLL_FRAMES: - self.pre_roll.pop(0) if is_loud: self.speech_frames += 1 @@ -114,7 +113,7 @@ class VADStateMachine: self.silence_frames = 0 self.segment = list(self.pre_roll) self.segment_start_time = max(0.0, elapsed_time - len(self.pre_roll) * FRAME_SIZE / SAMPLE_RATE) - self.pre_roll = [] + self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES) else: self.speech_frames = 0 return None @@ -134,7 +133,7 @@ class VADStateMachine: self.speech_frames = 0 self.silence_frames = 0 self.segment = [] - self.pre_roll = [] + self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES) return result return None @@ -162,20 +161,13 @@ def stream_transcribe(processor, model, language): worker = threading.Thread(target=transcription_worker, daemon=True) worker.start() - frame_buf = np.empty(0, dtype="float32") - def audio_callback(indata, frames, time_info, status): - nonlocal frame_buf if stop_event.is_set(): return - frame_buf = np.append(frame_buf, indata[:, 0]) - while len(frame_buf) >= FRAME_SIZE: - frame = frame_buf[:FRAME_SIZE] - frame_buf = frame_buf[FRAME_SIZE:] - elapsed = time.monotonic() - start_time - result = vad.process_frame(frame, elapsed) - if result is not None: - seg_queue.put(result) + elapsed = time.monotonic() - start_time + result = vad.process_frame(indata[:, 0].copy(), elapsed) + if result is not None: + seg_queue.put(result) print("Listening... (Ctrl+C to stop)") stream = sd.InputStream( @@ -198,6 +190,8 @@ def stream_transcribe(processor, model, language): seg_queue.put((vad.segment_start_time, np.concatenate(vad.segment))) worker.join(timeout=30) + if worker.is_alive(): + print("Warning: transcription worker did not finish in time.", file=sys.stderr) print("\nDone.")