fix: simplify audio callback, use deque for pre-roll, add worker timeout warning

- Remove frame_buf accumulation: blocksize=FRAME_SIZE guarantees indata is
  exactly FRAME_SIZE samples, so buffering was unnecessary. Use indata[:, 0].copy()
  to avoid stale references from sounddevice's buffer reuse.
- Replace pre_roll list with collections.deque(maxlen=PRE_ROLL_FRAMES) to
  eliminate manual bounds-checking (pop(0)) on every frame.
- Warn to stderr if the transcription worker thread outlives its 30s join timeout.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-29 02:48:51 +08:00
parent 747a4772b6
commit cf18335235
+10 -16
View File
@@ -1,5 +1,6 @@
import sys import sys
import argparse import argparse
import collections
import queue import queue
import threading import threading
import time import time
@@ -92,7 +93,7 @@ class VADStateMachine:
self.speaking = False self.speaking = False
self.speech_frames = 0 self.speech_frames = 0
self.silence_frames = 0 self.silence_frames = 0
self.pre_roll = [] self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES)
self.segment = [] self.segment = []
self.segment_start_time = 0.0 self.segment_start_time = 0.0
@@ -104,8 +105,6 @@ class VADStateMachine:
if not self.speaking: if not self.speaking:
self.pre_roll.append(frame) self.pre_roll.append(frame)
if len(self.pre_roll) > PRE_ROLL_FRAMES:
self.pre_roll.pop(0)
if is_loud: if is_loud:
self.speech_frames += 1 self.speech_frames += 1
@@ -114,7 +113,7 @@ class VADStateMachine:
self.silence_frames = 0 self.silence_frames = 0
self.segment = list(self.pre_roll) self.segment = list(self.pre_roll)
self.segment_start_time = max(0.0, elapsed_time - len(self.pre_roll) * FRAME_SIZE / SAMPLE_RATE) self.segment_start_time = max(0.0, elapsed_time - len(self.pre_roll) * FRAME_SIZE / SAMPLE_RATE)
self.pre_roll = [] self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES)
else: else:
self.speech_frames = 0 self.speech_frames = 0
return None return None
@@ -134,7 +133,7 @@ class VADStateMachine:
self.speech_frames = 0 self.speech_frames = 0
self.silence_frames = 0 self.silence_frames = 0
self.segment = [] self.segment = []
self.pre_roll = [] self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES)
return result return result
return None return None
@@ -162,20 +161,13 @@ def stream_transcribe(processor, model, language):
worker = threading.Thread(target=transcription_worker, daemon=True) worker = threading.Thread(target=transcription_worker, daemon=True)
worker.start() worker.start()
frame_buf = np.empty(0, dtype="float32")
def audio_callback(indata, frames, time_info, status): def audio_callback(indata, frames, time_info, status):
nonlocal frame_buf
if stop_event.is_set(): if stop_event.is_set():
return return
frame_buf = np.append(frame_buf, indata[:, 0]) elapsed = time.monotonic() - start_time
while len(frame_buf) >= FRAME_SIZE: result = vad.process_frame(indata[:, 0].copy(), elapsed)
frame = frame_buf[:FRAME_SIZE] if result is not None:
frame_buf = frame_buf[FRAME_SIZE:] seg_queue.put(result)
elapsed = time.monotonic() - start_time
result = vad.process_frame(frame, elapsed)
if result is not None:
seg_queue.put(result)
print("Listening... (Ctrl+C to stop)") print("Listening... (Ctrl+C to stop)")
stream = sd.InputStream( stream = sd.InputStream(
@@ -198,6 +190,8 @@ def stream_transcribe(processor, model, language):
seg_queue.put((vad.segment_start_time, np.concatenate(vad.segment))) seg_queue.put((vad.segment_start_time, np.concatenate(vad.segment)))
worker.join(timeout=30) worker.join(timeout=30)
if worker.is_alive():
print("Warning: transcription worker did not finish in time.", file=sys.stderr)
print("\nDone.") print("\nDone.")