fix: simplify audio callback, use deque for pre-roll, add worker timeout warning

- Remove frame_buf accumulation: blocksize=FRAME_SIZE guarantees indata is
  exactly FRAME_SIZE samples, so buffering was unnecessary. Use indata[:, 0].copy()
  to avoid stale references from sounddevice's buffer reuse.
- Replace pre_roll list with collections.deque(maxlen=PRE_ROLL_FRAMES) to
  eliminate manual bounds-checking (pop(0)) on every frame.
- Warn to stderr if the transcription worker thread outlives its 30s join timeout.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-29 02:48:51 +08:00
parent 747a4772b6
commit cf18335235
+10 -16
View File
@@ -1,5 +1,6 @@
import sys
import argparse
import collections
import queue
import threading
import time
@@ -92,7 +93,7 @@ class VADStateMachine:
self.speaking = False
self.speech_frames = 0
self.silence_frames = 0
self.pre_roll = []
self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES)
self.segment = []
self.segment_start_time = 0.0
@@ -104,8 +105,6 @@ class VADStateMachine:
if not self.speaking:
self.pre_roll.append(frame)
if len(self.pre_roll) > PRE_ROLL_FRAMES:
self.pre_roll.pop(0)
if is_loud:
self.speech_frames += 1
@@ -114,7 +113,7 @@ class VADStateMachine:
self.silence_frames = 0
self.segment = list(self.pre_roll)
self.segment_start_time = max(0.0, elapsed_time - len(self.pre_roll) * FRAME_SIZE / SAMPLE_RATE)
self.pre_roll = []
self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES)
else:
self.speech_frames = 0
return None
@@ -134,7 +133,7 @@ class VADStateMachine:
self.speech_frames = 0
self.silence_frames = 0
self.segment = []
self.pre_roll = []
self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES)
return result
return None
@@ -162,20 +161,13 @@ def stream_transcribe(processor, model, language):
worker = threading.Thread(target=transcription_worker, daemon=True)
worker.start()
frame_buf = np.empty(0, dtype="float32")
def audio_callback(indata, frames, time_info, status):
nonlocal frame_buf
if stop_event.is_set():
return
frame_buf = np.append(frame_buf, indata[:, 0])
while len(frame_buf) >= FRAME_SIZE:
frame = frame_buf[:FRAME_SIZE]
frame_buf = frame_buf[FRAME_SIZE:]
elapsed = time.monotonic() - start_time
result = vad.process_frame(frame, elapsed)
if result is not None:
seg_queue.put(result)
elapsed = time.monotonic() - start_time
result = vad.process_frame(indata[:, 0].copy(), elapsed)
if result is not None:
seg_queue.put(result)
print("Listening... (Ctrl+C to stop)")
stream = sd.InputStream(
@@ -198,6 +190,8 @@ def stream_transcribe(processor, model, language):
seg_queue.put((vad.segment_start_time, np.concatenate(vad.segment)))
worker.join(timeout=30)
if worker.is_alive():
print("Warning: transcription worker did not finish in time.", file=sys.stderr)
print("\nDone.")