fix: simplify audio callback, use deque for pre-roll, add worker timeout warning

- Remove frame_buf accumulation: blocksize=FRAME_SIZE guarantees indata is exactly FRAME_SIZE samples, so buffering was unnecessary. Use indata[:, 0].copy() to avoid stale references from sounddevice's buffer reuse. - Replace pre_roll list with collections.deque(maxlen=PRE_ROLL_FRAMES) to eliminate manual bounds-checking (pop(0)) on every frame. - Warn to stderr if the transcription worker thread outlives its 30s join timeout. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-29 02:48:51 +08:00
parent 747a4772b6
commit cf18335235
1 changed files with 10 additions and 16 deletions
@@ -1,5 +1,6 @@
 import sys
 import argparse
+import collections
 import queue
 import threading
 import time
@@ -92,7 +93,7 @@ class VADStateMachine:
        self.speaking = False
        self.speech_frames = 0
        self.silence_frames = 0
-        self.pre_roll = []
+        self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES)
        self.segment = []
        self.segment_start_time = 0.0

@@ -104,8 +105,6 @@ class VADStateMachine:

        if not self.speaking:
            self.pre_roll.append(frame)
-            if len(self.pre_roll) > PRE_ROLL_FRAMES:
-                self.pre_roll.pop(0)

            if is_loud:
                self.speech_frames += 1
@@ -114,7 +113,7 @@ class VADStateMachine:
                    self.silence_frames = 0
                    self.segment = list(self.pre_roll)
                    self.segment_start_time = max(0.0, elapsed_time - len(self.pre_roll) * FRAME_SIZE / SAMPLE_RATE)
-                    self.pre_roll = []
+                    self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES)
            else:
                self.speech_frames = 0
            return None
@@ -134,7 +133,7 @@ class VADStateMachine:
            self.speech_frames = 0
            self.silence_frames = 0
            self.segment = []
-            self.pre_roll = []
+            self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES)
            return result

        return None
@@ -162,20 +161,13 @@ def stream_transcribe(processor, model, language):
    worker = threading.Thread(target=transcription_worker, daemon=True)
    worker.start()

-    frame_buf = np.empty(0, dtype="float32")
-
    def audio_callback(indata, frames, time_info, status):
-        nonlocal frame_buf
        if stop_event.is_set():
            return
-        frame_buf = np.append(frame_buf, indata[:, 0])
-        while len(frame_buf) >= FRAME_SIZE:
-            frame = frame_buf[:FRAME_SIZE]
-            frame_buf = frame_buf[FRAME_SIZE:]
-            elapsed = time.monotonic() - start_time
-            result = vad.process_frame(frame, elapsed)
-            if result is not None:
-                seg_queue.put(result)
+        elapsed = time.monotonic() - start_time
+        result = vad.process_frame(indata[:, 0].copy(), elapsed)
+        if result is not None:
+            seg_queue.put(result)

    print("Listening... (Ctrl+C to stop)")
    stream = sd.InputStream(
@@ -198,6 +190,8 @@ def stream_transcribe(processor, model, language):
        seg_queue.put((vad.segment_start_time, np.concatenate(vad.segment)))

    worker.join(timeout=30)
+    if worker.is_alive():
+        print("Warning: transcription worker did not finish in time.", file=sys.stderr)
    print("\nDone.")