From cf1833523518bcce4edfc3ccadc5206bc058e01e Mon Sep 17 00:00:00 2001
From: Wong Ding Feng <dingfengwong@gmail.com>
Date: Fri, 29 May 2026 02:48:51 +0800
Subject: [PATCH] fix: simplify audio callback, use deque for pre-roll, add
 worker timeout warning

- Remove frame_buf accumulation: blocksize=FRAME_SIZE guarantees indata is
  exactly FRAME_SIZE samples, so buffering was unnecessary. Use indata[:, 0].copy()
  to avoid stale references from sounddevice's buffer reuse.
- Replace pre_roll list with collections.deque(maxlen=PRE_ROLL_FRAMES) to
  eliminate manual bounds-checking (pop(0)) on every frame.
- Warn to stderr if the transcription worker thread outlives its 30s join timeout.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 transcribe.py | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/transcribe.py b/transcribe.py
index 2212cc1..4f8c097 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -1,5 +1,6 @@
 import sys
 import argparse
+import collections
 import queue
 import threading
 import time
@@ -92,7 +93,7 @@ class VADStateMachine:
         self.speaking = False
         self.speech_frames = 0
         self.silence_frames = 0
-        self.pre_roll = []
+        self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES)
         self.segment = []
         self.segment_start_time = 0.0
 
@@ -104,8 +105,6 @@ class VADStateMachine:
 
         if not self.speaking:
             self.pre_roll.append(frame)
-            if len(self.pre_roll) > PRE_ROLL_FRAMES:
-                self.pre_roll.pop(0)
 
             if is_loud:
                 self.speech_frames += 1
@@ -114,7 +113,7 @@ class VADStateMachine:
                     self.silence_frames = 0
                     self.segment = list(self.pre_roll)
                     self.segment_start_time = max(0.0, elapsed_time - len(self.pre_roll) * FRAME_SIZE / SAMPLE_RATE)
-                    self.pre_roll = []
+                    self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES)
             else:
                 self.speech_frames = 0
             return None
@@ -134,7 +133,7 @@ class VADStateMachine:
             self.speech_frames = 0
             self.silence_frames = 0
             self.segment = []
-            self.pre_roll = []
+            self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES)
             return result
 
         return None
@@ -162,20 +161,13 @@ def stream_transcribe(processor, model, language):
     worker = threading.Thread(target=transcription_worker, daemon=True)
     worker.start()
 
-    frame_buf = np.empty(0, dtype="float32")
-
     def audio_callback(indata, frames, time_info, status):
-        nonlocal frame_buf
         if stop_event.is_set():
             return
-        frame_buf = np.append(frame_buf, indata[:, 0])
-        while len(frame_buf) >= FRAME_SIZE:
-            frame = frame_buf[:FRAME_SIZE]
-            frame_buf = frame_buf[FRAME_SIZE:]
-            elapsed = time.monotonic() - start_time
-            result = vad.process_frame(frame, elapsed)
-            if result is not None:
-                seg_queue.put(result)
+        elapsed = time.monotonic() - start_time
+        result = vad.process_frame(indata[:, 0].copy(), elapsed)
+        if result is not None:
+            seg_queue.put(result)
 
     print("Listening... (Ctrl+C to stop)")
     stream = sd.InputStream(
@@ -198,6 +190,8 @@ def stream_transcribe(processor, model, language):
         seg_queue.put((vad.segment_start_time, np.concatenate(vad.segment)))
 
     worker.join(timeout=30)
+    if worker.is_alive():
+        print("Warning: transcription worker did not finish in time.", file=sys.stderr)
     print("\nDone.")