fix: simplify audio callback, use deque for pre-roll, add worker timeout warning
- Remove frame_buf accumulation: blocksize=FRAME_SIZE guarantees indata is exactly FRAME_SIZE samples, so buffering was unnecessary. Use indata[:, 0].copy() to avoid stale references from sounddevice's buffer reuse. - Replace pre_roll list with collections.deque(maxlen=PRE_ROLL_FRAMES) to eliminate manual bounds-checking (pop(0)) on every frame. - Warn to stderr if the transcription worker thread outlives its 30s join timeout. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+10
-16
@@ -1,5 +1,6 @@
|
||||
import sys
|
||||
import argparse
|
||||
import collections
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
@@ -92,7 +93,7 @@ class VADStateMachine:
|
||||
self.speaking = False
|
||||
self.speech_frames = 0
|
||||
self.silence_frames = 0
|
||||
self.pre_roll = []
|
||||
self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES)
|
||||
self.segment = []
|
||||
self.segment_start_time = 0.0
|
||||
|
||||
@@ -104,8 +105,6 @@ class VADStateMachine:
|
||||
|
||||
if not self.speaking:
|
||||
self.pre_roll.append(frame)
|
||||
if len(self.pre_roll) > PRE_ROLL_FRAMES:
|
||||
self.pre_roll.pop(0)
|
||||
|
||||
if is_loud:
|
||||
self.speech_frames += 1
|
||||
@@ -114,7 +113,7 @@ class VADStateMachine:
|
||||
self.silence_frames = 0
|
||||
self.segment = list(self.pre_roll)
|
||||
self.segment_start_time = max(0.0, elapsed_time - len(self.pre_roll) * FRAME_SIZE / SAMPLE_RATE)
|
||||
self.pre_roll = []
|
||||
self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES)
|
||||
else:
|
||||
self.speech_frames = 0
|
||||
return None
|
||||
@@ -134,7 +133,7 @@ class VADStateMachine:
|
||||
self.speech_frames = 0
|
||||
self.silence_frames = 0
|
||||
self.segment = []
|
||||
self.pre_roll = []
|
||||
self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES)
|
||||
return result
|
||||
|
||||
return None
|
||||
@@ -162,20 +161,13 @@ def stream_transcribe(processor, model, language):
|
||||
worker = threading.Thread(target=transcription_worker, daemon=True)
|
||||
worker.start()
|
||||
|
||||
frame_buf = np.empty(0, dtype="float32")
|
||||
|
||||
def audio_callback(indata, frames, time_info, status):
|
||||
nonlocal frame_buf
|
||||
if stop_event.is_set():
|
||||
return
|
||||
frame_buf = np.append(frame_buf, indata[:, 0])
|
||||
while len(frame_buf) >= FRAME_SIZE:
|
||||
frame = frame_buf[:FRAME_SIZE]
|
||||
frame_buf = frame_buf[FRAME_SIZE:]
|
||||
elapsed = time.monotonic() - start_time
|
||||
result = vad.process_frame(frame, elapsed)
|
||||
if result is not None:
|
||||
seg_queue.put(result)
|
||||
elapsed = time.monotonic() - start_time
|
||||
result = vad.process_frame(indata[:, 0].copy(), elapsed)
|
||||
if result is not None:
|
||||
seg_queue.put(result)
|
||||
|
||||
print("Listening... (Ctrl+C to stop)")
|
||||
stream = sd.InputStream(
|
||||
@@ -198,6 +190,8 @@ def stream_transcribe(processor, model, language):
|
||||
seg_queue.put((vad.segment_start_time, np.concatenate(vad.segment)))
|
||||
|
||||
worker.join(timeout=30)
|
||||
if worker.is_alive():
|
||||
print("Warning: transcription worker did not finish in time.", file=sys.stderr)
|
||||
print("\nDone.")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user