fix: simplify audio callback, use deque for pre-roll, add worker timeout warning
- Remove frame_buf accumulation: blocksize=FRAME_SIZE guarantees indata is exactly FRAME_SIZE samples, so buffering was unnecessary. Use indata[:, 0].copy() to avoid stale references from sounddevice's buffer reuse. - Replace pre_roll list with collections.deque(maxlen=PRE_ROLL_FRAMES) to eliminate manual bounds-checking (pop(0)) on every frame. - Warn to stderr if the transcription worker thread outlives its 30s join timeout. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+7
-13
@@ -1,5 +1,6 @@
|
|||||||
import sys
|
import sys
|
||||||
import argparse
|
import argparse
|
||||||
|
import collections
|
||||||
import queue
|
import queue
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
@@ -92,7 +93,7 @@ class VADStateMachine:
|
|||||||
self.speaking = False
|
self.speaking = False
|
||||||
self.speech_frames = 0
|
self.speech_frames = 0
|
||||||
self.silence_frames = 0
|
self.silence_frames = 0
|
||||||
self.pre_roll = []
|
self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES)
|
||||||
self.segment = []
|
self.segment = []
|
||||||
self.segment_start_time = 0.0
|
self.segment_start_time = 0.0
|
||||||
|
|
||||||
@@ -104,8 +105,6 @@ class VADStateMachine:
|
|||||||
|
|
||||||
if not self.speaking:
|
if not self.speaking:
|
||||||
self.pre_roll.append(frame)
|
self.pre_roll.append(frame)
|
||||||
if len(self.pre_roll) > PRE_ROLL_FRAMES:
|
|
||||||
self.pre_roll.pop(0)
|
|
||||||
|
|
||||||
if is_loud:
|
if is_loud:
|
||||||
self.speech_frames += 1
|
self.speech_frames += 1
|
||||||
@@ -114,7 +113,7 @@ class VADStateMachine:
|
|||||||
self.silence_frames = 0
|
self.silence_frames = 0
|
||||||
self.segment = list(self.pre_roll)
|
self.segment = list(self.pre_roll)
|
||||||
self.segment_start_time = max(0.0, elapsed_time - len(self.pre_roll) * FRAME_SIZE / SAMPLE_RATE)
|
self.segment_start_time = max(0.0, elapsed_time - len(self.pre_roll) * FRAME_SIZE / SAMPLE_RATE)
|
||||||
self.pre_roll = []
|
self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES)
|
||||||
else:
|
else:
|
||||||
self.speech_frames = 0
|
self.speech_frames = 0
|
||||||
return None
|
return None
|
||||||
@@ -134,7 +133,7 @@ class VADStateMachine:
|
|||||||
self.speech_frames = 0
|
self.speech_frames = 0
|
||||||
self.silence_frames = 0
|
self.silence_frames = 0
|
||||||
self.segment = []
|
self.segment = []
|
||||||
self.pre_roll = []
|
self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
return None
|
return None
|
||||||
@@ -162,18 +161,11 @@ def stream_transcribe(processor, model, language):
|
|||||||
worker = threading.Thread(target=transcription_worker, daemon=True)
|
worker = threading.Thread(target=transcription_worker, daemon=True)
|
||||||
worker.start()
|
worker.start()
|
||||||
|
|
||||||
frame_buf = np.empty(0, dtype="float32")
|
|
||||||
|
|
||||||
def audio_callback(indata, frames, time_info, status):
|
def audio_callback(indata, frames, time_info, status):
|
||||||
nonlocal frame_buf
|
|
||||||
if stop_event.is_set():
|
if stop_event.is_set():
|
||||||
return
|
return
|
||||||
frame_buf = np.append(frame_buf, indata[:, 0])
|
|
||||||
while len(frame_buf) >= FRAME_SIZE:
|
|
||||||
frame = frame_buf[:FRAME_SIZE]
|
|
||||||
frame_buf = frame_buf[FRAME_SIZE:]
|
|
||||||
elapsed = time.monotonic() - start_time
|
elapsed = time.monotonic() - start_time
|
||||||
result = vad.process_frame(frame, elapsed)
|
result = vad.process_frame(indata[:, 0].copy(), elapsed)
|
||||||
if result is not None:
|
if result is not None:
|
||||||
seg_queue.put(result)
|
seg_queue.put(result)
|
||||||
|
|
||||||
@@ -198,6 +190,8 @@ def stream_transcribe(processor, model, language):
|
|||||||
seg_queue.put((vad.segment_start_time, np.concatenate(vad.segment)))
|
seg_queue.put((vad.segment_start_time, np.concatenate(vad.segment)))
|
||||||
|
|
||||||
worker.join(timeout=30)
|
worker.join(timeout=30)
|
||||||
|
if worker.is_alive():
|
||||||
|
print("Warning: transcription worker did not finish in time.", file=sys.stderr)
|
||||||
print("\nDone.")
|
print("\nDone.")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user