diff --git a/transcribe.py b/transcribe.py index 2212cc1..4f8c097 100644 --- a/transcribe.py +++ b/transcribe.py @@ -1,5 +1,6 @@ import sys import argparse +import collections import queue import threading import time @@ -92,7 +93,7 @@ class VADStateMachine: self.speaking = False self.speech_frames = 0 self.silence_frames = 0 - self.pre_roll = [] + self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES) self.segment = [] self.segment_start_time = 0.0 @@ -104,8 +105,6 @@ class VADStateMachine: if not self.speaking: self.pre_roll.append(frame) - if len(self.pre_roll) > PRE_ROLL_FRAMES: - self.pre_roll.pop(0) if is_loud: self.speech_frames += 1 @@ -114,7 +113,7 @@ class VADStateMachine: self.silence_frames = 0 self.segment = list(self.pre_roll) self.segment_start_time = max(0.0, elapsed_time - len(self.pre_roll) * FRAME_SIZE / SAMPLE_RATE) - self.pre_roll = [] + self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES) else: self.speech_frames = 0 return None @@ -134,7 +133,7 @@ class VADStateMachine: self.speech_frames = 0 self.silence_frames = 0 self.segment = [] - self.pre_roll = [] + self.pre_roll = collections.deque(maxlen=PRE_ROLL_FRAMES) return result return None @@ -162,20 +161,13 @@ def stream_transcribe(processor, model, language): worker = threading.Thread(target=transcription_worker, daemon=True) worker.start() - frame_buf = np.empty(0, dtype="float32") - def audio_callback(indata, frames, time_info, status): - nonlocal frame_buf if stop_event.is_set(): return - frame_buf = np.append(frame_buf, indata[:, 0]) - while len(frame_buf) >= FRAME_SIZE: - frame = frame_buf[:FRAME_SIZE] - frame_buf = frame_buf[FRAME_SIZE:] - elapsed = time.monotonic() - start_time - result = vad.process_frame(frame, elapsed) - if result is not None: - seg_queue.put(result) + elapsed = time.monotonic() - start_time + result = vad.process_frame(indata[:, 0].copy(), elapsed) + if result is not None: + seg_queue.put(result) print("Listening... (Ctrl+C to stop)") stream = sd.InputStream( @@ -198,6 +190,8 @@ def stream_transcribe(processor, model, language): seg_queue.put((vad.segment_start_time, np.concatenate(vad.segment))) worker.join(timeout=30) + if worker.is_alive(): + print("Warning: transcription worker did not finish in time.", file=sys.stderr) print("\nDone.")