From d62fcdd1cd7aceb0a8a824a9fdbfd44423e21b11 Mon Sep 17 00:00:00 2001 From: Wong Ding Feng Date: Fri, 29 May 2026 02:45:09 +0800 Subject: [PATCH] feat: add silence calibration and VAD state machine Co-Authored-By: Claude Sonnet 4.6 --- transcribe.py | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/transcribe.py b/transcribe.py index efa7f0c..e935be2 100644 --- a/transcribe.py +++ b/transcribe.py @@ -66,6 +66,77 @@ def main(): print(f"\nTranscription:\n{text}\n") +def calibrate_silence(duration=0.5): + print("Calibrating silence threshold...") + audio = sd.rec(int(duration * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype="float32") + sd.wait() + rms = np.sqrt(np.mean(audio ** 2)) + threshold = max(rms * 3, 0.01) + print(f" Ambient RMS: {rms:.4f}, threshold: {threshold:.4f}") + return threshold + + +FRAME_SIZE = 800 # 50ms at 16kHz +PRE_ROLL_FRAMES = 6 # ~0.3s of audio before speech onset +SILENCE_FRAMES = 16 # ~0.8s of silence to end a segment +SPEECH_ONSET_FRAMES = 3 # ~150ms of speech to trigger +MAX_SPEECH_SECONDS = 30 # force chunk boundary + + +class VADStateMachine: + def __init__(self, threshold): + self.threshold = threshold + self.speaking = False + self.speech_frames = 0 + self.silence_frames = 0 + self.pre_roll = [] + self.segment = [] + self.segment_start_time = 0.0 + + def process_frame(self, frame, elapsed_time): + """Process one 50ms frame. Returns a (start_time, audio_array) tuple when a + complete speech segment is detected, otherwise None.""" + rms = np.sqrt(np.mean(frame ** 2)) + is_loud = rms > self.threshold + + if not self.speaking: + self.pre_roll.append(frame) + if len(self.pre_roll) > PRE_ROLL_FRAMES: + self.pre_roll.pop(0) + + if is_loud: + self.speech_frames += 1 + if self.speech_frames >= SPEECH_ONSET_FRAMES: + self.speaking = True + self.silence_frames = 0 + self.segment = list(self.pre_roll) + self.segment_start_time = max(0.0, elapsed_time - len(self.pre_roll) * FRAME_SIZE / SAMPLE_RATE) + self.pre_roll = [] + else: + self.speech_frames = 0 + return None + + # Currently speaking + self.segment.append(frame) + + if is_loud: + self.silence_frames = 0 + else: + self.silence_frames += 1 + + segment_duration = len(self.segment) * FRAME_SIZE / SAMPLE_RATE + if self.silence_frames >= SILENCE_FRAMES or segment_duration >= MAX_SPEECH_SECONDS: + result = (self.segment_start_time, np.concatenate(self.segment)) + self.speaking = False + self.speech_frames = 0 + self.silence_frames = 0 + self.segment = [] + self.pre_roll = [] + return result + + return None + + def stream_transcribe(processor, model, language): print("TODO: streaming mode")