feat: filter short audio segments (mic bumps) and add debug notebook
Mic bumps produce transient spikes that pass VAD onset detection but contain no real speech — the model hallucinates "thank you" from them. Added MIN_SPEECH_SECONDS (0.3s) filter to discard segments where the actual speech portion is too short. Added a Jupyter notebook (notebooks/audio_debug.ipynb) for real-time audio visualization: streams RMS + peak amplitude into a live Plotly FigureWidget, then provides post-hoc waveform inspection, segment playback, and side-by-side segment comparison. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
File diff suppressed because one or more lines are too long
@@ -27,3 +27,11 @@ build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src/cohere_transcribe"]
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"anywidget>=0.11.0",
|
||||
"ipywidgets>=8.1.8",
|
||||
"jupyterlab>=4.5.7",
|
||||
"plotly>=6.7.0",
|
||||
]
|
||||
|
||||
@@ -10,6 +10,7 @@ PRE_ROLL_FRAMES = 6 # ~0.3s of audio before speech onset
|
||||
DEFAULT_SILENCE_FRAMES = 16 # ~0.8s of silence to end a segment
|
||||
SPEECH_ONSET_FRAMES = 3 # ~150ms of speech to trigger
|
||||
MAX_SPEECH_SECONDS = 30 # force chunk boundary
|
||||
MIN_SPEECH_SECONDS = 0.3 # discard segments shorter than this (mic bumps, clicks)
|
||||
|
||||
|
||||
def pause_seconds_to_frames(seconds: float) -> int:
|
||||
@@ -67,7 +68,10 @@ class VADStateMachine:
|
||||
|
||||
segment_duration = len(self.segment) * FRAME_SIZE / SAMPLE_RATE
|
||||
if self.silence_frames >= self.silence_limit or segment_duration >= MAX_SPEECH_SECONDS:
|
||||
result = (self.segment_start_time, np.concatenate(self.segment))
|
||||
speech_duration = segment_duration - self.silence_frames * FRAME_SIZE / SAMPLE_RATE
|
||||
result = None
|
||||
if speech_duration >= MIN_SPEECH_SECONDS:
|
||||
result = (self.segment_start_time, np.concatenate(self.segment))
|
||||
self.speaking = False
|
||||
self.speech_frames = 0
|
||||
self.silence_frames = 0
|
||||
|
||||
Reference in New Issue
Block a user