feat: filter short audio segments (mic bumps) and add debug notebook
Mic bumps produce transient spikes that pass VAD onset detection but contain no real speech — the model hallucinates "thank you" from them. Added MIN_SPEECH_SECONDS (0.3s) filter to discard segments where the actual speech portion is too short. Added a Jupyter notebook (notebooks/audio_debug.ipynb) for real-time audio visualization: streams RMS + peak amplitude into a live Plotly FigureWidget, then provides post-hoc waveform inspection, segment playback, and side-by-side segment comparison. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
File diff suppressed because one or more lines are too long
@@ -27,3 +27,11 @@ build-backend = "hatchling.build"
|
|||||||
|
|
||||||
[tool.hatch.build.targets.wheel]
|
[tool.hatch.build.targets.wheel]
|
||||||
packages = ["src/cohere_transcribe"]
|
packages = ["src/cohere_transcribe"]
|
||||||
|
|
||||||
|
[dependency-groups]
|
||||||
|
dev = [
|
||||||
|
"anywidget>=0.11.0",
|
||||||
|
"ipywidgets>=8.1.8",
|
||||||
|
"jupyterlab>=4.5.7",
|
||||||
|
"plotly>=6.7.0",
|
||||||
|
]
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ PRE_ROLL_FRAMES = 6 # ~0.3s of audio before speech onset
|
|||||||
DEFAULT_SILENCE_FRAMES = 16 # ~0.8s of silence to end a segment
|
DEFAULT_SILENCE_FRAMES = 16 # ~0.8s of silence to end a segment
|
||||||
SPEECH_ONSET_FRAMES = 3 # ~150ms of speech to trigger
|
SPEECH_ONSET_FRAMES = 3 # ~150ms of speech to trigger
|
||||||
MAX_SPEECH_SECONDS = 30 # force chunk boundary
|
MAX_SPEECH_SECONDS = 30 # force chunk boundary
|
||||||
|
MIN_SPEECH_SECONDS = 0.3 # discard segments shorter than this (mic bumps, clicks)
|
||||||
|
|
||||||
|
|
||||||
def pause_seconds_to_frames(seconds: float) -> int:
|
def pause_seconds_to_frames(seconds: float) -> int:
|
||||||
@@ -67,7 +68,10 @@ class VADStateMachine:
|
|||||||
|
|
||||||
segment_duration = len(self.segment) * FRAME_SIZE / SAMPLE_RATE
|
segment_duration = len(self.segment) * FRAME_SIZE / SAMPLE_RATE
|
||||||
if self.silence_frames >= self.silence_limit or segment_duration >= MAX_SPEECH_SECONDS:
|
if self.silence_frames >= self.silence_limit or segment_duration >= MAX_SPEECH_SECONDS:
|
||||||
result = (self.segment_start_time, np.concatenate(self.segment))
|
speech_duration = segment_duration - self.silence_frames * FRAME_SIZE / SAMPLE_RATE
|
||||||
|
result = None
|
||||||
|
if speech_duration >= MIN_SPEECH_SECONDS:
|
||||||
|
result = (self.segment_start_time, np.concatenate(self.segment))
|
||||||
self.speaking = False
|
self.speaking = False
|
||||||
self.speech_frames = 0
|
self.speech_frames = 0
|
||||||
self.silence_frames = 0
|
self.silence_frames = 0
|
||||||
|
|||||||
Reference in New Issue
Block a user