From 58fa7526fbc221731fed74eab13fed8e14d75195 Mon Sep 17 00:00:00 2001 From: Wong Ding Feng Date: Sat, 6 Jun 2026 22:56:21 +0800 Subject: [PATCH] feat: add --normalize compressor + limiter for input audio Adds a feedforward dynamic range compressor with a brick-wall limiter applied in the audio callback. Quiet speech gets +12 dB makeup gain, loud bursts are attenuated 4:1 above -20 dBFS, and the output is hard-limited at -1 dBFS so nothing clips. Enabled via --normalize/-n on `cohere on` and `cohere transcribe`. Co-Authored-By: Claude Opus 4.7 --- src/cohere_transcribe/cli/cli.py | 10 ++++-- src/cohere_transcribe/compressor.py | 52 ++++++++++++++++++++++++++++ src/cohere_transcribe/daemon.py | 10 ++++-- src/cohere_transcribe/daemon_main.py | 3 +- src/cohere_transcribe/model.py | 8 +++-- src/cohere_transcribe/stream.py | 10 ++++-- src/cohere_transcribe/vad.py | 4 ++- 7 files changed, 86 insertions(+), 11 deletions(-) create mode 100644 src/cohere_transcribe/compressor.py diff --git a/src/cohere_transcribe/cli/cli.py b/src/cohere_transcribe/cli/cli.py index 154a6b5..6b35f3c 100644 --- a/src/cohere_transcribe/cli/cli.py +++ b/src/cohere_transcribe/cli/cli.py @@ -26,6 +26,7 @@ def on( language: str = typer.Option("en", "--lang", "-l", help="Language code"), pause: float = typer.Option(0.3, "--pause", "-p", help="Seconds of silence before sending text"), device: str = typer.Option(None, "--device", "-d", help="Input device index or name substring (see `cohere devices`)"), + normalize: bool = typer.Option(False, "--normalize", "-n", help="Enable compressor + limiter to even out loudness"), foreground: bool = typer.Option(False, "--fg", help="Run in foreground (don't daemonize)"), ): """Start transcribing and typing into your focused window.""" @@ -36,7 +37,7 @@ def on( if foreground: from ..daemon import run_daemon console.print("[green]Starting cohere (foreground)...[/green]") - run_daemon(language, pause=pause, device=_parse_device(device)) + run_daemon(language, pause=pause, device=_parse_device(device), normalize=normalize) return console.print("[green]Starting cohere daemon...[/green]") @@ -46,6 +47,8 @@ def on( cmd += ["--pause", str(pause)] if device is not None: cmd += ["--device", device] + if normalize: + cmd += ["--normalize"] subprocess.Popen( cmd, start_new_session=True, @@ -103,6 +106,7 @@ def transcribe( language: str = typer.Option("en", "--lang", "-l", help="Language code"), pause: float = typer.Option(0.3, "--pause", "-p", help="Seconds of silence before sending text"), device: str = typer.Option(None, "--device", "-d", help="Input device index or name substring (see `cohere devices`)"), + normalize: bool = typer.Option(False, "--normalize", "-n", help="Enable compressor + limiter to even out loudness"), ): """One-shot transcription (file, mic, or stream to terminal).""" from ..model import load_model, transcribe_audio @@ -113,12 +117,12 @@ def transcribe( if stream: from ..stream import stream_transcribe processor, model = load_model() - stream_transcribe(processor, model, language, silence_frames=pause_seconds_to_frames(pause), device=dev) + stream_transcribe(processor, model, language, silence_frames=pause_seconds_to_frames(pause), device=dev, normalize=normalize) elif mic is not None: from ..model import record_audio processor, model = load_model() try: - audio = record_audio(mic, device=dev) + audio = record_audio(mic, device=dev, normalize=normalize) console.print("Transcribing...") text = transcribe_audio(processor, model, audio, language) console.print(f"\n{text}\n") diff --git a/src/cohere_transcribe/compressor.py b/src/cohere_transcribe/compressor.py new file mode 100644 index 0000000..bb64860 --- /dev/null +++ b/src/cohere_transcribe/compressor.py @@ -0,0 +1,52 @@ +import math + +import numpy as np + +from .model import SAMPLE_RATE + + +class Compressor: + """Feedforward dynamic range compressor + brick-wall limiter for speech. + + Per sample: track an attack/release-smoothed envelope of |x|, compute gain + reduction above the threshold, apply makeup gain, then hard-limit to the ceiling. + """ + + def __init__( + self, + threshold_db: float = -20.0, + ratio: float = 4.0, + attack_ms: float = 5.0, + release_ms: float = 80.0, + makeup_db: float = 12.0, + ceiling: float = 10 ** (-1.0 / 20), # -1 dBFS + sample_rate: int = SAMPLE_RATE, + ): + self.threshold_db = threshold_db + self.ratio = ratio + self.makeup_gain = 10 ** (makeup_db / 20) + self.ceiling = ceiling + self.knee = 1.0 - 1.0 / ratio + self.a_att = math.exp(-1.0 / (attack_ms * 0.001 * sample_rate)) + self.a_rel = math.exp(-1.0 / (release_ms * 0.001 * sample_rate)) + self.envelope = 0.0 + + def process(self, x: np.ndarray) -> np.ndarray: + abs_x = np.abs(x) + env_out = np.empty_like(x) + e = self.envelope + a_att = self.a_att + a_rel = self.a_rel + for i in range(len(x)): + target = abs_x[i] + coef = a_att if target > e else a_rel + e = coef * e + (1.0 - coef) * target + env_out[i] = e + self.envelope = e + + env_db = 20.0 * np.log10(np.maximum(env_out, 1e-10)) + over = env_db - self.threshold_db + gr_db = np.where(over > 0, over * self.knee, 0.0) + gain = 10 ** (-gr_db / 20.0) * self.makeup_gain + y = x * gain + return np.clip(y, -self.ceiling, self.ceiling).astype(np.float32) diff --git a/src/cohere_transcribe/daemon.py b/src/cohere_transcribe/daemon.py index 82aff5c..8909072 100644 --- a/src/cohere_transcribe/daemon.py +++ b/src/cohere_transcribe/daemon.py @@ -10,6 +10,7 @@ import sounddevice as sd from .backend import WtypeBackend from .commands import process_and_output +from .compressor import Compressor from .model import SAMPLE_RATE, load_model, transcribe_audio from .vad import ( DEFAULT_SILENCE_FRAMES, @@ -80,7 +81,7 @@ def stop_daemon() -> bool: return False -def run_daemon(language: str = "en", pause: float | None = None, device=None): +def run_daemon(language: str = "en", pause: float | None = None, device=None, normalize: bool = False): pid = os.getpid() _write_state(pid, "starting") @@ -92,7 +93,10 @@ def run_daemon(language: str = "en", pause: float | None = None, device=None): silence_frames = pause_seconds_to_frames(pause) if pause else DEFAULT_SILENCE_FRAMES processor, model = load_model() print(f"Using input device: {describe_input_device(device)}") - threshold = calibrate_silence(device=device) + comp = Compressor() if normalize else None + if comp: + print(" Normalization: compressor+limiter enabled") + threshold = calibrate_silence(device=device, compressor=comp) capture_rate = resolve_input_rate(device) capture_blocksize = FRAME_SIZE * capture_rate // SAMPLE_RATE vad = VADStateMachine(threshold, silence_frames=silence_frames) @@ -121,6 +125,8 @@ def run_daemon(language: str = "en", pause: float | None = None, device=None): return elapsed = time.monotonic() - start_time frame = resample_to_target(indata[:, 0].copy(), capture_rate) + if comp is not None: + frame = comp.process(frame) result = vad.process_frame(frame, elapsed) if result is not None: seg_queue.put(result) diff --git a/src/cohere_transcribe/daemon_main.py b/src/cohere_transcribe/daemon_main.py index 7fca2c3..bc01e5a 100644 --- a/src/cohere_transcribe/daemon_main.py +++ b/src/cohere_transcribe/daemon_main.py @@ -16,5 +16,6 @@ parser = argparse.ArgumentParser() parser.add_argument("--lang", default="en") parser.add_argument("--pause", type=float, default=None) parser.add_argument("--device", default=None) +parser.add_argument("--normalize", action="store_true") args = parser.parse_args() -run_daemon(args.lang, pause=args.pause, device=_parse_device(args.device)) +run_daemon(args.lang, pause=args.pause, device=_parse_device(args.device), normalize=args.normalize) diff --git a/src/cohere_transcribe/model.py b/src/cohere_transcribe/model.py index 128c290..5c3a759 100644 --- a/src/cohere_transcribe/model.py +++ b/src/cohere_transcribe/model.py @@ -23,7 +23,7 @@ def transcribe_audio(processor, model, audio, language="en"): return " ".join(texts) if isinstance(texts, list) else texts -def record_audio(duration, device=None): +def record_audio(duration, device=None, normalize=False): import sounddevice as sd from .vad import resolve_input_rate, resample_to_target @@ -31,4 +31,8 @@ def record_audio(duration, device=None): rate = resolve_input_rate(device) audio = sd.rec(int(duration * rate), samplerate=rate, channels=1, dtype="float32", device=device) sd.wait() - return resample_to_target(audio.flatten(), rate) + audio = resample_to_target(audio.flatten(), rate) + if normalize: + from .compressor import Compressor + audio = Compressor().process(audio) + return audio diff --git a/src/cohere_transcribe/stream.py b/src/cohere_transcribe/stream.py index a75fee4..413841e 100644 --- a/src/cohere_transcribe/stream.py +++ b/src/cohere_transcribe/stream.py @@ -6,13 +6,17 @@ import time import numpy as np import sounddevice as sd +from .compressor import Compressor from .model import SAMPLE_RATE, transcribe_audio from .vad import DEFAULT_SILENCE_FRAMES, FRAME_SIZE, VADStateMachine, calibrate_silence, describe_input_device, resample_to_target, resolve_input_rate -def stream_transcribe(processor, model, language, silence_frames=DEFAULT_SILENCE_FRAMES, device=None): +def stream_transcribe(processor, model, language, silence_frames=DEFAULT_SILENCE_FRAMES, device=None, normalize=False): print(f"Using input device: {describe_input_device(device)}") - threshold = calibrate_silence(device=device) + comp = Compressor() if normalize else None + if comp: + print(" Normalization: compressor+limiter enabled") + threshold = calibrate_silence(device=device, compressor=comp) capture_rate = resolve_input_rate(device) capture_blocksize = FRAME_SIZE * capture_rate // SAMPLE_RATE vad = VADStateMachine(threshold, silence_frames=silence_frames) @@ -40,6 +44,8 @@ def stream_transcribe(processor, model, language, silence_frames=DEFAULT_SILENCE return elapsed = time.monotonic() - start_time frame = resample_to_target(indata[:, 0].copy(), capture_rate) + if comp is not None: + frame = comp.process(frame) result = vad.process_frame(frame, elapsed) if result is not None: seg_queue.put(result) diff --git a/src/cohere_transcribe/vad.py b/src/cohere_transcribe/vad.py index 0306c31..5cb839f 100644 --- a/src/cohere_transcribe/vad.py +++ b/src/cohere_transcribe/vad.py @@ -55,12 +55,14 @@ def resample_to_target(audio: np.ndarray, src_rate: int) -> np.ndarray: return resample_poly(audio, SAMPLE_RATE // g, src_rate // g).astype(np.float32) -def calibrate_silence(duration=0.5, device=None): +def calibrate_silence(duration=0.5, device=None, compressor=None): print("Calibrating silence threshold...") rate = resolve_input_rate(device) audio = sd.rec(int(duration * rate), samplerate=rate, channels=1, dtype="float32", device=device) sd.wait() audio = resample_to_target(audio.flatten(), rate) + if compressor is not None: + audio = compressor.process(audio) rms = np.sqrt(np.mean(audio ** 2)) threshold = max(rms * 3, 0.01) print(f" Ambient RMS: {rms:.4f}, threshold: {threshold:.4f}")