From 8b88489a53534151cf5d3350b3e397c41ed53429 Mon Sep 17 00:00:00 2001 From: Wong Ding Feng Date: Tue, 26 May 2026 01:49:52 +0800 Subject: [PATCH] Simplify to audio file input (mic requires PortAudio on NixOS) --- pyproject.toml | 1 + transcribe.py | 21 +++++++++++---------- uv.lock | 18 ++++++++++++++++++ 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 430a48d..7cd40d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ dependencies = [ "librosa>=0.11.0", "protobuf>=7.35.0", "sentencepiece>=0.2.1", + "sounddevice>=0.5.5", "soundfile>=0.13.1", "torch>=2.12.0", "transformers>=5.9.0", diff --git a/transcribe.py b/transcribe.py index 5595342..3df8246 100644 --- a/transcribe.py +++ b/transcribe.py @@ -10,21 +10,22 @@ model = CohereAsrForConditionalGeneration.from_pretrained( device_map="auto" ) -# Download demo audio from Hugging Face +def transcribe_audio(audio, language="en"): + inputs = processor(audio, sampling_rate=16000, return_tensors="pt", language=language) + inputs.to(model.device, dtype=model.dtype) + + outputs = model.generate(**inputs, max_new_tokens=256) + text = processor.decode(outputs, skip_special_tokens=True) + return text + +# Use demo audio file from Hugging Face +print("Loading demo audio...") audio_file = hf_hub_download( repo_id="CohereLabs/cohere-transcribe-03-2026", filename="demo/voxpopuli_test_en_demo.wav", ) - -# Load audio audio = load_audio(audio_file, sampling_rate=16000) -# Process and transcribe print("Transcribing...") -inputs = processor(audio, sampling_rate=16000, return_tensors="pt", language="en") -inputs.to(model.device, dtype=model.dtype) - -outputs = model.generate(**inputs, max_new_tokens=256) -text = processor.decode(outputs, skip_special_tokens=True) - +text = transcribe_audio(audio) print(f"\nTranscription:\n{text}\n") diff --git a/uv.lock b/uv.lock index d02a8ff..223d174 100644 --- a/uv.lock +++ b/uv.lock @@ -199,6 +199,7 @@ dependencies = [ { name = "librosa" }, { name = "protobuf" }, { name = "sentencepiece" }, + { name = "sounddevice" }, { name = "soundfile" }, { name = "torch" }, { name = "transformers" }, @@ -211,6 +212,7 @@ requires-dist = [ { name = "librosa", specifier = ">=0.11.0" }, { name = "protobuf", specifier = ">=7.35.0" }, { name = "sentencepiece", specifier = ">=0.2.1" }, + { name = "sounddevice", specifier = ">=0.5.5" }, { name = "soundfile", specifier = ">=0.13.1" }, { name = "torch", specifier = ">=2.12.0" }, { name = "transformers", specifier = ">=5.9.0" }, @@ -1077,6 +1079,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" }, ] +[[package]] +name = "sounddevice" +version = "0.5.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2a/f9/2592608737553638fca98e21e54bfec40bf577bb98a61b2770c912aab25e/sounddevice-0.5.5.tar.gz", hash = "sha256:22487b65198cb5bf2208755105b524f78ad173e5ab6b445bdab1c989f6698df3", size = 143191, upload-time = "2026-01-23T18:36:43.529Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/0a/478e441fd049002cf308520c0d62dd8333e7c6cc8d997f0dda07b9fbcc46/sounddevice-0.5.5-py3-none-any.whl", hash = "sha256:30ff99f6c107f49d25ad16a45cacd8d91c25a1bcdd3e81a206b921a3a6405b1f", size = 32807, upload-time = "2026-01-23T18:36:35.649Z" }, + { url = "https://files.pythonhosted.org/packages/56/f9/c037c35f6d0b6bc3bc7bfb314f1d6f1f9a341328ef47cd63fc4f850a7b27/sounddevice-0.5.5-py3-none-macosx_10_6_x86_64.macosx_10_6_universal2.whl", hash = "sha256:05eb9fd6c54c38d67741441c19164c0dae8ce80453af2d8c4ad2e7823d15b722", size = 108557, upload-time = "2026-01-23T18:36:37.41Z" }, + { url = "https://files.pythonhosted.org/packages/88/a1/d19dd9889cd4bce2e233c4fac007cd8daaf5b9fe6e6a5d432cf17be0b807/sounddevice-0.5.5-py3-none-win32.whl", hash = "sha256:1234cc9b4c9df97b6cbe748146ae0ec64dd7d6e44739e8e42eaa5b595313a103", size = 317765, upload-time = "2026-01-23T18:36:39.047Z" }, + { url = "https://files.pythonhosted.org/packages/c3/0e/002ed7c4c1c2ab69031f78989d3b789fee3a7fba9e586eb2b81688bf4961/sounddevice-0.5.5-py3-none-win_amd64.whl", hash = "sha256:cfc6b2c49fb7f555591c78cb8ecf48d6a637fd5b6e1db5fec6ed9365d64b3519", size = 365324, upload-time = "2026-01-23T18:36:40.496Z" }, + { url = "https://files.pythonhosted.org/packages/4e/39/a61d4b83a7746b70d23d9173be688c0c6bfc7173772344b7442c2c155497/sounddevice-0.5.5-py3-none-win_arm64.whl", hash = "sha256:3861901ddd8230d2e0e8ae62ac320cdd4c688d81df89da036dcb812f757bb3e6", size = 317115, upload-time = "2026-01-23T18:36:42.235Z" }, +] + [[package]] name = "soundfile" version = "0.13.1"