From 8b88489a53534151cf5d3350b3e397c41ed53429 Mon Sep 17 00:00:00 2001
From: Wong Ding Feng <dingfengwong@gmail.com>
Date: Tue, 26 May 2026 01:49:52 +0800
Subject: [PATCH] Simplify to audio file input (mic requires PortAudio on
 NixOS)

---
 pyproject.toml |  1 +
 transcribe.py  | 21 +++++++++++----------
 uv.lock        | 18 ++++++++++++++++++
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 430a48d..7cd40d0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,6 +10,7 @@ dependencies = [
     "librosa>=0.11.0",
     "protobuf>=7.35.0",
     "sentencepiece>=0.2.1",
+    "sounddevice>=0.5.5",
     "soundfile>=0.13.1",
     "torch>=2.12.0",
     "transformers>=5.9.0",
diff --git a/transcribe.py b/transcribe.py
index 5595342..3df8246 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -10,21 +10,22 @@ model = CohereAsrForConditionalGeneration.from_pretrained(
     device_map="auto"
 )
 
-# Download demo audio from Hugging Face
+def transcribe_audio(audio, language="en"):
+    inputs = processor(audio, sampling_rate=16000, return_tensors="pt", language=language)
+    inputs.to(model.device, dtype=model.dtype)
+    
+    outputs = model.generate(**inputs, max_new_tokens=256)
+    text = processor.decode(outputs, skip_special_tokens=True)
+    return text
+
+# Use demo audio file from Hugging Face
+print("Loading demo audio...")
 audio_file = hf_hub_download(
     repo_id="CohereLabs/cohere-transcribe-03-2026",
     filename="demo/voxpopuli_test_en_demo.wav",
 )
-
-# Load audio
 audio = load_audio(audio_file, sampling_rate=16000)
 
-# Process and transcribe
 print("Transcribing...")
-inputs = processor(audio, sampling_rate=16000, return_tensors="pt", language="en")
-inputs.to(model.device, dtype=model.dtype)
-
-outputs = model.generate(**inputs, max_new_tokens=256)
-text = processor.decode(outputs, skip_special_tokens=True)
-
+text = transcribe_audio(audio)
 print(f"\nTranscription:\n{text}\n")
diff --git a/uv.lock b/uv.lock
index d02a8ff..223d174 100644
--- a/uv.lock
+++ b/uv.lock
@@ -199,6 +199,7 @@ dependencies = [
     { name = "librosa" },
     { name = "protobuf" },
     { name = "sentencepiece" },
+    { name = "sounddevice" },
     { name = "soundfile" },
     { name = "torch" },
     { name = "transformers" },
@@ -211,6 +212,7 @@ requires-dist = [
     { name = "librosa", specifier = ">=0.11.0" },
     { name = "protobuf", specifier = ">=7.35.0" },
     { name = "sentencepiece", specifier = ">=0.2.1" },
+    { name = "sounddevice", specifier = ">=0.5.5" },
     { name = "soundfile", specifier = ">=0.13.1" },
     { name = "torch", specifier = ">=2.12.0" },
     { name = "transformers", specifier = ">=5.9.0" },
@@ -1077,6 +1079,22 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" },
 ]
 
+[[package]]
+name = "sounddevice"
+version = "0.5.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cffi" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2a/f9/2592608737553638fca98e21e54bfec40bf577bb98a61b2770c912aab25e/sounddevice-0.5.5.tar.gz", hash = "sha256:22487b65198cb5bf2208755105b524f78ad173e5ab6b445bdab1c989f6698df3", size = 143191, upload-time = "2026-01-23T18:36:43.529Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1e/0a/478e441fd049002cf308520c0d62dd8333e7c6cc8d997f0dda07b9fbcc46/sounddevice-0.5.5-py3-none-any.whl", hash = "sha256:30ff99f6c107f49d25ad16a45cacd8d91c25a1bcdd3e81a206b921a3a6405b1f", size = 32807, upload-time = "2026-01-23T18:36:35.649Z" },
+    { url = "https://files.pythonhosted.org/packages/56/f9/c037c35f6d0b6bc3bc7bfb314f1d6f1f9a341328ef47cd63fc4f850a7b27/sounddevice-0.5.5-py3-none-macosx_10_6_x86_64.macosx_10_6_universal2.whl", hash = "sha256:05eb9fd6c54c38d67741441c19164c0dae8ce80453af2d8c4ad2e7823d15b722", size = 108557, upload-time = "2026-01-23T18:36:37.41Z" },
+    { url = "https://files.pythonhosted.org/packages/88/a1/d19dd9889cd4bce2e233c4fac007cd8daaf5b9fe6e6a5d432cf17be0b807/sounddevice-0.5.5-py3-none-win32.whl", hash = "sha256:1234cc9b4c9df97b6cbe748146ae0ec64dd7d6e44739e8e42eaa5b595313a103", size = 317765, upload-time = "2026-01-23T18:36:39.047Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/0e/002ed7c4c1c2ab69031f78989d3b789fee3a7fba9e586eb2b81688bf4961/sounddevice-0.5.5-py3-none-win_amd64.whl", hash = "sha256:cfc6b2c49fb7f555591c78cb8ecf48d6a637fd5b6e1db5fec6ed9365d64b3519", size = 365324, upload-time = "2026-01-23T18:36:40.496Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/39/a61d4b83a7746b70d23d9173be688c0c6bfc7173772344b7442c2c155497/sounddevice-0.5.5-py3-none-win_arm64.whl", hash = "sha256:3861901ddd8230d2e0e8ae62ac320cdd4c688d81df89da036dcb812f757bb3e6", size = 317115, upload-time = "2026-01-23T18:36:42.235Z" },
+]
+
 [[package]]
 name = "soundfile"
 version = "0.13.1"