diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..d8fcf25 --- /dev/null +++ b/flake.nix @@ -0,0 +1,21 @@ +{ + description = "Cohere Transcribe dev environment with microphone support"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable"; + }; + + outputs = { nixpkgs, ... }: { + devShells.x86_64-linux.default = nixpkgs.legacyPackages.x86_64-linux.mkShell { + buildInputs = with nixpkgs.legacyPackages.x86_64-linux; [ + portaudio + cudaPackages.cudatoolkit + ]; + + shellHook = '' + export LD_LIBRARY_PATH="${cudaPackages.cudatoolkit}/lib:$LD_LIBRARY_PATH" + echo "Dev shell ready - microphone input enabled" + ''; + }; + }; +} diff --git a/transcribe.py b/transcribe.py index 3df8246..6d7a539 100644 --- a/transcribe.py +++ b/transcribe.py @@ -1,3 +1,6 @@ +import sys +import numpy as np +import sounddevice as sd from transformers import AutoProcessor, CohereAsrForConditionalGeneration from transformers.audio_utils import load_audio from huggingface_hub import hf_hub_download @@ -18,14 +21,31 @@ def transcribe_audio(audio, language="en"): text = processor.decode(outputs, skip_special_tokens=True) return text -# Use demo audio file from Hugging Face -print("Loading demo audio...") -audio_file = hf_hub_download( - repo_id="CohereLabs/cohere-transcribe-03-2026", - filename="demo/voxpopuli_test_en_demo.wav", -) -audio = load_audio(audio_file, sampling_rate=16000) +def record_audio(duration, samplerate=16000): + print(f"Recording for {duration} seconds...") + audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype='float32') + sd.wait() + return audio.flatten() -print("Transcribing...") -text = transcribe_audio(audio) -print(f"\nTranscription:\n{text}\n") +# Parse arguments +if len(sys.argv) > 1 and sys.argv[1] == "--mic": + duration = int(sys.argv[2]) if len(sys.argv) > 2 else 5 + try: + mic_audio = record_audio(duration) + print("Transcribing...") + text = transcribe_audio(mic_audio) + print(f"\nTranscription:\n{text}\n") + except OSError as e: + print(f"Microphone error: {e}") + print("Hint: Run with nix-shell for PortAudio support") +else: + print("Loading demo audio...") + audio_file = hf_hub_download( + repo_id="CohereLabs/cohere-transcribe-03-2026", + filename="demo/voxpopuli_test_en_demo.wav", + ) + audio = load_audio(audio_file, sampling_rate=16000) + + print("Transcribing...") + text = transcribe_audio(audio) + print(f"\nTranscription:\n{text}\n")