Initial commit

2026-05-31 01:03:23 +08:00
13 changed files with 21 additions and 1980 deletions
@@ -1 +0,0 @@
 use flake
@@ -1,10 +0,0 @@
 # Python-generated files
 __pycache__/
 *.py[oc]
 build/
 dist/
 wheels/
 *.egg-info
 # Virtual environments
 .venv
@@ -1 +0,0 @@
 3.14
@@ -0,0 +1,18 @@
 MIT License
 Copyright (c) 2026 tomatocream
 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 
 associated documentation files (the "Software"), to deal in the Software without restriction, including 
 without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 
 copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the 
 following conditions:
 The above copyright notice and this permission notice shall be included in all copies or substantial 
 portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT 
 LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO 
 EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 
 IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
 USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,3 @@
 # cohere-transcribe
 Live speech-to-text using Cohere ASR model
@@ -1,442 +0,0 @@
 # Live Streaming Transcription Implementation Plan
 > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
 **Goal:** Add `--stream` mode to `transcribe.py` that captures microphone audio, segments speech using VAD, and transcribes each segment in near-real-time.
 **Architecture:** sounddevice InputStream callback pushes audio into a thread-safe buffer. A VAD state machine (energy-based RMS) detects speech segments. Completed segments are pushed onto a `queue.Queue` and consumed by a transcription thread that runs the Cohere ASR model and prints timestamped output. Ctrl+C triggers clean shutdown.
 **Tech Stack:** Python 3.14, sounddevice, numpy, transformers (CohereAsrForConditionalGeneration), threading, queue
 **Spec:** `docs/superpowers/specs/2026-05-29-live-streaming-transcription-design.md`
 ---
 ## File Structure
 All changes are in a single file:
 - **Modify:** `transcribe.py` — add `--stream` and `--lang` CLI flags, VAD logic, streaming capture loop, transcription consumer thread, clean shutdown handling. Grows from ~52 lines to ~170 lines.
 No new files. No test files (this is a hardware-dependent demo script — verification is manual with a real microphone).
 ---
 ### Task 1: Refactor CLI argument parsing
 **Files:**
 - Modify: `transcribe.py:1-52`
 Currently the script uses raw `sys.argv` checks. Replace with `argparse` to cleanly support `--stream`, `--mic`, `--lang`, and the default demo mode.
 - [ ] **Step 1: Replace sys.argv parsing with argparse**
 Replace the bottom half of `transcribe.py` (lines 30-52) with argparse-based dispatch. Move model loading after argument parsing so `--help` doesn't trigger a slow model load.
 ```python
 import sys
 import argparse
 import numpy as np
 import sounddevice as sd
 from transformers import AutoProcessor, CohereAsrForConditionalGeneration
 from transformers.audio_utils import load_audio
 from huggingface_hub import hf_hub_download
 MODEL_ID = "CohereLabs/cohere-transcribe-03-2026"
 SAMPLE_RATE = 16000
 def load_model():
    print("Loading model...")
    processor = AutoProcessor.from_pretrained(MODEL_ID)
    model = CohereAsrForConditionalGeneration.from_pretrained(
        MODEL_ID, device_map="auto"
    )
    return processor, model
 def transcribe_audio(processor, model, audio, language="en"):
    inputs = processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt", language=language)
    inputs.to(model.device, dtype=model.dtype)
    outputs = model.generate(**inputs, max_new_tokens=256)
    return processor.decode(outputs, skip_special_tokens=True)
 def record_audio(duration):
    print(f"Recording for {duration} seconds...")
    audio = sd.rec(int(duration * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype="float32")
    sd.wait()
    return audio.flatten()
 def main():
    parser = argparse.ArgumentParser(description="Cohere ASR Transcription")
    group = parser.add_mutually_exclusive_group()
    group.add_argument("--mic", type=int, nargs="?", const=5, metavar="SECONDS",
                       help="Record from microphone for N seconds (default: 5)")
    group.add_argument("--stream", action="store_true",
                       help="Live streaming transcription with VAD")
    parser.add_argument("--lang", default="en", help="Language code (default: en)")
    args = parser.parse_args()
    if args.stream:
        processor, model = load_model()
        stream_transcribe(processor, model, args.lang)
    elif args.mic is not None:
        processor, model = load_model()
        try:
            mic_audio = record_audio(args.mic)
            print("Transcribing...")
            text = transcribe_audio(processor, model, mic_audio, args.lang)
            print(f"\nTranscription:\n{text}\n")
        except OSError as e:
            print(f"Microphone error: {e}")
            print("Hint: Run with nix-shell for PortAudio support")
    else:
        processor, model = load_model()
        print("Loading demo audio...")
        audio_file = hf_hub_download(repo_id=MODEL_ID, filename="demo/voxpopuli_test_en_demo.wav")
        audio = load_audio(audio_file, sampling_rate=SAMPLE_RATE)
        print("Transcribing...")
        text = transcribe_audio(processor, model, audio, args.lang)
        print(f"\nTranscription:\n{text}\n")
 def stream_transcribe(processor, model, language):
    print("TODO: streaming mode")
 if __name__ == "__main__":
    main()
 ```
 - [ ] **Step 2: Verify existing modes still work**
 Run the demo mode to confirm nothing is broken:
 ```bash
 uv run python transcribe.py
 ```
 Expected: loads model, downloads demo audio, prints transcription.
 Run `--mic` mode:
 ```bash
 uv run python transcribe.py --mic 2
 ```
 Expected: records 2 seconds, transcribes, prints result.
 Run `--help`:
 ```bash
 uv run python transcribe.py --help
 ```
 Expected: prints usage without loading the model.
 - [ ] **Step 3: Commit**
 ```bash
 git add transcribe.py
 git commit -m "refactor: switch to argparse, add --stream and --lang flags"
 ```
 ---
 ### Task 2: Implement silence calibration and VAD state machine
 **Files:**
 - Modify: `transcribe.py` — add `calibrate_silence()` and `VADStateMachine` class
 - [ ] **Step 1: Add silence calibration function**
 Add this function above `stream_transcribe`:
 ```python
 def calibrate_silence(duration=0.5):
    print("Calibrating silence threshold...")
    audio = sd.rec(int(duration * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype="float32")
    sd.wait()
    rms = np.sqrt(np.mean(audio ** 2))
    threshold = max(rms * 3, 0.01)
    print(f"  Ambient RMS: {rms:.4f}, threshold: {threshold:.4f}")
    return threshold
 ```
 - [ ] **Step 2: Add the VAD state machine**
 Add this class above `stream_transcribe`. The VAD operates on 50ms frames (800 samples at 16kHz). It tracks state transitions between SILENCE and SPEAKING using consecutive frame counts and a configurable silence duration to end a segment.
 ```python
 FRAME_SIZE = 800          # 50ms at 16kHz
 PRE_ROLL_FRAMES = 6       # ~0.3s of audio before speech onset
 SILENCE_FRAMES = 16       # ~0.8s of silence to end a segment
 SPEECH_ONSET_FRAMES = 3   # ~150ms of speech to trigger
 MAX_SPEECH_SECONDS = 30   # force chunk boundary
 class VADStateMachine:
    def __init__(self, threshold):
        self.threshold = threshold
        self.speaking = False
        self.speech_frames = 0
        self.silence_frames = 0
        self.pre_roll = []
        self.segment = []
        self.segment_start_time = 0.0
    def process_frame(self, frame, elapsed_time):
        """Process one 50ms frame. Returns a (start_time, audio_array) tuple when a
        complete speech segment is detected, otherwise None."""
        rms = np.sqrt(np.mean(frame ** 2))
        is_loud = rms > self.threshold
        if not self.speaking:
            self.pre_roll.append(frame)
            if len(self.pre_roll) > PRE_ROLL_FRAMES:
                self.pre_roll.pop(0)
            if is_loud:
                self.speech_frames += 1
                if self.speech_frames >= SPEECH_ONSET_FRAMES:
                    self.speaking = True
                    self.silence_frames = 0
                    self.segment = list(self.pre_roll)
                    self.segment_start_time = max(0.0, elapsed_time - len(self.pre_roll) * FRAME_SIZE / SAMPLE_RATE)
                    self.pre_roll = []
            else:
                self.speech_frames = 0
            return None
        # Currently speaking
        self.segment.append(frame)
        if is_loud:
            self.silence_frames = 0
        else:
            self.silence_frames += 1
        segment_duration = len(self.segment) * FRAME_SIZE / SAMPLE_RATE
        if self.silence_frames >= SILENCE_FRAMES or segment_duration >= MAX_SPEECH_SECONDS:
            result = (self.segment_start_time, np.concatenate(self.segment))
            self.speaking = False
            self.speech_frames = 0
            self.silence_frames = 0
            self.segment = []
            self.pre_roll = []
            return result
        return None
 ```
 - [ ] **Step 3: Verify VAD with a quick smoke test**
 Run a quick inline test to make sure the VAD detects speech:
 ```bash
 uv run python -c "
 import numpy as np
 from transcribe import VADStateMachine, FRAME_SIZE, SAMPLE_RATE
 vad = VADStateMachine(threshold=0.01)
 # Feed 10 silent frames
 for i in range(10):
    frame = np.zeros(FRAME_SIZE, dtype='float32')
    result = vad.process_frame(frame, i * FRAME_SIZE / SAMPLE_RATE)
    assert result is None
 # Feed 5 loud frames (triggers speech after 3)
 for i in range(10, 15):
    frame = np.ones(FRAME_SIZE, dtype='float32') * 0.05
    result = vad.process_frame(frame, i * FRAME_SIZE / SAMPLE_RATE)
    assert result is None  # speaking but not yet ended
 # Feed 20 silent frames (triggers end after 16)
 for i in range(15, 35):
    frame = np.zeros(FRAME_SIZE, dtype='float32')
    result = vad.process_frame(frame, i * FRAME_SIZE / SAMPLE_RATE)
    if result is not None:
        start_time, audio = result
        duration = len(audio) / SAMPLE_RATE
        print(f'Segment detected: start={start_time:.2f}s, duration={duration:.2f}s')
        break
 else:
    raise AssertionError('No segment detected')
 print('VAD smoke test passed')
 "
 ```
 Expected: prints segment info and "VAD smoke test passed".
 - [ ] **Step 4: Commit**
 ```bash
 git add transcribe.py
 git commit -m "feat: add silence calibration and VAD state machine"
 ```
 ---
 ### Task 3: Implement the streaming transcription loop
 **Files:**
 - Modify: `transcribe.py` — replace `stream_transcribe` stub with full implementation
 - [ ] **Step 1: Add imports at the top of the file**
 Add these imports to the top of `transcribe.py` (after `import argparse`):
 ```python
 import queue
 import threading
 import time
 ```
 - [ ] **Step 2: Implement stream_transcribe**
 Replace the `stream_transcribe` stub with the full implementation. This function:
 1. Calibrates silence threshold
 2. Starts a transcription consumer thread
 3. Opens a sounddevice InputStream that feeds frames to the VAD
 4. When VAD emits a segment, pushes it onto the queue
 5. Handles Ctrl+C for clean shutdown
 ```python
 def stream_transcribe(processor, model, language):
    threshold = calibrate_silence()
    vad = VADStateMachine(threshold)
    seg_queue = queue.Queue()
    stop_event = threading.Event()
    start_time = time.monotonic()
    def transcription_worker():
        while not stop_event.is_set() or not seg_queue.empty():
            try:
                seg_start, audio = seg_queue.get(timeout=0.5)
            except queue.Empty:
                continue
            minutes = int(seg_start) // 60
            seconds = int(seg_start) % 60
            text = transcribe_audio(processor, model, audio, language)
            if text.strip():
                print(f"[{minutes:02d}:{seconds:02d}] {text.strip()}")
    worker = threading.Thread(target=transcription_worker, daemon=True)
    worker.start()
    frame_buf = np.empty(0, dtype="float32")
    def audio_callback(indata, frames, time_info, status):
        nonlocal frame_buf
        if stop_event.is_set():
            return
        frame_buf = np.append(frame_buf, indata[:, 0])
        while len(frame_buf) >= FRAME_SIZE:
            frame = frame_buf[:FRAME_SIZE]
            frame_buf = frame_buf[FRAME_SIZE:]
            elapsed = time.monotonic() - start_time
            result = vad.process_frame(frame, elapsed)
            if result is not None:
                seg_queue.put(result)
    print("Listening... (Ctrl+C to stop)")
    stream = sd.InputStream(
        samplerate=SAMPLE_RATE, channels=1, dtype="float32",
        callback=audio_callback, blocksize=FRAME_SIZE,
    )
    try:
        with stream:
            while True:
                time.sleep(0.1)
    except KeyboardInterrupt:
        pass
    stop_event.set()
    # Flush any remaining speech segment
    if vad.speaking and vad.segment:
        elapsed = time.monotonic() - start_time
        seg_queue.put((vad.segment_start_time, np.concatenate(vad.segment)))
    worker.join(timeout=30)
    print("\nDone.")
 ```
 - [ ] **Step 3: Verify streaming mode starts and captures speech**
 Run the streaming mode and speak a sentence into the microphone, then press Ctrl+C:
 ```bash
 uv run python transcribe.py --stream
 ```
 Expected output:
 ```
 Loading model...
 Calibrating silence threshold...
  Ambient RMS: 0.00XX, threshold: 0.00XX
 Listening... (Ctrl+C to stop)
 [00:03] <your spoken words appear here>
 ^C
 Done.
 ```
 - [ ] **Step 4: Verify --lang flag works**
 ```bash
 uv run python transcribe.py --stream --lang en
 ```
 Expected: same as above, English transcription.
 - [ ] **Step 5: Verify existing modes still work**
 ```bash
 uv run python transcribe.py --mic 3
 ```
 Expected: records 3 seconds, transcribes, prints result — same behavior as before.
 - [ ] **Step 6: Commit**
 ```bash
 git add transcribe.py
 git commit -m "feat: implement live streaming transcription with VAD"
 ```
 ---
 ### Task 4: End-to-end verification
 No code changes in this task — just verification that everything works together.
 - [ ] **Step 1: Test continuous conversation**
 Run streaming mode and speak multiple sentences with natural pauses between them:
 ```bash
 uv run python transcribe.py --stream
 ```
 Verify:
 - Each sentence appears as a separate timestamped line
 - Timestamps roughly correspond to when you started speaking
 - No words are cut off at segment boundaries
 - Pauses within a sentence (< 0.8s) don't split the segment
 - [ ] **Step 2: Test long speech (safety cap)**
 Speak continuously for 30+ seconds without pausing. Verify the safety cap forces a chunk boundary and transcription still works.
 - [ ] **Step 3: Test Ctrl+C with buffered speech**
 Start speaking and immediately press Ctrl+C. Verify the buffered speech is flushed and transcribed before exit.
 - [ ] **Step 4: Test quiet environment**
 Run in a quiet room without speaking. Verify no spurious segments are detected.
@@ -1,81 +0,0 @@
 # Live Streaming Microphone Transcription
 ## Summary
 Add a `--stream` mode to `transcribe.py` that continuously captures audio from the microphone, detects speech segments using energy-based VAD, and transcribes each segment in near-real-time using the Cohere ASR model. Output scrolls as timestamped lines in the terminal. Ctrl+C stops the session.
 ## Context
 - **Model**: CohereLabs/cohere-transcribe-03-2026, max 35s audio clips, 5s overlap for auto-chunking
 - **Inference speed**: ~0.4s for 5-10s audio on GPU (0.04-0.08x real-time)
 - **Microphone**: PD200X Podcast Microphone via PipeWire, 16kHz mono
 - **Existing code**: `transcribe.py` has `--mic` (fixed duration) and demo file modes
 ## Architecture
 ### Audio Capture
 `sounddevice.InputStream` with a callback streams 16kHz mono float32 audio into a thread-safe buffer. The callback appends raw samples; a separate consumer reads them.
 ### Voice Activity Detection
 Energy-based VAD using RMS amplitude over 50ms frames (800 samples at 16kHz):
 - **Threshold**: Calibrated from ~0.5s of ambient silence at startup, with a sensible fallback (~-40 dBFS)
 - **State machine**: `SILENCE -> SPEAKING -> SILENCE`
  - SILENCE -> SPEAKING: RMS exceeds threshold for >= 3 consecutive frames (~150ms)
  - SPEAKING -> SILENCE: RMS stays below threshold for >= 0.8s
 - **Pre-roll**: ~0.3s of audio before speech onset is included to avoid clipping word beginnings
 - **Safety cap**: If speech exceeds 30s without a pause, force a chunk boundary (model max is 35s)
 ### Threading Model
 Two threads communicating via `queue.Queue`:
 1. **Audio thread** (sounddevice callback + VAD logic): captures audio, runs VAD state machine, pushes completed speech segments onto the queue
 2. **Transcription thread**: pulls segments from the queue, runs `processor() -> model.generate() -> processor.decode()`, prints results
 No state carried between segments. Each is transcribed independently.
 ### Output
 Timestamped lines printed to stdout as each segment is transcribed:
 ```
 [00:03] Good morning, this is a test of the live captioning system.
 [00:08] The model seems to be picking up my voice pretty well.
 ```
 ### Shutdown
 Ctrl+C sets a stop flag via signal handler. The audio stream stops, any buffered speech is flushed and transcribed, then the program exits cleanly.
 ## CLI Interface
 ```
 uv run python transcribe.py --stream            # stream, default language (en)
 uv run python transcribe.py --stream --lang ja   # stream in Japanese
 uv run python transcribe.py --mic [duration]     # existing fixed-duration mode
 uv run python transcribe.py                      # existing demo file mode
 ```
 ### Startup Sequence
 1. Print "Loading model..." and load model
 2. Record ~0.5s of ambient audio, compute silence threshold
 3. Print threshold info and "Listening... (Ctrl+C to stop)"
 4. Begin streaming
 ## Dependencies
 No new dependencies. Uses: `sounddevice`, `numpy`, `threading`, `queue`, `signal`, `time` (all already available).
 ## Code Organization
 All new logic in `transcribe.py`. File grows from ~50 to ~150-180 lines. No new files.
 ## Constraints
 - Model max input: 35s per chunk (safety cap at 30s)
 - Sampling rate must be 16kHz
 - Single-channel (mono) audio only
@@ -1,31 +0,0 @@
 {
  inputs = {
    nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
  };
  outputs =
    { nixpkgs, ... }:
    let
      system = "x86_64-linux";
      pkgs = import nixpkgs {
        inherit system;
        config.allowUnfree = true;
      };
    in
    {
      devShells.${system}.default = pkgs.mkShell {
        packages = with pkgs; [
          uv
          python314
          portaudio
          cudaPackages.cudatoolkit
        ];
        env = {
          LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath [
            pkgs.cudaPackages.cudatoolkit
          ];
        };
      };
    };
 }
@@ -1,6 +0,0 @@
 def main():
    print("Hello from cohere!")
 if __name__ == "__main__":
    main()
@@ -1,17 +0,0 @@
 [project]
 name = "cohere"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.14"
 dependencies = [
    "accelerate>=1.13.0",
    "huggingface-hub>=1.16.1",
    "librosa>=0.11.0",
    "protobuf>=7.35.0",
    "sentencepiece>=0.2.1",
    "sounddevice>=0.5.5",
    "soundfile>=0.13.1",
    "torch>=2.12.0",
    "transformers>=5.9.0",
 ]
@@ -1,15 +0,0 @@
 { pkgs ? import <nixpkgs> { config.allowUnfree = true; } }:
 pkgs.mkShell {
  buildInputs = with pkgs; [
    portaudio
    cudaPackages.cudatoolkit
    uv
    python314
  ];
  shellHook = ''
    export LD_LIBRARY_PATH="${pkgs.cudaPackages.cudatoolkit}/lib:$LD_LIBRARY_PATH"
    echo "Dev shell ready - microphone input enabled"
  '';
 }
@@ -1,51 +0,0 @@
 import sys
 import numpy as np
 import sounddevice as sd
 from transformers import AutoProcessor, CohereAsrForConditionalGeneration
 from transformers.audio_utils import load_audio
 from huggingface_hub import hf_hub_download
 # Load model
 print("Loading model...")
 processor = AutoProcessor.from_pretrained("CohereLabs/cohere-transcribe-03-2026")
 model = CohereAsrForConditionalGeneration.from_pretrained(
    "CohereLabs/cohere-transcribe-03-2026",
    device_map="auto"
 )
 def transcribe_audio(audio, language="en"):
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt", language=language)
    inputs.to(model.device, dtype=model.dtype)
    outputs = model.generate(**inputs, max_new_tokens=256)
    text = processor.decode(outputs, skip_special_tokens=True)
    return text
 def record_audio(duration, samplerate=16000):
    print(f"Recording for {duration} seconds...")
    audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype='float32')
    sd.wait()
    return audio.flatten()
 # Parse arguments
 if len(sys.argv) > 1 and sys.argv[1] == "--mic":
    duration = int(sys.argv[2]) if len(sys.argv) > 2 else 5
    try:
        mic_audio = record_audio(duration)
        print("Transcribing...")
        text = transcribe_audio(mic_audio)
        print(f"\nTranscription:\n{text}\n")
    except OSError as e:
        print(f"Microphone error: {e}")
        print("Hint: Run with nix-shell for PortAudio support")
 else:
    print("Loading demo audio...")
    audio_file = hf_hub_download(
        repo_id="CohereLabs/cohere-transcribe-03-2026",
        filename="demo/voxpopuli_test_en_demo.wav",
    )
    audio = load_audio(audio_file, sampling_rate=16000)
    print("Transcribing...")
    text = transcribe_audio(audio)
    print(f"\nTranscription:\n{text}\n")
		`@@ -0,0 +1,3 @@`
							`# cohere-transcribe`

							`Live speech-to-text using Cohere ASR model`