dotfiles/files/stt-dictate.sh

#!/usr/bin/env bash
# stt-dictate.sh - Push-to-talk speech-to-text using whisper.cpp
#
# Usage:
#   stt-dictate start   # demarre l'enregistrement
#   stt-dictate stop    # arrete et transcrit
#   stt-dictate toggle  # bascule entre start/stop
#
# Keybinding i3 (mode toggle):
#   bindsym Mod4+space exec stt-dictate toggle
#
# Modeles disponibles (STT_MODEL):
#   tiny    - 39 MB   - rapide, qualite basique
#   base    - 74 MB   - rapide, bonne qualite
#   small   - 244 MB  - equilibre (defaut)
#   medium  - 769 MB  - lent, excellente qualite
#   large   - 1.5 GB  - tres lent, meilleure qualite
#
# Exemple: STT_MODEL=tiny stt-dictate start

set -euo pipefail

RECORDING_PID="/tmp/stt-recording.pid"
AUDIO_FILE="/tmp/stt-audio.wav"
MODEL_DIR="${HOME}/.cache/whisper"
MODEL="${STT_MODEL:-small}"

# Notification helper (silently fails if no daemon)
notify() {
  notify-send "STT" "$1" -t "${2:-2000}" 2>/dev/null || echo "[STT] $1"
}

# Telecharge le modele si absent
download_model() {
  local model_file="${MODEL_DIR}/ggml-${MODEL}.bin"
  if [[ ! -f "$model_file" ]]; then
    mkdir -p "$MODEL_DIR"
    notify "Telechargement du modele ${MODEL}..." 5000
    local url="https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-${MODEL}.bin"
    curl -L "$url" -o "$model_file"
    notify "Modele ${MODEL} pret"
  fi
}

start_recording() {
  # Ne pas démarrer si déjà en cours
  if [[ -f "$RECORDING_PID" ]] && kill -0 "$(cat "$RECORDING_PID")" 2>/dev/null; then
    return 0
  fi
  download_model
  # Enregistre avec arecord (format compatible whisper.cpp)
  arecord -f S16_LE -r 16000 -c 1 -t wav "$AUDIO_FILE" &
  echo $! >"$RECORDING_PID"
  notify "Enregistrement..." 1000
}

stop_and_transcribe() {
  if [[ -f "$RECORDING_PID" ]]; then
    kill "$(cat "$RECORDING_PID")" 2>/dev/null || true
    rm -f "$RECORDING_PID"
    sleep 0.3 # laisse arecord finaliser le fichier

    if [[ ! -f "$AUDIO_FILE" ]] || [[ ! -s "$AUDIO_FILE" ]]; then
      notify "Pas d'audio enregistre"
      rm -f "$AUDIO_FILE"
      return 1
    fi

    notify "Transcription..." 1000

    local model_file="${MODEL_DIR}/ggml-${MODEL}.bin"

    # Transcription avec whisper.cpp
    TEXT=$(whisper-cli \
      -m "$model_file" \
      -l fr \
      -nt \
      -np \
      "$AUDIO_FILE" 2>&1 |
      grep -v "^load_backend:" |
      tr -d '\n' |
      sed 's/^[[:space:]]*//;s/[[:space:]]*$//')

    rm -f "$AUDIO_FILE"

    # Tape le texte au curseur
    if [[ -n "$TEXT" ]]; then
      sleep 0.1 # petit delai pour focus
      xdotool type --delay 10 -- "$TEXT"
      notify "$TEXT"
    else
      notify "Aucun texte detecte"
    fi
  fi
}

case "${1:-toggle}" in
start) start_recording ;;
stop) stop_and_transcribe ;;
toggle)
  if [[ -f "$RECORDING_PID" ]]; then
    stop_and_transcribe
  else
    start_recording
  fi
  ;;
*)
  echo "Usage: $0 {start|stop|toggle}"
  exit 1
  ;;
esac