/**
 * voice-activation.mjs — Voice activation controller.
 *
 * Top-level orchestrator that ties wake word detection, speech recording,
 * silence detection, STT transcription, and chat injection together.
 *
 * State machine: IDLE → DETECTING → LISTENING → TRANSCRIBING → DETECTING
 *
 * Follows the file-watcher.mjs lifecycle pattern:
 *   startVoiceActivation(opts) / stopVoiceActivation() / isVoiceActive()
 *
 * Pure ESM. No build step.
 */

import { writeFileSync, unlinkSync, mkdirSync } from 'fs';
import { join } from 'path';
import { tmpdir } from 'os';
import { getConfig, getAgentRoot } from './config.mjs';
import {
  isWakeWordAvailable,
  startDetection,
  stopDetection,
  isDetecting,
  readFrame,
  getFrameLength
} from './multimodal/wake-word.mjs';
import { isSTTAvailable, speechToText } from './multimodal/speech.mjs';

// ── Constants ────────────────────────────────────────────────

const STATES = {
  IDLE: 'idle',
  DETECTING: 'detecting',
  LISTENING: 'listening',
  TRANSCRIBING: 'transcribing',
  INJECTING: 'injecting'
};

const SILENCE_THRESHOLD_RMS = 200;    // RMS below this = silence
const SILENCE_DURATION_MS = 1500;     // 1.5 seconds of silence = end of speech
const MAX_LISTEN_MS = 15000;          // Max recording duration after wake word
const DEBOUNCE_MS = 3000;             // Ignore re-triggers within 3 seconds
const SAMPLE_RATE = 16000;

// ── Module-level state ───────────────────────────────────────

let _state = STATES.IDLE;
let _active = false;
let _onTranscription = null;
let _onStateChange = null;
let _lastDetectionTime = 0;

// Post-wake recording buffer
let _postWakeFrames = [];
let _silenceStartMs = 0;
let _listenStartMs = 0;
let _isRecordingPostWake = false;

// ── State management ─────────────────────────────────────────

function setState(newState) {
  if (_state === newState) return;
  _state = newState;
  if (_onStateChange) {
    try { _onStateChange(newState); } catch { /* ignore */ }
  }
}

// ── Audio utilities ──────────────────────────────────────────

/**
 * Compute RMS (root mean square) of an Int16 audio frame.
 * @param {Int16Array} frame
 * @returns {number}
 */
function computeRMS(frame) {
  if (!frame || frame.length === 0) return 0;
  let sum = 0;
  for (let i = 0; i < frame.length; i++) {
    sum += frame[i] * frame[i];
  }
  return Math.sqrt(sum / frame.length);
}

/**
 * Convert PCM Int16 samples to a WAV file buffer.
 * @param {Int16Array} samples PCM samples at 16kHz mono.
 * @param {number} [sampleRate=16000]
 * @returns {Buffer}
 */
export function pcmToWav(samples, sampleRate = SAMPLE_RATE) {
  const dataSize = samples.length * 2;
  const buffer = Buffer.alloc(44 + dataSize);

  // RIFF header
  buffer.write('RIFF', 0);
  buffer.writeUInt32LE(36 + dataSize, 4);
  buffer.write('WAVE', 8);

  // fmt chunk
  buffer.write('fmt ', 12);
  buffer.writeUInt32LE(16, 16);           // chunk size
  buffer.writeUInt16LE(1, 20);            // PCM format
  buffer.writeUInt16LE(1, 22);            // mono
  buffer.writeUInt32LE(sampleRate, 24);   // sample rate
  buffer.writeUInt32LE(sampleRate * 2, 28); // byte rate
  buffer.writeUInt16LE(2, 32);            // block align
  buffer.writeUInt16LE(16, 34);           // bits per sample

  // data chunk
  buffer.write('data', 36);
  buffer.writeUInt32LE(dataSize, 40);

  for (let i = 0; i < samples.length; i++) {
    buffer.writeInt16LE(samples[i], 44 + i * 2);
  }

  return buffer;
}

// ── Silence detection logic ──────────────────────────────────

/**
 * Determine if silence has been detected for long enough to stop recording.
 * Returns true if we should stop listening.
 */
function checkSilence(frame) {
  const rms = computeRMS(frame);
  const now = Date.now();

  if (rms < SILENCE_THRESHOLD_RMS) {
    if (_silenceStartMs === 0) _silenceStartMs = now;
    if (now - _silenceStartMs >= SILENCE_DURATION_MS) return true;
  } else {
    _silenceStartMs = 0;
  }

  // Enforce max listen duration
  if (now - _listenStartMs >= MAX_LISTEN_MS) return true;

  return false;
}

// ── Post-wake recording ──────────────────────────────────────

async function startPostWakeRecording(preWakeAudio) {
  setState(STATES.LISTENING);
  _postWakeFrames = [];
  _silenceStartMs = 0;
  _listenStartMs = Date.now();
  _isRecordingPostWake = true;

  // Read frames from the wake-word module's mic stream until silence or timeout
  while (_isRecordingPostWake && _active) {
    try {
      const frame = await readFrame();
      if (!frame) break;
      _postWakeFrames.push(new Int16Array(frame));

      if (checkSilence(frame)) {
        break;
      }
    } catch (err) {
      console.error(`[voice-activation] Recording error: ${err.message}`);
      break;
    }

    await new Promise(r => setImmediate(r));
  }

  _isRecordingPostWake = false;

  if (!_active) return;

  // Combine pre-wake + post-wake audio and transcribe
  await transcribeAndInject(preWakeAudio);
}

async function transcribeAndInject(preWakeAudio) {
  setState(STATES.TRANSCRIBING);

  // Combine pre-wake context and post-wake speech into one buffer
  const postWakeSamples = _postWakeFrames.reduce((acc, f) => acc + f.length, 0);
  const preWakeSamples = preWakeAudio ? preWakeAudio.length : 0;
  const totalSamples = preWakeSamples + postWakeSamples;

  if (totalSamples === 0) {
    console.log('[voice-activation] No audio captured, skipping transcription');
    setState(STATES.DETECTING);
    return;
  }

  const combined = new Int16Array(totalSamples);
  let offset = 0;

  // Write pre-wake audio first (provides context for reverse wake words)
  if (preWakeAudio && preWakeAudio.length > 0) {
    combined.set(preWakeAudio, 0);
    offset = preWakeAudio.length;
  }

  // Write post-wake frames
  for (const frame of _postWakeFrames) {
    combined.set(frame, offset);
    offset += frame.length;
  }

  _postWakeFrames = [];

  // Check STT availability — if whisper isn't installed, delegate to browser STT
  if (!isSTTAvailable()) {
    console.log('[voice-activation] STT (whisper) not available, delegating to browser Web Speech API');
    // Broadcast listening state so browser picks up with its own speech recognition
    if (_onStateChange) {
      try { _onStateChange('listening'); } catch {}
    }
    setState(STATES.DETECTING);
    return;
  }

  // Write to temp WAV file
  const wavBuffer = pcmToWav(combined);
  const tempDir = join(tmpdir(), 'gergy-voice');
  try { mkdirSync(tempDir, { recursive: true }); } catch { /* ignore */ }
  const tempPath = join(tempDir, `voice-${Date.now()}.wav`);

  try {
    writeFileSync(tempPath, wavBuffer);

    const transcript = await speechToText(tempPath);

    // Clean up temp file
    try { unlinkSync(tempPath); } catch { /* ignore */ }

    if (transcript && transcript.trim().length > 0) {
      const cleanText = transcript.trim();
      const durationMs = Math.round((totalSamples / SAMPLE_RATE) * 1000);

      console.log(`[voice-activation] Transcribed: "${cleanText.slice(0, 80)}${cleanText.length > 80 ? '...' : ''}"`);

      setState(STATES.INJECTING);

      if (_onTranscription) {
        try {
          _onTranscription(cleanText, {
            source: 'voice',
            duration_ms: durationMs,
            samples: totalSamples,
            preWakeSamples
          });
        } catch (err) {
          console.error(`[voice-activation] Transcription callback error: ${err.message}`);
        }
      }
    } else {
      console.log('[voice-activation] Empty transcription, ignoring');
    }
  } catch (err) {
    console.error(`[voice-activation] Transcription failed: ${err.message}`);
    try { unlinkSync(tempPath); } catch { /* ignore */ }
  }

  setState(STATES.DETECTING);
}

// ── Public API ───────────────────────────────────────────────

/**
 * Start voice activation system.
 *
 * @param {object} opts
 * @param {function} opts.onTranscription Called with (text, metadata) when speech is transcribed.
 * @param {function} [opts.onStateChange] Called with (state) on state transitions.
 * @returns {Promise<void>}
 */
export async function startVoiceActivation(opts = {}) {
  if (_active) stopVoiceActivation();

  _onTranscription = opts.onTranscription || null;
  _onStateChange = opts.onStateChange || null;

  const voiceConfig = getConfig().voice || {};

  if (!voiceConfig.enabled) {
    console.log('[voice-activation] Voice is disabled in config');
    return;
  }

  if (!isWakeWordAvailable()) {
    console.log('[voice-activation] openWakeWord not available');
    console.log('[voice-activation] Install onnxruntime-node + mic and ensure ONNX model files exist in data/wake-word-models/');
    return;
  }

  // Resolve wake word models
  const wakeWordModel = voiceConfig.wakeWordModel || 'hey_jarvis_v0.1';
  const wakeWordModels = Array.isArray(wakeWordModel) ? wakeWordModel : [wakeWordModel];
  const sensitivity = voiceConfig.detectionThreshold || 0.5;

  _active = true;

  try {
    await startDetection({
      wakeWordModels,
      deviceIndex: -1,
      ringBufferSeconds: voiceConfig.ringBufferSeconds || 15,
      detectionThreshold: sensitivity,
      onWakeWord: (detection) => {
        const now = Date.now();

        // Debounce: ignore re-triggers within DEBOUNCE_MS
        if (now - _lastDetectionTime < DEBOUNCE_MS) {
          console.log('[voice-activation] Wake word debounced');
          return;
        }
        _lastDetectionTime = now;

        console.log(`[voice-activation] Wake word detected: "${detection.keyword}" (score: ${detection.score?.toFixed(3)})`);

        // Start post-wake recording (non-blocking)
        startPostWakeRecording(detection.preWakeAudio);
      },
      onError: (err) => {
        console.error(`[voice-activation] Detection error: ${err.message}`);
      }
    });

    setState(STATES.DETECTING);
    console.log(`[voice-activation] Started (wake words: ${wakeWordModels.join(', ')})`);
  } catch (err) {
    _active = false;
    setState(STATES.IDLE);
    throw err;
  }
}

/**
 * Stop voice activation and release all resources.
 */
export function stopVoiceActivation() {
  _active = false;
  _isRecordingPostWake = false;

  try { stopDetection(); } catch { /* ignore */ }

  _postWakeFrames = [];
  setState(STATES.IDLE);
  console.log('[voice-activation] Stopped');
}

/**
 * @returns {boolean} Whether voice activation is active.
 */
export function isVoiceActive() {
  return _active;
}

/**
 * Get current voice activation state.
 * @returns {{ active: boolean, state: string, wakeWords: string[], sensitivity: number }}
 */
export function getVoiceState() {
  const voiceConfig = getConfig().voice || {};
  const wakeWordModel = voiceConfig.wakeWordModel || 'hey_jarvis_v0.1';
  return {
    active: _active,
    state: _state,
    wakeWords: Array.isArray(wakeWordModel) ? wakeWordModel : [wakeWordModel],
    sensitivity: voiceConfig.detectionThreshold || 0.5
  };
}

/**
 * Handle hotkey activation — skip wake word, go straight to LISTENING.
 * Records immediately and transcribes when silence is detected.
 */
export function handleHotkeyActivation() {
  if (_state === STATES.LISTENING || _state === STATES.TRANSCRIBING) {
    console.log('[voice-activation] Already listening/transcribing, ignoring hotkey');
    return;
  }

  const now = Date.now();
  if (now - _lastDetectionTime < DEBOUNCE_MS) {
    console.log('[voice-activation] Hotkey debounced');
    return;
  }
  _lastDetectionTime = now;

  if (!_active || !isDetecting()) {
    console.log('[voice-activation] Not active, cannot start hotkey listen');
    return;
  }

  console.log('[voice-activation] Hotkey listen activated');

  // Start post-wake recording with no pre-wake audio
  startPostWakeRecording(null);
}

// Export states for testing
export { STATES };
