/**
 * multimodal/wake-word.mjs — openWakeWord wake word detection via ONNX Runtime.
 *
 * Replaces the former Picovoice Porcupine backend with a fully open-source,
 * offline, API-key-free 3-model ONNX inference pipeline from openWakeWord:
 *   1. melspectrogram.onnx   — raw PCM → mel features
 *   2. embedding_model.onnx  — mel features → 96-dim audio embedding
 *   3. <wake_word>.onnx      — embedding history → confidence score
 *
 * Provides continuous microphone monitoring with wake word detection,
 * a ring buffer for pre-wake audio context, and graceful degradation
 * when dependencies are not available.
 *
 * Follows the file-watcher.mjs lifecycle pattern:
 *   startDetection(opts) / stopDetection() / isDetecting()
 *
 * Pure ESM. No build step. Uses dynamic imports for optional deps.
 */

import { existsSync } from 'fs';
import { join, dirname } from 'path';
import { fileURLToPath } from 'url';
import { createRequire } from 'module';
import { getConfig } from '../config.mjs';

const _require = createRequire(import.meta.url);

const __dirname = dirname(fileURLToPath(import.meta.url));
const AGENT_ROOT = join(__dirname, '..');

// ── Module-level state ───────────────────────────────────────

let _melSession = null;
let _embeddingSession = null;
let _classifierSessions = [];  // Array of { session, name, threshold }
let _micInstance = null;
let _detecting = false;
let _stopRequested = false;

// Ring buffer for pre-wake audio context (Int16 PCM, 16kHz mono)
let _ringBuffer = null;
let _ringPos = 0;
let _ringSize = 0;
const SAMPLE_RATE = 16000;
const FRAME_SIZE = 1280;  // 80ms at 16kHz — openWakeWord's native frame size

// ONNX inference state buffers
let _melBuffer = null;       // Float32Array-backed [76][32] sliding window
let _melFrameCount = 0;      // Total mel frames written
let _embeddingBuffer = null;  // Float32Array-backed [120][96] circular buffer (≈10s)
let _embeddingWritePos = 0;
let _embeddingCount = 0;
let _rawAccumulator = [];     // Accumulate raw PCM between frame boundaries

// Stats
let _stats = { detections: 0, framesProcessed: 0, startedAt: null };

// Cached availability check
let _available = null;

// Frame reader interface for voice-activation.mjs post-wake recording
let _frameResolve = null;
let _frameQueue = [];

// ── Availability ─────────────────────────────────────────────

/**
 * Resolve the models directory (configurable, defaults to data/wake-word-models/).
 * @returns {string}
 */
function getModelsDir() {
  const voiceConfig = getConfig().voice || {};
  if (voiceConfig.modelsPath) {
    return voiceConfig.modelsPath.startsWith('/')
      ? voiceConfig.modelsPath
      : join(AGENT_ROOT, voiceConfig.modelsPath);
  }
  return join(AGENT_ROOT, 'data', 'wake-word-models');
}

/**
 * Check whether openWakeWord wake word detection is available.
 * Requires onnxruntime-node package and model files on disk.
 * Result is cached after first call.
 * @returns {boolean}
 */
export function isWakeWordAvailable() {
  if (_available !== null) return _available;

  try {
    _require.resolve('onnxruntime-node');
  } catch {
    _available = false;
    return false;
  }

  // Check for required shared models
  const modelsDir = getModelsDir();
  const melPath = join(modelsDir, 'melspectrogram.onnx');
  const embPath = join(modelsDir, 'embedding_model.onnx');

  if (!existsSync(melPath) || !existsSync(embPath)) {
    _available = false;
    return false;
  }

  // Check for at least one wake word model
  const voiceConfig = getConfig().voice || {};
  const wakeWordModel = voiceConfig.wakeWordModel || 'hey_jarvis_v0.1';
  const models = Array.isArray(wakeWordModel) ? wakeWordModel : [wakeWordModel];
  const hasModel = models.some(m => {
    const modelFile = m.endsWith('.onnx') ? m : `${m}.onnx`;
    return existsSync(join(modelsDir, modelFile));
  });

  if (!hasModel) {
    _available = false;
    return false;
  }

  _available = true;
  return true;
}

/**
 * Reset availability cache (for testing).
 */
export function _resetAvailabilityCache() {
  _available = null;
}

// ── Ring Buffer ──────────────────────────────────────────────

function initRingBuffer(seconds) {
  _ringSize = seconds * SAMPLE_RATE;
  _ringBuffer = new Int16Array(_ringSize);
  _ringPos = 0;
}

function writeToRingBuffer(frame) {
  for (let i = 0; i < frame.length; i++) {
    _ringBuffer[_ringPos] = frame[i];
    _ringPos = (_ringPos + 1) % _ringSize;
  }
}

/**
 * Extract the last `seconds` of audio from the ring buffer in correct order.
 * @param {number} seconds How many seconds of pre-wake audio to extract.
 * @returns {Int16Array} Ordered PCM samples.
 */
function extractRingBuffer(seconds) {
  const samplesToExtract = Math.min(seconds * SAMPLE_RATE, _ringSize);
  const result = new Int16Array(samplesToExtract);

  // Calculate the start position in the ring buffer
  let readPos = (_ringPos - samplesToExtract + _ringSize) % _ringSize;

  for (let i = 0; i < samplesToExtract; i++) {
    result[i] = _ringBuffer[readPos];
    readPos = (readPos + 1) % _ringSize;
  }

  return result;
}

// Exported for testing
export { initRingBuffer as _initRingBuffer, writeToRingBuffer as _writeToRingBuffer, extractRingBuffer as _extractRingBuffer };

// ── ONNX Inference ──────────────────────────────────────────

/**
 * Initialize the 3-model ONNX pipeline.
 * @param {string} modelsDir Path to directory containing ONNX model files.
 * @param {string[]} wakeWordModels Wake word model names (without .onnx extension).
 * @returns {Promise<void>}
 */
async function initOnnxPipeline(modelsDir, wakeWordModels) {
  const ort = await import('onnxruntime-node');

  const sessionOpts = {
    executionProviders: ['cpu'],
    interOpNumThreads: 1,
    intraOpNumThreads: 1
  };

  // Load shared models
  const melPath = join(modelsDir, 'melspectrogram.onnx');
  const embPath = join(modelsDir, 'embedding_model.onnx');

  _melSession = await ort.InferenceSession.create(melPath, sessionOpts);
  _embeddingSession = await ort.InferenceSession.create(embPath, sessionOpts);

  // Load wake word classifier(s)
  _classifierSessions = [];
  const voiceConfig = getConfig().voice || {};
  const defaultThreshold = voiceConfig.detectionThreshold || 0.5;

  for (const modelName of wakeWordModels) {
    const modelFile = modelName.endsWith('.onnx') ? modelName : `${modelName}.onnx`;
    const modelPath = join(modelsDir, modelFile);
    if (!existsSync(modelPath)) {
      console.warn(`[wake-word] Wake word model not found: ${modelPath}`);
      continue;
    }
    const session = await ort.InferenceSession.create(modelPath, sessionOpts);

    // Read the expected input frame count from the model
    const inputShape = session.inputNames.length > 0
      ? session.inputNames[0] : null;
    let nFrames = 16;  // default
    try {
      // Try to read input shape metadata
      const meta = session._session ? session._session.inputNames : null;
      // Most openWakeWord classifiers expect [1, N, 96] where N is typically 16
      nFrames = 16;
    } catch { /* use default */ }

    _classifierSessions.push({
      session,
      name: modelName.replace('.onnx', ''),
      threshold: defaultThreshold,
      nFrames
    });
  }

  if (_classifierSessions.length === 0) {
    throw new Error('No valid wake word models loaded');
  }

  // Initialize inference buffers
  // Mel buffer: sliding window of mel features (76 frames x 32 coefficients)
  _melBuffer = new Float32Array(76 * 32).fill(1.0);  // Match Python's np.ones((76, 32))
  _melFrameCount = 0;

  // Embedding buffer: circular buffer of embeddings (120 x 96 ≈ 10 seconds)
  _embeddingBuffer = new Float32Array(120 * 96).fill(0);
  _embeddingWritePos = 0;
  _embeddingCount = 0;

  // Raw sample accumulator
  _rawAccumulator = [];

  console.log(`[wake-word] ONNX pipeline initialized (${_classifierSessions.map(c => c.name).join(', ')})`);
}

/**
 * Process one frame of 1280 Int16 PCM samples through the 3-model pipeline.
 * Returns array of { name, score, detected } for each wake word model.
 */
async function processOnnxFrame(samples) {
  const ort = await import('onnxruntime-node');

  // ── Step 1: Melspectrogram ──
  // Convert Int16 PCM to Float32, shape [1, N]
  const floatSamples = new Float32Array(samples.length);
  for (let i = 0; i < samples.length; i++) {
    floatSamples[i] = samples[i];
  }
  const melInput = new ort.Tensor('float32', floatSamples, [1, samples.length]);
  const melResult = await _melSession.run({ 'input': melInput });

  // Get output tensor — shape varies but typically [1, frames, 32]
  const melOutputName = _melSession.outputNames[0];
  const melOutput = melResult[melOutputName];
  const melData = melOutput.data;

  // Determine number of new mel frames (output shape: [1, nFrames, 32])
  const totalMelValues = melData.length;
  const nNewFrames = Math.floor(totalMelValues / 32);

  // Apply normalization transform: (value / 10.0) + 2.0
  // and append to sliding mel buffer
  for (let f = 0; f < nNewFrames; f++) {
    // Shift mel buffer left by 1 frame (drop oldest)
    _melBuffer.copyWithin(0, 32);
    // Write new frame at the end
    for (let b = 0; b < 32; b++) {
      const rawVal = melData[f * 32 + b];
      _melBuffer[(76 - 1) * 32 + b] = (rawVal / 10.0) + 2.0;
    }
    _melFrameCount++;
  }

  // ── Step 2: Embedding (every 8 mel frames) ──
  // Only compute embedding when we have enough frames and on 8-frame boundaries
  if (_melFrameCount >= 76 && _melFrameCount % 8 < nNewFrames) {
    // Input: [1, 76, 32, 1]
    const embInput = new ort.Tensor('float32', new Float32Array(_melBuffer), [1, 76, 32, 1]);
    const embResult = await _embeddingSession.run({ 'input_1': embInput });
    const embOutputName = _embeddingSession.outputNames[0];
    const embData = embResult[embOutputName].data;

    // Store embedding (96 dims) in circular buffer
    const offset = (_embeddingWritePos % 120) * 96;
    for (let i = 0; i < 96 && i < embData.length; i++) {
      _embeddingBuffer[offset + i] = embData[i];
    }
    _embeddingWritePos++;
    _embeddingCount++;
  }

  // ── Step 3: Classification ──
  const results = [];

  if (_embeddingCount >= 16) {
    for (const classifier of _classifierSessions) {
      const nFrames = classifier.nFrames;

      // Build input tensor [1, nFrames, 96] from the last nFrames embeddings
      const classInput = new Float32Array(nFrames * 96);
      for (let f = 0; f < nFrames; f++) {
        const bufIdx = ((_embeddingWritePos - nFrames + f + 120) % 120) * 96;
        for (let i = 0; i < 96; i++) {
          classInput[f * 96 + i] = _embeddingBuffer[bufIdx + i];
        }
      }

      const inputTensor = new ort.Tensor('float32', classInput, [1, nFrames, 96]);
      const inputName = classifier.session.inputNames[0];
      const classResult = await classifier.session.run({ [inputName]: inputTensor });
      const outputName = classifier.session.outputNames[0];
      const score = classResult[outputName].data[0];

      results.push({
        name: classifier.name,
        score,
        detected: score > classifier.threshold
      });
    }
  }

  return results;
}

// ── Microphone ──────────────────────────────────────────────

/**
 * Start the mic child process and return a readable stream of Int16 PCM data.
 * Uses the `mic` npm package which requires `sox` (macOS/Windows) or `arecord` (Linux).
 */
async function startMicrophone(deviceIndex) {
  const Mic = (await import('mic')).default;

  const micOpts = {
    rate: String(SAMPLE_RATE),
    channels: '1',
    bitwidth: '16',
    encoding: 'signed-integer',
    endian: 'little',
    fileType: 'raw'
  };

  // If a specific device is requested, pass it through
  if (deviceIndex !== undefined && deviceIndex !== -1 && deviceIndex !== null) {
    micOpts.device = String(deviceIndex);
  }

  _micInstance = Mic(micOpts);
  const micStream = _micInstance.getAudioStream();

  // Buffer incoming raw PCM into FRAME_SIZE-sample chunks and feed to frame queue
  let pcmBuffer = Buffer.alloc(0);

  micStream.on('data', (chunk) => {
    pcmBuffer = Buffer.concat([pcmBuffer, chunk]);

    // Extract complete frames (FRAME_SIZE samples * 2 bytes per Int16)
    const bytesPerFrame = FRAME_SIZE * 2;
    while (pcmBuffer.length >= bytesPerFrame) {
      const frameBytes = pcmBuffer.subarray(0, bytesPerFrame);
      pcmBuffer = pcmBuffer.subarray(bytesPerFrame);

      // Convert to Int16Array
      const frame = new Int16Array(FRAME_SIZE);
      for (let i = 0; i < FRAME_SIZE; i++) {
        frame[i] = frameBytes.readInt16LE(i * 2);
      }

      // If someone is waiting for a frame (post-wake recording), resolve their promise
      if (_frameResolve) {
        const resolve = _frameResolve;
        _frameResolve = null;
        resolve(frame);
      } else {
        _frameQueue.push(frame);
      }
    }
  });

  micStream.on('error', (err) => {
    console.error(`[wake-word] Microphone error: ${err.message}`);
  });

  _micInstance.start();
  console.log('[wake-word] Microphone started');
}

/**
 * Read the next frame from the microphone (1280 Int16 samples).
 * Returns a promise that resolves when the next frame is available.
 * Used by voice-activation.mjs for post-wake recording.
 * @returns {Promise<Int16Array>}
 */
export async function readFrame() {
  if (_frameQueue.length > 0) {
    return _frameQueue.shift();
  }
  return new Promise(resolve => {
    _frameResolve = resolve;
  });
}

// ── Detection ────────────────────────────────────────────────

/**
 * Start wake word detection with continuous microphone monitoring.
 *
 * @param {object} opts Configuration options.
 * @param {Array<string>} [opts.wakeWordModels] Wake word model names (e.g. ['hey_jarvis_v0.1']).
 * @param {number} [opts.deviceIndex=-1] Audio input device index (-1 for default).
 * @param {number} [opts.ringBufferSeconds=15] Seconds of pre-wake audio to buffer.
 * @param {number} [opts.detectionThreshold=0.5] Score threshold for detection.
 * @param {function} opts.onWakeWord Callback: ({keywordIndex, keyword, score, timestamp, preWakeAudio}).
 * @param {function} [opts.onError] Error callback.
 * @returns {Promise<void>}
 */
export async function startDetection(opts) {
  if (_detecting) stopDetection();

  const {
    wakeWordModels,
    deviceIndex = -1,
    ringBufferSeconds = 15,
    detectionThreshold = 0.5,
    onWakeWord,
    onError
  } = opts;

  if (!wakeWordModels || wakeWordModels.length === 0) throw new Error('At least one wake word model is required');
  if (!onWakeWord) throw new Error('onWakeWord callback is required');

  const modelsDir = getModelsDir();

  // Initialize the ONNX pipeline
  await initOnnxPipeline(modelsDir, wakeWordModels);

  // Override thresholds if provided
  for (const classifier of _classifierSessions) {
    classifier.threshold = detectionThreshold;
  }

  // Initialize ring buffer
  initRingBuffer(ringBufferSeconds);

  // Start microphone
  await startMicrophone(deviceIndex);

  _detecting = true;
  _stopRequested = false;
  _frameQueue = [];
  _frameResolve = null;
  _stats = { detections: 0, framesProcessed: 0, startedAt: Date.now() };

  console.log(`[wake-word] Detection started (models: ${wakeWordModels.join(', ')}, threshold: ${detectionThreshold})`);

  // Frame processing loop
  processFrames(onWakeWord, onError);
}

async function processFrames(onWakeWord, onError) {
  while (_detecting && !_stopRequested) {
    try {
      const frame = await readFrame();
      if (!frame || _stopRequested) break;

      _stats.framesProcessed++;

      // Write to ring buffer for pre-wake context
      writeToRingBuffer(frame);

      // Process frame through ONNX pipeline
      const results = await processOnnxFrame(frame);

      for (let i = 0; i < results.length; i++) {
        if (results[i].detected) {
          _stats.detections++;
          const preWakeAudio = extractRingBuffer(10); // 10 seconds of context

          try {
            onWakeWord({
              keywordIndex: i,
              keyword: results[i].name,
              score: results[i].score,
              timestamp: Date.now(),
              preWakeAudio
            });
          } catch (err) {
            console.error(`[wake-word] Callback error: ${err.message}`);
          }
          break;  // Only fire once per frame (first detection wins)
        }
      }
    } catch (err) {
      if (_stopRequested) break;
      console.error(`[wake-word] Frame processing error: ${err.message}`);
      if (onError) {
        try { onError(err); } catch { /* ignore callback errors */ }
      }
      // Brief pause before retrying
      await new Promise(r => setTimeout(r, 100));
    }

    // Yield to event loop to keep CPU usage low
    await new Promise(r => setImmediate(r));
  }
}

/**
 * Stop wake word detection and release all resources.
 */
export function stopDetection() {
  _stopRequested = true;
  _detecting = false;

  // Stop microphone
  if (_micInstance) {
    try {
      _micInstance.stop();
    } catch (err) {
      console.error(`[wake-word] Mic cleanup error: ${err.message}`);
    }
    _micInstance = null;
  }

  // Dispose ONNX sessions
  if (_melSession) {
    try { _melSession.release(); } catch { /* ignore */ }
    _melSession = null;
  }
  if (_embeddingSession) {
    try { _embeddingSession.release(); } catch { /* ignore */ }
    _embeddingSession = null;
  }
  for (const c of _classifierSessions) {
    try { c.session.release(); } catch { /* ignore */ }
  }
  _classifierSessions = [];

  // Clear buffers
  _ringBuffer = null;
  _ringPos = 0;
  _melBuffer = null;
  _melFrameCount = 0;
  _embeddingBuffer = null;
  _embeddingWritePos = 0;
  _embeddingCount = 0;
  _rawAccumulator = [];
  _frameQueue = [];
  if (_frameResolve) {
    _frameResolve(null);
    _frameResolve = null;
  }

  console.log('[wake-word] Detection stopped');
}

/**
 * @returns {boolean} Whether detection is currently active.
 */
export function isDetecting() {
  return _detecting;
}

/**
 * Get detection statistics.
 * @returns {{ detections: number, framesProcessed: number, startedAt: number|null }}
 */
export function getDetectionStats() {
  return { ..._stats };
}

/**
 * List available audio input devices.
 * Uses system commands since the `mic` package doesn't provide device enumeration.
 * @returns {Promise<string[]>}
 */
export async function listAudioDevices() {
  try {
    const { execSync } = await import('child_process');
    if (process.platform === 'darwin') {
      // macOS: use system_profiler
      const output = execSync('system_profiler SPAudioDataType 2>/dev/null', { encoding: 'utf-8' });
      const devices = [];
      const lines = output.split('\n');
      for (const line of lines) {
        const trimmed = line.trim();
        if (trimmed && !trimmed.startsWith('Audio:') && !trimmed.startsWith('Devices:')
            && trimmed.endsWith(':') && !trimmed.includes('Properties:')) {
          devices.push(trimmed.replace(/:$/, ''));
        }
      }
      return devices;
    } else if (process.platform === 'linux') {
      // Linux: use arecord -l
      const output = execSync('arecord -l 2>/dev/null', { encoding: 'utf-8' });
      const devices = [];
      for (const line of output.split('\n')) {
        if (line.startsWith('card ')) {
          devices.push(line.trim());
        }
      }
      return devices;
    }
    return [];
  } catch {
    return [];
  }
}

/**
 * Get the microphone instance for direct access (used by voice-activation
 * to continue recording after wake word detection).
 * @returns {object|null} The active mic instance, or null.
 */
export function getRecorder() {
  return _micInstance;
}

/**
 * Get the frame length (samples per frame).
 * openWakeWord uses 1280 samples (80ms at 16kHz).
 * @returns {number}
 */
export function getFrameLength() {
  return FRAME_SIZE;
}
