import { readFile } from 'fs/promises';
import { extname, basename } from 'path';
import { fileURLToPath } from 'url';
import { createRequire } from 'module';
import { fetchUrl, htmlToMarkdown, extractMetadata, stripHtml } from './fetch-utils.mjs';
import { addEntity, addRelation, getEntityByName, getNextIngestion, setIngestionProcessing, completeIngestion, failIngestion } from './db.mjs';
import { askLLM, pickModel } from './llm.mjs';
import { parseLLMJson } from './json-utils.mjs';
import { getConfig } from './config.mjs';

const _require = createRequire(import.meta.url);

/**
 * Auto-detect source type and ingest accordingly.
 *
 * @param {string} source - URL, file path, or raw text
 * @param {object} opts - Options: workspaceId, extractEntities (default true)
 * @returns {Promise<{markdown: string, entities: Array, relations: Array, sourceType: string, metadata?: object}>}
 */
export async function ingest(source, opts = {}) {
  if (!source || typeof source !== 'string') {
    throw new Error('Source must be a non-empty string');
  }

  const trimmed = source.trim();

  // URL detection
  if (/^https?:\/\//i.test(trimmed)) {
    const result = await ingestUrl(trimmed, opts);
    return { ...result, sourceType: 'url' };
  }

  // File path detection: has extension, or starts with / or ./
  if (extname(trimmed) || trimmed.startsWith('/') || trimmed.startsWith('./') || trimmed.startsWith('../')) {
    const result = await ingestFile(trimmed, opts);
    return { ...result, sourceType: 'file' };
  }

  // Default: treat as raw text
  const result = await ingestText(trimmed, opts);
  return { ...result, sourceType: 'text' };
}

/**
 * Ingest a file by reading it and converting based on extension.
 *
 * @param {string} filePath - Path to the file
 * @param {object} opts - Options: workspaceId, extractEntities
 * @returns {Promise<{markdown: string, entities: Array, relations: Array, metadata: object}>}
 */
export async function ingestFile(filePath, opts = {}) {
  const ext = extname(filePath).toLowerCase();
  let content;

  try {
    content = await readFile(filePath, ext === '.pdf' ? undefined : 'utf-8');
  } catch (err) {
    throw new Error(`Cannot read file ${filePath}: ${err.message}`);
  }

  let markdown = '';
  let metadata = { filename: basename(filePath), extension: ext };

  switch (ext) {
    case '.md':
    case '.txt':
      markdown = typeof content === 'string' ? content : content.toString('utf-8');
      break;

    case '.html':
    case '.htm':
      markdown = htmlToMarkdown(content);
      metadata = { ...metadata, ...extractMetadata(content) };
      break;

    case '.json': {
      try {
        const parsed = JSON.parse(content);
        markdown = '```json\n' + JSON.stringify(parsed, null, 2) + '\n```';
      } catch {
        markdown = '```json\n' + content + '\n```';
      }
      break;
    }

    case '.csv': {
      const lines = content.split('\n').filter(l => l.trim());
      if (lines.length > 0) {
        const rows = lines.map(line => line.split(',').map(cell => cell.trim()));
        const colCount = Math.max(...rows.map(r => r.length));
        const mdLines = [];
        rows.forEach((row, idx) => {
          const padded = Array.from({ length: colCount }, (_, i) => row[i] || '');
          mdLines.push('| ' + padded.join(' | ') + ' |');
          if (idx === 0) {
            mdLines.push('| ' + padded.map(() => '---').join(' | ') + ' |');
          }
        });
        markdown = mdLines.join('\n');
      }
      break;
    }

    case '.js':
    case '.mjs':
    case '.ts':
    case '.tsx':
    case '.jsx':
      markdown = '```' + ext.slice(1) + '\n' + content + '\n```';
      break;

    case '.py':
      markdown = '```python\n' + content + '\n```';
      break;

    case '.pdf': {
      try {
        const pdfParse = (await import('pdf-parse')).default;
        const data = await pdfParse(content);
        markdown = data.text || '';
        metadata.pages = data.numpages;
      } catch (err) {
        throw new Error(`PDF parsing failed (install pdf-parse): ${err.message}`);
      }
      break;
    }

    case '.docx': {
      try {
        const mammoth = await import('mammoth');
        const result = await mammoth.convertToHtml({ buffer: content });
        markdown = htmlToMarkdown(result.value);
      } catch (err) {
        throw new Error(`DOCX parsing failed (install mammoth): ${err.message}`);
      }
      break;
    }

    default:
      // Treat as plain text
      markdown = typeof content === 'string' ? content : content.toString('utf-8');
  }

  // Extract entities via LLM if requested
  let entities = [];
  let relations = [];
  if (opts.extractEntities !== false) {
    try {
      const extracted = await extractEntitiesViaLLM(markdown, opts);
      entities = extracted.entities;
      relations = extracted.relations;
    } catch {
      // LLM extraction is best-effort
    }
  }

  return { markdown, entities, relations, metadata };
}

/**
 * Ingest a URL by fetching, converting to markdown, and extracting metadata.
 *
 * @param {string} url - URL to fetch
 * @param {object} opts - Options: workspaceId, extractEntities
 * @returns {Promise<{markdown: string, entities: Array, relations: Array, metadata: object}>}
 */
export async function ingestUrl(url, opts = {}) {
  const fetched = await fetchUrl(url, { timeout: opts.timeout || 15000 });
  const markdown = htmlToMarkdown(fetched.html);
  const metadata = { ...extractMetadata(fetched.html), url, statusCode: fetched.statusCode };

  let entities = [];
  let relations = [];
  if (opts.extractEntities !== false) {
    try {
      const extracted = await extractEntitiesViaLLM(markdown.slice(0, 10000), opts);
      entities = extracted.entities;
      relations = extracted.relations;
    } catch {
      // LLM extraction is best-effort
    }
  }

  return { markdown, entities, relations, metadata };
}

/**
 * Ingest raw text directly (text is the markdown).
 *
 * @param {string} text - Raw text content
 * @param {object} opts - Options: workspaceId, extractEntities
 * @returns {Promise<{markdown: string, entities: Array, relations: Array}>}
 */
export async function ingestText(text, opts = {}) {
  let entities = [];
  let relations = [];
  if (opts.extractEntities !== false) {
    try {
      const extracted = await extractEntitiesViaLLM(text, opts);
      entities = extracted.entities;
      relations = extracted.relations;
    } catch {
      // LLM extraction is best-effort
    }
  }

  return { markdown: text, entities, relations };
}

/**
 * Process items from the ingestion queue.
 *
 * @returns {Promise<number>} Number of items processed
 */
export async function processIngestionQueue() {
  const config = getConfig();
  if (!config.ingestion?.enabled) return 0;

  const maxConcurrent = config.ingestion?.maxConcurrent || 2;
  const items = getNextIngestion(maxConcurrent);
  if (!items.length) return 0;

  let processed = 0;

  for (const item of items) {
    try {
      setIngestionProcessing(item.id);

      const workspaceId = item.workspace_id || 'default';
      let result;

      switch (item.source_type) {
        case 'url':
          result = await ingest(item.source_path, { workspaceId, extractEntities: true });
          break;
        case 'file':
          result = await ingest(item.source_path, { workspaceId, extractEntities: true });
          break;
        case 'text':
          result = await ingestText(item.source_path, { workspaceId, extractEntities: true });
          break;
        default:
          result = await ingest(item.source_path, { workspaceId, extractEntities: true });
      }

      // Store extracted entities in the knowledge graph
      let primaryEntityId = null;
      if (result.entities.length > 0 || result.relations.length > 0) {
        try {
          const stored = storeExtractedEntities(result.entities, result.relations, workspaceId);
          primaryEntityId = stored.entityIds.length > 0 ? stored.entityIds[0] : null;
        } catch {}
      }

      completeIngestion(item.id, primaryEntityId);
      processed++;
    } catch (err) {
      failIngestion(item.id, err.message);
    }
  }

  return processed;
}

/**
 * Get the list of supported file extensions for ingestion.
 * Checks optional packages synchronously via createRequire.
 *
 * @returns {string[]} Array of supported extensions
 */
export function getSupportedTypes() {
  const base = ['.md', '.txt', '.html', '.htm', '.json', '.csv', '.js', '.mjs', '.ts', '.tsx', '.jsx', '.py'];

  // Check for optional packages using sync require
  try {
    _require.resolve('pdf-parse');
    base.push('.pdf');
  } catch {}

  try {
    _require.resolve('mammoth');
    base.push('.docx');
  } catch {}

  return base;
}

// ── Internal helpers ──────────────────────────────────────────

/**
 * Extract entities and relations from text using LLM.
 *
 * @param {string} text - Text to extract from
 * @param {object} opts - Options with workspaceId
 * @returns {Promise<{entities: Array, relations: Array}>}
 */
async function extractEntitiesViaLLM(text, opts = {}) {
  if (!text || text.length < 20) return { entities: [], relations: [] };

  const prompt = `Extract key entities and their relationships from this text. Return JSON only:
{
  "entities": [{"name": "...", "type": "person|technology|concept|organization|place|event|document|other", "description": "..."}],
  "relations": [{"source": "entity name", "target": "entity name", "relation": "uses|created_by|related_to|part_of|located_in|works_at|authored|depends_on"}]
}

Rules:
- Maximum 20 entities
- Entity names should be canonical (e.g., "JavaScript" not "JS")
- Only include clearly identified entities
- Relations must reference entities by exact name

Text:
${text.slice(0, 5000)}`;

  try {
    const model = pickModel('ingest');
    const response = await askLLM(prompt, { model });
    const parsed = parseLLMJson(response);

    if (parsed.ok && parsed.data) {
      const entities = Array.isArray(parsed.data.entities) ? parsed.data.entities.slice(0, 20) : [];
      const relations = Array.isArray(parsed.data.relations) ? parsed.data.relations : [];
      return { entities, relations };
    }
  } catch {
    // LLM not available or parse failure
  }

  return { entities: [], relations: [] };
}

/**
 * Store extracted entities and relations in the knowledge graph DB.
 *
 * @param {Array} entities - Entity objects with name, type, description
 * @param {Array} relations - Relation objects with source, target, relation
 * @param {string} workspaceId - Workspace to store in
 * @returns {{entityIds: number[], relationIds: number[]}}
 */
function storeExtractedEntities(entities, relations, workspaceId) {
  const entityIds = [];
  const entityNameToId = {};

  for (const entity of entities) {
    if (!entity.name || !entity.type) continue;
    try {
      const metadata = entity.description ? JSON.stringify({ description: entity.description }) : null;
      const id = addEntity(entity.name, entity.type, workspaceId, metadata);
      entityIds.push(id);
      entityNameToId[entity.name] = id;
    } catch {
      // Skip entities that fail to insert
    }
  }

  const relationIds = [];
  for (const rel of relations) {
    if (!rel.source || !rel.target || !rel.relation) continue;
    try {
      // Look up entity IDs by name
      let sourceId = entityNameToId[rel.source];
      let targetId = entityNameToId[rel.target];

      // If not in current batch, try DB lookup
      if (!sourceId) {
        const found = getEntityByName(rel.source, 'concept', workspaceId)
          || getEntityByName(rel.source, 'technology', workspaceId)
          || getEntityByName(rel.source, 'person', workspaceId)
          || getEntityByName(rel.source, 'organization', workspaceId);
        if (found) sourceId = found.id;
      }
      if (!targetId) {
        const found = getEntityByName(rel.target, 'concept', workspaceId)
          || getEntityByName(rel.target, 'technology', workspaceId)
          || getEntityByName(rel.target, 'person', workspaceId)
          || getEntityByName(rel.target, 'organization', workspaceId);
        if (found) targetId = found.id;
      }

      if (sourceId && targetId) {
        const id = addRelation(sourceId, targetId, rel.relation);
        relationIds.push(id);
      }
    } catch {
      // Skip relations that fail
    }
  }

  return { entityIds, relationIds };
}
