import { readFileSync } from 'fs';
import { join, dirname } from 'path';
import { fileURLToPath } from 'url';
import { createRequire } from 'module';

const __dirname = dirname(fileURLToPath(import.meta.url));
const require = createRequire(import.meta.url);

/**
 * HTML entity decode map for common entities.
 */
const ENTITIES = {
  '&amp;': '&',
  '&lt;': '<',
  '&gt;': '>',
  '&quot;': '"',
  '&#39;': "'",
  '&nbsp;': ' ',
};

const ENTITY_RE = /&(?:amp|lt|gt|quot|nbsp|#39);/g;

function decodeEntities(str) {
  return str.replace(ENTITY_RE, m => ENTITIES[m] || m);
}

/**
 * Fetch a URL with timeout, returning HTML, plain text, content type, title, and status code.
 *
 * @param {string} url - URL to fetch
 * @param {object} opts - Options: timeout (ms, default 15000)
 * @returns {Promise<{html: string, text: string, contentType: string, title: string|null, statusCode: number}>}
 */
export async function fetchUrl(url, opts = {}) {
  const timeout = opts.timeout ?? 15000;
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(), timeout);

  try {
    const response = await fetch(url, {
      signal: controller.signal,
      headers: {
        'User-Agent': 'Gergy/3.0',
      },
      redirect: 'follow',
    });

    const html = await response.text();
    const contentType = response.headers.get('content-type') || '';
    const statusCode = response.status;

    // Extract <title> from HTML
    const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
    const title = titleMatch ? decodeEntities(titleMatch[1].trim()) : null;

    // Strip HTML to get plain text
    const text = stripHtml(html);

    return { html, text, contentType, title, statusCode };
  } catch (err) {
    if (err.name === 'AbortError') {
      throw new Error(`Fetch timed out after ${timeout}ms: ${url}`);
    }
    throw new Error(`Fetch failed for ${url}: ${err.message}`);
  } finally {
    clearTimeout(timer);
  }
}

/**
 * Convert HTML to Markdown. Tries turndown first, falls back to custom converter.
 *
 * @param {string} html - HTML string to convert
 * @returns {string} Markdown string
 */
export function htmlToMarkdown(html) {
  // Try turndown if available
  try {
    const TurndownService = require('turndown');
    const td = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
    return td.turndown(html);
  } catch {
    // turndown not installed, use fallback
  }

  return htmlToMarkdownFallback(html);
}

/**
 * Fallback HTML to Markdown converter handling common elements.
 */
function htmlToMarkdownFallback(html) {
  if (!html) return '';

  let md = html;

  // Remove <script>, <style>, <noscript> blocks entirely
  md = md.replace(/<script[\s\S]*?<\/script>/gi, '');
  md = md.replace(/<style[\s\S]*?<\/style>/gi, '');
  md = md.replace(/<noscript[\s\S]*?<\/noscript>/gi, '');

  // Headings h1-h6
  for (let i = 1; i <= 6; i++) {
    const hashes = '#'.repeat(i);
    const re = new RegExp(`<h${i}[^>]*>([\\s\\S]*?)<\\/h${i}>`, 'gi');
    md = md.replace(re, (_, content) => `\n\n${hashes} ${stripTags(content).trim()}\n\n`);
  }

  // Images: <img> -> ![alt](src)
  md = md.replace(/<img[^>]*\bsrc=["']([^"']*)["'][^>]*\balt=["']([^"']*)["'][^>]*\/?>/gi,
    (_, src, alt) => `![${alt}](${src})`);
  md = md.replace(/<img[^>]*\balt=["']([^"']*)["'][^>]*\bsrc=["']([^"']*)["'][^>]*\/?>/gi,
    (_, alt, src) => `![${alt}](${src})`);
  md = md.replace(/<img[^>]*\bsrc=["']([^"']*)["'][^>]*\/?>/gi,
    (_, src) => `![](${src})`);

  // Links: <a href="...">text</a> -> [text](href)
  md = md.replace(/<a[^>]*\bhref=["']([^"']*)["'][^>]*>([\s\S]*?)<\/a>/gi,
    (_, href, text) => `[${stripTags(text).trim()}](${href})`);

  // Pre/code blocks
  md = md.replace(/<pre[^>]*>\s*<code[^>]*(?:\bclass=["'](?:language-)?([^"']*)["'])?[^>]*>([\s\S]*?)<\/code>\s*<\/pre>/gi,
    (_, lang, code) => `\n\n\`\`\`${lang || ''}\n${decodeEntities(stripTags(code)).trim()}\n\`\`\`\n\n`);
  md = md.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi,
    (_, code) => `\n\n\`\`\`\n${decodeEntities(stripTags(code)).trim()}\n\`\`\`\n\n`);
  md = md.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi,
    (_, code) => `\`${decodeEntities(stripTags(code))}\``);

  // Tables
  md = md.replace(/<table[^>]*>([\s\S]*?)<\/table>/gi, (_, tableContent) => {
    const rows = [];
    const rowRe = /<tr[^>]*>([\s\S]*?)<\/tr>/gi;
    let rowMatch;
    while ((rowMatch = rowRe.exec(tableContent)) !== null) {
      const cells = [];
      const cellRe = /<(?:td|th)[^>]*>([\s\S]*?)<\/(?:td|th)>/gi;
      let cellMatch;
      while ((cellMatch = cellRe.exec(rowMatch[1])) !== null) {
        cells.push(stripTags(cellMatch[1]).trim());
      }
      rows.push(cells);
    }
    if (rows.length === 0) return '';
    const colCount = Math.max(...rows.map(r => r.length));
    const lines = [];
    rows.forEach((row, idx) => {
      const padded = Array.from({ length: colCount }, (_, i) => row[i] || '');
      lines.push('| ' + padded.join(' | ') + ' |');
      if (idx === 0) {
        lines.push('| ' + padded.map(() => '---').join(' | ') + ' |');
      }
    });
    return '\n\n' + lines.join('\n') + '\n\n';
  });

  // Unordered lists
  md = md.replace(/<ul[^>]*>([\s\S]*?)<\/ul>/gi, (_, content) => {
    const items = [];
    const liRe = /<li[^>]*>([\s\S]*?)<\/li>/gi;
    let liMatch;
    while ((liMatch = liRe.exec(content)) !== null) {
      items.push(`- ${stripTags(liMatch[1]).trim()}`);
    }
    return '\n' + items.join('\n') + '\n';
  });

  // Ordered lists
  md = md.replace(/<ol[^>]*>([\s\S]*?)<\/ol>/gi, (_, content) => {
    const items = [];
    const liRe = /<li[^>]*>([\s\S]*?)<\/li>/gi;
    let liMatch;
    let num = 1;
    while ((liMatch = liRe.exec(content)) !== null) {
      items.push(`${num}. ${stripTags(liMatch[1]).trim()}`);
      num++;
    }
    return '\n' + items.join('\n') + '\n';
  });

  // Paragraphs
  md = md.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, (_, content) => `\n\n${stripTags(content).trim()}\n\n`);

  // Line breaks
  md = md.replace(/<br\s*\/?>/gi, '\n');

  // Bold and italic
  md = md.replace(/<(?:strong|b)[^>]*>([\s\S]*?)<\/(?:strong|b)>/gi, '**$1**');
  md = md.replace(/<(?:em|i)[^>]*>([\s\S]*?)<\/(?:em|i)>/gi, '*$1*');

  // Blockquotes
  md = md.replace(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi,
    (_, content) => '\n\n' + stripTags(content).trim().split('\n').map(l => `> ${l}`).join('\n') + '\n\n');

  // Horizontal rules
  md = md.replace(/<hr\s*\/?>/gi, '\n\n---\n\n');

  // Strip remaining tags
  md = stripTags(md);

  // Decode entities
  md = decodeEntities(md);

  // Collapse excessive whitespace (but preserve double newlines for paragraphs)
  md = md.replace(/\n{3,}/g, '\n\n');
  md = md.replace(/[ \t]+/g, ' ');
  md = md.trim();

  return md;
}

function stripTags(html) {
  if (!html) return '';
  return html.replace(/<[^>]*>/g, '');
}

/**
 * Extract metadata from HTML <head>.
 *
 * @param {string} html - Full HTML document
 * @returns {{title: string|null, description: string|null, author: string|null, date: string|null}}
 */
export function extractMetadata(html) {
  if (!html) return { title: null, description: null, author: null, date: null };

  // Title
  const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
  const title = titleMatch ? decodeEntities(titleMatch[1].trim()) : null;

  // Description
  const descMatch = html.match(/<meta[^>]*\bname=["']description["'][^>]*\bcontent=["']([^"']*)["'][^>]*\/?>/i)
    || html.match(/<meta[^>]*\bcontent=["']([^"']*)["'][^>]*\bname=["']description["'][^>]*\/?>/i);
  const description = descMatch ? decodeEntities(descMatch[1].trim()) : null;

  // Author
  const authorMatch = html.match(/<meta[^>]*\bname=["']author["'][^>]*\bcontent=["']([^"']*)["'][^>]*\/?>/i)
    || html.match(/<meta[^>]*\bcontent=["']([^"']*)["'][^>]*\bname=["']author["'][^>]*\/?>/i);
  const author = authorMatch ? decodeEntities(authorMatch[1].trim()) : null;

  // Date: try multiple meta patterns
  const datePatterns = [
    /<meta[^>]*\bname=["']date["'][^>]*\bcontent=["']([^"']*)["'][^>]*\/?>/i,
    /<meta[^>]*\bcontent=["']([^"']*)["'][^>]*\bname=["']date["'][^>]*\/?>/i,
    /<meta[^>]*\bproperty=["']article:published_time["'][^>]*\bcontent=["']([^"']*)["'][^>]*\/?>/i,
    /<meta[^>]*\bcontent=["']([^"']*)["'][^>]*\bproperty=["']article:published_time["'][^>]*\/?>/i,
    /<meta[^>]*\bname=["']pubdate["'][^>]*\bcontent=["']([^"']*)["'][^>]*\/?>/i,
    /<meta[^>]*\bcontent=["']([^"']*)["'][^>]*\bname=["']pubdate["'][^>]*\/?>/i,
  ];
  let date = null;
  for (const pattern of datePatterns) {
    const m = html.match(pattern);
    if (m) { date = decodeEntities(m[1].trim()); break; }
  }

  return { title, description, author, date };
}

/**
 * Strip all HTML tags, decode entities, and collapse whitespace.
 *
 * @param {string} html - HTML string
 * @returns {string} Plain text
 */
export function stripHtml(html) {
  if (!html) return '';

  let text = html;

  // Remove script/style/noscript blocks
  text = text.replace(/<script[\s\S]*?<\/script>/gi, '');
  text = text.replace(/<style[\s\S]*?<\/style>/gi, '');
  text = text.replace(/<noscript[\s\S]*?<\/noscript>/gi, '');

  // Replace block-level elements with newlines
  text = text.replace(/<\/(?:p|div|h[1-6]|li|tr|blockquote|section|article|header|footer|nav|main)>/gi, '\n');
  text = text.replace(/<br\s*\/?>/gi, '\n');

  // Strip all tags
  text = text.replace(/<[^>]*>/g, '');

  // Decode entities
  text = decodeEntities(text);

  // Decode numeric entities
  text = text.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)));
  text = text.replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(parseInt(code, 16)));

  // Collapse whitespace
  text = text.replace(/[ \t]+/g, ' ');
  text = text.replace(/\n[ \t]+/g, '\n');
  text = text.replace(/[ \t]+\n/g, '\n');
  text = text.replace(/\n{3,}/g, '\n\n');
  text = text.trim();

  return text;
}
