<?php
require_once __DIR__ . '/../helpers/http.php';
require_once __DIR__ . '/../helpers/url.php';
require_once __DIR__ . '/../helpers/html.php';
require_once __DIR__ . '/../helpers/logger.php';
require_once __DIR__ . '/../models/UrlModel.php';
require_once __DIR__ . '/Robots.php';
require_once __DIR__ . '/Sitemap.php';

class Discover {
  public static function run(mysqli $db, int $auditId, string $startUrl, array $settings): array {
    $urlModel = new UrlModel($db);
    $startNorm = seom_normalize_url($startUrl, ['allow_query_params' => (bool)($settings['allow_query_params'] ?? false)]);
    if (!$startNorm) return ['ok' => false, 'message' => 'Invalid start URL'];

    $base = seom_base_url($startNorm);
    $host = parse_url($startNorm, PHP_URL_HOST);

    $httpOpts = [
      'timeout_sec' => (int)($settings['timeout_sec'] ?? 15),
      'user_agent' => ($settings['user_agent'] ?? 'SEOMasterBot/1.0'),
      'max_redirects' => (int)($settings['max_redirects'] ?? 5),
    ];

    // 1) Robots
    $robots = Robots::fetch($base, $httpOpts);
    $sitemaps = $robots['sitemaps'] ?? [];

    // 2) Sitemap fallback: try /sitemap.xml if none
    if (empty($sitemaps)) {
      $sitemaps = [rtrim($base, '/') . '/sitemap.xml'];
    }

    $maxUrls = (int)($settings['max_urls'] ?? 300);
    $maxDepth = (int)($settings['max_depth'] ?? 4);

    // 3) Enqueue sitemap URLs (limited)
    $added = 0;
    foreach ($sitemaps as $sm) {
      $smRes = Sitemap::fetchUrls($sm, $httpOpts);
      if (!$smRes['ok']) continue;
      foreach ($smRes['urls'] as $u) {
        if ($added >= $maxUrls) break 2;
        // same host filter
        if (parse_url($u, PHP_URL_HOST) !== $host) continue;
        if ($urlModel->enqueue($auditId, $u, 0, 'sitemap', $sm, $settings)) $added++;
      }
    }

    // 4) Enqueue internal links from homepage/start
    $seedRes = seom_http_get($startNorm, $httpOpts);
    if (($seedRes['http_code'] ?? 0) === 200 && is_string($seedRes['body'] ?? null)) {
      $links = self::extractInternalLinks($seedRes['body'], $startNorm, $settings);
      foreach ($links as $u) {
        if ($added >= $maxUrls) break;
        if (parse_url($u, PHP_URL_HOST) !== $host) continue;
        if ($urlModel->enqueue($auditId, $u, 1, 'internal', $startNorm, $settings)) $added++;
      }
    }

    // Update discovered count
    $count = self::countUrls($db, $auditId);
    $stmt = $db->prepare("UPDATE audits SET pages_discovered=?, updated_at=NOW() WHERE id=?");
    $stmt->bind_param('ii', $count, $auditId);
    $stmt->execute();
    $stmt->close();

    return ['ok' => true, 'discovered' => $count];
  }

  private static function extractInternalLinks(string $html, string $baseUrl, array $settings): array {
    $dom = seom_load_dom($html);
    if (!$dom) return [];

    $xpath = new DOMXPath($dom);
    $nodes = $xpath->query('//a[@href]');
    $out = [];
    $baseParts = parse_url($baseUrl);
    $baseHost = strtolower($baseParts['host'] ?? '');

    foreach ($nodes as $a) {
      /** @var DOMElement $a */
      $href = trim($a->getAttribute('href'));
      if ($href === '' || str_starts_with($href, '#')) continue;
      if (preg_match('~^(mailto:|tel:|javascript:)~i', $href)) continue;

      $abs = seom_resolve_url($href, $baseUrl);
      if (!$abs) continue;
      $host = strtolower(parse_url($abs, PHP_URL_HOST) ?? '');

      $allowSub = (bool)($settings['include_subdomains'] ?? false);
      $same = $host === $baseHost || ($allowSub && str_ends_with($host, '.' . $baseHost));
      if (!$same) continue;

      $norm = seom_normalize_url($abs, [
        'allow_query_params' => (bool)($settings['allow_query_params'] ?? false),
        'strip_trailing_slash' => true,
      ]);
      if ($norm) $out[] = $norm;
    }

    return array_values(array_unique($out));
  }

  private static function countUrls(mysqli $db, int $auditId): int {
    $stmt = $db->prepare('SELECT COUNT(*) c FROM audit_urls WHERE audit_id=?');
    $stmt->bind_param('i', $auditId);
    $stmt->execute();
    $res = $stmt->get_result();
    $row = $res->fetch_assoc();
    $stmt->close();
    return (int)($row['c'] ?? 0);
  }
}
