<?php
require_once __DIR__ . '/../helpers/html.php';
require_once __DIR__ . '/../helpers/text.php';

class Parser {
  public static function parse(string $html): array {
    $dom = seom_load_dom($html);
    if (!$dom) {
      return ['ok' => false, 'error' => 'DOM parse failed'];
    }

    $xp = new DOMXPath($dom);

    // Title
    $titleNode = $xp->query('//title')->item(0);
    $title = seom_dom_text($titleNode);

    // Meta description
    $metaDesc = '';
    $metaDescNode = $xp->query("//meta[translate(@name,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='description']")->item(0);
    if ($metaDescNode && $metaDescNode->attributes) {
      $metaDesc = trim($metaDescNode->attributes->getNamedItem('content')?->nodeValue ?? '');
      $metaDesc = preg_replace('/\s+/u', ' ', $metaDesc);
    }

    // Canonical
    $canonical = '';
    $canNode = $xp->query("//link[translate(@rel,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='canonical']")->item(0);
    if ($canNode && $canNode->attributes) {
      $canonical = trim($canNode->attributes->getNamedItem('href')?->nodeValue ?? '');
    }

    // Robots meta
    $robotsMeta = '';
    $robotsNode = $xp->query("//meta[translate(@name,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='robots']")->item(0);
    if ($robotsNode && $robotsNode->attributes) {
      $robotsMeta = trim($robotsNode->attributes->getNamedItem('content')?->nodeValue ?? '');
    }

    // Lang
    $lang = '';
    $htmlNode = $xp->query('//html')->item(0);
    if ($htmlNode && $htmlNode->attributes) {
      $lang = trim($htmlNode->attributes->getNamedItem('lang')?->nodeValue ?? '');
    }

    // Viewport
    $viewport = '';
    $vpNode = $xp->query("//meta[translate(@name,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='viewport']")->item(0);
    if ($vpNode && $vpNode->attributes) {
      $viewport = trim($vpNode->attributes->getNamedItem('content')?->nodeValue ?? '');
    }

    // Headings
    $headCounts = [];
    $h1Text = '';
    for ($i = 1; $i <= 6; $i++) {
      $nodes = $xp->query('//h' . $i);
      $headCounts['h' . $i] = $nodes?->length ?? 0;
      if ($i === 1 && ($nodes?->length ?? 0) > 0) {
        $h1Text = seom_dom_text($nodes->item(0));
      }
    }

    // Images (alt missing count)
    $imgNodes = $xp->query('//img');
    $imgTotal = $imgNodes?->length ?? 0;
    $imgMissingAlt = 0;
    if ($imgNodes) {
      foreach ($imgNodes as $img) {
        $alt = '';
        if ($img->attributes) {
          $alt = trim($img->attributes->getNamedItem('alt')?->nodeValue ?? '');
        }
        if ($alt === '') $imgMissingAlt++;
      }
    }

    // Links (counts only for Phase 1)
    $aNodes = $xp->query('//a[@href]');
    $linkTotal = $aNodes?->length ?? 0;

    // Content metrics
    $text = seom_strip_and_normalize_text($html);
    $wordCount = seom_word_count($text);
    $htmlBytes = strlen($html);
    $textBytes = strlen($text);
    $ratio = $htmlBytes > 0 ? round(($textBytes / $htmlBytes) * 100, 2) : null;
    $contentHash = sha1($text);

    return [
      'ok' => true,
      'data' => [
        'title' => $title,
        'title_len' => mb_strlen($title, 'UTF-8'),
        'meta_description' => $metaDesc,
        'meta_description_len' => mb_strlen($metaDesc, 'UTF-8'),
        'canonical' => $canonical,
        'robots_meta' => $robotsMeta,
        'h1' => $h1Text,
        'headings_json' => ['counts' => $headCounts],
        'links_json' => ['total' => $linkTotal],
        'images_json' => ['total' => $imgTotal, 'missing_alt' => $imgMissingAlt],
        'schema_json' => null,
        'og_json' => null,
        'twitter_json' => null,
        'lang' => $lang,
        'viewport_meta' => $viewport,
        'word_count' => $wordCount,
        'text_html_ratio' => $ratio,
        'content_hash' => $contentHash,
      ]
    ];
  }
}
