#!/usr/bin/env python3
"""
AutoPremium Import Analyser — Serveur local
Lancement : python3 serveur_autopremium.py
Puis ouvrir : http://localhost:8080/AUTOPREMIUM_ANALYSEUR_V3.html

Sites supportés :
  ✅ Mobile.de  — extraction depuis window.__INITIAL_STATE__ (structure validée)
  ✅ Blocket.se — scraping HTML + photos blocketcdn
  ✅ AutoScout24, Otomoto, etc. — scraping générique
"""

import asyncio
import gzip
import hashlib
import http.server
import io
import json
import os
import re
import time
import urllib.request
import urllib.error
import zlib
from http.server import SimpleHTTPRequestHandler

try:
    import brotli  # type: ignore
    _HAS_BROTLI = True
except ImportError:
    _HAS_BROTLI = False

try:
    import redis as _redis_lib
    _redis_client = _redis_lib.from_url(os.environ.get("REDIS_URL", "redis://localhost:6379"))
    _redis_client.ping()
    _HAS_REDIS = True
    print("  ✅ Redis connecté")
except Exception as _re:
    _HAS_REDIS = False
    _redis_client = None
    print(f"  ⚠️  Redis non disponible : {_re}")

API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
PORT = 8080

import threading
_claude_lock = threading.Lock()
_claude_active = 0
MAX_CONCURRENT_CLAUDE = 2


def cache_get(key: str):
    if not _HAS_REDIS:
        return None
    try:
        val = _redis_client.get(key)
        return json.loads(val) if val else None
    except Exception:
        return None


def cache_set(key: str, data, ttl_seconds: int = 3600):
    if not _HAS_REDIS:
        return
    try:
        _redis_client.setex(key, ttl_seconds, json.dumps(data, ensure_ascii=False))
    except Exception:
        pass

class Handler(SimpleHTTPRequestHandler):

    def log_message(self, fmt, *args):
        if "favicon" not in self.path:
            print(f"  {self.command} {self.path} → {args[1]}")

    def end_headers(self):
        self.send_header("Access-Control-Allow-Origin", "*")
        self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
        self.send_header("Access-Control-Allow-Headers", "Content-Type")
        super().end_headers()

    def do_GET(self):
        # Recherche mobile.de via Playwright headless
        if self.path.startswith('/api/search-mobilede'):
            from urllib.parse import parse_qs, urlparse
            qs = parse_qs(urlparse(self.path).query)
            url = qs.get('url', [''])[0]
            if not url:
                self._json_error(400, 'URL manquante')
                return
            self._search_mobilede(url)
            return
        if self.path.startswith('/api/search-autoscout24'):
            from urllib.parse import parse_qs, urlparse
            qs = parse_qs(urlparse(self.path).query)
            url = qs.get('url', [''])[0]
            if not url:
                self._json_error(400, 'URL manquante')
                return
            self._search_autoscout24_server(url)
            return
        # Endpoints scrapers dealers
        if self.path == '/api/scrape-daytona':
            self._scrape_dealer('daytona')
            return
        if self.path == '/api/scrape-matrix':
            self._scrape_dealer('matrix')
            return
        # Fichiers statiques
        super().do_GET()

    def do_OPTIONS(self):
        self.send_response(200)
        self.end_headers()

    def do_POST(self):
        if self.path == "/api/claude":
            self._proxy_claude()
        elif self.path == "/api/fetch-page":
            self._fetch_page()
        else:
            self.send_error(404)

    # ══════════════════════════════════════════════════════════════
    # PROXY CLAUDE — retry 529 + prompt caching
    # ══════════════════════════════════════════════════════════════
    def _proxy_claude(self):
        global _claude_active
        if not API_KEY:
            self._json_error(500, "ANTHROPIC_API_KEY non définie. Lance : export ANTHROPIC_API_KEY=sk-ant-...")
            return

        # Verrou : rejeter si trop de requêtes Claude simultanées
        with _claude_lock:
            if _claude_active >= MAX_CONCURRENT_CLAUDE:
                print(f"  [{time.strftime('%H:%M:%S')}] REJECT proxy_claude — {_claude_active} déjà en cours (max={MAX_CONCURRENT_CLAUDE})")
                self._json_error(429, f"Trop de requêtes Claude simultanées ({_claude_active}/{MAX_CONCURRENT_CLAUDE})")
                return
            _claude_active += 1

        t0_claude = time.time()
        print(f"  [{time.strftime('%H:%M:%S')}] START proxy_claude ({_claude_active}/{MAX_CONCURRENT_CLAUDE} actives)")
        length = int(self.headers.get("Content-Length", 0))
        body = self.rfile.read(length)

        req = urllib.request.Request(
            "https://api.anthropic.com/v1/messages",
            data=body,
            headers={
                "Content-Type": "application/json",
                "x-api-key": API_KEY,
                "anthropic-version": "2023-06-01",
                "anthropic-beta": "prompt-caching-2024-07-31",
            },
            method="POST"
        )

        retry_delays = [30, 60, 120, 180]
        max_attempts = len(retry_delays) + 1
        for attempt in range(max_attempts):
            try:
                with urllib.request.urlopen(req, timeout=300) as resp:
                    result = resp.read()
                    self.send_response(200)
                    self.send_header("Content-Type", "application/json")
                    self.end_headers()
                    self.wfile.write(result)
                    elapsed = time.time() - t0_claude
                    print(f"  [{time.strftime('%H:%M:%S')}] END proxy_claude ({elapsed:.1f}s)" + (f" — après {attempt} retry(s)" if attempt > 0 else ""))
                    with _claude_lock:
                        _claude_active -= 1
                    return
            except urllib.error.HTTPError as e:
                if e.code == 529 and attempt < len(retry_delays):
                    wait = retry_delays[attempt]
                    print(f"  ⏳ Serveurs Anthropic surchargés (529) — attente de {wait}s avant retry ({attempt + 1}/{len(retry_delays)})")
                    time.sleep(wait)
                    print(f"  🔄 Nouvelle tentative {attempt + 2}/{max_attempts}")
                    continue
                if e.code == 529:
                    print(f"  ❌ Toujours surchargé après {max_attempts} tentatives — abandon")
                err = e.read()
                self.send_response(e.code)
                self.send_header("Content-Type", "application/json")
                self.end_headers()
                self.wfile.write(err)
                with _claude_lock:
                    _claude_active -= 1
                return

    # ══════════════════════════════════════════════════════════════
    # FETCH PAGE — routeur selon le site
    # ══════════════════════════════════════════════════════════════
    def _fetch_page(self):
        t0 = time.time()
        length = int(self.headers.get("Content-Length", 0))
        body = json.loads(self.rfile.read(length))
        url = body.get("url", "")
        print(f"  [{time.strftime('%H:%M:%S')}] FETCH-PAGE → {url[:80]}")
        if not url:
            self._json_error(400, "URL manquante")
            return

        if "mobile.de" in url:
            result = self._fetch_mobile_de(url)
            if result:
                self._send_json(result)
                return

        if "blocket.se" in url or "biltorget.se" in url or "kvdbil.se" in url:
            content, photos = self._scrape_blocket(url)
            self._send_json({"content": content, "photos": photos, "url": url, "source": "blocket"})
            return

        if "autoscout24" in url:
            content = self._scrape_autoscout24(url)
            if content is None:
                # Page de blocage anti-bot détectée — renvoyer 403 explicite
                self._json_error(403, "AutoScout24 : page de blocage anti-bot")
                return
            self._send_json({"content": content, "url": url, "source": "autoscout24"})
            return

        # Fallback générique (Otomoto, Finn.no, etc.)
        content = self._scrape_generic(url)
        self._send_json({"content": content, "url": url, "source": "generic"})

    # ══════════════════════════════════════════════════════════════
    # MOBILE.DE — extraction depuis window.__INITIAL_STATE__
    #
    # Structure validée sur annonce réelle 445553206 (avril 2026) :
    #   L'objet annonce se trouve dans state.search.*
    #   Il se reconnaît par : price.grossAmount + attributes[]
    #   Tags attrs : mileage, firstRegistration, constructionYear,
    #     power, fuel, transmission, manufacturerColorName, color,
    #     interior, envkv.co2Emissions, emissionClass, sku (VIN),
    #     numberOfPreviousOwners, hu, damageCondition, availability
    #   Photos : galleryImages[].srcSet (contient mo-1600)
    #   Prix HT : price.netAmount / Prix TTC : price.grossAmount
    #   Vendeur : contactInfo.name / contactInfo.city
    #   Type vendeur : onCustomerBehalf (true=particulier)
    # ══════════════════════════════════════════════════════════════
    def _fetch_mobile_de(self, url):
        try:
            req = urllib.request.Request(url, headers={
                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
                "Accept-Language": "fr-FR,fr;q=0.9,de;q=0.8,en;q=0.7",
                "Accept-Encoding": "gzip, deflate, br",
                "Cache-Control": "no-cache",
                "Pragma": "no-cache",
                "Sec-Fetch-Dest": "document",
                "Sec-Fetch-Mode": "navigate",
                "Sec-Fetch-Site": "none",
                "Sec-Fetch-User": "?1",
                "Upgrade-Insecure-Requests": "1",
            })
            with urllib.request.urlopen(req, timeout=15) as resp:
                html = resp.read().decode("utf-8", errors="replace")
        except Exception as e:
            print(f"  ⚠️ Mobile.de HTTP error: {e} — tentative Playwright")
            try:
                from mobilede_playwright import extract_mobilede_url
                state = asyncio.run(extract_mobilede_url(url))
                if state:
                    listing = self._find_listing_in_state(state)
                    if listing:
                        content = self._flatten_mobile_de(listing)
                        photos = self._extract_mobile_de_photos(listing)
                        make = listing.get("make", "")
                        model = listing.get("model", "")
                        print(f"  ✅ Mobile.de (Playwright) : {make} {model} — {len(photos)} photos")
                        return {"content": content, "photos": photos, "url": url, "source": "mobile.de-playwright"}
            except Exception as pe:
                print(f"  ❌ Mobile.de Playwright erreur : {pe}")
            return None

        # Extraire __INITIAL_STATE__
        match = re.search(r"window\.__INITIAL_STATE__\s*=\s*(\{.+?\});\s*(?:\n|window\.)", html, re.DOTALL)
        if not match:
            # Tentative moins stricte
            match = re.search(r"window\.__INITIAL_STATE__\s*=\s*(\{)", html)
            if match:
                # Extraire jusqu'à la fermeture du JSON
                start = match.start(1)
                depth = 0
                i = start
                for i, c in enumerate(html[start:], start):
                    if c == '{':
                        depth += 1
                    elif c == '}':
                        depth -= 1
                        if depth == 0:
                            break
                try:
                    state = json.loads(html[start:i+1])
                    match = True  # flag pour éviter le fallback
                except Exception:
                    state = None
                    match = None
            if not match:
                print("  ⚠️ Mobile.de : __INITIAL_STATE__ non trouvé → fallback scraping")
                return {"content": self._html_to_text(html), "url": url, "source": "mobile.de-html"}
        else:
            try:
                state = json.loads(match.group(1))
            except Exception as e:
                print(f"  ⚠️ Mobile.de JSON parse error ({e}) → fallback scraping")
                return {"content": self._html_to_text(html), "url": url, "source": "mobile.de-html"}

        listing = self._find_listing_in_state(state)
        if not listing:
            print("  ⚠️ Mobile.de : annonce non trouvée dans le state → fallback scraping")
            return {"content": self._html_to_text(html), "url": url, "source": "mobile.de-html"}

        content = self._flatten_mobile_de(listing)
        photos = self._extract_mobile_de_photos(listing)
        make = listing.get("make", "")
        model = listing.get("model", "")
        print(f"  ✅ Mobile.de __INITIAL_STATE__ : {make} {model} — {len(photos)} photos")
        return {"content": content, "photos": photos, "url": url, "source": "mobile.de-state"}

    # ══════════════════════════════════════════════════════════════
    # MOBILE.DE SEARCH — résultats de recherche via Playwright
    # ══════════════════════════════════════════════════════════════
    def _search_mobilede(self, url: str):
        t0 = time.time()
        print(f"  [{time.strftime('%H:%M:%S')}] SEARCH-MOBILEDE → {url[:80]}")

        cache_key = 'mde_search:' + hashlib.md5(url.encode()).hexdigest()
        cached = cache_get(cache_key)
        if cached is not None:
            print(f"  [cache] SEARCH-MOBILEDE ({len(cached)} annonces)")
            self._send_json({'listings': cached, 'count': len(cached), 'cached': True})
            return

        try:
            from mobilede_playwright import search_mobilede
            listings = asyncio.run(search_mobilede(url))
            elapsed = time.time() - t0
            print(f"  [{time.strftime('%H:%M:%S')}] SEARCH-MOBILEDE END ({elapsed:.1f}s) → {len(listings)} annonces")
            if listings:
                cache_set(cache_key, listings, 30 * 60)
            self._send_json({'listings': listings, 'count': len(listings), 'cached': False})
        except Exception as e:
            elapsed = time.time() - t0
            print(f"  [{time.strftime('%H:%M:%S')}] SEARCH-MOBILEDE FAIL ({elapsed:.1f}s) : {e}")
            self._json_error(500, f'Playwright search erreur : {e}')

    # ══════════════════════════════════════════════════════════════
    # AUTOSCOUT24 DE/NL SEARCH — résultats via Playwright + Firecrawl
    # ══════════════════════════════════════════════════════════════
    def _search_autoscout24_server(self, url: str):
        t0 = time.time()
        print(f"  [{time.strftime('%H:%M:%S')}] SEARCH-AS24 → {url[:80]}")

        cache_key = 'as24_search:' + hashlib.md5(url.encode()).hexdigest()
        cached = cache_get(cache_key)
        if cached is not None:
            print(f"  [cache] SEARCH-AS24 ({len(cached)} annonces)")
            self._send_json({'listings': cached, 'count': len(cached), 'cached': True})
            return

        try:
            from autoscout24_playwright import search_autoscout24
            listings = asyncio.run(search_autoscout24(url))
            elapsed = time.time() - t0
            print(f"  [{time.strftime('%H:%M:%S')}] SEARCH-AS24 END ({elapsed:.1f}s) → {len(listings)} annonces")
            if listings:
                cache_set(cache_key, listings, 30 * 60)
            self._send_json({'listings': listings, 'count': len(listings), 'cached': False})
        except Exception as e:
            elapsed = time.time() - t0
            print(f"  [{time.strftime('%H:%M:%S')}] SEARCH-AS24 FAIL ({elapsed:.1f}s) : {e}")
            self._json_error(500, f'AS24 search erreur : {e}')

    def _find_listing_in_state(self, obj, depth=0):
        """Cherche récursivement l'objet annonce dans __INITIAL_STATE__.
        L'annonce se reconnaît par : price.grossAmount ET (attributes OU make)."""
        if depth > 10 or not obj or not isinstance(obj, dict):
            return None
        if (isinstance(obj.get("price"), dict)
                and "grossAmount" in obj["price"]
                and ("attributes" in obj or "make" in obj)):
            return obj
        for v in obj.values():
            if isinstance(v, dict):
                r = self._find_listing_in_state(v, depth + 1)
                if r:
                    return r
            elif isinstance(v, list):
                for item in v:
                    if isinstance(item, dict):
                        r = self._find_listing_in_state(item, depth + 1)
                        if r:
                            return r
        return None

    def _flatten_mobile_de(self, L):
        """Convertit l'objet annonce Mobile.de en texte structuré pour Claude.
        Tags validés sur annonce réelle avril 2026."""
        lines = ["=== DONNÉES MOBILE.DE — Annonce réelle (DOM extrait) ==="]

        # Indexer les attributs par tag (structure validée)
        attrs = {}
        for a in L.get("attributes", []):
            tag = a.get("tag") or a.get("label", "")
            val = a.get("value", "")
            if tag and val:
                attrs[tag] = val

        # Véhicule
        lines.append(f"MARQUE : {L.get('make', '?')}")
        lines.append(f"MODÈLE : {L.get('model', '?')}")
        lines.append(f"CATÉGORIE : {L.get('category', '?')}")
        if L.get("title"):
            lines.append(f"TITRE ANNONCE : {L['title']}")

        for tag, label in [
            ("mileage",               "KILOMÉTRAGE"),
            ("firstRegistration",     "1ÈRE IMMATRICULATION"),
            ("constructionYear",      "ANNÉE DE CONSTRUCTION"),
            ("power",                 "PUISSANCE"),
            ("fuel",                  "CARBURANT"),
            ("transmission",          "BOÎTE"),
            ("manufacturerColorName", "COULEUR CONSTRUCTEUR"),
            ("color",                 "COULEUR GÉNÉRIQUE"),
            ("interior",              "INTÉRIEUR"),
            ("envkv.co2Emissions",    "CO2 WLTP"),
            ("emissionClass",         "NORME EURO"),
            ("sku",                   "VIN"),
            ("numberOfPreviousOwners","NOMBRE PROPRIÉTAIRES"),
            ("hu",                    "CT ALLEMAGNE valide jusqu'à"),
            ("availability",          "DISPONIBILITÉ"),
            ("damageCondition",       "ÉTAT"),
            ("numSeats",              "NOMBRE DE SIÈGES"),
            ("doorCount",             "NOMBRE DE PORTES"),
        ]:
            if attrs.get(tag):
                lines.append(f"{label} : {attrs[tag]}")

        # Prix TTC / HT / TVA
        price = L.get("price", {})
        gross = price.get("grossAmount")
        net = price.get("netAmount")
        vat = price.get("vat", "")
        if gross:
            lines.append(f"PRIX TTC : {gross} EUR")
        if net and net > 0:
            eco = round(gross - net) if gross else "?"
            lines.append(f"PRIX HT : {round(net, 2)} EUR ({vat})")
            lines.append(f"TVA RÉCUPÉRABLE : OUI — économie de ~{eco} € si achat via société assujettie à TVA intracommunautaire (SASU NES par exemple)")
        else:
            lines.append("TVA RÉCUPÉRABLE : NON — vendeur particulier ou TVA non récupérable")

        # Vendeur
        contact = L.get("contactInfo", {})
        if contact.get("name"):
            lines.append(f"VENDEUR NOM : {contact['name']}")
        if contact.get("city"):
            lines.append(f"VILLE VENDEUR : {contact['city']}, Allemagne")
        seller_type = "Particulier" if L.get("onCustomerBehalf") else "Professionnel / Concessionnaire"
        lines.append(f"TYPE VENDEUR : {seller_type}")

        # Équipements
        features = L.get("features", [])
        if features:
            labels = [f.get("label", str(f)) if isinstance(f, dict) else str(f) for f in features]
            lines.append(f"ÉQUIPEMENTS ({len(labels)}) : {', '.join(labels[:50])}")

        # Description
        desc = L.get("htmlDescription", "")
        if desc:
            desc_clean = re.sub(r"<[^>]+>", " ", desc)
            desc_clean = re.sub(r"\s+", " ", desc_clean).strip()[:600]
            lines.append(f"DESCRIPTION VENDEUR : {desc_clean}")

        return "\n".join(lines)

    def _extract_mobile_de_photos(self, L):
        """Extrait URLs photos haute résolution depuis galleryImages[].srcSet."""
        photos = []
        for img in L.get("galleryImages", []):
            if isinstance(img, dict):
                srcset = img.get("srcSet", "")
                # Extraire la version 1600w
                m = re.search(r"(https://\S+rule=mo-1600)", srcset)
                if m:
                    photos.append(m.group(1))
                elif img.get("src"):
                    # Fallback : convertir mo-360 en mo-1024
                    photos.append(img["src"].replace("mo-360", "mo-1024"))
            elif isinstance(img, str):
                photos.append(img)
        return list(dict.fromkeys(photos))[:7]

    # ══════════════════════════════════════════════════════════════
    # BLOCKET.SE — scraping HTML + photos CDN
    # ══════════════════════════════════════════════════════════════
    def _scrape_blocket(self, url):
        try:
            req = urllib.request.Request(url, headers={
                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
                "Accept-Language": "sv-SE,sv;q=0.9,fr;q=0.8",
            })
            with urllib.request.urlopen(req, timeout=15) as resp:
                raw = resp.read().decode("utf-8", errors="replace")

            # Tentative d'extraction depuis __NEXT_DATA__
            nd_match = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.+?)</script>', raw, re.DOTALL)
            if nd_match:
                try:
                    nd = json.loads(nd_match.group(1))
                    content = self._flatten_blocket_next(nd)
                    photos = re.findall(r"https://images\.blocketcdn\.se/dynamic/1600w/[^\s\"']+", raw)
                    return content, list(dict.fromkeys(photos))[:7]
                except Exception:
                    pass

            # Fallback : scraping texte brut
            text = self._html_to_text(raw)
            header = ("=== DONNÉES BLOCKET.SE ===\n"
                      "⚠️ RÈGLE CRITIQUE : kilométrage affiché en 'mil' suédois. 1 mil = 10 km. "
                      "Multiplier systématiquement par 10.\n\n")
            photos = re.findall(r"https://images\.blocketcdn\.se/dynamic/1600w/[^\s\"']+", raw)
            return header + text, list(dict.fromkeys(photos))[:7]
        except Exception as e:
            return f"Erreur scraping Blocket: {e}", []

    def _flatten_blocket_next(self, nd):
        """Extrait les données depuis __NEXT_DATA__ de Blocket."""
        lines = [
            "=== DONNÉES BLOCKET.SE ===",
            "⚠️ RÈGLE CRITIQUE : kilométrage en 'mil' suédois. 1 mil = 10 km. Multiplier par 10."
        ]
        try:
            props = nd.get("props", {}).get("pageProps", {})
            ad = props.get("ad", props.get("listing", props.get("data", {})))
            if ad:
                lines.append(f"TITRE : {ad.get('subject', ad.get('title', ''))}")
                price = ad.get("price", {})
                lines.append(f"PRIX : {price.get('value', '?')} {price.get('currency', 'SEK')}")
                for p in ad.get("parameters", []):
                    lines.append(f"{p.get('label', '?')} : {p.get('value', '?')}")
                desc = ad.get("body", ad.get("description", ""))
                if desc:
                    lines.append(f"DESCRIPTION : {str(desc)[:500]}")
        except Exception as e:
            lines.append(f"[Extraction partielle: {e}]")
        return "\n".join(lines)

    # ══════════════════════════════════════════════════════════════
    # SCRAPERS DEALERS (Daytona, Matrix)
    # ══════════════════════════════════════════════════════════════
    def _scrape_dealer(self, dealer_id):
        """Endpoint générique qui appelle le scraper standalone correspondant."""
        t0 = time.time()
        print(f"  [{time.strftime('%H:%M:%S')}] START scrape_{dealer_id}")
        try:
            if dealer_id == 'daytona':
                import daytona_scraper
                vehicles = daytona_scraper.scrape()
            elif dealer_id == 'matrix':
                import matrix_scraper
                vehicles = matrix_scraper.scrape()
            else:
                self._json_error(400, f"Dealer inconnu : {dealer_id}")
                return
            elapsed = time.time() - t0
            print(f"  [{time.strftime('%H:%M:%S')}] END scrape_{dealer_id} ({elapsed:.1f}s) → {len(vehicles)} véhicule(s)")
            self._send_json({"vehicles": vehicles, "count": len(vehicles), "source": dealer_id})
        except Exception as e:
            elapsed = time.time() - t0
            print(f"  [{time.strftime('%H:%M:%S')}] FAIL scrape_{dealer_id} ({elapsed:.1f}s) : {e}")
            self._json_error(500, f"Erreur scraper {dealer_id} : {str(e)}")

    # ══════════════════════════════════════════════════════════════
    # AUTOSCOUT24 — headers anti-bot + détection de page de blocage
    # ══════════════════════════════════════════════════════════════
    def _scrape_autoscout24(self, url):
        """Scrape AutoScout24 avec headers étoffés + détection blocage.
        Retourne le texte brut si OK, None si page de blocage détectée."""
        t0 = time.time()
        print(f"  [{time.strftime('%H:%M:%S')}] START scrape_autoscout24")
        accept_encoding = "gzip, deflate, br" if _HAS_BROTLI else "gzip, deflate"
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
            "Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
            "Accept-Encoding": accept_encoding,
            "Cache-Control": "no-cache",
            "Pragma": "no-cache",
            "Referer": "https://www.google.de/",
            "sec-ch-ua": '"Chromium";v="120", "Google Chrome";v="120"',
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": '"macOS"',
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "cross-site",
            "Sec-Fetch-User": "?1",
            "Upgrade-Insecure-Requests": "1",
        }
        try:
            req = urllib.request.Request(url, headers=headers)
            with urllib.request.urlopen(req, timeout=15) as resp:
                raw = resp.read()
                encoding = resp.headers.get("Content-Encoding", "").lower()
                if encoding == "gzip":
                    raw = gzip.decompress(raw)
                elif encoding == "deflate":
                    try:
                        raw = zlib.decompress(raw)
                    except zlib.error:
                        raw = zlib.decompress(raw, -zlib.MAX_WBITS)
                elif encoding == "br" and _HAS_BROTLI:
                    raw = brotli.decompress(raw)
                charset = "utf-8"
                ct = resp.headers.get("Content-Type", "")
                if "charset=" in ct:
                    charset = ct.split("charset=")[-1].split(";")[0].strip()
                html = raw.decode(charset, errors="replace")
        except urllib.error.HTTPError as e:
            print(f"  ⚠️ AutoScout24 HTTP {e.code} → blocage probable")
            return None
        except Exception as e:
            print(f"  ⚠️ AutoScout24 erreur réseau : {e}")
            return None

        # Détection page de blocage anti-bot
        markers = ['<article', '<div class="cldt', 'data-item-name', 'ListItem', 'data-listing-id']
        has_marker = any(m in html for m in markers)
        if len(html) < 500 or not has_marker:
            print(f"  ⚠️ AutoScout24 : contenu suspect ({len(html)} chars) — probablement bloqué")
            return None

        print(f"  [{time.strftime('%H:%M:%S')}] END scrape_autoscout24 ({time.time()-t0:.1f}s) — {len(html)} chars, marqueurs OK")

        # Extraire les URLs de fiches individuelles AVANT de stripper le HTML
        # Patterns selon le domaine : /angebote/..., /fr/d/..., /aanbod/...
        listing_urls = re.findall(
            r'href="((?:https?://[^"]*autoscout24\.[^"]+)?/(?:angebote|fr/d|aanbod|offers)/[^"]+)"',
            html
        )
        # Dédupliquer et reconstruire les URLs absolues
        base_origin = re.match(r'(https?://[^/]+)', url)
        origin = base_origin.group(1) if base_origin else ''
        seen = set()
        unique_urls = []
        for u in listing_urls:
            if not u.startswith('http'):
                u = origin + u
            if u not in seen:
                seen.add(u)
                unique_urls.append(u)

        text = self._html_to_text(html)

        # Injecter les URLs trouvées à la fin du texte pour que Claude puisse les associer
        if unique_urls:
            text += '\n\n=== URLs FICHES INDIVIDUELLES TROUVÉES ===\n'
            text += '\n'.join(unique_urls[:30])
            print(f"  📎 {len(unique_urls)} URLs de fiches individuelles extraites")

        return text

    # ══════════════════════════════════════════════════════════════
    # UTILITAIRES
    # ══════════════════════════════════════════════════════════════
    def _scrape_generic(self, url):
        """Scraping HTML générique pour tous les autres sites."""
        t0 = time.time()
        print(f"  [{time.strftime('%H:%M:%S')}] START scrape_generic ({url[:60]})")
        try:
            req = urllib.request.Request(url, headers={
                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml",
                "Accept-Language": "fr-FR,fr;q=0.9,en;q=0.8",
            })
            with urllib.request.urlopen(req, timeout=15) as resp:
                raw = resp.read()
                charset = "utf-8"
                ct = resp.headers.get("Content-Type", "")
                if "charset=" in ct:
                    charset = ct.split("charset=")[-1].split(";")[0].strip()
                html = raw.decode(charset, errors="replace")
            print(f"  [{time.strftime('%H:%M:%S')}] END scrape_generic ({time.time()-t0:.1f}s) — {len(html)} chars")
            return self._html_to_text(html)
        except Exception as e:
            print(f"  [{time.strftime('%H:%M:%S')}] FAIL scrape_generic ({time.time()-t0:.1f}s) : {e}")
            return f"Erreur scraping générique: {e}"

    def _html_to_text(self, html):
        """Extrait le texte brut d'un HTML."""
        text = re.sub(r"<script[^>]*>.*?</script>", " ", html, flags=re.DOTALL | re.IGNORECASE)
        text = re.sub(r"<style[^>]*>.*?</style>", " ", text, flags=re.DOTALL | re.IGNORECASE)
        text = re.sub(r"<[^>]+>", " ", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text[:8000]

    def _send_json(self, data):
        result = json.dumps(data, ensure_ascii=False).encode("utf-8")
        self.send_response(200)
        self.send_header("Content-Type", "application/json; charset=utf-8")
        self.end_headers()
        self.wfile.write(result)

    def _json_error(self, code, msg):
        self.send_response(code)
        self.send_header("Content-Type", "application/json")
        self.end_headers()
        self.wfile.write(json.dumps({"error": msg}).encode())


def main():
    if not API_KEY:
        print()
        print("  ⚠️  ANTHROPIC_API_KEY non définie !")
        print()
        print("  Lance d'abord :")
        print("  export ANTHROPIC_API_KEY=sk-ant-api03-XXXXXXXX")
        print()
        print("  (Clé disponible sur : https://console.anthropic.com/settings/keys)")
        print()
    else:
        print(f"  ✅ Clé API détectée ({API_KEY[:18]}...)")

    print(f"""
  ╔══════════════════════════════════════════════════════════╗
  ║        AUTOPREMIUM — Serveur local actif                 ║
  ╠══════════════════════════════════════════════════════════╣
  ║  Ouvre dans Chrome :                                     ║
  ║  http://localhost:{PORT}/AUTOPREMIUM_ANALYSEUR_V3.html    ║
  ║                                                          ║
  ║  Sites supportés :                                       ║
  ║  ✅ Mobile.de  (extraction DOM structurée)               ║
  ║  ✅ Blocket.se (scraping + photos CDN)                   ║
  ║  ✅ AutoScout24, Otomoto, Finn.no, etc. (générique)      ║
  ║                                                          ║
  ║  Arrêter le serveur : Ctrl + C                           ║
  ╚══════════════════════════════════════════════════════════╝
    """)

    os.chdir(os.path.dirname(os.path.abspath(__file__)))
    server = http.server.HTTPServer(("0.0.0.0", PORT), Handler)
    try:
        server.serve_forever()
    except KeyboardInterrupt:
        print("\n  Serveur arrêté.")


if __name__ == "__main__":
    main()
