#!/usr/bin/env python3
"""
Matrix Motors Berlin — Scraper standalone
WordPress + WP Car Showroom plugin, HTML statique, 3 pages de ~20 véhicules.
"""

import json
import re
import urllib.request

BASE_URL = "https://matrix-automobile.de/alle-fahrzeuge/"
HEADERS = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"}


def fetch_page(page_num):
    """Fetch une page du listing (1-indexed)."""
    url = BASE_URL if page_num <= 1 else f"{BASE_URL}page/{page_num}/"
    req = urllib.request.Request(url, headers=HEADERS)
    with urllib.request.urlopen(req, timeout=30) as resp:
        return resp.read().decode("utf-8", errors="replace")


def parse_articles(html):
    """Extrait les véhicules depuis le HTML d'une page de listing."""
    vehicles = []
    articles = re.findall(r"<article class=\"vehicle-on-archive\">([\s\S]*?)</article>", html)

    for art in articles:
        v = {"source": "matrix-automobile", "pays": "DE", "devise": "EUR"}

        # URL + image
        link = re.search(r'href="(https://matrix-automobile\.de/fahrzeuge/[^"]+)"', art)
        v["url"] = link.group(1) if link else None

        img = re.search(r'src="(https://matrix-automobile\.de/wp-content/uploads/[^"]+)"', art)
        v["image"] = img.group(1) if img else None

        # Marque + modèle
        make = re.search(r'<span class="make">([^<]+)</span>', art)
        model = re.search(r'<span class="model">([^<]+)</span>', art)
        desc = re.search(r'<span class="model_description">([^<]+)</span>', art)
        v["marque"] = make.group(1).strip() if make else ""
        v["modele"] = model.group(1).strip() if model else ""
        variant = desc.group(1).strip() if desc else ""
        v["titre"] = f"{v['marque']} {v['modele']}".strip()
        if variant:
            v["titre"] += f" {variant}"

        # Specs : puissance, km, date immat
        km_m = re.search(r'class="fact mileage">([^<]+)', art)
        if km_m:
            v["km"] = int(re.sub(r"[^\d]", "", km_m.group(1)))
        else:
            v["km"] = None

        reg_m = re.search(r'class="fact first_registration">([^<]+)', art)
        if reg_m:
            parts = reg_m.group(1).strip().split("/")
            v["annee"] = int(parts[-1]) if len(parts) >= 2 and parts[-1].isdigit() else None
        else:
            v["annee"] = None

        power_m = re.search(r'class="fact power">([^<]+)', art)
        v["puissance"] = power_m.group(1).strip() if power_m else None

        # Prix (pas toujours dans l'article, parfois dans un bloc suivant)
        v["prix"] = None
        v["prix_ht"] = None
        vehicles.append(v)

    return vehicles


def extract_prices(html):
    """Extrait les paires prix brut/net depuis les blocs car_price de la page."""
    prices = []
    # Pattern : <div class="car_price">574.950 € (Brutto)</div>
    blocks = re.findall(r'class="car_price">([^<]+)', html)
    for b in blocks:
        val = re.search(r"([\d.]+)", b)
        if val:
            amount = int(val.group(1).replace(".", ""))
            is_brutto = "Brutto" in b or "brutto" in b
            is_netto = "Netto" in b or "netto" in b
            prices.append({"amount": amount, "brutto": is_brutto, "netto": is_netto})
    return prices


def scrape():
    """Scrape toutes les pages et retourne la liste normalisée."""
    all_vehicles = []
    seen_urls = set()

    # Déterminer le nombre de pages
    html1 = fetch_page(1)
    props = re.search(r'var vehicle_list_properties\s*=\s*(\{[^;]+\})', html1)
    max_pages = 3
    if props:
        try:
            p = json.loads(props.group(1))
            max_pages = int(p.get("max_pages", 3))
        except (json.JSONDecodeError, ValueError):
            pass

    for page_num in range(1, max_pages + 1):
        html = html1 if page_num == 1 else fetch_page(page_num)
        articles = parse_articles(html)
        prices = extract_prices(html)

        # Associer les prix aux véhicules (même ordre dans le HTML)
        # Les prix apparaissent en paires (Brutto puis Netto) pour chaque véhicule
        brutto_prices = [p["amount"] for p in prices if p["brutto"]]
        netto_prices = [p["amount"] for p in prices if p["netto"]]

        for i, v in enumerate(articles):
            if i < len(brutto_prices):
                v["prix"] = brutto_prices[i]
            if i < len(netto_prices):
                v["prix_ht"] = netto_prices[i]
            # Dédupliquer par URL
            if v["url"] and v["url"] not in seen_urls:
                seen_urls.add(v["url"])
                all_vehicles.append(v)

        print(f"  Page {page_num}/{max_pages} : {len(articles)} véhicules, {len(brutto_prices)} prix")

    return all_vehicles


if __name__ == "__main__":
    vehicles = scrape()
    print(f"\n  Matrix Motors Berlin — {len(vehicles)} véhicule(s)\n")
    for v in vehicles:
        prix_str = f"{v['prix']:,} €".replace(",", ".") if v["prix"] else "N/A"
        print(f"  {v['titre']}")
        print(f"    {v.get('annee','?')} · {v.get('km','?')} km · {prix_str}")
        print(f"    URL: {v['url']}")
        print()
    # JSON output
    with open("matrix_results.json", "w", encoding="utf-8") as f:
        json.dump(vehicles, f, ensure_ascii=False, indent=2)
    print(f"  Sauvegardé dans matrix_results.json")
