import html as _html_lib
import os
import re
import json
from playwright.async_api import async_playwright

_LAUNCH_ARGS = [
    '--no-sandbox', '--disable-dev-shm-usage',
    '--disable-gpu', '--single-process',
]

_BOT_MARKERS = ['captcha', 'blocked', 'zugriff verweigert', 'access denied', 'forbidden']


def _is_bot_blocked(html: str) -> bool:
    lc = html.lower()
    return any(m in lc for m in _BOT_MARKERS) and len(html) < 50000


async def search_autoscout24(url: str) -> list:
    """Scrape AutoScout24 DE/NL via Playwright headless, fallback Firecrawl HTML."""
    try:
        listings = await _playwright_search(url)
        if listings:
            print(f'[AS24-Playwright] ✅ {len(listings)} annonces')
            return listings
        print('[AS24] Playwright 0 résultat — fallback Firecrawl')
    except Exception as e:
        print(f'[AS24] Playwright erreur: {e} — fallback Firecrawl')
    return await _firecrawl_search(url)


# ─────────────────────────────────────────────────────────────────────────────
# PLAYWRIGHT — domcontentloaded + __NEXT_DATA__ + DOM fallback
# ─────────────────────────────────────────────────────────────────────────────

async def _playwright_search(url: str) -> list:
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True, args=_LAUNCH_ARGS)
        context = await browser.new_context(
            user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 '
                       '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            locale='de-DE',
            extra_http_headers={'Accept-Language': 'de-DE,de;q=0.9,en;q=0.8'},
        )
        page = await context.new_page()
        try:
            # domcontentloaded évite le timeout sur les SPA qui font du polling réseau
            await page.goto(url, wait_until='domcontentloaded', timeout=30000)

            # Accepter le consentement cookie si présent
            try:
                await page.click(
                    'button[data-cy="uc-accept-all-button"], '
                    'button:has-text("Alle akzeptieren"), '
                    '#onetrust-accept-btn-handler',
                    timeout=3000,
                )
                await page.wait_for_timeout(2000)
            except Exception:
                pass

            # Attendre que les articles apparaissent (max 10s)
            try:
                await page.wait_for_selector('article[data-guid]', timeout=10000)
            except Exception:
                pass

            html = await page.content()
            if _is_bot_blocked(html):
                print('[AS24-Playwright] page de blocage détectée')
                return []

            # Essayer __NEXT_DATA__ en premier
            next_data_raw = await page.evaluate('''() => {
                const el = document.getElementById('__NEXT_DATA__');
                try { return el ? el.textContent : null; } catch(e) { return null; }
            }''')
            if next_data_raw:
                try:
                    nd = json.loads(next_data_raw)
                    items = _parse_next_data(nd, url)
                    if items:
                        return items
                except Exception as e:
                    print(f'[AS24-Playwright] __NEXT_DATA__ parse error: {e}')

            # Fallback DOM — extrait data-* attrs + title spans
            return _parse_as24_html(html, url)
        finally:
            await browser.close()


# ─────────────────────────────────────────────────────────────────────────────
# PARSER HTML — basé sur les <article data-guid="…"> d'AutoScout24
#
# Structure validée sur le HTML Firecrawl/Playwright d'AS24 (avril 2026) :
#   <article id="{guid}" data-guid="{guid}" data-price="{int}"
#            data-make="ford" data-model="mustang"
#            data-first-registration="02-2023" data-mileage="18685">
#     <span class="ListItemTitle_title__{hash}">Ford Mustang</span>
#     <span class="ListItemTitle_subtitle__{hash}"> <!-- -->5.0 V8 GT<!-- --> </span>
#   </article>
#
# URL listing : https://www.autoscout24.de/angebote/{guid}  (vérifié 200 OK)
#               https://www.autoscout24.nl/aanbod/{guid}
# ─────────────────────────────────────────────────────────────────────────────

def _parse_as24_html(html: str, base_url: str) -> list:
    is_nl = 'autoscout24.nl' in base_url
    origin = 'https://www.autoscout24.nl' if is_nl else 'https://www.autoscout24.de'
    seg = 'aanbod' if is_nl else 'angebote'

    # Découper le HTML par article
    article_pattern = re.compile(r'<article\s([^>]{50,2000})>', re.DOTALL)
    listings = []
    seen = set()

    for m in article_pattern.finditer(html):
        tag = m.group(1)
        attrs = dict(re.findall(r'data-([a-z\-]+)=["\']([^"\']+)["\']', tag))

        guid = attrs.get('guid', '')
        if not guid or guid in seen:
            continue
        if attrs.get('testid', '') not in ('list-item', ''):
            continue  # ignorer les éléments qui ne sont pas des annonces
        seen.add(guid)

        # Prix
        prix_raw = attrs.get('price', '')
        try:
            prix = int(prix_raw) if prix_raw and prix_raw != 'unknown' else None
        except ValueError:
            prix = None
        if prix is not None and prix < 1000:
            continue

        # Kilométrage
        km_raw = attrs.get('mileage', '')
        try:
            km = int(km_raw) if km_raw and km_raw != 'unknown' else None
        except ValueError:
            km = None

        # Année (format "MM-YYYY" ou "YYYY" ou "new")
        reg = attrs.get('first-registration', '')
        annee = None
        if reg and reg not in ('new', 'unknown', ''):
            y_m = re.search(r'(\d{4})', reg)
            if y_m:
                annee = int(y_m.group(1))

        # URL
        item_url = f'{origin}/{seg}/{guid}'

        # Titre — extraire depuis le bloc HTML de l'article
        art_start = m.end()
        art_end = html.find('</article>', art_start)
        art_body = html[art_start:art_end] if art_end > art_start else ''

        title_m = re.search(r'ListItemTitle_title[^"]*"[^>]*>([^<]+)</span>', art_body)
        sub_m = re.search(r'ListItemTitle_subtitle[^"]*"[^>]*>(.*?)</span>', art_body, re.DOTALL)

        main_title = title_m.group(1).strip() if title_m else ''
        subtitle = ''
        if sub_m:
            subtitle = re.sub(r'<!--.*?-->', '', sub_m.group(1), flags=re.DOTALL).strip()

        # Fallback titre depuis data-make + data-model
        if not main_title:
            mk = attrs.get('make', '').capitalize()
            mo = attrs.get('model', '').capitalize()
            main_title = f'{mk} {mo}'.strip()

        title = _html_lib.unescape(f'{main_title} {subtitle}'.strip())
        title = re.sub(r'\s+', ' ', title)

        # Photo
        photo_m = re.search(
            r'(https://prod\.pictures\.autoscout24\.net/listing-images/[^\s"\']+\.(?:jpg|jpeg|webp))',
            art_body
        )
        photo = None
        if photo_m:
            photo = re.sub(r'/\d+x\d+\.webp$', '/1200x900.jpg', photo_m.group(1))
            photo = re.sub(r'/\d+x\d+\.(jpg|jpeg)$', '/1200x900.\\1', photo)

        if title or prix:
            listings.append({
                'title': title, 'prix': prix, 'km': km,
                'annee': annee, 'url': item_url,
                'photo': photo, 'devise': 'EUR',
            })

    return listings


# ─────────────────────────────────────────────────────────────────────────────
# __NEXT_DATA__ — fallback si disponible dans le rendu Playwright
# ─────────────────────────────────────────────────────────────────────────────

def _parse_next_data(data: dict, base_url: str) -> list:
    is_nl = 'autoscout24.nl' in base_url
    origin = 'https://www.autoscout24.nl' if is_nl else 'https://www.autoscout24.de'
    seg = 'aanbod' if is_nl else 'angebote'

    items = _find_listings_array(data)
    if not items:
        return []

    listings = []
    for item in items[:30]:
        if not isinstance(item, dict):
            continue

        item_url = item.get('url', '')
        if not item_url:
            item_id = item.get('id', item.get('guid', ''))
            if item_id:
                item_url = f'{origin}/{seg}/{item_id}'
        if item_url and not item_url.startswith('http'):
            item_url = origin + item_url

        make = item.get('make', '')
        model = item.get('model', '')
        version = item.get('version', item.get('modelVersion', ''))
        title = ' '.join(filter(None, [make, model, version])).strip() or item.get('title', '')

        prix = _extract_price(item)
        annee = _extract_year(item)
        km = _extract_mileage(item)
        photo = _extract_photo(item)

        if (title or prix) and item_url and (prix is None or prix >= 1000):
            listings.append({
                'title': title, 'prix': prix, 'km': km,
                'annee': annee, 'url': item_url,
                'photo': photo, 'devise': 'EUR',
            })
    return listings


def _find_listings_array(obj, depth=0):
    if depth > 8 or not isinstance(obj, dict):
        return None
    for key in ('classifieds', 'listings', 'ads', 'results', 'items', 'vehicles', 'data'):
        val = obj.get(key)
        if isinstance(val, list) and val and isinstance(val[0], dict):
            if 'price' in val[0] or 'id' in val[0]:
                return val
    for v in obj.values():
        if isinstance(v, dict):
            r = _find_listings_array(v, depth + 1)
            if r:
                return r
    return None


def _extract_price(item):
    p = item.get('price') or item.get('priceValue')
    if isinstance(p, (int, float)):
        return int(p) if p > 0 else None
    if isinstance(p, dict):
        for k in ('value', 'amount', 'gross', 'net'):
            v = p.get(k)
            if isinstance(v, (int, float)) and v > 0:
                return int(v)
    return None


def _extract_year(item):
    reg = item.get('firstRegistration') or item.get('registrationDate') or item.get('year')
    if isinstance(reg, int) and 1950 < reg < 2030:
        return reg
    if isinstance(reg, dict):
        y = reg.get('year') or reg.get('y')
        if y:
            return int(y)
    if isinstance(reg, str):
        m = re.search(r'(\d{4})', reg)
        if m:
            return int(m.group(1))
    return None


def _extract_mileage(item):
    km = item.get('mileageInKm') or item.get('mileage')
    if isinstance(km, (int, float)):
        return int(km) if km > 0 else None
    if isinstance(km, dict):
        v = km.get('value') or km.get('mileageInKm')
        if isinstance(v, (int, float)) and v > 0:
            return int(v)
    return None


def _extract_photo(item):
    for key in ('images', 'pictures', 'photos', 'media'):
        imgs = item.get(key)
        if isinstance(imgs, list) and imgs:
            img = imgs[0]
            if isinstance(img, str) and img.startswith('http'):
                return img
            if isinstance(img, dict):
                for k in ('url', 'src', 'uri', 'href'):
                    v = img.get(k, '')
                    if v and v.startswith('http'):
                        return v
    for key in ('imageUrl', 'thumbnailUrl', 'photo', 'image'):
        v = item.get(key, '')
        if v and v.startswith('http'):
            return v
    return None


# ─────────────────────────────────────────────────────────────────────────────
# FIRECRAWL — format HTML, parsing via <article data-guid>
# ─────────────────────────────────────────────────────────────────────────────

async def _firecrawl_search(url: str) -> list:
    """Fallback Firecrawl pour AutoScout24 — extrait le HTML et parse les articles."""
    try:
        from firecrawl import Firecrawl
        api_key = os.environ.get('FIRECRAWL_API_KEY', '')
        if not api_key or api_key.startswith('REMPLACER'):
            print('[AS24-Firecrawl] FIRECRAWL_API_KEY non configurée')
            return []
        fc = Firecrawl(api_key=api_key)
        result = fc.scrape(url, formats=['html'])
        html = result.html if hasattr(result, 'html') else ''
        if not html:
            html = result.get('html', '') if isinstance(result, dict) else ''
        if not html:
            print('[AS24-Firecrawl] HTML vide')
            return []
        listings = _parse_as24_html(html, url)
        print(f'[AS24-Firecrawl] ✅ {len(listings)} annonces parsées')
        return listings
    except Exception as e:
        print(f'[AS24-Firecrawl] ❌ Erreur: {e}')
        return []
