#!/usr/bin/env python3
"""Convert collected foerderdatenbank.de listing entries (_raw_pages.json) into
schema-compliant JSON Lines. Derives traeger/region from the detail-URL path.
Idempotent: dedupes by quelle_url. Append-safe: merges with existing jsonl."""
import json, re, os, glob, unicodedata

BASE = "https://www.foerderdatenbank.de/"
HERE = os.path.dirname(os.path.abspath(__file__))
OUT = os.path.join(HERE, "foerderdatenbank-de.jsonl")

# Map URL-path träger codes to readable names where known
TRAEGER_MAP = {
    "BMWi": "Bundesministerium für Wirtschaft", "BMWK": "Bundesministerium für Wirtschaft und Klimaschutz",
    "BMAS": "Bundesministerium für Arbeit und Soziales", "BMBF": "Bundesministerium für Bildung und Forschung",
    "BMFTR": "Bundesministerium für Forschung, Technologie und Raumfahrt",
    "KfW": "KfW", "BMU": "Bundesministerium für Umwelt", "BMI": "Bundesministerium des Innern",
    "BMFSFJ": "Bundesministerium für Familie, Senioren, Frauen und Jugend", "LR": "Landwirtschaftliche Rentenbank",
}
LAND_NAMES = {
    "Baden-Wuerttemberg": "Baden-Württemberg", "Bayern": "Bayern", "Berlin": "Berlin",
    "Brandenburg": "Brandenburg", "Bremen": "Bremen", "Hamburg": "Hamburg", "Hessen": "Hessen",
    "Mecklenburg-Vorpommern": "Mecklenburg-Vorpommern", "Niedersachsen": "Niedersachsen",
    "NRW": "Nordrhein-Westfalen", "Rheinland-Pfalz": "Rheinland-Pfalz", "Saarland": "Saarland",
    "Sachsen": "Sachsen", "Sachsen-Anhalt": "Sachsen-Anhalt", "Schleswig-Holstein": "Schleswig-Holstein",
    "Thueringen": "Thüringen",
}

def slugify(s):
    s = s.lower()
    repl = {"ä":"ae","ö":"oe","ü":"ue","ß":"ss","–":"-","„":"","\"":"","“":""}
    for k,v in repl.items(): s = s.replace(k,v)
    s = unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode()
    s = re.sub(r"[^a-z0-9]+","-",s).strip("-")
    return s[:90]

def parse(path):
    # FDB/Content/DE/Foerderprogramm/<Bund|Land|EU>/<Traeger?>/<slug>.html
    parts = path.split("/")
    try:
        i = parts.index("Foerderprogramm")
    except ValueError:
        return None, None
    rest = parts[i+1:]
    if not rest: return None, None
    scope = rest[0]
    if scope == "Bund":
        region = "Bund"
        traeger = TRAEGER_MAP.get(rest[1], rest[1]) if len(rest) > 2 else "Bund"
    elif scope == "EU":
        region = "EU"; traeger = "Europäische Union"
    elif scope == "Land":
        region = LAND_NAMES.get(rest[1], rest[1]) if len(rest) > 1 else "Land"
        traeger = region
    else:
        region = scope; traeger = scope
    return region, traeger

def main():
    rows = []
    for fn in sorted(glob.glob(os.path.join(HERE, "_raw_*.json"))):
        with open(fn, encoding="utf-8") as f:
            rows.extend(json.load(f))
    # load existing to merge/dedupe
    existing = {}
    if os.path.exists(OUT):
        with open(OUT, encoding="utf-8") as f:
            for line in f:
                line=line.strip()
                if line:
                    o = json.loads(line); existing[o["quelle_url"]] = o
    seen_ids = set()
    for r in rows:
        path = r["url"].lstrip("/")
        url = BASE + path
        region, traeger = parse(path)
        jur = "EU" if region == "EU" else "DE"
        sid = slugify(r["name"]) or slugify(path.split("/")[-1])
        base_id = sid; n=2
        while sid in seen_ids and existing.get(url, {}).get("id") != sid:
            sid = f"{base_id}-{n}"; n+=1
        seen_ids.add(sid)
        entry = {
            "id": sid, "name": r["name"], "traeger": traeger or "",
            "jurisdiktion": jur, "region": region or "", "themen": [],
            "foerderart_grob": "", "status_grob": "",
            "quelle_url": url, "source_catalog": "foerderdatenbank.de",
            "last_seen": "2026-05-25", "vertieft": False, "relevanz_breact": "ungeprueft",
        }
        existing[url] = entry
    with open(OUT, "w", encoding="utf-8") as f:
        for o in existing.values():
            f.write(json.dumps(o, ensure_ascii=False) + "\n")
    print(f"wrote {len(existing)} unique entries to {OUT}")

if __name__ == "__main__":
    main()
