"""Mantara v8 — in-engine normalisation pass.

Runs immediately after the LLM produces schema_json. Owns every repair that
used to live in step-05 post-processors:

    1. cfg_* tables are the source of truth for status/type/category lookups.
       enum_types[] is lifted into proper cfg_* tables.
    2. LLM-emitted cfg_* tables are coerced to the canonical Mantara shape
       (PK = <cfg_name>_id, columns = code|label|description|is_active).
    3. Every FK target referenced by any business table actually exists.
       Missing cfg_* tables are auto-created. FKs to missing business tables
       are stripped (kept as comments).
    4. Business-table columns referencing cfg_* are rewritten to the canonical
       <col>_id INT shape with FK to <cfg>(<cfg>_id).
    5. CIR rules are surfaced into table comments / assumptions.
    6. CIR workflow states backfill cfg seed values.
    7. SaaS audit columns (version, notes, metadata, deleted_at) injected.
    8. Paired *_history tables auto-created for DOCUMENT/MASTER archetypes.
    9. Indexes auto-emitted for FKs, UNIQUE keys, status/type/is_active.
   10. schema_name regenerated to a unique sch_<date>_<time>_<rand> (36 chars).

Deterministic, idempotent, $0.
"""
from __future__ import annotations

import re
import uuid
from datetime import datetime, timezone
from typing import Any


# --------------------------------------------------------------------------- #
# Schema name regenerator — unique per generation
# --------------------------------------------------------------------------- #
# Format: sch_{YYYYMMDD}_{HHMMSS}_{16-hex-chars}  →  exactly 36 chars
#   sch_ + YYYYMMDD + _ + HHMMSS + _ = 20 chars
#   16-hex suffix = 16 chars
#   Total = 36 (well under PostgreSQL's 63-char identifier limit)
#   Alphanumeric + underscore only — safe as an unquoted SQL identifier.
_SCHEMA_NAME_PATTERN = re.compile(r"^sch_\d{8}_\d{6}_[0-9a-f]{16}$")


def regenerate_schema_name(schema_json: dict, force: bool = False) -> str | None:
    """Replace schema_name with a fresh `sch_<date>_<time>_<rand>` (36 chars).

    Rewrites every FK reference in the JSON so the new name propagates to
    rendered SQL. Idempotent — skips if already in sch_* shape unless
    `force=True`. Returns the new schema_name (or None if skipped).
    """
    old = schema_json.get("schema_name") or "schema"
    if not force and _SCHEMA_NAME_PATTERN.match(old):
        return None

    now = datetime.now(timezone.utc)
    new_name = (
        f"sch_{now.strftime('%Y%m%d')}_{now.strftime('%H%M%S')}_"
        f"{uuid.uuid4().hex[:16]}"
    )
    assert len(new_name) == 36, f"schema_name length {len(new_name)} != 36"

    schema_json["schema_name"] = new_name
    old_prefix = f"{old}."
    new_prefix = f"{new_name}."

    for menu in schema_json.get("menus", []) or []:
        for sub in menu.get("submenus") or []:
            for tbl in sub.get("tables") or []:
                if not isinstance(tbl, dict):
                    continue
                for col in tbl.get("columns") or []:
                    cons = col.get("constraints") or ""
                    if old_prefix in cons:
                        col["constraints"] = cons.replace(old_prefix, new_prefix)
                    cmt = col.get("comment") or ""
                    if old_prefix in cmt:
                        col["comment"] = cmt.replace(old_prefix, new_prefix)
                for fk in tbl.get("foreign_keys") or []:
                    ref = fk.get("references") or ""
                    if old_prefix in ref:
                        fk["references"] = ref.replace(old_prefix, new_prefix)
    return new_name


_CFG_MENU_NAME = "Configuration"
_CFG_MENU_DESC = (
    "System configuration lookup tables. All status, type, and category "
    "values are managed here and can be edited by authorised users."
)

# Canonical column set every cfg_* table must have (in this order).
_CANONICAL_CFG_COLUMNS = {
    "code":        ("VARCHAR(50)",  "NOT NULL UNIQUE"),
    "label":       ("VARCHAR(100)", "NOT NULL"),
    "description": ("TEXT",         ""),                    # nullable
    "is_active":   ("BOOLEAN",      "NOT NULL DEFAULT TRUE"),
}


# --------------------------------------------------------------------------- #
# Helpers
# --------------------------------------------------------------------------- #
def _snake(name: str) -> str:
    if not name:
        return ""
    s = re.sub(r"(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])", "_", name)
    return re.sub(r"[^A-Za-z0-9]+", "_", s).strip("_").lower()


def _strip_enum_suffix(type_name: str) -> str:
    bare = type_name.split(".")[-1]
    if bare.endswith("_enum"):
        bare = bare[: -len("_enum")]
    if not bare.startswith("cfg_"):
        bare = f"cfg_{bare}"
    return bare


def _value_to_code(v: str) -> str:
    return re.sub(r"[^a-z0-9_]", "_", str(v).lower()).strip("_")


def _value_to_label(v: str) -> str:
    return str(v).replace("_", " ").title()


def _all_tables(schema_json: dict) -> list[dict]:
    out: list[dict] = []
    for menu in schema_json.get("menus", []):
        for sub in menu.get("submenus") or []:
            for tbl in sub.get("tables") or []:
                if isinstance(tbl, dict):
                    out.append(tbl)
    return out


def _all_table_names(schema_json: dict) -> set[str]:
    out: set[str] = {"menu", "submenu"}
    for tbl in _all_tables(schema_json):
        name = tbl.get("table_name")
        if name:
            out.add(name)
    return out


def _existing_cfg_tables(schema_json: dict) -> dict[str, dict]:
    return {
        tbl["table_name"]: tbl
        for tbl in _all_tables(schema_json)
        if tbl.get("table_name", "").startswith("cfg_")
    }


def _next_menu_id(schema_json: dict) -> int:
    return max((m.get("menu_id", 0) for m in schema_json.get("menus", [])), default=0) + 1


def _find_or_create_cfg_menu(schema_json: dict) -> dict:
    for m in schema_json.get("menus", []):
        if (m.get("menu_name") or "").lower() == _CFG_MENU_NAME.lower():
            return m
    new_id = _next_menu_id(schema_json)
    cfg_menu = {
        "menu_id": new_id,
        "menu_name": _CFG_MENU_NAME,
        "sequence_number": new_id,
        "description": _CFG_MENU_DESC,
        "submenus": [],
    }
    schema_json.setdefault("menus", []).append(cfg_menu)
    return cfg_menu


# --------------------------------------------------------------------------- #
# Step 1 — lift enum_types[] into cfg_* tables
# --------------------------------------------------------------------------- #
def _build_cfg_table(
    cfg_name: str,
    values: list[str],
    description: str,
    submenu_id: int,
    schema_name: str,
) -> dict:
    pk_col = f"{cfg_name}_id"
    return {
        "table_name": cfg_name,
        "comment": description or f"Configuration lookup: {cfg_name}",
        "columns": [
            {"name": pk_col, "type": "SERIAL", "constraints": "PRIMARY KEY"},
            {"name": "code", "type": "VARCHAR(50)", "constraints": "NOT NULL UNIQUE"},
            {"name": "label", "type": "VARCHAR(100)", "constraints": "NOT NULL"},
            {"name": "description", "type": "TEXT"},
            {"name": "is_active", "type": "BOOLEAN", "constraints": "NOT NULL DEFAULT TRUE"},
            {
                "name": "submenu_id",
                "type": "INT",
                "constraints": (
                    f"NOT NULL DEFAULT {submenu_id} "
                    f"REFERENCES {schema_name}.submenu(submenu_id)"
                ),
            },
        ],
        "foreign_keys": [],
        "_seed_values": list(values or []),
    }


def lift_enum_types(schema_json: dict) -> int:
    """Convert every enum_types[] entry to a real cfg_* table. Returns count lifted."""
    enum_types = schema_json.get("enum_types") or []
    if not enum_types:
        return 0
    schema_name = schema_json.get("schema_name") or "schema"
    cfg_tables = _existing_cfg_tables(schema_json)
    cfg_menu = _find_or_create_cfg_menu(schema_json)
    cfg_menu_id = cfg_menu["menu_id"]
    existing_sm_ids = {
        sm.get("submenu_id", 0) for sm in cfg_menu.get("submenus", []) or []
    }
    next_sm = max(existing_sm_ids, default=cfg_menu_id * 100) + 1
    if next_sm < cfg_menu_id * 100 + 1:
        next_sm = cfg_menu_id * 100 + 1

    lifted = 0
    for enum in enum_types:
        cfg_name = _strip_enum_suffix(enum.get("type_name", ""))
        if not cfg_name or cfg_name == "cfg_" or cfg_name in cfg_tables:
            continue
        cfg_table = _build_cfg_table(
            cfg_name=cfg_name,
            values=enum.get("values", []),
            description=enum.get("description", ""),
            submenu_id=next_sm,
            schema_name=schema_name,
        )
        cfg_menu.setdefault("submenus", []).append({
            "submenu_id": next_sm,
            "submenu_name": cfg_name.replace("_", " ").title(),
            "sequence_number": next_sm - cfg_menu_id * 100,
            "description": enum.get("description") or f"Configuration: {cfg_name}",
            "tables": [cfg_table],
        })
        cfg_tables[cfg_name] = cfg_table
        next_sm += 1
        lifted += 1

    schema_json["enum_types"] = []  # renderer must not emit CREATE TYPE
    return lifted


# --------------------------------------------------------------------------- #
# Step 2 — auto-create cfg_* tables for FK targets the LLM forgot
# --------------------------------------------------------------------------- #
def _scan_cfg_fk_targets(schema_json: dict) -> set[str]:
    targets: set[str] = set()
    pat = re.compile(r"(?:\w+\.)?(cfg_[a-zA-Z0-9_]+)\s*\(", re.IGNORECASE)
    for tbl in _all_tables(schema_json):
        if tbl.get("table_name", "").startswith("cfg_"):
            continue
        for col in tbl.get("columns") or []:
            cons = col.get("constraints") or ""
            for m in pat.finditer(cons):
                targets.add(m.group(1))
        for fk in tbl.get("foreign_keys") or []:
            ref = fk.get("references") or ""
            for m in pat.finditer(ref):
                targets.add(m.group(1))
    return {t[:-5] if t.endswith("_enum") else t for t in targets}


def create_missing_cfg_tables(schema_json: dict) -> int:
    """Create stub cfg_* tables for every FK target not yet defined."""
    existing = set(_existing_cfg_tables(schema_json).keys())
    missing = sorted(_scan_cfg_fk_targets(schema_json) - existing)
    if not missing:
        return 0

    schema_name = schema_json.get("schema_name") or "schema"
    cfg_menu = _find_or_create_cfg_menu(schema_json)
    cfg_menu_id = cfg_menu["menu_id"]
    existing_sm_ids = {
        sm.get("submenu_id", 0) for sm in cfg_menu.get("submenus", []) or []
    }
    next_sm = max(existing_sm_ids, default=cfg_menu_id * 100) + 1
    if next_sm < cfg_menu_id * 100 + 1:
        next_sm = cfg_menu_id * 100 + 1

    for cfg_name in missing:
        cfg_table = _build_cfg_table(
            cfg_name=cfg_name,
            values=[],
            description=f"Configuration lookup (auto-created from FK target): {cfg_name}",
            submenu_id=next_sm,
            schema_name=schema_name,
        )
        cfg_menu.setdefault("submenus", []).append({
            "submenu_id": next_sm,
            "submenu_name": cfg_name.replace("_", " ").title(),
            "sequence_number": next_sm - cfg_menu_id * 100,
            "description": f"Auto-created lookup: {cfg_name}",
            "tables": [cfg_table],
        })
        next_sm += 1
    return len(missing)


# --------------------------------------------------------------------------- #
# Step 3 — coerce every cfg_* table to canonical shape
# --------------------------------------------------------------------------- #
def normalise_cfg_table(tbl: dict, schema_name: str, submenu_id: int | None = None) -> int:
    name = tbl.get("table_name", "")
    if not name.startswith("cfg_"):
        return 0
    cols = tbl.get("columns") or []
    repairs = 0
    canonical_pk = f"{name}_id"

    # 1. PK rename
    pk_idx: int | None = None
    for i, c in enumerate(cols):
        cons = (c.get("constraints") or "").upper()
        if "PRIMARY KEY" in cons:
            pk_idx = i
            break
    if pk_idx is not None:
        pk_col = cols[pk_idx]
        if pk_col.get("name") != canonical_pk:
            old = pk_col.get("name")
            pk_col["name"] = canonical_pk
            if "SERIAL" not in (pk_col.get("type") or "").upper():
                pk_col["type"] = "SERIAL"
            pk_col["constraints"] = "PRIMARY KEY"
            tbl.setdefault("_normalised", {})["pk_renamed_from"] = old
            repairs += 1

    existing_names = {c.get("name") for c in cols}

    # 2. Ensure canonical columns
    for cname, (ctype, ccons) in _CANONICAL_CFG_COLUMNS.items():
        if cname in existing_names:
            if cname == "description":
                # Relax NOT NULL so seed inserts NULL safely
                for c in cols:
                    if c.get("name") == "description":
                        cur = c.get("constraints") or ""
                        if re.search(r"\bNOT\s+NULL\b", cur, re.IGNORECASE):
                            c["constraints"] = re.sub(
                                r"\bNOT\s+NULL\b", "", cur, flags=re.IGNORECASE
                            ).strip()
                            repairs += 1
            continue
        new_col = {"name": cname, "type": ctype, "constraints": ccons}
        insert_at = len(cols)
        for i, c in enumerate(cols):
            if c.get("name") == "submenu_id":
                insert_at = i
                break
        cols.insert(insert_at, new_col)
        repairs += 1

    # 3. display_order — give DEFAULT 0 if NOT NULL without DEFAULT
    for c in cols:
        if c.get("name") != "display_order":
            continue
        cur = c.get("constraints") or ""
        if re.search(r"\bNOT\s+NULL\b", cur, re.IGNORECASE) and not re.search(
            r"\bDEFAULT\b", cur, re.IGNORECASE
        ):
            c["constraints"] = (cur + " DEFAULT 0").strip()
            repairs += 1
        break

    # 4. Ensure submenu_id present
    if "submenu_id" not in {c.get("name") for c in cols}:
        if submenu_id is not None:
            sm_constraints = (
                f"NOT NULL DEFAULT {submenu_id} "
                f"REFERENCES {schema_name}.submenu(submenu_id)"
            )
        else:
            sm_constraints = f"REFERENCES {schema_name}.submenu(submenu_id)"
        cols.append({
            "name": "submenu_id",
            "type": "INT",
            "constraints": sm_constraints,
        })
        repairs += 1
    else:
        # 4.5. Repair existing submenu_id NOT NULL without DEFAULT — seed inserts skip
        # submenu_id, so a missing DEFAULT causes a NOT NULL violation at insert time.
        for c in cols:
            if c.get("name") != "submenu_id":
                continue
            cur = c.get("constraints") or ""
            if (re.search(r"\bNOT\s+NULL\b", cur, re.IGNORECASE)
                    and not re.search(r"\bDEFAULT\b", cur, re.IGNORECASE)):
                if submenu_id is not None:
                    c["constraints"] = re.sub(
                        r"\bNOT\s+NULL\b",
                        f"NOT NULL DEFAULT {submenu_id}",
                        cur, flags=re.IGNORECASE, count=1,
                    )
                else:
                    c["constraints"] = re.sub(
                        r"\bNOT\s+NULL\b", "", cur, flags=re.IGNORECASE
                    ).strip()
                repairs += 1
            break

    tbl["columns"] = cols
    return repairs


# --------------------------------------------------------------------------- #
# Step 4 — rewrite business-table FKs to use cfg_*(<cfg>_id)
# --------------------------------------------------------------------------- #
def rewrite_business_fks_to_cfg(
    table: dict, cfg_tables: dict[str, dict], schema_name: str
) -> int:
    rewritten = 0
    fks = table.get("foreign_keys") or []
    new_fks: list[dict] = []
    cols_by_name = {c["name"]: c for c in table.get("columns") or []}
    cfg_targets: dict[str, str] = {}

    for fk in fks:
        ref = fk.get("references") or ""
        m = re.search(
            r"(?:\w+\.)?(cfg_[a-zA-Z0-9_]+?)(?:_enum)?\s*\(\w+\)", ref
        )
        if m and m.group(1) in cfg_tables:
            cfg_targets[fk["column"]] = m.group(1)
        else:
            new_fks.append(fk)

    if not cfg_targets:
        for col in table.get("columns") or []:
            cons = col.get("constraints") or ""
            m = re.search(
                r"REFERENCES\s+(?:\w+\.)?(cfg_[a-zA-Z0-9_]+?)(?:_enum)?\s*\(\w+\)",
                cons, re.IGNORECASE
            )
            if m and m.group(1) in cfg_tables:
                cfg_targets[col["name"]] = m.group(1)

    for orig_col, cfg_tbl in cfg_targets.items():
        new_col = orig_col if orig_col.endswith("_id") else f"{orig_col}_id"
        col = cols_by_name.get(orig_col)
        if col is None:
            continue
        col["name"] = new_col
        col["type"] = "INT"
        cons = col.get("constraints") or ""
        # Strip any existing REFERENCES (we'll re-insert with canonical target)
        cons = re.sub(r"REFERENCES\s+\S+\s*\([^)]+\)", "", cons, flags=re.IGNORECASE)
        cons = re.sub(r"\s{2,}", " ", cons).strip()
        # Re-insert as inline REFERENCES so renderer keeps it on the column line
        # (cfg_* tables render first, so this is a backward ref — never deferred)
        target = f"{schema_name}.{cfg_tbl}({cfg_tbl}_id)"
        col["constraints"] = (cons + f" REFERENCES {target}").strip() if cons else f"REFERENCES {target}"
        # Update / clean the comment so it points at the canonical target
        prev_comment = col.get("comment") or ""
        # Drop any stale "References cfg_X.code" wording — the inline clause
        # is now self-documenting.
        prev_comment = re.sub(
            r"References\s+cfg_[a-zA-Z0-9_]+(?:\.\w+)?\.?\s*",
            "", prev_comment, flags=re.IGNORECASE,
        ).strip(" .;,")
        if prev_comment:
            col["comment"] = prev_comment
        elif "comment" in col:
            del col["comment"]
        rewritten += 1

    # Keep table-level FKs only for non-cfg references (the LLM's other FKs).
    table["foreign_keys"] = new_fks
    return rewritten


# --------------------------------------------------------------------------- #
# Step 5 — strip dangling FKs to non-existent tables
# --------------------------------------------------------------------------- #
def strip_dangling_fks(schema_json: dict) -> int:
    known = _all_table_names(schema_json)
    stripped = 0
    inline_pat = re.compile(
        r"REFERENCES\s+(?:(\w+)\.)?(\w+)\s*\(\s*(\w+)\s*\)", re.IGNORECASE
    )

    for tbl in _all_tables(schema_json):
        for col in tbl.get("columns") or []:
            cons = col.get("constraints") or ""
            if not cons:
                continue
            def _maybe_strip(match: "re.Match[str]") -> str:
                target = match.group(2)
                return match.group(0) if target in known else ""
            new_cons = inline_pat.sub(_maybe_strip, cons)
            if new_cons != cons:
                col["constraints"] = re.sub(r"\s{2,}", " ", new_cons).strip()
                prev = (col.get("comment") or "").rstrip(". ")
                col["comment"] = (
                    (prev + ". " if prev else "")
                    + "FK target table missing — reference removed"
                )
                stripped += 1

        kept: list[dict] = []
        for fk in tbl.get("foreign_keys") or []:
            ref = fk.get("references") or ""
            m = re.match(r"(?:\w+\.)?(\w+)\s*\(", ref)
            target = m.group(1) if m else None
            if target and target not in known:
                stripped += 1
                continue
            kept.append(fk)
        tbl["foreign_keys"] = kept
    return stripped


def repair_fk_types(schema_json: dict) -> int:
    """Coerce FK columns that reference integer PKs to INT.

    The LLM occasionally emits UUID or BIGINT for FK columns even though all
    PKs are SERIAL (integer).  A type mismatch is a hard Postgres error:
    'Key columns X and id are of incompatible types: uuid and integer.'
    """
    _UUID_PAT = re.compile(r"\bUUID\b", re.IGNORECASE)
    _BIGINT_PAT = re.compile(r"\bBIGINT\b", re.IGNORECASE)
    _REF_PAT = re.compile(r"\bREFERENCES\b", re.IGNORECASE)
    fixed = 0
    for tbl in _all_tables(schema_json):
        for col in tbl.get("columns") or []:
            col_type = col.get("type") or ""
            if not _REF_PAT.search(col.get("constraints") or ""):
                continue
            if _UUID_PAT.search(col_type) or _BIGINT_PAT.search(col_type):
                col["type"] = "INT"
                fixed += 1
    return fixed


def repair_fk_referenced_columns(schema_json: dict) -> int:
    """Rewrite FK references that name a non-existent column to use the real PK.

    The LLM sometimes writes REFERENCES schema.bmiresult(bmi_result_id) when
    the actual PK column is `id`, producing:
      'column bmi_result_id referenced in foreign key constraint does not exist'

    For every REFERENCES clause we look up the target table's real PK column
    name and replace the wrong column with the correct one.
    """
    # Build map: table_name → pk_column_name
    pk_map: dict[str, str] = {}
    for tbl in _all_tables(schema_json):
        name = tbl.get("table_name", "")
        for col in tbl.get("columns") or []:
            if "PRIMARY KEY" in (col.get("constraints") or "").upper():
                pk_map[name] = col["name"]
                break
        else:
            pk_map.setdefault(name, "id")

    ref_pat = re.compile(
        r"(REFERENCES\s+(?:\w+\.)?(\w+)\s*\(\s*)(\w+)(\s*\))",
        re.IGNORECASE,
    )
    fixed = 0

    def _fix_ref(m: "re.Match[str]") -> str:
        nonlocal fixed
        prefix, tbl_name, col_name, suffix = m.group(1), m.group(2), m.group(3), m.group(4)
        real_pk = pk_map.get(tbl_name)
        if real_pk and col_name != real_pk:
            fixed += 1
            return prefix + real_pk + suffix
        return m.group(0)

    for tbl in _all_tables(schema_json):
        for col in tbl.get("columns") or []:
            cons = col.get("constraints") or ""
            if "REFERENCES" not in cons.upper():
                continue
            new_cons = ref_pat.sub(_fix_ref, cons)
            if new_cons != cons:
                col["constraints"] = new_cons

        for fk in tbl.get("foreign_keys") or []:
            ref = fk.get("references") or ""
            if "REFERENCES" not in ref.upper():
                # foreign_keys entries store just the target, e.g. "schema.table(col)"
                m = re.match(r"((?:\w+\.)?(\w+)\s*\(\s*)(\w+)(\s*\))", ref)
                if m:
                    prefix, tbl_name, col_name, suffix = m.group(1), m.group(2), m.group(3), m.group(4)
                    real_pk = pk_map.get(tbl_name)
                    if real_pk and col_name != real_pk:
                        fk["references"] = prefix + real_pk + suffix
                        fixed += 1

    return fixed


# --------------------------------------------------------------------------- #
# Step 6 — backfill cfg seed values from CIR workflow states
# --------------------------------------------------------------------------- #
def backfill_seeds_from_cir(schema_json: dict, cir: dict) -> int:
    workflows = [
        wf for wf in (cir.get("workflows") or [])
        if isinstance(wf, dict) and (wf.get("states") or [])
    ]
    if not workflows:
        return 0
    by_entity: dict[str, list[str]] = {}
    for wf in workflows:
        bound = wf.get("bound_entity") or wf.get("name") or ""
        key = _snake(bound)
        if key:
            by_entity.setdefault(key, []).extend(
                str(s) for s in (wf.get("states") or []) if s
            )

    backfilled = 0
    for tbl in _all_tables(schema_json):
        name = tbl.get("table_name", "")
        if not name.startswith("cfg_"):
            continue
        if tbl.get("_seed_values") or tbl.get("values"):
            continue
        stem = name[4:]
        for suffix in ("_status", "_state", "_type", "_category", "_kind"):
            if stem.endswith(suffix):
                stem = stem[: -len(suffix)]
                break
        states = by_entity.get(stem)
        if not states:
            for k, v in by_entity.items():
                if k in stem or stem in k:
                    states = v
                    break
        if states:
            tbl["_seed_values"] = states
            backfilled += 1
    return backfilled


# --------------------------------------------------------------------------- #
# Step 7 — surface CIR rules into table comments / assumptions
# --------------------------------------------------------------------------- #
def _find_table_for_rule(schema_json: dict, applies_to: str) -> dict | None:
    if not applies_to:
        return None
    target = _snake(applies_to)
    for tbl in _all_tables(schema_json):
        name = tbl.get("table_name", "")
        if not name or name.startswith("cfg_"):
            continue
        if name == target or target in name or name in target:
            return tbl
    if target.endswith("s"):
        singular = target[:-1]
        for tbl in _all_tables(schema_json):
            name = tbl.get("table_name", "")
            if name == singular or singular in name:
                return tbl
    return None


def _rule_already_present(rule_name: str, table: dict) -> bool:
    needle = _snake(rule_name)
    haystack: list[str] = []
    if table.get("comment"):
        haystack.append(str(table["comment"]).lower())
    for col in table.get("columns") or []:
        if col.get("comment"):
            haystack.append(str(col["comment"]).lower())
        if col.get("constraints"):
            haystack.append(str(col["constraints"]).lower())
    return needle in " ".join(haystack)


def surface_cir_rules(schema_json: dict, cir: dict) -> dict:
    rules = [r for r in (cir.get("rules") or []) if isinstance(r, dict) and r.get("name")]
    stats = {
        "rules_total": len(rules),
        "appended_to_table_comment": 0,
        "added_to_assumptions": 0,
        "already_present": 0,
    }
    if not rules:
        return stats
    schema_json.setdefault("assumptions", [])

    for rule in rules:
        rname = rule.get("name", "")
        rdesc = (rule.get("description") or "").strip()
        applies = rule.get("applies_to", "")
        target = _find_table_for_rule(schema_json, applies)

        if target is not None:
            if _rule_already_present(rname, target):
                stats["already_present"] += 1
                continue
            existing = (target.get("comment") or "").rstrip(". ")
            addition = f"Rule {rname}: {rdesc}"
            target["comment"] = f"{existing}. {addition}" if existing else addition
            stats["appended_to_table_comment"] += 1
        else:
            needle = _snake(rname)
            if any(needle in str(a).lower() for a in schema_json.get("assumptions") or []):
                stats["already_present"] += 1
                continue
            schema_json["assumptions"].append(
                f"Rule {rname} (applies to {applies or 'system'}): {rdesc}"
            )
            stats["added_to_assumptions"] += 1
    return stats


# --------------------------------------------------------------------------- #
# Match a cfg_* stem to its likely business table — multi-strategy
# --------------------------------------------------------------------------- #
# Industry-standard abbreviations the LLM commonly uses for cfg names even
# when the business table uses the long form. Bidirectional match.
_BIZ_ABBREVIATIONS = {
    "asn":  "advanced_shipment_notice",
    "rcr":  "receiving_confirmation_report",
    "po":   "purchase_order",
    "so":   "sales_order",
    "wo":   "work_order",
    "rfq":  "request_for_quotation",
    "rma":  "return_merchandise_authorization",
    "bol":  "bill_of_lading",
    "gr":   "goods_receipt",
    "pcn":  "product_change_notice",
    "ecn":  "engineering_change_notice",
    "kpi":  "key_performance_indicator",
    "sla":  "service_level_agreement",
    "sku":  "stock_keeping_unit",
    "upc":  "universal_product_code",
    "ean":  "european_article_number",
    "pos":  "point_of_sale",
    "crm":  "customer_relationship",
    "erp":  "enterprise_resource_plan",
    "wms":  "warehouse_management",
}


def _initials(name: str) -> str:
    """`receiving_confirmation_report` → `rcr`."""
    return "".join(p[:1] for p in (name or "").split("_") if p).lower()


def _singularise(name: str) -> str:
    """Cheap English singulariser. `orders` → `order`, `entities` → `entity`."""
    if not name or len(name) < 3:
        return name
    if name.endswith("ies"):
        return name[:-3] + "y"
    if name.endswith("ses") or name.endswith("xes") or name.endswith("zes"):
        return name[:-2]
    if name.endswith("s") and not name.endswith("ss"):
        return name[:-1]
    return name


def _match_business_for_cfg_stem(stem: str, business_index: dict[str, dict]) -> dict | None:
    """Try every reasonable strategy to match a cfg_<stem>_<suffix> to an
    existing business table. Returns the first matching table, or None.

    Strategies, in priority order:
      1. Exact match (`stem == table_name`)
      2. Singular/plural pivot (`stem` <-> singular(table_name))
      3. Industry abbreviation expansion (`asn` → `advanced_shipment_notice`)
      4. Initials match (`rcr` matches initials of `receiving_confirmation_report`)
      5. Substring match (longest overlap wins)
      6. Reverse abbreviation lookup (table name's initials == stem)
    """
    if not stem or not business_index:
        return None

    # Strategy 1 — exact
    if stem in business_index:
        return business_index[stem]

    # Strategy 2 — plural/singular pivot
    sing_stem = _singularise(stem)
    if sing_stem in business_index:
        return business_index[sing_stem]
    for tname in business_index:
        if _singularise(tname) == stem:
            return business_index[tname]

    # Strategy 3 — known abbreviation expansion
    expanded = _BIZ_ABBREVIATIONS.get(stem)
    if expanded:
        if expanded in business_index:
            return business_index[expanded]
        # also try plural/singular of the expansion
        for tname in business_index:
            if tname == expanded or _singularise(tname) == expanded:
                return business_index[tname]

    # Strategy 4 — initials match (forward: table → initials == stem)
    for tname, tbl in business_index.items():
        if _initials(tname) == stem:
            return tbl

    # Strategy 5 — substring overlap (longest wins)
    best, best_overlap = None, 0
    for tname, tbl in business_index.items():
        if stem in tname or tname in stem:
            overlap = min(len(stem), len(tname))
            if overlap > best_overlap:
                best, best_overlap = tbl, overlap
    if best is not None:
        return best

    # Strategy 6 — reverse: stem might itself be an initialism we don't have
    # in the dictionary, but matches some business table's initials anyway
    if 2 <= len(stem) <= 5:  # plausible initialism length
        for tname, tbl in business_index.items():
            if "_" in tname and _initials(tname).startswith(stem):
                return tbl

    return None


# --------------------------------------------------------------------------- #
# Generic archetype-default status values (fallback only — used when neither
# the LLM nor the upstream artifacts have data for an entity)
# --------------------------------------------------------------------------- #
_ARCHETYPE_STATUS_DEFAULTS: dict[str, list[str]] = {
    "DOCUMENT":   ["draft", "open", "in_progress", "closed", "cancelled"],
    "MASTER":     ["active", "inactive", "archived"],
    "ASSIGNMENT": ["pending", "active", "completed", "cancelled"],
    "LINE_ITEM":  ["pending", "received", "short", "rejected"],
    "EVENT_LOG":  ["new", "acknowledged", "resolved"],
    "_GENERIC":   ["active", "inactive"],
}


def _classify_archetype_local(tbl: dict) -> str | None:
    """Local archetype classifier — same logic as _classify_archetype but
    available before the auto_create_history_tables function defines it.
    """
    if not _is_business_table(tbl):
        return None
    cols = tbl.get("columns") or []
    col_names = {c.get("name") for c in cols if c.get("name")}
    name = tbl.get("table_name", "")
    if name.endswith("_log") or name in {"audit_log", "notifications"}:
        return "EVENT_LOG"
    if any(n in col_names for n in {"event_type", "event_timestamp", "actor_id"}):
        return "EVENT_LOG"
    has_qty = any(n in col_names for n in {"quantity", "qty", "quantity_expected"})
    has_price = any(n in col_names for n in {"unit_price", "price", "line_total"})
    if has_qty and has_price:
        return "LINE_ITEM"
    if any(n in col_names for n in {"assigned_at", "released_at", "assigned_to"}):
        return "ASSIGNMENT"
    has_doc_num = any(
        "UNIQUE" in (c.get("constraints") or "").upper()
        for c in cols if c.get("name") not in {"id", "submenu_id"}
    )
    has_status = any(n in col_names for n in {"status_id", "status"})
    has_date = any(
        re.search(r"_(date|at)$", c.get("name", "") or "") and
        c.get("name") not in {"created_at", "updated_at", "deleted_at"}
        for c in cols
    )
    if has_doc_num and has_status and has_date:
        return "DOCUMENT"
    has_name = any(n in col_names for n in {"name", "label", "title"}) or any(
        (n or "").endswith("_name") for n in col_names if isinstance(n, str)
    )
    if has_doc_num and (has_name or has_status):
        return "MASTER"
    return "DOCUMENT" if has_doc_num else None


def _is_business_table(tbl: dict) -> bool:
    """Same as the function defined later — duplicated here to avoid forward-ref."""
    name = tbl.get("table_name", "")
    if not name or name in {"menu", "submenu"}:
        return False
    if name.startswith("cfg_") or name.endswith("_history") or name.endswith("_settings"):
        return False
    return True


def _ensure_every_cfg_has_seeds(schema_json: dict, bag=None) -> int:
    """Final-pass guarantee: every cfg_* table has at least 2 seed values.

    Priority order for picking seeds:
      1. Existing _seed_values (no-op)
      2. Bag lookup by stripped stem (entity name from cfg_<entity>_<suffix>)
      3. Archetype default by suffix
      4. Generic fallback (active/inactive)
    """
    suffix_to_archetype = {
        "status":   "DOCUMENT",   # most cfg_X_status fits document/master statuses
        "type":     "DOCUMENT",
        "category": "DOCUMENT",
        "priority": "_GENERIC",   # priority is its own thing — handled inline
        "severity": "EVENT_LOG",
        "level":    "_GENERIC",
        "kind":     "_GENERIC",
        "stage":    "DOCUMENT",
        "phase":    "DOCUMENT",
        "condition":"_GENERIC",
        "role":     "_GENERIC",
    }
    # Specific defaults per suffix (richer than archetype-only)
    suffix_defaults = {
        "priority": ["low", "medium", "high", "urgent"],
        "type":     ["standard", "expedited", "special"],
        "category": ["primary", "secondary", "tertiary"],
        "severity": ["info", "warning", "error", "critical"],
        "role":     ["admin", "user", "viewer"],
    }
    filled = 0
    for tbl in _all_tables(schema_json):
        name = tbl.get("table_name", "")
        if not name.startswith("cfg_"):
            continue
        if tbl.get("_seed_values"):
            continue
        # Strip cfg_ prefix and the suffix to get the entity stem
        stem = name[4:]
        suffix = ""
        for s in ("status", "type", "category", "priority", "severity",
                   "level", "kind", "stage", "phase", "condition", "role"):
            if stem.endswith("_" + s):
                stem = stem[: -len(s) - 1]
                suffix = s
                break

        seeds: list[str] = []
        # Priority 2: bag
        if bag is not None and stem:
            try:
                if suffix == "status":
                    seeds = bag.lookup_status_values(stem) or []
                else:
                    seeds = bag.lookup_field_values(stem, suffix) or []
            except Exception:
                seeds = []

        # Priority 3: suffix-specific default
        if not seeds and suffix in suffix_defaults:
            seeds = list(suffix_defaults[suffix])

        # Priority 4: archetype default by suffix → archetype map
        if not seeds:
            arch = suffix_to_archetype.get(suffix, "_GENERIC")
            seeds = list(_ARCHETYPE_STATUS_DEFAULTS.get(arch, _ARCHETYPE_STATUS_DEFAULTS["_GENERIC"]))

        if seeds:
            tbl["_seed_values"] = seeds
            filled += 1
    return filled


def enforce_status_per_business_table(schema_json: dict, bag=None) -> int:
    """For every DOCUMENT/MASTER business table without a status FK:
       1. Find status values from the bag (CIR workflows + field values + glossary)
          OR fall back to archetype defaults
       2. Auto-create cfg_<table>_status with those seeds
       3. Inject `status_id INT NOT NULL DEFAULT 1 REFERENCES cfg_<table>_status(...)`
          on the business table
       4. Attach a glossary-derived comment to the new cfg table when available

    Idempotent. Returns count of cfg+FK pairs created.
    """
    schema_name = schema_json.get("schema_name") or "schema"
    cfg_tables = _existing_cfg_tables(schema_json)
    cfg_menu = _find_or_create_cfg_menu(schema_json)
    cfg_menu_id = cfg_menu["menu_id"]
    existing_sm_ids = {sm.get("submenu_id", 0) for sm in cfg_menu.get("submenus", []) or []}
    next_sm = max(existing_sm_ids, default=cfg_menu_id * 100) + 1
    if next_sm < cfg_menu_id * 100 + 1:
        next_sm = cfg_menu_id * 100 + 1

    created = 0
    for tbl in _all_tables(schema_json):
        if not _is_business_table(tbl):
            continue
        archetype = _classify_archetype_local(tbl)
        if archetype not in ("DOCUMENT", "MASTER", "LINE_ITEM", "ASSIGNMENT"):
            continue

        # Already has a status FK?
        cols = tbl.get("columns") or []
        existing_names = {c.get("name") for c in cols}
        if "status_id" in existing_names or "status" in existing_names:
            continue

        biz_name = tbl["table_name"]
        cfg_name = f"cfg_{biz_name}_status"

        # Source seeds — bag first, then archetype fallback
        seeds: list[str] = []
        seed_source = "archetype_default"
        if bag is not None:
            try:
                seeds = bag.lookup_status_values(biz_name) or []
                if seeds:
                    seed_source = "input_artifacts"
            except Exception:
                seeds = []
        if not seeds:
            seeds = list(_ARCHETYPE_STATUS_DEFAULTS.get(archetype, _ARCHETYPE_STATUS_DEFAULTS["_GENERIC"]))

        # If the cfg already exists (LLM emitted it), backfill seeds + ensure
        # status_id FK on the business table — but don't recreate the table.
        if cfg_name in cfg_tables:
            existing_cfg = cfg_tables[cfg_name]
            if not existing_cfg.get("_seed_values"):
                existing_cfg["_seed_values"] = list(seeds)
                created += 1  # count seed-population as a fix
            # also ensure the business table actually FKs to it
            biz_has_fk = any(
                cfg_name in (c.get("constraints") or "")
                for c in cols
            )
            if not biz_has_fk:
                cfg_pk = f"{cfg_name}_id"
                cols.insert(1, {
                    "name": "status_id",
                    "type": "INT",
                    "constraints": (
                        f"NOT NULL DEFAULT 1 "
                        f"REFERENCES {schema_name}.{cfg_name}({cfg_pk})"
                    ),
                    "comment": f"Auto-injected FK to existing cfg ({seed_source})",
                })
            continue

        # Glossary-derived comment for the cfg table
        comment = None
        if bag is not None:
            try:
                ent_comment = bag.lookup_entity_comment(biz_name)
                if ent_comment:
                    comment = f"Status lookup for {biz_name}. {ent_comment}"
            except Exception:
                pass
        if not comment:
            comment = (
                f"Status lookup for {biz_name}. "
                f"{'Seed values from input.' if seed_source == 'input_artifacts' else 'Generic placeholder seeds — operator may customise via admin UI.'}"
            )

        # Create the cfg table
        cfg_table = _build_cfg_table(
            cfg_name=cfg_name,
            values=seeds,
            description=comment,
            submenu_id=next_sm,
            schema_name=schema_name,
        )
        cfg_menu.setdefault("submenus", []).append({
            "submenu_id": next_sm,
            "submenu_name": f"{biz_name.replace('_', ' ').title()} Status",
            "sequence_number": next_sm - cfg_menu_id * 100,
            "description": f"Configurable status values for {biz_name}",
            "tables": [cfg_table],
        })
        cfg_tables[cfg_name] = cfg_table
        next_sm += 1

        # Inject status_id FK column on the business table
        cfg_pk = f"{cfg_name}_id"
        new_col = {
            "name": "status_id",
            "type": "INT",
            "constraints": (
                f"NOT NULL DEFAULT 1 "
                f"REFERENCES {schema_name}.{cfg_name}({cfg_pk})"
            ),
            "comment": f"Auto-injected: {seed_source}",
        }
        # Insert after submenu_id if present, else after id
        insert_at = 1
        for i, c in enumerate(cols):
            if c.get("name") == "submenu_id":
                insert_at = i + 1
                break
        cols.insert(insert_at, new_col)
        created += 1

    return created


# --------------------------------------------------------------------------- #
# Fix A — Inject missing FK columns when an orphan cfg_* table exists
# --------------------------------------------------------------------------- #
def inject_missing_cfg_fks(schema_json: dict) -> int:
    """For every cfg_<X>_<suffix> table that no business table FK-references,
    find a business table whose name matches X (or close match) and inject
    the missing `<suffix>_id INT NOT NULL DEFAULT 1 REFERENCES cfg_X_suffix(...)`
    column.

    Idempotent. Returns count of FK columns injected.

    Catches the class of bug where the LLM emits a cfg lookup but forgets the
    business-table column that references it (the orphan-cfg problem).
    """
    schema_name = schema_json.get("schema_name") or "schema"
    cfg_tables = _existing_cfg_tables(schema_json)
    if not cfg_tables:
        return 0

    # Collect all currently-referenced cfg targets (so we know which are orphan)
    referenced: set[str] = set()
    pat = re.compile(r"(?:\w+\.)?(cfg_[a-zA-Z0-9_]+)\s*\(", re.IGNORECASE)
    for tbl in _all_tables(schema_json):
        if tbl.get("table_name", "").startswith("cfg_"):
            continue
        for col in tbl.get("columns") or []:
            for m in pat.finditer(col.get("constraints") or ""):
                referenced.add(m.group(1))
        for fk in tbl.get("foreign_keys") or []:
            for m in pat.finditer(fk.get("references") or ""):
                referenced.add(m.group(1))

    business_index = {
        tbl["table_name"]: tbl
        for tbl in _all_tables(schema_json)
        if tbl.get("table_name") and not tbl["table_name"].startswith("cfg_")
        and not tbl["table_name"].endswith("_history")
        and not tbl["table_name"].endswith("_settings")
    }
    injected = 0
    cfg_pat = re.compile(r"^cfg_(.+?)_(status|type|category|priority|severity|level|kind|stage|phase|condition|role)$")

    for cfg_name in sorted(cfg_tables.keys()):
        if cfg_name in referenced:
            continue
        m = cfg_pat.match(cfg_name)
        if not m:
            continue
        entity_part, suffix = m.group(1), m.group(2)
        col_name = f"{suffix}_id"

        # Find the matching business table — multi-strategy in priority order.
        target = _match_business_for_cfg_stem(entity_part, business_index)
        if target is None:
            continue

        existing_cols = {c.get("name") for c in target.get("columns") or []}
        if col_name in existing_cols:
            continue

        cfg_pk = f"{cfg_name}_id"
        new_col = {
            "name": col_name,
            "type": "INT",
            "constraints": (
                f"NOT NULL DEFAULT 1 "
                f"REFERENCES {schema_name}.{cfg_name}({cfg_pk})"
            ),
            "comment": f"Auto-injected: {cfg_name} was orphan; wired here via name match",
        }
        cols = target.setdefault("columns", [])
        # Insert after `submenu_id` if present, else after `id`
        insert_at = 1
        for i, c in enumerate(cols):
            if c.get("name") == "submenu_id":
                insert_at = i + 1
                break
        cols.insert(insert_at, new_col)
        injected += 1

    return injected


# --------------------------------------------------------------------------- #
# Fix C — Dedupe tables (keep first definition, drop subsequent)
# --------------------------------------------------------------------------- #
def dedupe_tables(schema_json: dict) -> int:
    """Drop subsequent definitions of any table_name already seen.

    The LLM occasionally emits the same table twice (e.g., `audit_log` under
    two different menus) which would crash CREATE TABLE on Postgres. Keep
    the first occurrence in render order; remove duplicates.

    Idempotent. Returns count of duplicates removed.
    """
    seen: set[str] = set()
    removed = 0
    for menu in schema_json.get("menus", []) or []:
        for sub in menu.get("submenus") or []:
            tables = sub.get("tables") or []
            kept = []
            for tbl in tables:
                name = tbl.get("table_name") if isinstance(tbl, dict) else None
                if not name:
                    continue
                if name in seen:
                    removed += 1
                    continue
                seen.add(name)
                kept.append(tbl)
            sub["tables"] = kept
    return removed


# --------------------------------------------------------------------------- #
# Step 8 — Richness booster: SaaS audit columns (deterministic, generic)
# --------------------------------------------------------------------------- #
_BOILERPLATE_NAMES = {
    "id", "uid", "submenu_id",
    "user_id", "created_by", "updated_by", "parent_id", "obtained_marks",
    "created_at", "updated_at", "deleted",
    "is_parent_deleted", "is_auto", "is_draft",
    # legacy columns (preserved if already present in schema JSON)
    "is_active", "deleted_at", "deleted_by", "version", "notes", "metadata",
}

# Standard columns injected into every non-system, non-cfg table.
# HEAD group: inserted after submenu_id; TAIL group: appended at end.
_STANDARD_COLUMNS = [
    # HEAD — identity and ownership
    {"name": "uid",               "type": "VARCHAR",     "constraints": "NOT NULL UNIQUE"},
    {"name": "user_id",           "type": "BIGINT",      "constraints": "NOT NULL"},
    {"name": "created_by",        "type": "BIGINT",      "constraints": ""},
    {"name": "updated_by",        "type": "BIGINT",      "constraints": ""},
    {"name": "parent_id",         "type": "BIGINT",      "constraints": ""},
    {"name": "obtained_marks",    "type": "FLOAT8",      "constraints": ""},
    # TAIL — timestamps and soft-delete flags
    {"name": "created_at",        "type": "TIMESTAMPTZ", "constraints": "DEFAULT now() NOT NULL"},
    {"name": "updated_at",        "type": "TIMESTAMPTZ", "constraints": "DEFAULT now() NOT NULL"},
    {"name": "deleted",           "type": "BOOLEAN",     "constraints": "DEFAULT false NOT NULL"},
    {"name": "is_parent_deleted", "type": "BOOLEAN",     "constraints": "DEFAULT false NOT NULL"},
    {"name": "is_auto",           "type": "BOOLEAN",     "constraints": "DEFAULT false NOT NULL"},
    {"name": "is_draft",          "type": "BOOLEAN",     "constraints": "DEFAULT false NOT NULL"},
]

_HEAD_COLUMN_NAMES = {"uid", "user_id", "created_by", "updated_by", "parent_id", "obtained_marks"}
_TAIL_COLUMN_NAMES = {"created_at", "updated_at", "deleted", "is_parent_deleted", "is_auto", "is_draft"}


def _is_business_table(tbl: dict) -> bool:
    """Decide whether a table is a 'business' table that should get richness."""
    name = tbl.get("table_name", "")
    if not name:
        return False
    if name in {"menu", "submenu"}:
        return False
    if name.startswith("cfg_"):
        return False
    if name.endswith("_history"):
        return False
    if name.endswith("_settings"):
        return False  # settings are sparse key/value tables
    return True


def _needs_standard_columns(tbl: dict) -> bool:
    """Tables that receive the full standard column set (all except cfg_*, menu, submenu)."""
    name = tbl.get("table_name", "")
    if not name or name in {"menu", "submenu"}:
        return False
    if name.startswith("cfg_"):
        return False
    return True


def inject_saas_columns(schema_json: dict) -> int:
    """Inject / upgrade the standard column set on every non-system, non-cfg table.

    HEAD columns (uid, user_id, created_by, updated_by, parent_id, obtained_marks)
    are inserted right after submenu_id.  TAIL columns (created_at, updated_at,
    deleted, is_parent_deleted, is_auto, is_draft) are appended at the end.
    Existing created_at / updated_at are upgraded to TIMESTAMPTZ DEFAULT now().
    Idempotent. Returns count of columns injected or upgraded.
    """
    injected = 0
    for tbl in _all_tables(schema_json):
        if not _needs_standard_columns(tbl):
            continue
        cols = tbl.setdefault("columns", [])
        existing_names = {c.get("name") for c in cols}

        # Upgrade existing created_at / updated_at to TIMESTAMPTZ + DEFAULT now()
        for c in cols:
            if c.get("name") in ("created_at", "updated_at"):
                if (c.get("type", "").upper() != "TIMESTAMPTZ"
                        or "now()" not in (c.get("constraints") or "").lower()):
                    c["type"] = "TIMESTAMPTZ"
                    c["constraints"] = "DEFAULT now() NOT NULL"
                    injected += 1

        # Collect missing columns by group
        missing_head = [
            spec for spec in _STANDARD_COLUMNS
            if spec["name"] in _HEAD_COLUMN_NAMES and spec["name"] not in existing_names
        ]
        missing_tail = [
            spec for spec in _STANDARD_COLUMNS
            if spec["name"] in _TAIL_COLUMN_NAMES and spec["name"] not in existing_names
        ]

        # Insert HEAD columns after submenu_id (or after id if no submenu_id)
        if missing_head:
            head_at = 1
            for i, c in enumerate(cols):
                if c.get("name") == "submenu_id":
                    head_at = i + 1
                    break
            for spec in reversed(missing_head):
                cols.insert(head_at, dict(spec))
                injected += 1

        # Append TAIL columns at end
        for spec in missing_tail:
            cols.append(dict(spec))
            injected += 1

    return injected


# --------------------------------------------------------------------------- #
# Step 9 — Auto-create paired *_history tables for DOCUMENT and MASTER entities
# --------------------------------------------------------------------------- #
def _classify_archetype(tbl: dict) -> str | None:
    """Heuristically classify a business table into one of the v8 archetypes.

    Returns one of: DOCUMENT, MASTER, LINE_ITEM, ASSIGNMENT, EVENT_LOG, None.
    Heuristics use column shape only — no entity-name hardcoding.
    """
    if not _is_business_table(tbl):
        return None
    cols = tbl.get("columns") or []
    col_names = {c.get("name") for c in cols if c.get("name")}
    name = tbl.get("table_name", "")

    # EVENT_LOG: name-suffix or has event_* / action / actor columns
    if name.endswith("_log") or name in {"audit_log", "notifications"}:
        return "EVENT_LOG"
    if any(n in col_names for n in {"event_type", "event_timestamp", "actor_id"}):
        return "EVENT_LOG"

    # LINE_ITEM: has parent FK + qty/quantity + price
    has_qty = any(n in col_names for n in {"quantity", "qty", "quantity_expected"})
    has_price = any(n in col_names for n in {"unit_price", "price", "line_total"})
    if has_qty and has_price:
        return "LINE_ITEM"

    # ASSIGNMENT: has assigned_at / released_at / assigned_to
    if any(n in col_names for n in {"assigned_at", "released_at", "assigned_to"}):
        return "ASSIGNMENT"

    # DOCUMENT: has any UNIQUE business key + status_id + a date column.
    # The business key may be `<doc>_number`, `<doc>_id` (LLM naming), or any
    # other UNIQUE column on the table.
    has_doc_num = any(
        "UNIQUE" in (c.get("constraints") or "").upper()
        for c in cols
        if c.get("name") not in {"id", "submenu_id"}
    )
    has_status = any(n in col_names for n in {"status_id", "status"})
    has_date = any(
        re.search(r"_(date|at)$", c.get("name", "")) and c.get("name") not in {"created_at", "updated_at", "deleted_at"}
        for c in cols
    )
    if has_doc_num and has_status and has_date:
        return "DOCUMENT"

    # MASTER: has unique business key + name/description but no document-y dates
    has_name = any(n in col_names for n in {"name", "label", "title"}) or any(
        n.endswith("_name") for n in col_names if isinstance(n, str)
    )
    if has_doc_num and has_name and not has_date:
        return "MASTER"
    if has_doc_num and has_status:
        return "MASTER"

    return None


def auto_create_history_tables(schema_json: dict) -> int:
    """For every DOCUMENT and MASTER table, ensure a paired *_history exists.

    Idempotent. Places new history tables under a Configuration menu submenu.
    Generic across domains — uses universal change-tracking columns only.
    """
    schema_name = schema_json.get("schema_name") or "schema"
    existing_tables = _all_table_names(schema_json)

    candidates: list[dict] = []
    for tbl in _all_tables(schema_json):
        arch = _classify_archetype(tbl)
        if arch in ("DOCUMENT", "MASTER"):
            history_name = f"{tbl['table_name']}_history"
            if history_name not in existing_tables:
                candidates.append(tbl)

    if not candidates:
        return 0

    cfg_menu = _find_or_create_cfg_menu(schema_json)
    cfg_menu_id = cfg_menu["menu_id"]
    existing_sm_ids = {sm.get("submenu_id", 0) for sm in cfg_menu.get("submenus", []) or []}
    next_sm = max(existing_sm_ids, default=cfg_menu_id * 100) + 1
    if next_sm < cfg_menu_id * 100 + 1:
        next_sm = cfg_menu_id * 100 + 1

    created = 0
    for parent in candidates:
        parent_name = parent["table_name"]
        history_name = f"{parent_name}_history"
        history_table = {
            "table_name": history_name,
            "comment": f"Append-only audit trail for {parent_name}.",
            "columns": [
                {"name": "id", "type": "SERIAL", "constraints": "PRIMARY KEY"},
                {"name": "submenu_id", "type": "INT",
                 "constraints": f"NOT NULL DEFAULT {next_sm} REFERENCES {schema_name}.submenu(submenu_id)"},
                {"name": f"{parent_name}_id", "type": "INT",
                 "constraints": f"NOT NULL REFERENCES {schema_name}.{parent_name}(id)",
                 "comment": f"FK to {parent_name}"},
                {"name": "change_type", "type": "VARCHAR(30)",
                 "constraints": "NOT NULL CHECK (change_type IN ('CREATE','UPDATE','DELETE'))",
                 "comment": "Type of change"},
                {"name": "change_details", "type": "JSONB",
                 "constraints": "NOT NULL",
                 "comment": "Full snapshot of changed fields"},
                {"name": "changed_by", "type": "INT",
                 "constraints": "",
                 "comment": "Actor user id (FK added later if users table exists)"},
                {"name": "change_date", "type": "TIMESTAMP",
                 "constraints": "NOT NULL DEFAULT CURRENT_TIMESTAMP"},
                {"name": "ip_address", "type": "INET",
                 "constraints": "",
                 "comment": "Source IP of the change"},
                {"name": "created_at", "type": "TIMESTAMP",
                 "constraints": "DEFAULT CURRENT_TIMESTAMP"},
            ],
            "foreign_keys": [],
        }
        cfg_menu.setdefault("submenus", []).append({
            "submenu_id": next_sm,
            "submenu_name": f"{parent_name.replace('_', ' ').title()} History",
            "sequence_number": next_sm - cfg_menu_id * 100,
            "description": f"Audit history for {parent_name}",
            "tables": [history_table],
        })
        next_sm += 1
        created += 1
    return created


# --------------------------------------------------------------------------- #
# Step 10 — Generate INDEX statements (deterministic, generic)
# --------------------------------------------------------------------------- #
def generate_index_statements(schema_json: dict) -> list[str]:
    """Return a list of CREATE INDEX SQL statements covering:
       - every `*_id` FK column on every table
       - every UNIQUE business key (NB: Postgres auto-indexes UNIQUE — explicit here for documentation)
       - `is_active`, `created_at`, `deleted_at` on every business table
       - every `_status` / `_type` / `_priority` column

    Idempotent (uses a `seen` set). Generic across all domains.
    Order: business tables first, then cfg tables.
    """
    schema_name = schema_json.get("schema_name") or "schema"
    seen: set[tuple[str, str]] = set()
    out: list[str] = []

    def _add(table: str, col: str) -> None:
        key = (table, col)
        if key in seen:
            return
        seen.add(key)
        idx = f"idx_{table}_{col}"
        out.append(
            f"CREATE INDEX {idx} ON {schema_name}.{table}({col});"
        )

    business: list[dict] = []
    cfgs: list[dict] = []
    for tbl in _all_tables(schema_json):
        if tbl.get("table_name", "").startswith("cfg_"):
            cfgs.append(tbl)
        elif tbl.get("table_name") not in {"menu", "submenu"}:
            business.append(tbl)

    for tbl in business + cfgs:
        tname = tbl.get("table_name", "")
        for col in tbl.get("columns") or []:
            cname = col.get("name") or ""
            if not cname or cname == "id":
                continue
            cons_upper = (col.get("constraints") or "").upper()
            # FK columns
            if cname.endswith("_id"):
                _add(tname, cname)
                continue
            # UNIQUE business keys
            if "UNIQUE" in cons_upper:
                _add(tname, cname)
                continue
            # Status / type / priority
            if any(suffix in cname for suffix in ("_status", "_type", "_priority", "_category")):
                _add(tname, cname)
                continue
            # Activity flags + time fields used in WHERE clauses
            if cname in {
                "is_active", "created_at", "updated_at", "deleted_at",
                "deleted", "created_by", "updated_by",
                "is_draft", "is_auto", "is_parent_deleted",
            }:
                _add(tname, cname)
                continue
    return out


# --------------------------------------------------------------------------- #
# Public entry point — single call wires it all together
# --------------------------------------------------------------------------- #
def normalise(schema_json: dict, cir: dict | None = None,
              richness: bool = True,
              regenerate_name: bool = True,
              bag=None) -> dict:
    """Run the full normalisation pipeline. Mutates schema_json. Idempotent.

    Args:
        schema_json: Mantara schema dict (will be mutated)
        cir: optional CIR dict for seed backfill + rule surfacing
        richness: when True (default), runs the deterministic richness booster
                  (SaaS audit columns, history tables, indexes). Set False if
                  you want pure structural correctness without column injection.
        regenerate_name: when True (default), replaces schema_name with a
                  unique `sch_<date>_<time>_<rand>` (36 chars). Idempotent —
                  skipped if schema_name already matches the sch_ pattern.
                  Pass False to keep the LLM-emitted name (e.g. inside the
                  LLM repair loop, where stable names help the model).

    Returns a stats dict written under `schema_json["_normaliser"]`.
    """
    stats: dict[str, Any] = {
        "schema_name_old": schema_json.get("schema_name"),
        "schema_name_new": None,
        "lifted_enums": 0,
        "auto_created_cfg": 0,
        "normalised_repairs": 0,
        "rewritten_columns": 0,
        "stripped_dangling_fks": 0,
        "backfilled_seeds": 0,
        "duplicate_tables_removed": 0,
        "missing_fks_injected": 0,
        "status_cfg_enforced": 0,
        "seeds_default_filled": 0,
        "saas_columns_injected": 0,
        "history_tables_created": 0,
        "indexes_emitted": 0,
        "rules": {},
    }

    # 0. Regenerate schema_name (FIRST — so all subsequent FK rewrites use
    #    the new prefix). Skipped inside the LLM repair loop.
    if regenerate_name:
        new_sn = regenerate_schema_name(schema_json)
        if new_sn:
            stats["schema_name_new"] = new_sn

    schema_name = schema_json.get("schema_name") or "schema"
    stats["schema_name"] = schema_name

    # 0.5. dedupe duplicate tables (LLM occasionally emits the same name twice)
    stats["duplicate_tables_removed"] = dedupe_tables(schema_json)

    # 1. enums → cfg
    stats["lifted_enums"] = lift_enum_types(schema_json)

    # 2. auto-create missing cfg targets
    stats["auto_created_cfg"] = create_missing_cfg_tables(schema_json)

    # 3. normalise every cfg table
    cfg_tables = _existing_cfg_tables(schema_json)
    # Build table_name → submenu_id so each cfg table gets a proper DEFAULT.
    cfg_submenu_map: dict[str, int] = {}
    for _menu in schema_json.get("menus", []):
        for _sub in _menu.get("submenus") or []:
            _sid = _sub.get("submenu_id")
            if _sid is None:
                continue
            for _tbl in _sub.get("tables") or []:
                _tname = _tbl.get("table_name", "")
                if _tname.startswith("cfg_"):
                    cfg_submenu_map[_tname] = _sid
    repairs = 0
    for tbl_name, tbl in cfg_tables.items():
        repairs += normalise_cfg_table(tbl, schema_name, submenu_id=cfg_submenu_map.get(tbl_name))
    stats["normalised_repairs"] = repairs

    # 4. rewrite business→cfg refs
    rewritten = 0
    for tbl in _all_tables(schema_json):
        if tbl.get("table_name", "").startswith("cfg_"):
            continue
        rewritten += rewrite_business_fks_to_cfg(tbl, cfg_tables, schema_name)
    stats["rewritten_columns"] = rewritten

    # 5. strip dangling FKs
    stats["stripped_dangling_fks"] = strip_dangling_fks(schema_json)

    # 5.1. coerce UUID/BIGINT FK columns to INT (all PKs are SERIAL/integer)
    stats["fk_types_repaired"] = repair_fk_types(schema_json)

    # 5.2. fix FK references naming a non-existent column (e.g. bmi_result_id → id)
    stats["fk_columns_repaired"] = repair_fk_referenced_columns(schema_json)

    # 5.5. inject missing FK columns for orphan cfg tables (Fix A — cfg → biz)
    stats["missing_fks_injected"] = inject_missing_cfg_fks(schema_json)

    # 5.7. enforce status FK on every business table (Lever 3 — biz → cfg).
    # Uses DomainKnowledgeBag for input-derived seeds when available, falls
    # back to archetype defaults otherwise. Generic across domains.
    stats["status_cfg_enforced"] = enforce_status_per_business_table(schema_json, bag)

    # 5.8. final pass — every cfg_* table must have seeds. If neither the LLM,
    # CIR backfill, nor Lever 3 populated them, use archetype-derived defaults
    # based on the cfg's stem name.
    stats["seeds_default_filled"] = _ensure_every_cfg_has_seeds(schema_json, bag)

    # 6. CIR backfill + rules
    if cir:
        stats["backfilled_seeds"] = backfill_seeds_from_cir(schema_json, cir)
        stats["rules"] = surface_cir_rules(schema_json, cir)

    # 7-9. Richness booster (deterministic, generic, $0)
    if richness:
        stats["saas_columns_injected"] = inject_saas_columns(schema_json)
        stats["history_tables_created"] = auto_create_history_tables(schema_json)
        # Generate indexes AFTER history tables so they're indexed too
        index_stmts = generate_index_statements(schema_json)
        stats["indexes_emitted"] = len(index_stmts)
        # Cache for the seed/index renderer to consume — kept under a private key
        schema_json["_indexes"] = index_stmts

    schema_json["_normaliser"] = stats
    return stats


__all__ = [
    "normalise",
    "regenerate_schema_name",
    "lift_enum_types",
    "create_missing_cfg_tables",
    "normalise_cfg_table",
    "rewrite_business_fks_to_cfg",
    "strip_dangling_fks",
    "repair_fk_types",
    "repair_fk_referenced_columns",
    "inject_missing_cfg_fks",
    "enforce_status_per_business_table",
    "dedupe_tables",
    "backfill_seeds_from_cir",
    "surface_cir_rules",
    "inject_saas_columns",
    "auto_create_history_tables",
    "generate_index_statements",
]
