"""Mantara v8 — in-engine normalisation pass. Runs immediately after the LLM produces schema_json. Owns every repair that used to live in step-05 post-processors: 1. cfg_* tables are the source of truth for status/type/category lookups. enum_types[] is lifted into proper cfg_* tables. 2. LLM-emitted cfg_* tables are coerced to the canonical Mantara shape (PK = _id, columns = code|label|description|is_active). 3. Every FK target referenced by any business table actually exists. Missing cfg_* tables are auto-created. FKs to missing business tables are stripped (kept as comments). 4. Business-table columns referencing cfg_* are rewritten to the canonical _id INT shape with FK to (_id). 5. CIR rules are surfaced into table comments / assumptions. 6. CIR workflow states backfill cfg seed values. 7. SaaS audit columns (version, notes, metadata, deleted_at) injected. 8. Paired *_history tables auto-created for DOCUMENT/MASTER archetypes. 9. Indexes auto-emitted for FKs, UNIQUE keys, status/type/is_active. 10. schema_name regenerated to a unique sch___ (36 chars). Deterministic, idempotent, $0. """ from __future__ import annotations import re import uuid from datetime import datetime, timezone from typing import Any # --------------------------------------------------------------------------- # # Schema name regenerator — unique per generation # --------------------------------------------------------------------------- # # Format: sch_{YYYYMMDD}_{HHMMSS}_{16-hex-chars} → exactly 36 chars # sch_ + YYYYMMDD + _ + HHMMSS + _ = 20 chars # 16-hex suffix = 16 chars # Total = 36 (well under PostgreSQL's 63-char identifier limit) # Alphanumeric + underscore only — safe as an unquoted SQL identifier. _SCHEMA_NAME_PATTERN = re.compile(r"^sch_\d{8}_\d{6}_[0-9a-f]{16}$") def regenerate_schema_name(schema_json: dict, force: bool = False) -> str | None: """Replace schema_name with a fresh `sch___` (36 chars). Rewrites every FK reference in the JSON so the new name propagates to rendered SQL. Idempotent — skips if already in sch_* shape unless `force=True`. Returns the new schema_name (or None if skipped). """ old = schema_json.get("schema_name") or "schema" if not force and _SCHEMA_NAME_PATTERN.match(old): return None now = datetime.now(timezone.utc) new_name = ( f"sch_{now.strftime('%Y%m%d')}_{now.strftime('%H%M%S')}_" f"{uuid.uuid4().hex[:16]}" ) assert len(new_name) == 36, f"schema_name length {len(new_name)} != 36" schema_json["schema_name"] = new_name old_prefix = f"{old}." new_prefix = f"{new_name}." for menu in schema_json.get("menus", []) or []: for sub in menu.get("submenus") or []: for tbl in sub.get("tables") or []: if not isinstance(tbl, dict): continue for col in tbl.get("columns") or []: cons = col.get("constraints") or "" if old_prefix in cons: col["constraints"] = cons.replace(old_prefix, new_prefix) cmt = col.get("comment") or "" if old_prefix in cmt: col["comment"] = cmt.replace(old_prefix, new_prefix) for fk in tbl.get("foreign_keys") or []: ref = fk.get("references") or "" if old_prefix in ref: fk["references"] = ref.replace(old_prefix, new_prefix) return new_name _CFG_MENU_NAME = "Configuration" _CFG_MENU_DESC = ( "System configuration lookup tables. All status, type, and category " "values are managed here and can be edited by authorised users." ) # Canonical column set every cfg_* table must have (in this order). _CANONICAL_CFG_COLUMNS = { "code": ("VARCHAR(50)", "NOT NULL UNIQUE"), "label": ("VARCHAR(100)", "NOT NULL"), "description": ("TEXT", ""), # nullable "is_active": ("BOOLEAN", "NOT NULL DEFAULT TRUE"), } # --------------------------------------------------------------------------- # # Helpers # --------------------------------------------------------------------------- # def _snake(name: str) -> str: if not name: return "" s = re.sub(r"(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])", "_", name) return re.sub(r"[^A-Za-z0-9]+", "_", s).strip("_").lower() def _strip_enum_suffix(type_name: str) -> str: bare = type_name.split(".")[-1] if bare.endswith("_enum"): bare = bare[: -len("_enum")] if not bare.startswith("cfg_"): bare = f"cfg_{bare}" return bare def _value_to_code(v: str) -> str: return re.sub(r"[^a-z0-9_]", "_", str(v).lower()).strip("_") def _value_to_label(v: str) -> str: return str(v).replace("_", " ").title() def _all_tables(schema_json: dict) -> list[dict]: out: list[dict] = [] for menu in schema_json.get("menus", []): for sub in menu.get("submenus") or []: for tbl in sub.get("tables") or []: if isinstance(tbl, dict): out.append(tbl) return out def _all_table_names(schema_json: dict) -> set[str]: out: set[str] = {"menu", "submenu"} for tbl in _all_tables(schema_json): name = tbl.get("table_name") if name: out.add(name) return out def _existing_cfg_tables(schema_json: dict) -> dict[str, dict]: return { tbl["table_name"]: tbl for tbl in _all_tables(schema_json) if tbl.get("table_name", "").startswith("cfg_") } def _next_menu_id(schema_json: dict) -> int: return max((m.get("menu_id", 0) for m in schema_json.get("menus", [])), default=0) + 1 def _find_or_create_cfg_menu(schema_json: dict) -> dict: for m in schema_json.get("menus", []): if (m.get("menu_name") or "").lower() == _CFG_MENU_NAME.lower(): return m new_id = _next_menu_id(schema_json) cfg_menu = { "menu_id": new_id, "menu_name": _CFG_MENU_NAME, "sequence_number": new_id, "description": _CFG_MENU_DESC, "submenus": [], } schema_json.setdefault("menus", []).append(cfg_menu) return cfg_menu # --------------------------------------------------------------------------- # # Step 1 — lift enum_types[] into cfg_* tables # --------------------------------------------------------------------------- # def _build_cfg_table( cfg_name: str, values: list[str], description: str, submenu_id: int, schema_name: str, ) -> dict: pk_col = f"{cfg_name}_id" return { "table_name": cfg_name, "comment": description or f"Configuration lookup: {cfg_name}", "columns": [ {"name": pk_col, "type": "SERIAL", "constraints": "PRIMARY KEY"}, {"name": "code", "type": "VARCHAR(50)", "constraints": "NOT NULL UNIQUE"}, {"name": "label", "type": "VARCHAR(100)", "constraints": "NOT NULL"}, {"name": "description", "type": "TEXT"}, {"name": "is_active", "type": "BOOLEAN", "constraints": "NOT NULL DEFAULT TRUE"}, { "name": "submenu_id", "type": "INT", "constraints": ( f"NOT NULL DEFAULT {submenu_id} " f"REFERENCES {schema_name}.submenu(submenu_id)" ), }, ], "foreign_keys": [], "_seed_values": list(values or []), } def lift_enum_types(schema_json: dict) -> int: """Convert every enum_types[] entry to a real cfg_* table. Returns count lifted.""" enum_types = schema_json.get("enum_types") or [] if not enum_types: return 0 schema_name = schema_json.get("schema_name") or "schema" cfg_tables = _existing_cfg_tables(schema_json) cfg_menu = _find_or_create_cfg_menu(schema_json) cfg_menu_id = cfg_menu["menu_id"] existing_sm_ids = { sm.get("submenu_id", 0) for sm in cfg_menu.get("submenus", []) or [] } next_sm = max(existing_sm_ids, default=cfg_menu_id * 100) + 1 if next_sm < cfg_menu_id * 100 + 1: next_sm = cfg_menu_id * 100 + 1 lifted = 0 for enum in enum_types: cfg_name = _strip_enum_suffix(enum.get("type_name", "")) if not cfg_name or cfg_name == "cfg_" or cfg_name in cfg_tables: continue cfg_table = _build_cfg_table( cfg_name=cfg_name, values=enum.get("values", []), description=enum.get("description", ""), submenu_id=next_sm, schema_name=schema_name, ) cfg_menu.setdefault("submenus", []).append({ "submenu_id": next_sm, "submenu_name": cfg_name.replace("_", " ").title(), "sequence_number": next_sm - cfg_menu_id * 100, "description": enum.get("description") or f"Configuration: {cfg_name}", "tables": [cfg_table], }) cfg_tables[cfg_name] = cfg_table next_sm += 1 lifted += 1 schema_json["enum_types"] = [] # renderer must not emit CREATE TYPE return lifted # --------------------------------------------------------------------------- # # Step 2 — auto-create cfg_* tables for FK targets the LLM forgot # --------------------------------------------------------------------------- # def _scan_cfg_fk_targets(schema_json: dict) -> set[str]: targets: set[str] = set() pat = re.compile(r"(?:\w+\.)?(cfg_[a-zA-Z0-9_]+)\s*\(", re.IGNORECASE) for tbl in _all_tables(schema_json): if tbl.get("table_name", "").startswith("cfg_"): continue for col in tbl.get("columns") or []: cons = col.get("constraints") or "" for m in pat.finditer(cons): targets.add(m.group(1)) for fk in tbl.get("foreign_keys") or []: ref = fk.get("references") or "" for m in pat.finditer(ref): targets.add(m.group(1)) return {t[:-5] if t.endswith("_enum") else t for t in targets} def create_missing_cfg_tables(schema_json: dict) -> int: """Create stub cfg_* tables for every FK target not yet defined.""" existing = set(_existing_cfg_tables(schema_json).keys()) missing = sorted(_scan_cfg_fk_targets(schema_json) - existing) if not missing: return 0 schema_name = schema_json.get("schema_name") or "schema" cfg_menu = _find_or_create_cfg_menu(schema_json) cfg_menu_id = cfg_menu["menu_id"] existing_sm_ids = { sm.get("submenu_id", 0) for sm in cfg_menu.get("submenus", []) or [] } next_sm = max(existing_sm_ids, default=cfg_menu_id * 100) + 1 if next_sm < cfg_menu_id * 100 + 1: next_sm = cfg_menu_id * 100 + 1 for cfg_name in missing: cfg_table = _build_cfg_table( cfg_name=cfg_name, values=[], description=f"Configuration lookup (auto-created from FK target): {cfg_name}", submenu_id=next_sm, schema_name=schema_name, ) cfg_menu.setdefault("submenus", []).append({ "submenu_id": next_sm, "submenu_name": cfg_name.replace("_", " ").title(), "sequence_number": next_sm - cfg_menu_id * 100, "description": f"Auto-created lookup: {cfg_name}", "tables": [cfg_table], }) next_sm += 1 return len(missing) # --------------------------------------------------------------------------- # # Step 3 — coerce every cfg_* table to canonical shape # --------------------------------------------------------------------------- # def normalise_cfg_table(tbl: dict, schema_name: str, submenu_id: int | None = None) -> int: name = tbl.get("table_name", "") if not name.startswith("cfg_"): return 0 cols = tbl.get("columns") or [] repairs = 0 canonical_pk = f"{name}_id" # 1. PK rename pk_idx: int | None = None for i, c in enumerate(cols): cons = (c.get("constraints") or "").upper() if "PRIMARY KEY" in cons: pk_idx = i break if pk_idx is not None: pk_col = cols[pk_idx] if pk_col.get("name") != canonical_pk: old = pk_col.get("name") pk_col["name"] = canonical_pk if "SERIAL" not in (pk_col.get("type") or "").upper(): pk_col["type"] = "SERIAL" pk_col["constraints"] = "PRIMARY KEY" tbl.setdefault("_normalised", {})["pk_renamed_from"] = old repairs += 1 existing_names = {c.get("name") for c in cols} # 2. Ensure canonical columns for cname, (ctype, ccons) in _CANONICAL_CFG_COLUMNS.items(): if cname in existing_names: if cname == "description": # Relax NOT NULL so seed inserts NULL safely for c in cols: if c.get("name") == "description": cur = c.get("constraints") or "" if re.search(r"\bNOT\s+NULL\b", cur, re.IGNORECASE): c["constraints"] = re.sub( r"\bNOT\s+NULL\b", "", cur, flags=re.IGNORECASE ).strip() repairs += 1 continue new_col = {"name": cname, "type": ctype, "constraints": ccons} insert_at = len(cols) for i, c in enumerate(cols): if c.get("name") == "submenu_id": insert_at = i break cols.insert(insert_at, new_col) repairs += 1 # 3. display_order — give DEFAULT 0 if NOT NULL without DEFAULT for c in cols: if c.get("name") != "display_order": continue cur = c.get("constraints") or "" if re.search(r"\bNOT\s+NULL\b", cur, re.IGNORECASE) and not re.search( r"\bDEFAULT\b", cur, re.IGNORECASE ): c["constraints"] = (cur + " DEFAULT 0").strip() repairs += 1 break # 4. Ensure submenu_id present if "submenu_id" not in {c.get("name") for c in cols}: if submenu_id is not None: sm_constraints = ( f"NOT NULL DEFAULT {submenu_id} " f"REFERENCES {schema_name}.submenu(submenu_id)" ) else: sm_constraints = f"REFERENCES {schema_name}.submenu(submenu_id)" cols.append({ "name": "submenu_id", "type": "INT", "constraints": sm_constraints, }) repairs += 1 else: # 4.5. Repair existing submenu_id NOT NULL without DEFAULT — seed inserts skip # submenu_id, so a missing DEFAULT causes a NOT NULL violation at insert time. for c in cols: if c.get("name") != "submenu_id": continue cur = c.get("constraints") or "" if (re.search(r"\bNOT\s+NULL\b", cur, re.IGNORECASE) and not re.search(r"\bDEFAULT\b", cur, re.IGNORECASE)): if submenu_id is not None: c["constraints"] = re.sub( r"\bNOT\s+NULL\b", f"NOT NULL DEFAULT {submenu_id}", cur, flags=re.IGNORECASE, count=1, ) else: c["constraints"] = re.sub( r"\bNOT\s+NULL\b", "", cur, flags=re.IGNORECASE ).strip() repairs += 1 break tbl["columns"] = cols return repairs # --------------------------------------------------------------------------- # # Step 4 — rewrite business-table FKs to use cfg_*(_id) # --------------------------------------------------------------------------- # def rewrite_business_fks_to_cfg( table: dict, cfg_tables: dict[str, dict], schema_name: str ) -> int: rewritten = 0 fks = table.get("foreign_keys") or [] new_fks: list[dict] = [] cols_by_name = {c["name"]: c for c in table.get("columns") or []} cfg_targets: dict[str, str] = {} for fk in fks: ref = fk.get("references") or "" m = re.search( r"(?:\w+\.)?(cfg_[a-zA-Z0-9_]+?)(?:_enum)?\s*$\w+$", ref ) if m and m.group(1) in cfg_tables: cfg_targets[fk["column"]] = m.group(1) else: new_fks.append(fk) if not cfg_targets: for col in table.get("columns") or []: cons = col.get("constraints") or "" m = re.search( r"REFERENCES\s+(?:\w+\.)?(cfg_[a-zA-Z0-9_]+?)(?:_enum)?\s*$\w+$", cons, re.IGNORECASE ) if m and m.group(1) in cfg_tables: cfg_targets[col["name"]] = m.group(1) for orig_col, cfg_tbl in cfg_targets.items(): new_col = orig_col if orig_col.endswith("_id") else f"{orig_col}_id" col = cols_by_name.get(orig_col) if col is None: continue col["name"] = new_col col["type"] = "INT" cons = col.get("constraints") or "" # Strip any existing REFERENCES (we'll re-insert with canonical target) cons = re.sub(r"REFERENCES\s+\S+\s*$[^)]+$", "", cons, flags=re.IGNORECASE) cons = re.sub(r"\s{2,}", " ", cons).strip() # Re-insert as inline REFERENCES so renderer keeps it on the column line # (cfg_* tables render first, so this is a backward ref — never deferred) target = f"{schema_name}.{cfg_tbl}({cfg_tbl}_id)" col["constraints"] = (cons + f" REFERENCES {target}").strip() if cons else f"REFERENCES {target}" # Update / clean the comment so it points at the canonical target prev_comment = col.get("comment") or "" # Drop any stale "References cfg_X.code" wording — the inline clause # is now self-documenting. prev_comment = re.sub( r"References\s+cfg_[a-zA-Z0-9_]+(?:\.\w+)?\.?\s*", "", prev_comment, flags=re.IGNORECASE, ).strip(" .;,") if prev_comment: col["comment"] = prev_comment elif "comment" in col: del col["comment"] rewritten += 1 # Keep table-level FKs only for non-cfg references (the LLM's other FKs). table["foreign_keys"] = new_fks return rewritten # --------------------------------------------------------------------------- # # Step 5 — strip dangling FKs to non-existent tables # --------------------------------------------------------------------------- # def strip_dangling_fks(schema_json: dict) -> int: known = _all_table_names(schema_json) stripped = 0 inline_pat = re.compile( r"REFERENCES\s+(?:(\w+)\.)?(\w+)\s*$\s*(\w+)\s*$", re.IGNORECASE ) for tbl in _all_tables(schema_json): for col in tbl.get("columns") or []: cons = col.get("constraints") or "" if not cons: continue def _maybe_strip(match: "re.Match[str]") -> str: target = match.group(2) return match.group(0) if target in known else "" new_cons = inline_pat.sub(_maybe_strip, cons) if new_cons != cons: col["constraints"] = re.sub(r"\s{2,}", " ", new_cons).strip() prev = (col.get("comment") or "").rstrip(". ") col["comment"] = ( (prev + ". " if prev else "") + "FK target table missing — reference removed" ) stripped += 1 kept: list[dict] = [] for fk in tbl.get("foreign_keys") or []: ref = fk.get("references") or "" m = re.match(r"(?:\w+\.)?(\w+)\s*\(", ref) target = m.group(1) if m else None if target and target not in known: stripped += 1 continue kept.append(fk) tbl["foreign_keys"] = kept return stripped def repair_fk_types(schema_json: dict) -> int: """Coerce FK columns that reference integer PKs to INT. The LLM occasionally emits UUID or BIGINT for FK columns even though all PKs are SERIAL (integer). A type mismatch is a hard Postgres error: 'Key columns X and id are of incompatible types: uuid and integer.' """ _UUID_PAT = re.compile(r"\bUUID\b", re.IGNORECASE) _BIGINT_PAT = re.compile(r"\bBIGINT\b", re.IGNORECASE) _REF_PAT = re.compile(r"\bREFERENCES\b", re.IGNORECASE) fixed = 0 for tbl in _all_tables(schema_json): for col in tbl.get("columns") or []: col_type = col.get("type") or "" if not _REF_PAT.search(col.get("constraints") or ""): continue if _UUID_PAT.search(col_type) or _BIGINT_PAT.search(col_type): col["type"] = "INT" fixed += 1 return fixed def repair_fk_referenced_columns(schema_json: dict) -> int: """Rewrite FK references that name a non-existent column to use the real PK. The LLM sometimes writes REFERENCES schema.bmiresult(bmi_result_id) when the actual PK column is `id`, producing: 'column bmi_result_id referenced in foreign key constraint does not exist' For every REFERENCES clause we look up the target table's real PK column name and replace the wrong column with the correct one. """ # Build map: table_name → pk_column_name pk_map: dict[str, str] = {} for tbl in _all_tables(schema_json): name = tbl.get("table_name", "") for col in tbl.get("columns") or []: if "PRIMARY KEY" in (col.get("constraints") or "").upper(): pk_map[name] = col["name"] break else: pk_map.setdefault(name, "id") ref_pat = re.compile( r"(REFERENCES\s+(?:\w+\.)?(\w+)\s*$\s*)(\w+)(\s*$)", re.IGNORECASE, ) fixed = 0 def _fix_ref(m: "re.Match[str]") -> str: nonlocal fixed prefix, tbl_name, col_name, suffix = m.group(1), m.group(2), m.group(3), m.group(4) real_pk = pk_map.get(tbl_name) if real_pk and col_name != real_pk: fixed += 1 return prefix + real_pk + suffix return m.group(0) for tbl in _all_tables(schema_json): for col in tbl.get("columns") or []: cons = col.get("constraints") or "" if "REFERENCES" not in cons.upper(): continue new_cons = ref_pat.sub(_fix_ref, cons) if new_cons != cons: col["constraints"] = new_cons for fk in tbl.get("foreign_keys") or []: ref = fk.get("references") or "" if "REFERENCES" not in ref.upper(): # foreign_keys entries store just the target, e.g. "schema.table(col)" m = re.match(r"((?:\w+\.)?(\w+)\s*$\s*)(\w+)(\s*$)", ref) if m: prefix, tbl_name, col_name, suffix = m.group(1), m.group(2), m.group(3), m.group(4) real_pk = pk_map.get(tbl_name) if real_pk and col_name != real_pk: fk["references"] = prefix + real_pk + suffix fixed += 1 return fixed # --------------------------------------------------------------------------- # # Step 6 — backfill cfg seed values from CIR workflow states # --------------------------------------------------------------------------- # def backfill_seeds_from_cir(schema_json: dict, cir: dict) -> int: workflows = [ wf for wf in (cir.get("workflows") or []) if isinstance(wf, dict) and (wf.get("states") or []) ] if not workflows: return 0 by_entity: dict[str, list[str]] = {} for wf in workflows: bound = wf.get("bound_entity") or wf.get("name") or "" key = _snake(bound) if key: by_entity.setdefault(key, []).extend( str(s) for s in (wf.get("states") or []) if s ) backfilled = 0 for tbl in _all_tables(schema_json): name = tbl.get("table_name", "") if not name.startswith("cfg_"): continue if tbl.get("_seed_values") or tbl.get("values"): continue stem = name[4:] for suffix in ("_status", "_state", "_type", "_category", "_kind"): if stem.endswith(suffix): stem = stem[: -len(suffix)] break states = by_entity.get(stem) if not states: for k, v in by_entity.items(): if k in stem or stem in k: states = v break if states: tbl["_seed_values"] = states backfilled += 1 return backfilled # --------------------------------------------------------------------------- # # Step 7 — surface CIR rules into table comments / assumptions # --------------------------------------------------------------------------- # def _find_table_for_rule(schema_json: dict, applies_to: str) -> dict | None: if not applies_to: return None target = _snake(applies_to) for tbl in _all_tables(schema_json): name = tbl.get("table_name", "") if not name or name.startswith("cfg_"): continue if name == target or target in name or name in target: return tbl if target.endswith("s"): singular = target[:-1] for tbl in _all_tables(schema_json): name = tbl.get("table_name", "") if name == singular or singular in name: return tbl return None def _rule_already_present(rule_name: str, table: dict) -> bool: needle = _snake(rule_name) haystack: list[str] = [] if table.get("comment"): haystack.append(str(table["comment"]).lower()) for col in table.get("columns") or []: if col.get("comment"): haystack.append(str(col["comment"]).lower()) if col.get("constraints"): haystack.append(str(col["constraints"]).lower()) return needle in " ".join(haystack) def surface_cir_rules(schema_json: dict, cir: dict) -> dict: rules = [r for r in (cir.get("rules") or []) if isinstance(r, dict) and r.get("name")] stats = { "rules_total": len(rules), "appended_to_table_comment": 0, "added_to_assumptions": 0, "already_present": 0, } if not rules: return stats schema_json.setdefault("assumptions", []) for rule in rules: rname = rule.get("name", "") rdesc = (rule.get("description") or "").strip() applies = rule.get("applies_to", "") target = _find_table_for_rule(schema_json, applies) if target is not None: if _rule_already_present(rname, target): stats["already_present"] += 1 continue existing = (target.get("comment") or "").rstrip(". ") addition = f"Rule {rname}: {rdesc}" target["comment"] = f"{existing}. {addition}" if existing else addition stats["appended_to_table_comment"] += 1 else: needle = _snake(rname) if any(needle in str(a).lower() for a in schema_json.get("assumptions") or []): stats["already_present"] += 1 continue schema_json["assumptions"].append( f"Rule {rname} (applies to {applies or 'system'}): {rdesc}" ) stats["added_to_assumptions"] += 1 return stats # --------------------------------------------------------------------------- # # Match a cfg_* stem to its likely business table — multi-strategy # --------------------------------------------------------------------------- # # Industry-standard abbreviations the LLM commonly uses for cfg names even # when the business table uses the long form. Bidirectional match. _BIZ_ABBREVIATIONS = { "asn": "advanced_shipment_notice", "rcr": "receiving_confirmation_report", "po": "purchase_order", "so": "sales_order", "wo": "work_order", "rfq": "request_for_quotation", "rma": "return_merchandise_authorization", "bol": "bill_of_lading", "gr": "goods_receipt", "pcn": "product_change_notice", "ecn": "engineering_change_notice", "kpi": "key_performance_indicator", "sla": "service_level_agreement", "sku": "stock_keeping_unit", "upc": "universal_product_code", "ean": "european_article_number", "pos": "point_of_sale", "crm": "customer_relationship", "erp": "enterprise_resource_plan", "wms": "warehouse_management", } def _initials(name: str) -> str: """`receiving_confirmation_report` → `rcr`.""" return "".join(p[:1] for p in (name or "").split("_") if p).lower() def _singularise(name: str) -> str: """Cheap English singulariser. `orders` → `order`, `entities` → `entity`.""" if not name or len(name) < 3: return name if name.endswith("ies"): return name[:-3] + "y" if name.endswith("ses") or name.endswith("xes") or name.endswith("zes"): return name[:-2] if name.endswith("s") and not name.endswith("ss"): return name[:-1] return name def _match_business_for_cfg_stem(stem: str, business_index: dict[str, dict]) -> dict | None: """Try every reasonable strategy to match a cfg__ to an existing business table. Returns the first matching table, or None. Strategies, in priority order: 1. Exact match (`stem == table_name`) 2. Singular/plural pivot (`stem` <-> singular(table_name)) 3. Industry abbreviation expansion (`asn` → `advanced_shipment_notice`) 4. Initials match (`rcr` matches initials of `receiving_confirmation_report`) 5. Substring match (longest overlap wins) 6. Reverse abbreviation lookup (table name's initials == stem) """ if not stem or not business_index: return None # Strategy 1 — exact if stem in business_index: return business_index[stem] # Strategy 2 — plural/singular pivot sing_stem = _singularise(stem) if sing_stem in business_index: return business_index[sing_stem] for tname in business_index: if _singularise(tname) == stem: return business_index[tname] # Strategy 3 — known abbreviation expansion expanded = _BIZ_ABBREVIATIONS.get(stem) if expanded: if expanded in business_index: return business_index[expanded] # also try plural/singular of the expansion for tname in business_index: if tname == expanded or _singularise(tname) == expanded: return business_index[tname] # Strategy 4 — initials match (forward: table → initials == stem) for tname, tbl in business_index.items(): if _initials(tname) == stem: return tbl # Strategy 5 — substring overlap (longest wins) best, best_overlap = None, 0 for tname, tbl in business_index.items(): if stem in tname or tname in stem: overlap = min(len(stem), len(tname)) if overlap > best_overlap: best, best_overlap = tbl, overlap if best is not None: return best # Strategy 6 — reverse: stem might itself be an initialism we don't have # in the dictionary, but matches some business table's initials anyway if 2 <= len(stem) <= 5: # plausible initialism length for tname, tbl in business_index.items(): if "_" in tname and _initials(tname).startswith(stem): return tbl return None # --------------------------------------------------------------------------- # # Generic archetype-default status values (fallback only — used when neither # the LLM nor the upstream artifacts have data for an entity) # --------------------------------------------------------------------------- # _ARCHETYPE_STATUS_DEFAULTS: dict[str, list[str]] = { "DOCUMENT": ["draft", "open", "in_progress", "closed", "cancelled"], "MASTER": ["active", "inactive", "archived"], "ASSIGNMENT": ["pending", "active", "completed", "cancelled"], "LINE_ITEM": ["pending", "received", "short", "rejected"], "EVENT_LOG": ["new", "acknowledged", "resolved"], "_GENERIC": ["active", "inactive"], } def _classify_archetype_local(tbl: dict) -> str | None: """Local archetype classifier — same logic as _classify_archetype but available before the auto_create_history_tables function defines it. """ if not _is_business_table(tbl): return None cols = tbl.get("columns") or [] col_names = {c.get("name") for c in cols if c.get("name")} name = tbl.get("table_name", "") if name.endswith("_log") or name in {"audit_log", "notifications"}: return "EVENT_LOG" if any(n in col_names for n in {"event_type", "event_timestamp", "actor_id"}): return "EVENT_LOG" has_qty = any(n in col_names for n in {"quantity", "qty", "quantity_expected"}) has_price = any(n in col_names for n in {"unit_price", "price", "line_total"}) if has_qty and has_price: return "LINE_ITEM" if any(n in col_names for n in {"assigned_at", "released_at", "assigned_to"}): return "ASSIGNMENT" has_doc_num = any( "UNIQUE" in (c.get("constraints") or "").upper() for c in cols if c.get("name") not in {"id", "submenu_id"} ) has_status = any(n in col_names for n in {"status_id", "status"}) has_date = any( re.search(r"_(date|at)$", c.get("name", "") or "") and c.get("name") not in {"created_at", "updated_at", "deleted_at"} for c in cols ) if has_doc_num and has_status and has_date: return "DOCUMENT" has_name = any(n in col_names for n in {"name", "label", "title"}) or any( (n or "").endswith("_name") for n in col_names if isinstance(n, str) ) if has_doc_num and (has_name or has_status): return "MASTER" return "DOCUMENT" if has_doc_num else None def _is_business_table(tbl: dict) -> bool: """Same as the function defined later — duplicated here to avoid forward-ref.""" name = tbl.get("table_name", "") if not name or name in {"menu", "submenu"}: return False if name.startswith("cfg_") or name.endswith("_history") or name.endswith("_settings"): return False return True def _ensure_every_cfg_has_seeds(schema_json: dict, bag=None) -> int: """Final-pass guarantee: every cfg_* table has at least 2 seed values. Priority order for picking seeds: 1. Existing _seed_values (no-op) 2. Bag lookup by stripped stem (entity name from cfg__) 3. Archetype default by suffix 4. Generic fallback (active/inactive) """ suffix_to_archetype = { "status": "DOCUMENT", # most cfg_X_status fits document/master statuses "type": "DOCUMENT", "category": "DOCUMENT", "priority": "_GENERIC", # priority is its own thing — handled inline "severity": "EVENT_LOG", "level": "_GENERIC", "kind": "_GENERIC", "stage": "DOCUMENT", "phase": "DOCUMENT", "condition":"_GENERIC", "role": "_GENERIC", } # Specific defaults per suffix (richer than archetype-only) suffix_defaults = { "priority": ["low", "medium", "high", "urgent"], "type": ["standard", "expedited", "special"], "category": ["primary", "secondary", "tertiary"], "severity": ["info", "warning", "error", "critical"], "role": ["admin", "user", "viewer"], } filled = 0 for tbl in _all_tables(schema_json): name = tbl.get("table_name", "") if not name.startswith("cfg_"): continue if tbl.get("_seed_values"): continue # Strip cfg_ prefix and the suffix to get the entity stem stem = name[4:] suffix = "" for s in ("status", "type", "category", "priority", "severity", "level", "kind", "stage", "phase", "condition", "role"): if stem.endswith("_" + s): stem = stem[: -len(s) - 1] suffix = s break seeds: list[str] = [] # Priority 2: bag if bag is not None and stem: try: if suffix == "status": seeds = bag.lookup_status_values(stem) or [] else: seeds = bag.lookup_field_values(stem, suffix) or [] except Exception: seeds = [] # Priority 3: suffix-specific default if not seeds and suffix in suffix_defaults: seeds = list(suffix_defaults[suffix]) # Priority 4: archetype default by suffix → archetype map if not seeds: arch = suffix_to_archetype.get(suffix, "_GENERIC") seeds = list(_ARCHETYPE_STATUS_DEFAULTS.get(arch, _ARCHETYPE_STATUS_DEFAULTS["_GENERIC"])) if seeds: tbl["_seed_values"] = seeds filled += 1 return filled def enforce_status_per_business_table(schema_json: dict, bag=None) -> int: """For every DOCUMENT/MASTER business table without a status FK: 1. Find status values from the bag (CIR workflows + field values + glossary) OR fall back to archetype defaults 2. Auto-create cfg__status with those seeds 3. Inject `status_id INT NOT NULL DEFAULT 1 REFERENCES cfg_

_status(...)` on the business table 4. Attach a glossary-derived comment to the new cfg table when available Idempotent. Returns count of cfg+FK pairs created. """ schema_name = schema_json.get("schema_name") or "schema" cfg_tables = _existing_cfg_tables(schema_json) cfg_menu = _find_or_create_cfg_menu(schema_json) cfg_menu_id = cfg_menu["menu_id"] existing_sm_ids = {sm.get("submenu_id", 0) for sm in cfg_menu.get("submenus", []) or []} next_sm = max(existing_sm_ids, default=cfg_menu_id * 100) + 1 if next_sm < cfg_menu_id * 100 + 1: next_sm = cfg_menu_id * 100 + 1 created = 0 for tbl in _all_tables(schema_json): if not _is_business_table(tbl): continue archetype = _classify_archetype_local(tbl) if archetype not in ("DOCUMENT", "MASTER", "LINE_ITEM", "ASSIGNMENT"): continue # Already has a status FK? cols = tbl.get("columns") or [] existing_names = {c.get("name") for c in cols} if "status_id" in existing_names or "status" in existing_names: continue biz_name = tbl["table_name"] cfg_name = f"cfg_{biz_name}_status" # Source seeds — bag first, then archetype fallback seeds: list[str] = [] seed_source = "archetype_default" if bag is not None: try: seeds = bag.lookup_status_values(biz_name) or [] if seeds: seed_source = "input_artifacts" except Exception: seeds = [] if not seeds: seeds = list(_ARCHETYPE_STATUS_DEFAULTS.get(archetype, _ARCHETYPE_STATUS_DEFAULTS["_GENERIC"])) # If the cfg already exists (LLM emitted it), backfill seeds + ensure # status_id FK on the business table — but don't recreate the table. if cfg_name in cfg_tables: existing_cfg = cfg_tables[cfg_name] if not existing_cfg.get("_seed_values"): existing_cfg["_seed_values"] = list(seeds) created += 1 # count seed-population as a fix # also ensure the business table actually FKs to it biz_has_fk = any( cfg_name in (c.get("constraints") or "") for c in cols ) if not biz_has_fk: cfg_pk = f"{cfg_name}_id" cols.insert(1, { "name": "status_id", "type": "INT", "constraints": ( f"NOT NULL DEFAULT 1 " f"REFERENCES {schema_name}.{cfg_name}({cfg_pk})" ), "comment": f"Auto-injected FK to existing cfg ({seed_source})", }) continue # Glossary-derived comment for the cfg table comment = None if bag is not None: try: ent_comment = bag.lookup_entity_comment(biz_name) if ent_comment: comment = f"Status lookup for {biz_name}. {ent_comment}" except Exception: pass if not comment: comment = ( f"Status lookup for {biz_name}. " f"{'Seed values from input.' if seed_source == 'input_artifacts' else 'Generic placeholder seeds — operator may customise via admin UI.'}" ) # Create the cfg table cfg_table = _build_cfg_table( cfg_name=cfg_name, values=seeds, description=comment, submenu_id=next_sm, schema_name=schema_name, ) cfg_menu.setdefault("submenus", []).append({ "submenu_id": next_sm, "submenu_name": f"{biz_name.replace('_', ' ').title()} Status", "sequence_number": next_sm - cfg_menu_id * 100, "description": f"Configurable status values for {biz_name}", "tables": [cfg_table], }) cfg_tables[cfg_name] = cfg_table next_sm += 1 # Inject status_id FK column on the business table cfg_pk = f"{cfg_name}_id" new_col = { "name": "status_id", "type": "INT", "constraints": ( f"NOT NULL DEFAULT 1 " f"REFERENCES {schema_name}.{cfg_name}({cfg_pk})" ), "comment": f"Auto-injected: {seed_source}", } # Insert after submenu_id if present, else after id insert_at = 1 for i, c in enumerate(cols): if c.get("name") == "submenu_id": insert_at = i + 1 break cols.insert(insert_at, new_col) created += 1 return created # --------------------------------------------------------------------------- # # Fix A — Inject missing FK columns when an orphan cfg_* table exists # --------------------------------------------------------------------------- # def inject_missing_cfg_fks(schema_json: dict) -> int: """For every cfg__ table that no business table FK-references, find a business table whose name matches X (or close match) and inject the missing `_id INT NOT NULL DEFAULT 1 REFERENCES cfg_X_suffix(...)` column. Idempotent. Returns count of FK columns injected. Catches the class of bug where the LLM emits a cfg lookup but forgets the business-table column that references it (the orphan-cfg problem). """ schema_name = schema_json.get("schema_name") or "schema" cfg_tables = _existing_cfg_tables(schema_json) if not cfg_tables: return 0 # Collect all currently-referenced cfg targets (so we know which are orphan) referenced: set[str] = set() pat = re.compile(r"(?:\w+\.)?(cfg_[a-zA-Z0-9_]+)\s*\(", re.IGNORECASE) for tbl in _all_tables(schema_json): if tbl.get("table_name", "").startswith("cfg_"): continue for col in tbl.get("columns") or []: for m in pat.finditer(col.get("constraints") or ""): referenced.add(m.group(1)) for fk in tbl.get("foreign_keys") or []: for m in pat.finditer(fk.get("references") or ""): referenced.add(m.group(1)) business_index = { tbl["table_name"]: tbl for tbl in _all_tables(schema_json) if tbl.get("table_name") and not tbl["table_name"].startswith("cfg_") and not tbl["table_name"].endswith("_history") and not tbl["table_name"].endswith("_settings") } injected = 0 cfg_pat = re.compile(r"^cfg_(.+?)_(status|type|category|priority|severity|level|kind|stage|phase|condition|role)$") for cfg_name in sorted(cfg_tables.keys()): if cfg_name in referenced: continue m = cfg_pat.match(cfg_name) if not m: continue entity_part, suffix = m.group(1), m.group(2) col_name = f"{suffix}_id" # Find the matching business table — multi-strategy in priority order. target = _match_business_for_cfg_stem(entity_part, business_index) if target is None: continue existing_cols = {c.get("name") for c in target.get("columns") or []} if col_name in existing_cols: continue cfg_pk = f"{cfg_name}_id" new_col = { "name": col_name, "type": "INT", "constraints": ( f"NOT NULL DEFAULT 1 " f"REFERENCES {schema_name}.{cfg_name}({cfg_pk})" ), "comment": f"Auto-injected: {cfg_name} was orphan; wired here via name match", } cols = target.setdefault("columns", []) # Insert after `submenu_id` if present, else after `id` insert_at = 1 for i, c in enumerate(cols): if c.get("name") == "submenu_id": insert_at = i + 1 break cols.insert(insert_at, new_col) injected += 1 return injected # --------------------------------------------------------------------------- # # Fix C — Dedupe tables (keep first definition, drop subsequent) # --------------------------------------------------------------------------- # def dedupe_tables(schema_json: dict) -> int: """Drop subsequent definitions of any table_name already seen. The LLM occasionally emits the same table twice (e.g., `audit_log` under two different menus) which would crash CREATE TABLE on Postgres. Keep the first occurrence in render order; remove duplicates. Idempotent. Returns count of duplicates removed. """ seen: set[str] = set() removed = 0 for menu in schema_json.get("menus", []) or []: for sub in menu.get("submenus") or []: tables = sub.get("tables") or [] kept = [] for tbl in tables: name = tbl.get("table_name") if isinstance(tbl, dict) else None if not name: continue if name in seen: removed += 1 continue seen.add(name) kept.append(tbl) sub["tables"] = kept return removed # --------------------------------------------------------------------------- # # Step 8 — Richness booster: SaaS audit columns (deterministic, generic) # --------------------------------------------------------------------------- # _BOILERPLATE_NAMES = { "id", "uid", "submenu_id", "user_id", "created_by", "updated_by", "parent_id", "obtained_marks", "created_at", "updated_at", "deleted", "is_parent_deleted", "is_auto", "is_draft", # legacy columns (preserved if already present in schema JSON) "is_active", "deleted_at", "deleted_by", "version", "notes", "metadata", } # Standard columns injected into every non-system, non-cfg table. # HEAD group: inserted after submenu_id; TAIL group: appended at end. _STANDARD_COLUMNS = [ # HEAD — identity and ownership {"name": "uid", "type": "VARCHAR", "constraints": "NOT NULL UNIQUE"}, {"name": "user_id", "type": "BIGINT", "constraints": "NOT NULL"}, {"name": "created_by", "type": "BIGINT", "constraints": ""}, {"name": "updated_by", "type": "BIGINT", "constraints": ""}, {"name": "parent_id", "type": "BIGINT", "constraints": ""}, {"name": "obtained_marks", "type": "FLOAT8", "constraints": ""}, # TAIL — timestamps and soft-delete flags {"name": "created_at", "type": "TIMESTAMPTZ", "constraints": "DEFAULT now() NOT NULL"}, {"name": "updated_at", "type": "TIMESTAMPTZ", "constraints": "DEFAULT now() NOT NULL"}, {"name": "deleted", "type": "BOOLEAN", "constraints": "DEFAULT false NOT NULL"}, {"name": "is_parent_deleted", "type": "BOOLEAN", "constraints": "DEFAULT false NOT NULL"}, {"name": "is_auto", "type": "BOOLEAN", "constraints": "DEFAULT false NOT NULL"}, {"name": "is_draft", "type": "BOOLEAN", "constraints": "DEFAULT false NOT NULL"}, ] _HEAD_COLUMN_NAMES = {"uid", "user_id", "created_by", "updated_by", "parent_id", "obtained_marks"} _TAIL_COLUMN_NAMES = {"created_at", "updated_at", "deleted", "is_parent_deleted", "is_auto", "is_draft"} def _is_business_table(tbl: dict) -> bool: """Decide whether a table is a 'business' table that should get richness.""" name = tbl.get("table_name", "") if not name: return False if name in {"menu", "submenu"}: return False if name.startswith("cfg_"): return False if name.endswith("_history"): return False if name.endswith("_settings"): return False # settings are sparse key/value tables return True def _needs_standard_columns(tbl: dict) -> bool: """Tables that receive the full standard column set (all except cfg_*, menu, submenu).""" name = tbl.get("table_name", "") if not name or name in {"menu", "submenu"}: return False if name.startswith("cfg_"): return False return True def inject_saas_columns(schema_json: dict) -> int: """Inject / upgrade the standard column set on every non-system, non-cfg table. HEAD columns (uid, user_id, created_by, updated_by, parent_id, obtained_marks) are inserted right after submenu_id. TAIL columns (created_at, updated_at, deleted, is_parent_deleted, is_auto, is_draft) are appended at the end. Existing created_at / updated_at are upgraded to TIMESTAMPTZ DEFAULT now(). Idempotent. Returns count of columns injected or upgraded. """ injected = 0 for tbl in _all_tables(schema_json): if not _needs_standard_columns(tbl): continue cols = tbl.setdefault("columns", []) existing_names = {c.get("name") for c in cols} # Upgrade existing created_at / updated_at to TIMESTAMPTZ + DEFAULT now() for c in cols: if c.get("name") in ("created_at", "updated_at"): if (c.get("type", "").upper() != "TIMESTAMPTZ" or "now()" not in (c.get("constraints") or "").lower()): c["type"] = "TIMESTAMPTZ" c["constraints"] = "DEFAULT now() NOT NULL" injected += 1 # Collect missing columns by group missing_head = [ spec for spec in _STANDARD_COLUMNS if spec["name"] in _HEAD_COLUMN_NAMES and spec["name"] not in existing_names ] missing_tail = [ spec for spec in _STANDARD_COLUMNS if spec["name"] in _TAIL_COLUMN_NAMES and spec["name"] not in existing_names ] # Insert HEAD columns after submenu_id (or after id if no submenu_id) if missing_head: head_at = 1 for i, c in enumerate(cols): if c.get("name") == "submenu_id": head_at = i + 1 break for spec in reversed(missing_head): cols.insert(head_at, dict(spec)) injected += 1 # Append TAIL columns at end for spec in missing_tail: cols.append(dict(spec)) injected += 1 return injected # --------------------------------------------------------------------------- # # Step 9 — Auto-create paired *_history tables for DOCUMENT and MASTER entities # --------------------------------------------------------------------------- # def _classify_archetype(tbl: dict) -> str | None: """Heuristically classify a business table into one of the v8 archetypes. Returns one of: DOCUMENT, MASTER, LINE_ITEM, ASSIGNMENT, EVENT_LOG, None. Heuristics use column shape only — no entity-name hardcoding. """ if not _is_business_table(tbl): return None cols = tbl.get("columns") or [] col_names = {c.get("name") for c in cols if c.get("name")} name = tbl.get("table_name", "") # EVENT_LOG: name-suffix or has event_* / action / actor columns if name.endswith("_log") or name in {"audit_log", "notifications"}: return "EVENT_LOG" if any(n in col_names for n in {"event_type", "event_timestamp", "actor_id"}): return "EVENT_LOG" # LINE_ITEM: has parent FK + qty/quantity + price has_qty = any(n in col_names for n in {"quantity", "qty", "quantity_expected"}) has_price = any(n in col_names for n in {"unit_price", "price", "line_total"}) if has_qty and has_price: return "LINE_ITEM" # ASSIGNMENT: has assigned_at / released_at / assigned_to if any(n in col_names for n in {"assigned_at", "released_at", "assigned_to"}): return "ASSIGNMENT" # DOCUMENT: has any UNIQUE business key + status_id + a date column. # The business key may be `_number`, `_id` (LLM naming), or any # other UNIQUE column on the table. has_doc_num = any( "UNIQUE" in (c.get("constraints") or "").upper() for c in cols if c.get("name") not in {"id", "submenu_id"} ) has_status = any(n in col_names for n in {"status_id", "status"}) has_date = any( re.search(r"_(date|at)$", c.get("name", "")) and c.get("name") not in {"created_at", "updated_at", "deleted_at"} for c in cols ) if has_doc_num and has_status and has_date: return "DOCUMENT" # MASTER: has unique business key + name/description but no document-y dates has_name = any(n in col_names for n in {"name", "label", "title"}) or any( n.endswith("_name") for n in col_names if isinstance(n, str) ) if has_doc_num and has_name and not has_date: return "MASTER" if has_doc_num and has_status: return "MASTER" return None def auto_create_history_tables(schema_json: dict) -> int: """For every DOCUMENT and MASTER table, ensure a paired *_history exists. Idempotent. Places new history tables under a Configuration menu submenu. Generic across domains — uses universal change-tracking columns only. """ schema_name = schema_json.get("schema_name") or "schema" existing_tables = _all_table_names(schema_json) candidates: list[dict] = [] for tbl in _all_tables(schema_json): arch = _classify_archetype(tbl) if arch in ("DOCUMENT", "MASTER"): history_name = f"{tbl['table_name']}_history" if history_name not in existing_tables: candidates.append(tbl) if not candidates: return 0 cfg_menu = _find_or_create_cfg_menu(schema_json) cfg_menu_id = cfg_menu["menu_id"] existing_sm_ids = {sm.get("submenu_id", 0) for sm in cfg_menu.get("submenus", []) or []} next_sm = max(existing_sm_ids, default=cfg_menu_id * 100) + 1 if next_sm < cfg_menu_id * 100 + 1: next_sm = cfg_menu_id * 100 + 1 created = 0 for parent in candidates: parent_name = parent["table_name"] history_name = f"{parent_name}_history" history_table = { "table_name": history_name, "comment": f"Append-only audit trail for {parent_name}.", "columns": [ {"name": "id", "type": "SERIAL", "constraints": "PRIMARY KEY"}, {"name": "submenu_id", "type": "INT", "constraints": f"NOT NULL DEFAULT {next_sm} REFERENCES {schema_name}.submenu(submenu_id)"}, {"name": f"{parent_name}_id", "type": "INT", "constraints": f"NOT NULL REFERENCES {schema_name}.{parent_name}(id)", "comment": f"FK to {parent_name}"}, {"name": "change_type", "type": "VARCHAR(30)", "constraints": "NOT NULL CHECK (change_type IN ('CREATE','UPDATE','DELETE'))", "comment": "Type of change"}, {"name": "change_details", "type": "JSONB", "constraints": "NOT NULL", "comment": "Full snapshot of changed fields"}, {"name": "changed_by", "type": "INT", "constraints": "", "comment": "Actor user id (FK added later if users table exists)"}, {"name": "change_date", "type": "TIMESTAMP", "constraints": "NOT NULL DEFAULT CURRENT_TIMESTAMP"}, {"name": "ip_address", "type": "INET", "constraints": "", "comment": "Source IP of the change"}, {"name": "created_at", "type": "TIMESTAMP", "constraints": "DEFAULT CURRENT_TIMESTAMP"}, ], "foreign_keys": [], } cfg_menu.setdefault("submenus", []).append({ "submenu_id": next_sm, "submenu_name": f"{parent_name.replace('_', ' ').title()} History", "sequence_number": next_sm - cfg_menu_id * 100, "description": f"Audit history for {parent_name}", "tables": [history_table], }) next_sm += 1 created += 1 return created # --------------------------------------------------------------------------- # # Step 10 — Generate INDEX statements (deterministic, generic) # --------------------------------------------------------------------------- # def generate_index_statements(schema_json: dict) -> list[str]: """Return a list of CREATE INDEX SQL statements covering: - every `*_id` FK column on every table - every UNIQUE business key (NB: Postgres auto-indexes UNIQUE — explicit here for documentation) - `is_active`, `created_at`, `deleted_at` on every business table - every `_status` / `_type` / `_priority` column Idempotent (uses a `seen` set). Generic across all domains. Order: business tables first, then cfg tables. """ schema_name = schema_json.get("schema_name") or "schema" seen: set[tuple[str, str]] = set() out: list[str] = [] def _add(table: str, col: str) -> None: key = (table, col) if key in seen: return seen.add(key) idx = f"idx_{table}_{col}" out.append( f"CREATE INDEX {idx} ON {schema_name}.{table}({col});" ) business: list[dict] = [] cfgs: list[dict] = [] for tbl in _all_tables(schema_json): if tbl.get("table_name", "").startswith("cfg_"): cfgs.append(tbl) elif tbl.get("table_name") not in {"menu", "submenu"}: business.append(tbl) for tbl in business + cfgs: tname = tbl.get("table_name", "") for col in tbl.get("columns") or []: cname = col.get("name") or "" if not cname or cname == "id": continue cons_upper = (col.get("constraints") or "").upper() # FK columns if cname.endswith("_id"): _add(tname, cname) continue # UNIQUE business keys if "UNIQUE" in cons_upper: _add(tname, cname) continue # Status / type / priority if any(suffix in cname for suffix in ("_status", "_type", "_priority", "_category")): _add(tname, cname) continue # Activity flags + time fields used in WHERE clauses if cname in { "is_active", "created_at", "updated_at", "deleted_at", "deleted", "created_by", "updated_by", "is_draft", "is_auto", "is_parent_deleted", }: _add(tname, cname) continue return out # --------------------------------------------------------------------------- # # Public entry point — single call wires it all together # --------------------------------------------------------------------------- # def normalise(schema_json: dict, cir: dict | None = None, richness: bool = True, regenerate_name: bool = True, bag=None) -> dict: """Run the full normalisation pipeline. Mutates schema_json. Idempotent. Args: schema_json: Mantara schema dict (will be mutated) cir: optional CIR dict for seed backfill + rule surfacing richness: when True (default), runs the deterministic richness booster (SaaS audit columns, history tables, indexes). Set False if you want pure structural correctness without column injection. regenerate_name: when True (default), replaces schema_name with a unique `sch___` (36 chars). Idempotent — skipped if schema_name already matches the sch_ pattern. Pass False to keep the LLM-emitted name (e.g. inside the LLM repair loop, where stable names help the model). Returns a stats dict written under `schema_json["_normaliser"]`. """ stats: dict[str, Any] = { "schema_name_old": schema_json.get("schema_name"), "schema_name_new": None, "lifted_enums": 0, "auto_created_cfg": 0, "normalised_repairs": 0, "rewritten_columns": 0, "stripped_dangling_fks": 0, "backfilled_seeds": 0, "duplicate_tables_removed": 0, "missing_fks_injected": 0, "status_cfg_enforced": 0, "seeds_default_filled": 0, "saas_columns_injected": 0, "history_tables_created": 0, "indexes_emitted": 0, "rules": {}, } # 0. Regenerate schema_name (FIRST — so all subsequent FK rewrites use # the new prefix). Skipped inside the LLM repair loop. if regenerate_name: new_sn = regenerate_schema_name(schema_json) if new_sn: stats["schema_name_new"] = new_sn schema_name = schema_json.get("schema_name") or "schema" stats["schema_name"] = schema_name # 0.5. dedupe duplicate tables (LLM occasionally emits the same name twice) stats["duplicate_tables_removed"] = dedupe_tables(schema_json) # 1. enums → cfg stats["lifted_enums"] = lift_enum_types(schema_json) # 2. auto-create missing cfg targets stats["auto_created_cfg"] = create_missing_cfg_tables(schema_json) # 3. normalise every cfg table cfg_tables = _existing_cfg_tables(schema_json) # Build table_name → submenu_id so each cfg table gets a proper DEFAULT. cfg_submenu_map: dict[str, int] = {} for _menu in schema_json.get("menus", []): for _sub in _menu.get("submenus") or []: _sid = _sub.get("submenu_id") if _sid is None: continue for _tbl in _sub.get("tables") or []: _tname = _tbl.get("table_name", "") if _tname.startswith("cfg_"): cfg_submenu_map[_tname] = _sid repairs = 0 for tbl_name, tbl in cfg_tables.items(): repairs += normalise_cfg_table(tbl, schema_name, submenu_id=cfg_submenu_map.get(tbl_name)) stats["normalised_repairs"] = repairs # 4. rewrite business→cfg refs rewritten = 0 for tbl in _all_tables(schema_json): if tbl.get("table_name", "").startswith("cfg_"): continue rewritten += rewrite_business_fks_to_cfg(tbl, cfg_tables, schema_name) stats["rewritten_columns"] = rewritten # 5. strip dangling FKs stats["stripped_dangling_fks"] = strip_dangling_fks(schema_json) # 5.1. coerce UUID/BIGINT FK columns to INT (all PKs are SERIAL/integer) stats["fk_types_repaired"] = repair_fk_types(schema_json) # 5.2. fix FK references naming a non-existent column (e.g. bmi_result_id → id) stats["fk_columns_repaired"] = repair_fk_referenced_columns(schema_json) # 5.5. inject missing FK columns for orphan cfg tables (Fix A — cfg → biz) stats["missing_fks_injected"] = inject_missing_cfg_fks(schema_json) # 5.7. enforce status FK on every business table (Lever 3 — biz → cfg). # Uses DomainKnowledgeBag for input-derived seeds when available, falls # back to archetype defaults otherwise. Generic across domains. stats["status_cfg_enforced"] = enforce_status_per_business_table(schema_json, bag) # 5.8. final pass — every cfg_* table must have seeds. If neither the LLM, # CIR backfill, nor Lever 3 populated them, use archetype-derived defaults # based on the cfg's stem name. stats["seeds_default_filled"] = _ensure_every_cfg_has_seeds(schema_json, bag) # 6. CIR backfill + rules if cir: stats["backfilled_seeds"] = backfill_seeds_from_cir(schema_json, cir) stats["rules"] = surface_cir_rules(schema_json, cir) # 7-9. Richness booster (deterministic, generic, $0) if richness: stats["saas_columns_injected"] = inject_saas_columns(schema_json) stats["history_tables_created"] = auto_create_history_tables(schema_json) # Generate indexes AFTER history tables so they're indexed too index_stmts = generate_index_statements(schema_json) stats["indexes_emitted"] = len(index_stmts) # Cache for the seed/index renderer to consume — kept under a private key schema_json["_indexes"] = index_stmts schema_json["_normaliser"] = stats return stats __all__ = [ "normalise", "regenerate_schema_name", "lift_enum_types", "create_missing_cfg_tables", "normalise_cfg_table", "rewrite_business_fks_to_cfg", "strip_dangling_fks", "repair_fk_types", "repair_fk_referenced_columns", "inject_missing_cfg_fks", "enforce_status_per_business_table", "dedupe_tables", "backfill_seeds_from_cir", "surface_cir_rules", "inject_saas_columns", "auto_create_history_tables", "generate_index_statements", ]