"""Mantara FSD Analyzer -- Extracts structured information from Functional
Specification Documents before they enter the schema generation pipeline.

The analyzer identifies:
  - Document structure (sections, headings, hierarchy)
  - Modules and features
  - Entity names and their attributes
  - Business rules and constraints
  - Enum candidates from lists/options

The output is a dict that can enrich the LLM prompt with pre-parsed context.
"""

import re
from dataclasses import dataclass, field


# ---------------------------------------------------------------------------
# Data classes for extracted information
# ---------------------------------------------------------------------------

@dataclass
class ExtractedEntity:
    name: str
    attributes: list[str] = field(default_factory=list)
    source_section: str = ""


@dataclass
class ExtractedRelationship:
    from_entity: str
    to_entity: str
    cardinality: str = ""  # e.g. "1:N", "M:N", "1:1"
    description: str = ""


@dataclass
class ExtractedBusinessRule:
    rule: str
    source_section: str = ""
    entities_involved: list[str] = field(default_factory=list)


@dataclass
class ExtractedEnum:
    name: str
    values: list[str] = field(default_factory=list)
    source_section: str = ""


@dataclass
class ExtractedSection:
    level: int  # 1 = top-level, 2 = subsection, etc.
    title: str
    content: str
    line_number: int = 0


@dataclass
class FSDAnalysis:
    """Complete analysis result from an FSD document."""
    is_fsd: bool = False
    confidence: float = 0.0  # 0.0 to 1.0
    sections: list[ExtractedSection] = field(default_factory=list)
    modules: list[str] = field(default_factory=list)
    features: list[str] = field(default_factory=list)
    entities: list[ExtractedEntity] = field(default_factory=list)
    relationships: list[ExtractedRelationship] = field(default_factory=list)
    business_rules: list[ExtractedBusinessRule] = field(default_factory=list)
    enum_candidates: list[ExtractedEnum] = field(default_factory=list)
    summary_text: str = ""

    @property
    def entity_count(self) -> int:
        return len(self.entities)

    @property
    def module_count(self) -> int:
        return len(self.modules)

    @property
    def has_structure(self) -> bool:
        return len(self.sections) > 2


# ---------------------------------------------------------------------------
# Detection: Is this text likely an FSD?
# ---------------------------------------------------------------------------

# Patterns that strongly suggest FSD content
_FSD_INDICATORS = [
    r"(?i)\bfunctional\s+specification\b",
    r"(?i)\bfsd\b",
    r"(?i)\brequirement[s]?\b",
    r"(?i)\buse\s+case\b",
    r"(?i)\bmodule[s]?\b",
    r"(?i)\bfeature[s]?\b",
    r"(?i)\bentit(?:y|ies)\b",
    r"(?i)\bworkflow[s]?\b",
    r"(?i)\bbusiness\s+rule[s]?\b",
    r"(?i)\bdata\s+model\b",
    r"(?i)\buser\s+stor(?:y|ies)\b",
    r"(?i)\bsystem\s+(?:shall|should|must|will)\b",
    r"(?i)\bscope\b",
    r"(?i)\bstakeholder[s]?\b",
    r"(?i)\bacceptance\s+criteria\b",
]

# Section heading patterns (markdown-style and numbered)
_HEADING_PATTERNS = [
    r"^#{1,4}\s+.+",                          # Markdown: ## Heading
    r"^\d+(?:\.\d+)*\s+[A-Z]",                # Numbered: 1.2 Something
    r"^[A-Z][A-Za-z\s&/]+:$",                 # Title Case followed by colon
    r"^(?:Section|Chapter|Module|Part)\s+\d+", # Explicit section markers
]


def _detect_fsd(text: str) -> tuple[bool, float]:
    """Return (is_fsd, confidence) based on heuristics.

    A document is classified as FSD if it:
    - Is longer than 500 characters
    - Contains multiple FSD indicator keywords
    - Has section headings / structure
    """
    if len(text) < 500:
        return False, 0.0

    # Count indicator matches
    indicator_hits = 0
    for pattern in _FSD_INDICATORS:
        if re.search(pattern, text):
            indicator_hits += 1

    # Count heading matches
    heading_hits = 0
    for line in text.split("\n"):
        stripped = line.strip()
        for hp in _HEADING_PATTERNS:
            if re.match(hp, stripped):
                heading_hits += 1
                break

    # Scoring
    score = 0.0
    if len(text) > 2000:
        score += 0.2
    if len(text) > 5000:
        score += 0.1
    score += min(indicator_hits * 0.08, 0.4)
    score += min(heading_hits * 0.05, 0.3)

    score = min(score, 1.0)
    return score >= 0.3, score


# ---------------------------------------------------------------------------
# Section extraction
# ---------------------------------------------------------------------------

def _extract_sections(text: str) -> list[ExtractedSection]:
    """Split the document into hierarchical sections based on headings."""
    sections = []
    lines = text.split("\n")
    current_section = None
    content_lines = []

    for i, line in enumerate(lines):
        stripped = line.strip()

        # Detect heading
        heading_level = 0
        heading_title = ""

        # Markdown headings
        md_match = re.match(r"^(#{1,4})\s+(.+)", stripped)
        if md_match:
            heading_level = len(md_match.group(1))
            heading_title = md_match.group(2).strip()

        # Numbered headings (1.0 Title, 2.1 Title)
        if not heading_level:
            num_match = re.match(r"^(\d+(?:\.\d+)*)\s+([A-Z].{2,})", stripped)
            if num_match:
                parts = num_match.group(1).split(".")
                heading_level = len(parts)
                heading_title = num_match.group(2).strip()

        if heading_level and heading_title:
            # Save previous section
            if current_section is not None:
                current_section.content = "\n".join(content_lines).strip()
                sections.append(current_section)
                content_lines = []

            current_section = ExtractedSection(
                level=heading_level,
                title=heading_title,
                content="",
                line_number=i + 1,
            )
        else:
            content_lines.append(line)

    # Save last section
    if current_section is not None:
        current_section.content = "\n".join(content_lines).strip()
        sections.append(current_section)

    return sections


# ---------------------------------------------------------------------------
# Entity extraction
# ---------------------------------------------------------------------------

# Common patterns for entity mentions in FSDs
_ENTITY_PATTERNS = [
    # "manage/track/store X, Y, and Z"
    r"(?i)(?:manage|track|store|maintain|handle|process|record)[s]?\s+(.+?)(?:\.|$)",
    # "X table", "X entity", "X record"
    r"(?i)(\w+)\s+(?:table|entity|record|master|register|registry)",
    # Entities listed after colons
    r"(?i)entit(?:y|ies)\s*:\s*(.+?)(?:\.|$)",
    # Semicolon-separated entity lists (common in CSV FSDs)
    r"(?i)(?:^|\n)\s*(?:entities?\s*:\s*)?(\w+(?:\s*;\s*\w+)+)",
]

# Words that look like entity names but are not
_ENTITY_STOPWORDS = {
    "the", "a", "an", "and", "or", "for", "with", "from", "this", "that",
    "all", "each", "every", "any", "some", "many", "multiple", "various",
    "system", "application", "platform", "software", "solution", "module",
    "feature", "function", "process", "data", "information", "detail",
    "management", "description", "overview", "scope", "purpose",
}

# Attribute/field patterns
_ATTRIBUTE_PATTERNS = [
    r"(?i)(?:fields?|columns?|attributes?|properties)\s*:\s*(.+?)(?:\.|$)",
    r"(?i)(?:includes?|contains?|has|with)\s+(.+?)(?:\.|$)",
]


def _extract_entities(text: str, sections: list[ExtractedSection]) -> list[ExtractedEntity]:
    """Extract entity names and their attributes from the text."""
    entities_dict: dict[str, ExtractedEntity] = {}

    # Strategy 1: Look for semicolon-separated entity lists (CSV FSD style)
    for match in re.finditer(r"(?i)(?:entities?\s*[;:]\s*)([a-z_]+(?:\s*;\s*[a-z_]+)+)", text):
        names = [n.strip() for n in match.group(1).split(";") if n.strip()]
        for name in names:
            clean = _clean_entity_name(name)
            if clean and clean not in entities_dict:
                entities_dict[clean] = ExtractedEntity(name=clean)

    # Strategy 2: Pattern matching on prose
    for pattern in _ENTITY_PATTERNS:
        for match in re.finditer(pattern, text):
            candidates = match.group(1)
            # Split on commas, semicolons, "and"
            for raw in re.split(r"[,;]|\band\b", candidates):
                clean = _clean_entity_name(raw.strip())
                if clean and clean not in entities_dict:
                    entities_dict[clean] = ExtractedEntity(name=clean)

    # Strategy 3: Section-aware extraction (look for entity mentions near headings)
    for section in sections:
        section_entities = _find_entities_in_section(section)
        for ent in section_entities:
            if ent.name not in entities_dict:
                entities_dict[ent.name] = ent
            else:
                # Merge attributes
                existing = entities_dict[ent.name]
                for attr in ent.attributes:
                    if attr not in existing.attributes:
                        existing.attributes.append(attr)
                if not existing.source_section:
                    existing.source_section = ent.source_section

    return list(entities_dict.values())


def _clean_entity_name(raw: str) -> str:
    """Clean and normalize a raw entity name candidate."""
    # Remove common prefixes/suffixes
    cleaned = re.sub(r"(?i)^(?:the|a|an)\s+", "", raw.strip())
    cleaned = re.sub(r"[^a-zA-Z0-9_\s]", "", cleaned).strip()
    # Convert to snake_case
    cleaned = re.sub(r"\s+", "_", cleaned).lower()
    # Remove trailing _table, _entity, etc.
    cleaned = re.sub(r"_(?:table|entity|record|master)$", "", cleaned)

    if not cleaned or cleaned in _ENTITY_STOPWORDS or len(cleaned) < 2:
        return ""
    return cleaned


def _find_entities_in_section(section: ExtractedSection) -> list[ExtractedEntity]:
    """Extract entities specifically from a section's content."""
    entities = []
    content = section.content

    # Look for attribute lists (bulleted or comma-separated near entity names)
    # Pattern: "Entity has: field1, field2, field3"
    for match in re.finditer(
        r"(?i)(\w+)\s+(?:has|contains?|includes?|with)\s*:\s*(.+?)(?:\.|$)",
        content, re.MULTILINE
    ):
        name = _clean_entity_name(match.group(1))
        if name:
            attrs = [a.strip() for a in re.split(r"[,;]", match.group(2)) if a.strip()]
            entities.append(ExtractedEntity(
                name=name,
                attributes=attrs[:20],  # cap at 20
                source_section=section.title,
            ))

    # Look for "table_name (attr1, attr2, ...)" pattern
    for match in re.finditer(r"(\w+)\s*\(([^)]+)\)", content):
        name = _clean_entity_name(match.group(1))
        if name:
            attrs = [a.strip() for a in match.group(2).split(",") if a.strip()]
            entities.append(ExtractedEntity(
                name=name,
                attributes=attrs[:20],
                source_section=section.title,
            ))

    return entities


# ---------------------------------------------------------------------------
# Relationship extraction
# ---------------------------------------------------------------------------

_RELATIONSHIP_PATTERNS = [
    # "X → Y" or "X -> Y"
    r"(\w+)\s*(?:→|->)\s*(\w+)",
    # "X has many Y" / "X belongs to Y"
    r"(?i)(\w+)\s+(?:has\s+many|has\s+multiple)\s+(\w+)",
    r"(?i)(\w+)\s+(?:belongs?\s+to)\s+(\w+)",
    # "X (1:N) Y" / "X (M:N) Y"
    r"(\w+)\s*\((\d+:[NM\d]+)\)\s*(\w+)",
    # Relationship lines from CSV: "X → Y; A → B"
    r"(\w+)\s*(?:→|->)\s*(\w+)(?:\s*;\s*(\w+)\s*(?:→|->)\s*(\w+))*",
]


def _extract_relationships(text: str) -> list[ExtractedRelationship]:
    """Extract entity relationships from the text."""
    relationships = []
    seen = set()

    # Arrow notation: entity_a → entity_b
    for match in re.finditer(r"([a-z_]+)\s*(?:→|->)\s*([a-z_]+)", text, re.IGNORECASE):
        from_e = match.group(1).strip().lower()
        to_e = match.group(2).strip().lower()
        key = (from_e, to_e)
        if key not in seen and from_e != to_e:
            seen.add(key)
            relationships.append(ExtractedRelationship(
                from_entity=from_e, to_entity=to_e
            ))

    # Cardinality notation: (1:N), (M:N)
    for match in re.finditer(
        r"(\w+)\s*\(([1MN]+:[1MN]+)\)\s*(\w+)", text, re.IGNORECASE
    ):
        from_e = _clean_entity_name(match.group(1))
        card = match.group(2).upper()
        to_e = _clean_entity_name(match.group(3))
        if from_e and to_e:
            key = (from_e, to_e)
            if key not in seen:
                seen.add(key)
                relationships.append(ExtractedRelationship(
                    from_entity=from_e, to_entity=to_e, cardinality=card
                ))

    # Prose: "X has many Y"
    for match in re.finditer(
        r"(?i)(\w+)\s+has\s+(?:many|multiple)\s+(\w+)", text
    ):
        from_e = _clean_entity_name(match.group(1))
        to_e = _clean_entity_name(match.group(2))
        if from_e and to_e:
            key = (from_e, to_e)
            if key not in seen:
                seen.add(key)
                relationships.append(ExtractedRelationship(
                    from_entity=from_e, to_entity=to_e, cardinality="1:N"
                ))

    return relationships


# ---------------------------------------------------------------------------
# Business rule extraction
# ---------------------------------------------------------------------------

_RULE_PATTERNS = [
    r"(?i)(?:rule|constraint|requirement|validation)\s*:\s*(.+?)(?:\.|$)",
    r"(?i)(?:must|shall|should|cannot|must\s+not)\s+(.+?)(?:\.|$)",
    r"(?i)(?:only|at\s+least|at\s+most|maximum|minimum)\s+(.+?)(?:\.|$)",
    r"(?i)(?:if|when|unless)\s+(.+?),\s*(?:then)\s+(.+?)(?:\.|$)",
]


def _extract_business_rules(text: str, sections: list[ExtractedSection]) -> list[ExtractedBusinessRule]:
    """Extract business rules and constraints from the text."""
    rules = []
    seen = set()

    # Look in sections with rule-related titles first
    for section in sections:
        title_lower = section.title.lower()
        if any(kw in title_lower for kw in ["rule", "constraint", "validation", "requirement", "logic"]):
            # Every line/sentence in a rules section is likely a rule
            for line in section.content.split("\n"):
                line = line.strip()
                line = re.sub(r"^[-*\d.)\s]+", "", line).strip()
                if len(line) > 15:
                    rule_key = line[:50].lower()
                    if rule_key not in seen:
                        seen.add(rule_key)
                        rules.append(ExtractedBusinessRule(
                            rule=line,
                            source_section=section.title,
                        ))

    # Pattern-based extraction from full text
    for pattern in _RULE_PATTERNS:
        for match in re.finditer(pattern, text, re.MULTILINE):
            rule_text = match.group(0).strip()
            if len(rule_text) > 15:
                rule_key = rule_text[:50].lower()
                if rule_key not in seen:
                    seen.add(rule_key)
                    rules.append(ExtractedBusinessRule(rule=rule_text))

    return rules[:50]  # Cap at 50 rules


# ---------------------------------------------------------------------------
# Enum candidate extraction
# ---------------------------------------------------------------------------

_ENUM_PATTERNS = [
    # Explicit enum definitions: "status_enum: value1 | value2 | value3"
    r"(\w+_enum)\s*:\s*([a-z_]+(?:\s*\|\s*[a-z_]+)+)",
    # "type: A, B, C" or "status: active, inactive, archived"
    r"(?i)(\w+(?:_type|_status|_category|_method|_role|_level|_priority))\s*:\s*([a-z_]+(?:\s*[,|]\s*[a-z_]+)+)",
    # Parenthetical options: "(active, inactive, suspended)"
    r"(?i)(\w+)\s*\(([a-z_]+(?:\s*,\s*[a-z_]+){2,})\)",
    # "X can be: A, B, or C"
    r"(?i)(\w+)\s+can\s+be\s*:\s*(.+?)(?:\.|$)",
]


def _extract_enum_candidates(text: str) -> list[ExtractedEnum]:
    """Extract potential ENUM types from the text."""
    enums = {}

    # Explicit enum definitions (highest confidence)
    for match in re.finditer(
        r"(\w+_enum)\s*:\s*([a-z_]+(?:\s*\|\s*[a-z_]+)+)", text, re.IGNORECASE
    ):
        name = match.group(1).lower()
        values = [v.strip().lower() for v in re.split(r"[|,]", match.group(2)) if v.strip()]
        if name not in enums:
            enums[name] = ExtractedEnum(name=name, values=values)

    # Status/type/role columns with listed values
    for match in re.finditer(
        r"(?i)(\w+(?:_type|_status|_category|_method|_role|_level|_priority|_channel|_format|_condition|_strategy))\s*:\s*([a-z_]+(?:\s*[,|]\s*[a-z_]+)+)",
        text
    ):
        name = match.group(1).lower()
        if not name.endswith("_enum"):
            name = name + "_enum"
        values = [v.strip().lower() for v in re.split(r"[|,]", match.group(2)) if v.strip()]
        if name not in enums and len(values) >= 2:
            enums[name] = ExtractedEnum(name=name, values=values)

    # "can be" pattern
    for match in re.finditer(
        r"(?i)(\w+)\s+can\s+be\s*:\s*(.+?)(?:\.|$)", text
    ):
        field_name = match.group(1).lower()
        values_raw = match.group(2)
        values = [
            re.sub(r"[^a-z0-9_]", "", v.strip().lower().replace(" ", "_"))
            for v in re.split(r"[,;]|\bor\b|\band\b", values_raw)
            if v.strip()
        ]
        values = [v for v in values if v and len(v) > 1]
        if len(values) >= 2:
            enum_name = f"{field_name}_enum"
            if enum_name not in enums:
                enums[enum_name] = ExtractedEnum(name=enum_name, values=values)

    return list(enums.values())


# ---------------------------------------------------------------------------
# Module/Feature extraction
# ---------------------------------------------------------------------------

def _extract_modules_and_features(
    sections: list[ExtractedSection],
) -> tuple[list[str], list[str]]:
    """Extract module and feature names from section hierarchy."""
    modules = []
    features = []

    for section in sections:
        title = section.title.strip()
        if not title:
            continue

        # Level 1 sections with domain-sounding names are modules
        if section.level == 1:
            # Filter out non-module headings
            lower = title.lower()
            if any(skip in lower for skip in [
                "overview", "introduction", "appendix", "glossary",
                "revision", "change log", "table of contents", "toc",
                "references", "acronyms",
            ]):
                continue
            modules.append(title)

        # Level 2 sections are features
        elif section.level == 2:
            lower = title.lower()
            if not any(skip in lower for skip in [
                "overview", "introduction", "scope", "purpose",
            ]):
                features.append(title)

    # Also detect modules from "Module: X" patterns in text
    # (already handled by section extraction for well-structured docs)

    return modules, features


# ---------------------------------------------------------------------------
# Main analysis function
# ---------------------------------------------------------------------------

def analyze_fsd(text: str) -> FSDAnalysis:
    """Analyze text and extract FSD-relevant information.

    This is the main entry point. It:
    1. Detects whether the text is likely an FSD
    2. Extracts document structure (sections, headings)
    3. Identifies modules and features
    4. Extracts entity names and attributes
    5. Identifies relationships between entities
    6. Extracts business rules and constraints
    7. Identifies ENUM candidates from listed options

    Returns:
        FSDAnalysis with all extracted information.
    """
    is_fsd, confidence = _detect_fsd(text)

    # Even if confidence is low, extract what we can for large inputs
    if len(text) < 200 and not is_fsd:
        return FSDAnalysis(is_fsd=False, confidence=confidence)

    sections = _extract_sections(text)
    modules, features = _extract_modules_and_features(sections)
    entities = _extract_entities(text, sections)
    relationships = _extract_relationships(text)
    business_rules = _extract_business_rules(text, sections)
    enum_candidates = _extract_enum_candidates(text)

    # Build summary
    summary_parts = []
    if modules:
        summary_parts.append(f"{len(modules)} modules: {', '.join(modules[:8])}")
    if features:
        summary_parts.append(f"{len(features)} features identified")
    if entities:
        entity_names = [e.name for e in entities[:10]]
        summary_parts.append(f"{len(entities)} entities: {', '.join(entity_names)}")
    if relationships:
        summary_parts.append(f"{len(relationships)} relationships")
    if enum_candidates:
        summary_parts.append(f"{len(enum_candidates)} ENUM candidates")
    if business_rules:
        summary_parts.append(f"{len(business_rules)} business rules")

    summary_text = " | ".join(summary_parts) if summary_parts else "No structured information extracted."

    # Adjust confidence upward if we found a lot of structure
    if entities and len(entities) >= 3:
        confidence = min(confidence + 0.15, 1.0)
    if enum_candidates and len(enum_candidates) >= 2:
        confidence = min(confidence + 0.1, 1.0)

    return FSDAnalysis(
        is_fsd=is_fsd or confidence >= 0.3,
        confidence=confidence,
        sections=sections,
        modules=modules,
        features=features,
        entities=entities,
        relationships=relationships,
        business_rules=business_rules,
        enum_candidates=enum_candidates,
        summary_text=summary_text,
    )


# ---------------------------------------------------------------------------
# Prompt enrichment: convert FSD analysis into LLM context
# ---------------------------------------------------------------------------

_FSD_CONTEXT_HEADER = """
=== FSD ANALYSIS (Pre-Parsed Context) ===

This input is a Functional Specification Document. Pay special attention to:
- Module boundaries (these become menus)
- Feature lists (these become submenus)
- Entity descriptions (these become tables)
- Attribute lists (these become columns)
- Enumerated options (these become ENUMs)
- Business rules (these become CHECK constraints and validation logic)

"""


def build_fsd_context(analysis: FSDAnalysis) -> str:
    """Convert FSD analysis into structured context to prepend to the LLM prompt.

    This gives the LLM a pre-parsed summary so it does not have to discover
    the document structure itself -- it can focus on schema design decisions.
    """
    if not analysis.is_fsd:
        return ""

    parts = [_FSD_CONTEXT_HEADER]

    # Modules -> Menus mapping hint
    if analysis.modules:
        parts.append("DETECTED MODULES (map these to menus):")
        for i, mod in enumerate(analysis.modules, 1):
            parts.append(f"  {i}. {mod}")
        parts.append("")

    # Features -> Submenus mapping hint
    if analysis.features:
        parts.append("DETECTED FEATURES (map these to submenus):")
        for i, feat in enumerate(analysis.features, 1):
            parts.append(f"  {i}. {feat}")
        parts.append("")

    # Entities -> Tables mapping hint
    if analysis.entities:
        parts.append("DETECTED ENTITIES (each must have a corresponding table):")
        for ent in analysis.entities:
            if ent.attributes:
                attrs = ", ".join(ent.attributes[:10])
                parts.append(f"  - {ent.name} (attributes: {attrs})")
            else:
                parts.append(f"  - {ent.name}")
        parts.append("")

    # Relationships -> FK hints
    if analysis.relationships:
        parts.append("DETECTED RELATIONSHIPS (implement as foreign keys):")
        for rel in analysis.relationships:
            card = f" ({rel.cardinality})" if rel.cardinality else ""
            parts.append(f"  - {rel.from_entity} -> {rel.to_entity}{card}")
        parts.append("")

    # Enum candidates -> ENUM types
    if analysis.enum_candidates:
        parts.append("DETECTED ENUM CANDIDATES (create these as ENUM types):")
        for enum in analysis.enum_candidates:
            vals = " | ".join(enum.values[:15])
            parts.append(f"  - {enum.name}: {vals}")
        parts.append("")

    # Business rules -> constraints
    if analysis.business_rules:
        parts.append("DETECTED BUSINESS RULES (implement as CHECK constraints or application logic):")
        for rule in analysis.business_rules[:20]:
            parts.append(f"  - {rule.rule}")
        parts.append("")

    # Section markers for long documents
    if analysis.sections and len(analysis.sections) > 3:
        parts.append("DOCUMENT STRUCTURE:")
        for section in analysis.sections:
            indent = "  " * section.level
            parts.append(f"{indent}[Section {section.level}] {section.title}")
        parts.append("")

    parts.append("=== END FSD ANALYSIS ===\n")

    return "\n".join(parts)


def enrich_user_input(user_input: str, analysis: FSDAnalysis) -> str:
    """Prepend FSD context to the user input for the LLM pipeline.

    If the analysis indicates the input is an FSD, the extracted context is
    prepended so the LLM has a structured summary to work from.
    """
    fsd_context = build_fsd_context(analysis)
    if not fsd_context:
        return user_input

    return fsd_context + user_input