"""Ollama backend — local LLM inference via Ollama's OpenAI-compatible API.

Uses regular chat completions + JSON mode (no Structured Outputs).
The JSON schema is injected into the system prompt so the model knows
the exact shape to produce. Response is parsed with Pydantic.

Setup:
    brew install ollama
    ollama serve
    ollama pull qwen2.5-coder:7b
"""

import json
import re
import sys
import time

from openai import OpenAI, APITimeoutError, APIConnectionError

from config import (
    OLLAMA_BASE_URL, OLLAMA_MODEL, MAX_TOKENS, TEMPERATURE,
    TIMEOUT_SECONDS, MAX_RETRIES, JSON_SCHEMA_PATH,
)
from models import MantaraSchema


_RETRYABLE = (APITimeoutError, APIConnectionError, ConnectionError)


def _log(msg: str):
    print(f"  [ollama] {msg}", file=sys.stderr)


def _load_json_schema() -> str:
    """Load the mantara JSON schema to inject into the prompt."""
    with open(JSON_SCHEMA_PATH) as f:
        schema = json.load(f)
    # Compact but readable
    return json.dumps(schema, indent=2)


_SCHEMA_INSTRUCTION = """

---

OUTPUT FORMAT: You MUST respond with ONLY a valid JSON object — no markdown, no code fences, no explanation.
The JSON must conform exactly to the following JSON Schema:

{json_schema}

CRITICAL:
- Output raw JSON only. Do NOT wrap in ```json``` or any markdown.
- Every field marked "required" MUST be present.
- All pattern constraints (snake_case, schema.table(col) format) MUST be followed.
- ENUM type_name must match pattern: schema_name.xxx_enum
- ENUM values must be lowercase snake_case (no symbols, no spaces).
"""


def _extract_json(text: str) -> str:
    """Extract JSON from model response, stripping markdown fences if present."""
    # Strip markdown code fences
    text = text.strip()
    if text.startswith("```"):
        # Remove opening fence (```json or ```)
        text = re.sub(r'^```(?:json)?\s*\n?', '', text)
        # Remove closing fence
        text = re.sub(r'\n?```\s*$', '', text)
        text = text.strip()

    # Find the outermost JSON object
    start = text.find('{')
    if start == -1:
        return text

    # Match braces to find the complete object
    depth = 0
    for i in range(start, len(text)):
        if text[i] == '{':
            depth += 1
        elif text[i] == '}':
            depth -= 1
            if depth == 0:
                return text[start:i + 1]

    # Fallback: return from first { to end
    return text[start:]


class OllamaBackend:
    """Ollama backend using OpenAI-compatible API."""

    def __init__(self):
        self._json_schema = _load_json_schema()

    def generate(self, system_prompt: str, user_input: str, model: str | None = None) -> MantaraSchema:
        """Generate a MantaraSchema via Ollama.

        Since Ollama doesn't support OpenAI Structured Outputs, we:
        1. Inject the JSON schema into the system prompt
        2. Use response_format={"type": "json_object"} for JSON mode
        3. Parse the response manually with Pydantic
        """
        client = OpenAI(
            base_url=OLLAMA_BASE_URL,
            api_key="ollama",  # Ollama doesn't need a real key
            timeout=TIMEOUT_SECONDS,
        )
        use_model = model or OLLAMA_MODEL
        last_error = None

        # Augment system prompt with JSON schema instructions
        full_prompt = system_prompt + _SCHEMA_INSTRUCTION.format(json_schema=self._json_schema)

        for attempt in range(1 + MAX_RETRIES):
            try:
                start = time.time()

                completion = client.chat.completions.create(
                    model=use_model,
                    messages=[
                        {"role": "system", "content": full_prompt},
                        {"role": "user", "content": user_input},
                    ],
                    max_tokens=MAX_TOKENS,
                    temperature=TEMPERATURE,
                    response_format={"type": "json_object"},
                )

                elapsed = round(time.time() - start, 1)

                usage = completion.usage
                if usage:
                    _log(
                        f"model={use_model}  "
                        f"prompt_tokens={usage.prompt_tokens}  "
                        f"completion_tokens={usage.completion_tokens}  "
                        f"total_tokens={usage.total_tokens}  "
                        f"latency={elapsed}s"
                    )

                raw = completion.choices[0].message.content
                if not raw:
                    raise ValueError("Model returned empty response")

                # Extract and parse JSON
                json_str = _extract_json(raw)
                try:
                    data = json.loads(json_str)
                except json.JSONDecodeError as e:
                    raise ValueError(
                        f"Model returned invalid JSON: {e}\n"
                        f"Raw response (first 500 chars): {raw[:500]}"
                    ) from e

                # Remove $schema field if present (Pydantic model doesn't have it)
                data.pop("$schema", None)

                try:
                    schema = MantaraSchema.model_validate(data)
                except Exception as e:
                    raise ValueError(
                        f"JSON parsed but failed Pydantic validation: {e}\n"
                        f"Keys in response: {list(data.keys())}"
                    ) from e

                return schema

            except _RETRYABLE as e:
                last_error = e
                if attempt < MAX_RETRIES:
                    wait = 2 ** (attempt + 1)
                    _log(
                        f"Transient error (attempt {attempt + 1}/{1 + MAX_RETRIES}): "
                        f"{type(e).__name__}: {e}  — retrying in {wait}s"
                    )
                    time.sleep(wait)
                else:
                    _log(f"All {1 + MAX_RETRIES} attempts failed.")

        raise RuntimeError(
            f"Failed after {1 + MAX_RETRIES} attempts. "
            f"Last error: {type(last_error).__name__}: {last_error}"
        )


class OllamaChat:
    """Lightweight wrapper for V2 pipeline steps that need plain chat completions.

    Mimics the OpenAI client interface used by generator_v2.py so the
    V2 pipeline can work with Ollama without major refactoring.
    """

    def __init__(self, base_url: str | None = None, timeout: int | None = None):
        self._client = OpenAI(
            base_url=base_url or OLLAMA_BASE_URL,
            api_key="ollama",
            timeout=timeout or TIMEOUT_SECONDS,
        )

    @property
    def chat(self):
        return self

    @property
    def completions(self):
        return self

    def create(self, model: str, messages: list, max_tokens: int = 4000,
               temperature: float = 0.2, **kwargs) -> object:
        """Plain chat completion — used by V2 analyze/plan steps."""
        # Map OpenAI model names to Ollama models
        model = self._resolve_model(model)

        return self._client.chat.completions.create(
            model=model,
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
            response_format={"type": "json_object"},
        )

    def _resolve_model(self, model: str) -> str:
        """Map OpenAI model names to Ollama equivalents."""
        mapping = {
            "gpt-4o": OLLAMA_MODEL,
            "gpt-4o-mini": OLLAMA_MODEL,  # Use same model for both on local
        }
        return mapping.get(model, model)