contexta_be/app/routers/analytics.py

"""
Analytics router - provides chatbot performance data for Starter+ users.

Available to: Starter, Pro, Enterprise plans only.
No LLM cost data is exposed to users.
"""
from fastapi import APIRouter, HTTPException, Depends
from app.database import get_supabase
from app.dependencies import get_current_user
from app.config import PLAN_LIMITS
from typing import List, Optional, Dict
from collections import defaultdict
from pydantic import BaseModel
from datetime import datetime, timedelta
import logging

logger = logging.getLogger(__name__)
router = APIRouter(prefix="/analytics", tags=["Analytics"])


# ─── Response Models ───────────────────────────────────────────────────────────

class DailyConversations(BaseModel):
    date: str
    count: int


class TopQuery(BaseModel):
    query: str
    count: int


class ChatbotAnalyticsResponse(BaseModel):
    chatbot_id: str
    chatbot_name: str
    total_conversations: int
    unique_sessions: int
    total_messages: int
    average_messages_per_conversation: float
    average_rating: Optional[float]
    total_ratings: int
    conversations_today: int
    conversations_this_week: int
    conversations_this_month: int
    daily_conversations: List[DailyConversations]
    top_queries: List[TopQuery]
    languages_used: Dict[str, int]
    peak_hour: Optional[int]  # 0-23
    unanswered_count: int = 0
    unanswered_queries: List[TopQuery] = []
    feedback_positive: int = 0
    feedback_negative: int = 0


class OverviewAnalyticsResponse(BaseModel):
    total_chatbots: int
    published_chatbots: int
    total_conversations: int
    total_messages: int
    unique_sessions: int
    conversations_this_month: int
    average_rating: Optional[float]
    chatbots: List[ChatbotAnalyticsResponse]
    plan: str
    conversations_limit: int
    conversations_used: int


# ─── Helpers ───────────────────────────────────────────────────────────────────

def _get_user_plan(user_id: str) -> str:
    supabase = get_supabase()
    result = supabase.table("subscriptions") \
        .select("plan") \
        .eq("user_id", user_id) \
        .eq("status", "active") \
        .execute()
    return result.data[0]["plan"] if result.data else "free"


def _check_analytics_access(plan: str):
    """Ensure user has analytics access (Starter+)."""
    plan_config = PLAN_LIMITS.get(plan, PLAN_LIMITS["free"])
    if not plan_config.get("analytics", False):
        raise HTTPException(
            status_code=402,
            detail="Analytics is available on Starter and Pro plans. Upgrade to access your chatbot analytics."
        )


# ─── Endpoints ─────────────────────────────────────────────────────────────────

@router.get("/overview", response_model=OverviewAnalyticsResponse)
async def get_analytics_overview(user=Depends(get_current_user)):
    """
    Get analytics overview across all chatbots for the current user.
    Requires Starter+ plan.
    """
    plan = _get_user_plan(user.id)
    _check_analytics_access(plan)

    supabase = get_supabase()

    # Get user's company
    company = supabase.table("companies").select("id").eq("owner_id", user.id).execute()
    if not company.data:
        raise HTTPException(status_code=404, detail="Company not found")
    company_id = company.data[0]["id"]

    # Get all chatbots
    chatbots = supabase.table("chatbots").select("*").eq("company_id", company_id).execute()
    chatbot_list = chatbots.data or []
    chatbot_ids = [c["id"] for c in chatbot_list]

    if not chatbot_ids:
        plan_config = PLAN_LIMITS.get(plan, PLAN_LIMITS["free"])
        return OverviewAnalyticsResponse(
            total_chatbots=0,
            published_chatbots=0,
            total_conversations=0,
            total_messages=0,
            unique_sessions=0,
            conversations_this_month=0,
            average_rating=None,
            chatbots=[],
            plan=plan,
            conversations_limit=plan_config.get("conversations_limit", 0),
            conversations_used=0,
        )

    # ── Batch queries (fixes N+1) ────────────────────────────────────────────────
    now = datetime.utcnow()
    month_start = now.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
    week_start = now - timedelta(days=now.weekday())
    week_start = week_start.replace(hour=0, minute=0, second=0, microsecond=0)
    today_start = now.replace(hour=0, minute=0, second=0, microsecond=0)
    thirty_days_ago = now - timedelta(days=30)

    # Batch query 1: ALL conversations for all chatbots (single query)
    all_convos_resp = supabase.table("conversations") \
        .select("id, chatbot_id, session_id, language, created_at") \
        .in_("chatbot_id", chatbot_ids) \
        .execute()
    all_convos = all_convos_resp.data or []
    all_conv_ids = [c["id"] for c in all_convos]

    # Batch query 2: ALL messages for all conversations (single query)
    all_msgs: List[Dict] = []
    if all_conv_ids:
        # Split into chunks of 500 to avoid URL length limits
        for i in range(0, len(all_conv_ids), 500):
            chunk = all_conv_ids[i:i + 500]
            msgs_resp = supabase.table("messages") \
                .select("id, conversation_id, role, content, created_at") \
                .in_("conversation_id", chunk) \
                .execute()
            all_msgs.extend(msgs_resp.data or [])

    # Batch query 3: ALL feedback for all chatbots (single query)
    all_feedback: List[Dict] = []
    if chatbot_ids:
        fb_resp = supabase.table("message_feedback") \
            .select("chatbot_id, feedback") \
            .in_("chatbot_id", chatbot_ids) \
            .execute()
        all_feedback = fb_resp.data or []

    # Index data by chatbot_id for O(1) lookups
    convos_by_chatbot: Dict[str, List[Dict]] = defaultdict(list)
    for c in all_convos:
        convos_by_chatbot[c["chatbot_id"]].append(c)

    msgs_by_conv: Dict[str, List[Dict]] = defaultdict(list)
    for m in all_msgs:
        msgs_by_conv[m["conversation_id"]].append(m)

    fb_by_chatbot: Dict[str, List[Dict]] = defaultdict(list)
    for f in all_feedback:
        fb_by_chatbot[f["chatbot_id"]].append(f)

    # ── Aggregate per chatbot ────────────────────────────────────────────────────
    chatbot_analytics = []
    total_convos = 0
    total_msgs = 0
    total_sessions = 0
    month_convos = 0
    all_ratings = []

    for chatbot in chatbot_list:
        cid = chatbot["id"]
        conv_data = convos_by_chatbot[cid]
        conv_count = len(conv_data)
        total_convos += conv_count

        # Unique sessions
        sessions = set(c.get("session_id") for c in conv_data if c.get("session_id"))
        unique_sess = len(sessions)
        total_sessions += unique_sess

        # Messages for this chatbot
        chatbot_msgs = []
        for c in conv_data:
            chatbot_msgs.extend(msgs_by_conv[c["id"]])
        msg_count = len(chatbot_msgs)
        total_msgs += msg_count

        # Time-based conversation counts
        today_str = today_start.strftime("%Y-%m-%d")
        today_count = sum(1 for c in conv_data if c.get("created_at") and c["created_at"][:10] == today_str)
        week_count = sum(1 for c in conv_data if c.get("created_at") and c["created_at"] >= week_start.isoformat())
        month_count = sum(1 for c in conv_data if c.get("created_at") and c["created_at"] >= month_start.isoformat())
        month_convos += month_count

        # Daily conversations (last 30 days)
        daily: Dict[str, int] = {}
        for c in conv_data:
            if c.get("created_at") and c["created_at"] >= thirty_days_ago.isoformat():
                day = c["created_at"][:10]
                daily[day] = daily.get(day, 0) + 1
        daily_list = [DailyConversations(date=d, count=n) for d, n in sorted(daily.items())]

        # Languages
        lang_counts: Dict[str, int] = {}
        for c in conv_data:
            lang = c.get("language", "en")
            lang_counts[lang] = lang_counts.get(lang, 0) + 1

        # Peak hour
        hour_counts: Dict[int, int] = {}
        for c in conv_data:
            if c.get("created_at") and len(c["created_at"]) > 13:
                try:
                    hour = int(c["created_at"][11:13])
                    hour_counts[hour] = hour_counts.get(hour, 0) + 1
                except (ValueError, IndexError):
                    pass
        peak = max(hour_counts, key=hour_counts.get) if hour_counts else None

        # Top queries from user messages
        query_counts: Dict[str, int] = {}
        for m in chatbot_msgs:
            if m.get("role") == "user":
                content = (m.get("content") or "")[:100].strip()
                if content:
                    query_counts[content] = query_counts.get(content, 0) + 1
        top_queries = [TopQuery(query=q, count=n) for q, n in sorted(query_counts.items(), key=lambda x: -x[1])[:5]]

        # Rating
        rating = chatbot.get("average_rating")
        if rating:
            all_ratings.append(rating)

        # Feedback
        chatbot_fb = fb_by_chatbot[cid]
        fb_pos = sum(1 for f in chatbot_fb if f.get("feedback") == "positive")
        fb_neg = len(chatbot_fb) - fb_pos

        avg_msgs = round(msg_count / conv_count, 1) if conv_count > 0 else 0.0

        chatbot_analytics.append(ChatbotAnalyticsResponse(
            chatbot_id=cid,
            chatbot_name=chatbot.get("name", "Untitled"),
            total_conversations=conv_count,
            unique_sessions=unique_sess,
            total_messages=msg_count,
            average_messages_per_conversation=avg_msgs,
            average_rating=rating,
            total_ratings=len(chatbot_fb),
            conversations_today=today_count,
            conversations_this_week=week_count,
            conversations_this_month=month_count,
            daily_conversations=daily_list,
            top_queries=top_queries,
            languages_used=lang_counts,
            peak_hour=peak,
            feedback_positive=fb_pos,
            feedback_negative=fb_neg,
        ))

    # Overall average rating
    avg_rating = round(sum(all_ratings) / len(all_ratings), 1) if all_ratings else None

    plan_config = PLAN_LIMITS.get(plan, PLAN_LIMITS["free"])

    return OverviewAnalyticsResponse(
        total_chatbots=len(chatbot_list),
        published_chatbots=sum(1 for c in chatbot_list if c.get("is_published")),
        total_conversations=total_convos,
        total_messages=total_msgs,
        unique_sessions=total_sessions,
        conversations_this_month=month_convos,
        average_rating=avg_rating,
        chatbots=chatbot_analytics,
        plan=plan,
        conversations_limit=plan_config.get("conversations_limit", 0),
        conversations_used=month_convos,
    )


@router.get("/chatbot/{chatbot_id}", response_model=ChatbotAnalyticsResponse)
async def get_chatbot_analytics(chatbot_id: str, user=Depends(get_current_user)):
    """
    Get detailed analytics for a specific chatbot.
    Requires Starter+ plan and ownership of the chatbot.
    """
    plan = _get_user_plan(user.id)
    _check_analytics_access(plan)

    supabase = get_supabase()

    # Verify ownership
    company = supabase.table("companies").select("id").eq("owner_id", user.id).execute()
    if not company.data:
        raise HTTPException(status_code=404, detail="Company not found")

    chatbot = supabase.table("chatbots").select("*") \
        .eq("id", chatbot_id) \
        .eq("company_id", company.data[0]["id"]).execute()

    if not chatbot.data:
        raise HTTPException(status_code=404, detail="Chatbot not found")

    cb = chatbot.data[0]
    now = datetime.utcnow()
    month_start = now.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
    week_start = now - timedelta(days=now.weekday())
    week_start = week_start.replace(hour=0, minute=0, second=0, microsecond=0)
    today_start = now.replace(hour=0, minute=0, second=0, microsecond=0)
    thirty_days_ago = now - timedelta(days=30)

    # Conversations
    convos = supabase.table("conversations").select("id, session_id, language, created_at", count="exact") \
        .eq("chatbot_id", chatbot_id).execute()
    conv_count = convos.count or 0
    conv_data = convos.data or []

    sessions = set(c.get("session_id") for c in conv_data if c.get("session_id"))

    # Messages
    conv_ids = [c["id"] for c in conv_data] if conv_data else [""]
    msgs = supabase.table("messages").select("id", count="exact") \
        .in_("conversation_id", conv_ids).execute()
    msg_count = msgs.count or 0

    today_count = sum(1 for c in conv_data if c.get("created_at") and c["created_at"][:10] == today_start.strftime("%Y-%m-%d"))
    week_count = sum(1 for c in conv_data if c.get("created_at") and c["created_at"] >= week_start.isoformat())
    month_count = sum(1 for c in conv_data if c.get("created_at") and c["created_at"] >= month_start.isoformat())

    # Daily
    daily = {}
    for c in conv_data:
        if c.get("created_at") and c["created_at"] >= thirty_days_ago.isoformat():
            day = c["created_at"][:10]
            daily[day] = daily.get(day, 0) + 1
    daily_list = [DailyConversations(date=d, count=n) for d, n in sorted(daily.items())]

    # Languages
    lang_counts: Dict[str, int] = {}
    for c in conv_data:
        lang = c.get("language", "en")
        lang_counts[lang] = lang_counts.get(lang, 0) + 1

    # Peak hour
    hour_counts: Dict[int, int] = {}
    for c in conv_data:
        if c.get("created_at") and len(c["created_at"]) > 13:
            try:
                hour = int(c["created_at"][11:13])
                hour_counts[hour] = hour_counts.get(hour, 0) + 1
            except (ValueError, IndexError):
                pass
    peak = max(hour_counts, key=hour_counts.get) if hour_counts else None

    # Top queries
    top_queries: List[TopQuery] = []
    if conv_data:
        recent_ids = [c["id"] for c in conv_data[:100]]
        user_msgs = supabase.table("messages").select("content") \
            .in_("conversation_id", recent_ids) \
            .eq("role", "user") \
            .limit(200).execute()
        query_counts: Dict[str, int] = {}
        for m in (user_msgs.data or []):
            content = (m.get("content") or "")[:100].strip()
            if content:
                query_counts[content] = query_counts.get(content, 0) + 1
        top_sorted = sorted(query_counts.items(), key=lambda x: -x[1])[:10]
        top_queries = [TopQuery(query=q, count=n) for q, n in top_sorted]

    avg_msgs = round(msg_count / conv_count, 1) if conv_count > 0 else 0.0

    # Feedback counts
    fb_pos = 0
    fb_neg = 0
    if conv_ids and conv_ids != [""]:
        feedback = supabase.table("message_feedback").select("feedback") \
            .eq("chatbot_id", chatbot_id).execute()
        for f in (feedback.data or []):
            if f["feedback"] == "positive":
                fb_pos += 1
            else:
                fb_neg += 1

    # Unanswered queries (low confidence)
    unanswered_queries: List[TopQuery] = []
    unanswered_count = 0
    if conv_ids and conv_ids != [""]:
        try:
            low_conf_msgs = supabase.table("messages").select("id, conversation_id, confidence_score") \
                .in_("conversation_id", conv_ids[:100]) \
                .eq("role", "assistant") \
                .lt("confidence_score", 0.2) \
                .limit(200).execute()
            unanswered_count = len(low_conf_msgs.data or [])
            # For each low-confidence assistant message, find the preceding user message
            if low_conf_msgs.data:
                unanswered_q_counts: Dict[str, int] = {}
                for lm in low_conf_msgs.data[:20]:  # limit work
                    prev_user = supabase.table("messages").select("content") \
                        .eq("conversation_id", lm["conversation_id"]) \
                        .eq("role", "user") \
                        .lt("created_at", lm.get("created_at", "9999")) \
                        .order("created_at", desc=True) \
                        .limit(1).execute()
                    if prev_user.data:
                        q = (prev_user.data[0].get("content") or "")[:100].strip()
                        if q:
                            unanswered_q_counts[q] = unanswered_q_counts.get(q, 0) + 1
                top_unanswered = sorted(unanswered_q_counts.items(), key=lambda x: -x[1])[:5]
                unanswered_queries = [TopQuery(query=q, count=n) for q, n in top_unanswered]
        except Exception:
            pass  # unanswered queries is optional

    return ChatbotAnalyticsResponse(
        chatbot_id=chatbot_id,
        chatbot_name=cb.get("name", "Untitled"),
        total_conversations=conv_count,
        unique_sessions=len(sessions),
        total_messages=msg_count,
        average_messages_per_conversation=avg_msgs,
        average_rating=cb.get("average_rating"),
        total_ratings=fb_pos + fb_neg,
        conversations_today=today_count,
        conversations_this_week=week_count,
        conversations_this_month=month_count,
        daily_conversations=daily_list,
        top_queries=top_queries,
        languages_used=lang_counts,
        peak_hour=peak,
        unanswered_count=unanswered_count,
        unanswered_queries=unanswered_queries,
        feedback_positive=fb_pos,
        feedback_negative=fb_neg,
    )


@router.get("/chatbot/{chatbot_id}/gaps", response_model=List[TopQuery])
async def get_knowledge_gaps(chatbot_id: str, user=Depends(get_current_user)):
    """Returns top queries where the bot had low confidence (knowledge gaps). Starter+ only."""
    plan = _get_user_plan(user.id)
    _check_analytics_access(plan)

    supabase = get_supabase()
    company = supabase.table("companies").select("id").eq("owner_id", user.id).execute()
    if not company.data:
        raise HTTPException(status_code=404, detail="Company not found")

    chatbot = supabase.table("chatbots").select("id") \
        .eq("id", chatbot_id).eq("company_id", company.data[0]["id"]).execute()
    if not chatbot.data:
        raise HTTPException(status_code=404, detail="Chatbot not found")

    # Find conversations
    convs = supabase.table("conversations").select("id").eq("chatbot_id", chatbot_id).execute()
    conv_ids = [c["id"] for c in (convs.data or [])]
    if not conv_ids:
        return []

    # Low confidence assistant messages
    low_conf = supabase.table("messages").select("id, conversation_id, created_at") \
        .in_("conversation_id", conv_ids[:100]) \
        .eq("role", "assistant") \
        .lt("confidence_score", 0.2) \
        .limit(100).execute()

    if not low_conf.data:
        return []

    q_counts: Dict[str, int] = {}
    for msg in low_conf.data[:30]:
        prev = supabase.table("messages").select("content") \
            .eq("conversation_id", msg["conversation_id"]) \
            .eq("role", "user") \
            .order("created_at", desc=True) \
            .limit(1).execute()
        if prev.data:
            content = (prev.data[0].get("content") or "")[:100].strip()
            if content:
                q_counts[content] = q_counts.get(content, 0) + 1

    sorted_gaps = sorted(q_counts.items(), key=lambda x: -x[1])[:10]
    return [TopQuery(query=q, count=n) for q, n in sorted_gaps]