contexta_be/app/services/rag.py

from app.services.embeddings import embedding_service
from app.services.vector_store import vector_store
from app.services.llm import llm_service
from app.models import SourceDocument
from typing import List, Dict, Any, Optional, Tuple
import logging

logger = logging.getLogger(__name__)

RAG_SYSTEM_PROMPT = """You are a helpful AI assistant for {company_name}.
Your role is to answer questions based on the provided context from company documents.

IMPORTANT RULES:
1. Answer based on the provided context below
2. If the context does not contain enough information, say so, but also try to be helpful with what IS available
3. Be concise and helpful
4. Always maintain a professional, friendly tone
5. If asked about topics completely outside the context, politely redirect to relevant topics

{custom_instructions}

Context from knowledge base:
{context}
"""


class RAGEngine:
    def __init__(self):
        self.embedding_svc = embedding_service
        self.vector_svc = vector_store
        self.llm_svc = llm_service

    async def process_query(
        self,
        query: str,
        collection_name: str,
        chatbot_config: Dict[str, Any],
        conversation_history: List[Dict[str, str]] = None,
        language: str = "en",
    ) -> Dict[str, Any]:
        """
        Full RAG pipeline: embed → retrieve → generate
        """
        if conversation_history is None:
            conversation_history = []

        # Step 1: Embed the query
        try:
            query_embedding = self.embedding_svc.embed_text(query)
            logger.info(f"[RAG] Query embedded successfully. Vector length: {len(query_embedding)}")
        except Exception as e:
            logger.error(f"[RAG] Embedding error: {e}", exc_info=True)
            return {
                "response": "I'm having trouble processing your request. Please try again.",
                "sources": [],
                "tokens_used": 0,
                "model": chatbot_config.get("model", "unknown"),
            }

        # Step 2: Retrieve relevant chunks
        # FIX: Lowered score_threshold from 0.3 to 0.1 to avoid filtering out
        # all results. With cosine similarity, 0.3 can be too aggressive for
        # many document types and query patterns.
        retrieved = self.vector_svc.search(
            collection_name=collection_name,
            query_vector=query_embedding,
            limit=5,
            score_threshold=0.1,  # FIX: was 0.3, now 0.1 to avoid over-filtering
        )

        logger.info(f"[RAG] Retrieved {len(retrieved)} chunks from collection '{collection_name}'")
        for i, item in enumerate(retrieved):
            score = item.get("score", 0)
            text_preview = item.get("payload", {}).get("text", "")[:80]
            logger.info(f"[RAG]   Chunk {i+1}: score={score:.4f}, preview='{text_preview}...'")

        # Step 3: Build sources
        sources = []
        context_parts = []
        seen_texts = set()

        for item in retrieved:
            payload = item.get("payload", {})
            text = payload.get("text", "")
            if text and text not in seen_texts:
                seen_texts.add(text)
                context_parts.append(text)
                sources.append(
                    SourceDocument(
                        document_name=payload.get("file_name", "Document"),
                        chunk_text=text[:200] + "..." if len(text) > 200 else text,
                        score=item.get("score", 0.0),
                        page_number=payload.get("page_number"),
                    )
                )

        if context_parts:
            context = "\n\n---\n\n".join(context_parts)
            logger.info(f"[RAG] Built context from {len(context_parts)} chunks ({len(context)} chars)")
        else:
            context = "No relevant information found in the knowledge base."
            logger.warning(f"[RAG] No context found for query: '{query}' in collection '{collection_name}'")

        # Step 4: Build messages
        system_prompt = RAG_SYSTEM_PROMPT.format(
            company_name=chatbot_config.get("company_name", ""),
            custom_instructions=chatbot_config.get("system_prompt") or "",
            context=context,
        )

        messages = [{"role": "system", "content": system_prompt}]

        # FIX: Conversation history must be in CHRONOLOGICAL order (oldest first).
        # The history should already come sorted ascending from the chat router.
        # We take the last 10 messages for context window management.
        history_to_use = conversation_history[-10:] if conversation_history else []
        for msg in history_to_use:
            messages.append({"role": msg["role"], "content": msg["content"]})

        # Add current query
        messages.append({"role": "user", "content": query})

        logger.info(f"[RAG] Sending {len(messages)} messages to LLM (model: {chatbot_config.get('model')})")

        # Step 5: Generate response
        model = chatbot_config.get("model", "accounts/fireworks/models/kimi-k2-instruct-0905")
        try:
            result = await self.llm_svc.generate(
                messages=messages,
                model=model,
                max_tokens=chatbot_config.get("max_tokens", 1000),
                temperature=chatbot_config.get("temperature", 0.7),
            )
            logger.info(f"[RAG] LLM response generated. Tokens used: {result.get('tokens_used', 0)}")
            return {
                "response": result["content"],
                "sources": sources,
                "tokens_used": result.get("tokens_used", 0),
                "model": result.get("model", model),
            }
        except Exception as e:
            logger.error(f"[RAG] LLM generation error: {e}", exc_info=True)
            return {
                "response": "I'm having trouble generating a response. Please try again later.",
                "sources": sources,
                "tokens_used": 0,
                "model": model,
            }


rag_engine = RAGEngine()