fixed the RAg in test pipeline issue

2026-06-13 08:45:24 +00:00 · 2026-04-26 18:51:48 +00:00
parent 205d9d7901
commit 97a501097d
14 changed files with 249 additions and 57 deletions
--- a/app/services/rag.py
+++ b/app/services/rag.py
@@ -1,8 +1,9 @@
 from app.services.embeddings import embedding_service
 from app.services.vector_store import vector_store
 from app.services.llm import llm_service
+from app.services import cache as response_cache
 from app.models import SourceDocument
-from typing import List, Dict, Any, Optional, Tuple
+from typing import List, Dict, Any, Optional
 import logging

 logger = logging.getLogger(__name__)
@@ -44,6 +45,7 @@ class RAGEngine:
        chatbot_config: Dict[str, Any],
        conversation_history: List[Dict[str, str]] = None,
        language: str = "en",
+        bypass_cache: bool = False,
    ) -> Dict[str, Any]:
        """
        Full RAG pipeline: embed → retrieve → generate
@@ -51,6 +53,13 @@ class RAGEngine:
        if conversation_history is None:
            conversation_history = []

+        # Cache hit — only for stateless (no history) queries, and not bypassed
+        if not conversation_history and not bypass_cache:
+            cached = response_cache.get(collection_name, query)
+            if cached is not None:
+                logger.info(f"[RAG] Cache hit for query in '{collection_name}'")
+                return cached
+
        # Step 1: Embed the query
        try:
            query_embedding = self.embedding_svc.embed_text(query)
@@ -65,14 +74,14 @@ class RAGEngine:
            }

        # Step 2: Retrieve relevant chunks
-        # FIX: Lowered score_threshold from 0.3 to 0.1 to avoid filtering out
-        # all results. With cosine similarity, 0.3 can be too aggressive for
-        # many document types and query patterns.
+        # Fetch more than needed so that after filtering low-quality results
+        # we still have enough context. score_threshold=0.55 keeps only chunks
+        # that are genuinely relevant for text-embedding-3-small cosine similarity.
        retrieved = self.vector_svc.search(
            collection_name=collection_name,
            query_vector=query_embedding,
-            limit=5,
-            score_threshold=0.1,  # FIX: was 0.3, now 0.1 to avoid over-filtering
+            limit=8,
+            score_threshold=0.55,
        )

        logger.info(f"[RAG] Retrieved {len(retrieved)} chunks from collection '{collection_name}'")
@@ -108,11 +117,15 @@ class RAGEngine:
            context = "No relevant information found in the knowledge base."
            logger.warning(f"[RAG] No context found for query: '{query}' in collection '{collection_name}'")

+        # Confidence: mean of top-3 scores (more stable than max alone)
+        top_scores = sorted([s.score for s in sources], reverse=True)[:3]
+        confidence_score = round(sum(top_scores) / len(top_scores), 4) if top_scores else 0.0
+
        # Step 4: Build messages
-        lang_name = LANGUAGE_NAMES.get(language, "English") if language and language != "en" else ""
        language_instruction = (
-            f"\n6. Respond in {lang_name}. Match the language of the user's message."
-            if lang_name else ""
+            "\n6. CRITICAL: Always reply in the exact same language the user wrote in. "
+            "If they write in French, reply in French. If Spanish, reply in Spanish. "
+            "Never switch to English unless the user writes in English."
        )

        system_prompt = RAG_SYSTEM_PROMPT.format(
@@ -137,7 +150,7 @@ class RAGEngine:
        logger.info(f"[RAG] Sending {len(messages)} messages to LLM (model: {chatbot_config.get('model')})")

        # Step 5: Generate response
-        model = chatbot_config.get("model", "accounts/fireworks/models/kimi-k2-instruct-0905")
+        model = chatbot_config.get("model", "accounts/fireworks/models/kimi-k2-instruct")
        try:
            result = await self.llm_svc.generate(
                messages=messages,
@@ -146,17 +159,22 @@ class RAGEngine:
                temperature=chatbot_config.get("temperature", 0.7),
            )
            logger.info(f"[RAG] LLM response generated. Tokens used: {result.get('tokens_used', 0)}")
-            return {
+            payload = {
                "response": result["content"],
                "sources": sources,
+                "confidence_score": confidence_score,
                "tokens_used": result.get("tokens_used", 0),
                "model": result.get("model", model),
            }
+            if not conversation_history and not bypass_cache:
+                response_cache.set(collection_name, query, payload)
+            return payload
        except Exception as e:
            logger.error(f"[RAG] LLM generation error: {e}", exc_info=True)
            return {
                "response": "I'm having trouble generating a response. Please try again later.",
                "sources": sources,
+                "confidence_score": confidence_score,
                "tokens_used": 0,
                "model": model,
            }