fixed the RAg in test pipeline issue

This commit is contained in:
belviskhoremk
2026-04-26 18:51:48 +00:00
parent 205d9d7901
commit 97a501097d
14 changed files with 249 additions and 57 deletions

View File

@@ -31,14 +31,9 @@ class LLMService:
return await self._call_openai(messages, model, max_tokens, temperature)
except Exception as e:
logger.error(f"LLM error ({provider}/{model}): {e}")
# Fallback to a basic model if available
if model != "accounts/fireworks/models/kimi-k2-instruct-0905" and settings.fireworks_api_key:
return await self._call_fireworks(
messages,
"accounts/fireworks/models/kimi-k2-instruct-0905",
max_tokens,
temperature,
)
fallback = "accounts/fireworks/models/llama-v3p3-70b-instruct"
if model != fallback and settings.fireworks_api_key:
return await self._call_fireworks(messages, fallback, max_tokens, temperature)
raise
async def _call_fireworks(

View File

@@ -1,8 +1,9 @@
from app.services.embeddings import embedding_service
from app.services.vector_store import vector_store
from app.services.llm import llm_service
from app.services import cache as response_cache
from app.models import SourceDocument
from typing import List, Dict, Any, Optional, Tuple
from typing import List, Dict, Any, Optional
import logging
logger = logging.getLogger(__name__)
@@ -44,6 +45,7 @@ class RAGEngine:
chatbot_config: Dict[str, Any],
conversation_history: List[Dict[str, str]] = None,
language: str = "en",
bypass_cache: bool = False,
) -> Dict[str, Any]:
"""
Full RAG pipeline: embed → retrieve → generate
@@ -51,6 +53,13 @@ class RAGEngine:
if conversation_history is None:
conversation_history = []
# Cache hit — only for stateless (no history) queries, and not bypassed
if not conversation_history and not bypass_cache:
cached = response_cache.get(collection_name, query)
if cached is not None:
logger.info(f"[RAG] Cache hit for query in '{collection_name}'")
return cached
# Step 1: Embed the query
try:
query_embedding = self.embedding_svc.embed_text(query)
@@ -65,14 +74,14 @@ class RAGEngine:
}
# Step 2: Retrieve relevant chunks
# FIX: Lowered score_threshold from 0.3 to 0.1 to avoid filtering out
# all results. With cosine similarity, 0.3 can be too aggressive for
# many document types and query patterns.
# Fetch more than needed so that after filtering low-quality results
# we still have enough context. score_threshold=0.55 keeps only chunks
# that are genuinely relevant for text-embedding-3-small cosine similarity.
retrieved = self.vector_svc.search(
collection_name=collection_name,
query_vector=query_embedding,
limit=5,
score_threshold=0.1, # FIX: was 0.3, now 0.1 to avoid over-filtering
limit=8,
score_threshold=0.55,
)
logger.info(f"[RAG] Retrieved {len(retrieved)} chunks from collection '{collection_name}'")
@@ -108,11 +117,15 @@ class RAGEngine:
context = "No relevant information found in the knowledge base."
logger.warning(f"[RAG] No context found for query: '{query}' in collection '{collection_name}'")
# Confidence: mean of top-3 scores (more stable than max alone)
top_scores = sorted([s.score for s in sources], reverse=True)[:3]
confidence_score = round(sum(top_scores) / len(top_scores), 4) if top_scores else 0.0
# Step 4: Build messages
lang_name = LANGUAGE_NAMES.get(language, "English") if language and language != "en" else ""
language_instruction = (
f"\n6. Respond in {lang_name}. Match the language of the user's message."
if lang_name else ""
"\n6. CRITICAL: Always reply in the exact same language the user wrote in. "
"If they write in French, reply in French. If Spanish, reply in Spanish. "
"Never switch to English unless the user writes in English."
)
system_prompt = RAG_SYSTEM_PROMPT.format(
@@ -137,7 +150,7 @@ class RAGEngine:
logger.info(f"[RAG] Sending {len(messages)} messages to LLM (model: {chatbot_config.get('model')})")
# Step 5: Generate response
model = chatbot_config.get("model", "accounts/fireworks/models/kimi-k2-instruct-0905")
model = chatbot_config.get("model", "accounts/fireworks/models/kimi-k2-instruct")
try:
result = await self.llm_svc.generate(
messages=messages,
@@ -146,17 +159,22 @@ class RAGEngine:
temperature=chatbot_config.get("temperature", 0.7),
)
logger.info(f"[RAG] LLM response generated. Tokens used: {result.get('tokens_used', 0)}")
return {
payload = {
"response": result["content"],
"sources": sources,
"confidence_score": confidence_score,
"tokens_used": result.get("tokens_used", 0),
"model": result.get("model", model),
}
if not conversation_history and not bypass_cache:
response_cache.set(collection_name, query, payload)
return payload
except Exception as e:
logger.error(f"[RAG] LLM generation error: {e}", exc_info=True)
return {
"response": "I'm having trouble generating a response. Please try again later.",
"sources": sources,
"confidence_score": confidence_score,
"tokens_used": 0,
"model": model,
}