mirror of
http://88.130.71.182:3000/BlitTech/contexta_be.git
synced 2026-06-13 08:45:24 +00:00
fixed the RAg in test pipeline issue
This commit is contained in:
@@ -1,8 +1,9 @@
|
||||
from app.services.embeddings import embedding_service
|
||||
from app.services.vector_store import vector_store
|
||||
from app.services.llm import llm_service
|
||||
from app.services import cache as response_cache
|
||||
from app.models import SourceDocument
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
from typing import List, Dict, Any, Optional
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -44,6 +45,7 @@ class RAGEngine:
|
||||
chatbot_config: Dict[str, Any],
|
||||
conversation_history: List[Dict[str, str]] = None,
|
||||
language: str = "en",
|
||||
bypass_cache: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Full RAG pipeline: embed → retrieve → generate
|
||||
@@ -51,6 +53,13 @@ class RAGEngine:
|
||||
if conversation_history is None:
|
||||
conversation_history = []
|
||||
|
||||
# Cache hit — only for stateless (no history) queries, and not bypassed
|
||||
if not conversation_history and not bypass_cache:
|
||||
cached = response_cache.get(collection_name, query)
|
||||
if cached is not None:
|
||||
logger.info(f"[RAG] Cache hit for query in '{collection_name}'")
|
||||
return cached
|
||||
|
||||
# Step 1: Embed the query
|
||||
try:
|
||||
query_embedding = self.embedding_svc.embed_text(query)
|
||||
@@ -65,14 +74,14 @@ class RAGEngine:
|
||||
}
|
||||
|
||||
# Step 2: Retrieve relevant chunks
|
||||
# FIX: Lowered score_threshold from 0.3 to 0.1 to avoid filtering out
|
||||
# all results. With cosine similarity, 0.3 can be too aggressive for
|
||||
# many document types and query patterns.
|
||||
# Fetch more than needed so that after filtering low-quality results
|
||||
# we still have enough context. score_threshold=0.55 keeps only chunks
|
||||
# that are genuinely relevant for text-embedding-3-small cosine similarity.
|
||||
retrieved = self.vector_svc.search(
|
||||
collection_name=collection_name,
|
||||
query_vector=query_embedding,
|
||||
limit=5,
|
||||
score_threshold=0.1, # FIX: was 0.3, now 0.1 to avoid over-filtering
|
||||
limit=8,
|
||||
score_threshold=0.55,
|
||||
)
|
||||
|
||||
logger.info(f"[RAG] Retrieved {len(retrieved)} chunks from collection '{collection_name}'")
|
||||
@@ -108,11 +117,15 @@ class RAGEngine:
|
||||
context = "No relevant information found in the knowledge base."
|
||||
logger.warning(f"[RAG] No context found for query: '{query}' in collection '{collection_name}'")
|
||||
|
||||
# Confidence: mean of top-3 scores (more stable than max alone)
|
||||
top_scores = sorted([s.score for s in sources], reverse=True)[:3]
|
||||
confidence_score = round(sum(top_scores) / len(top_scores), 4) if top_scores else 0.0
|
||||
|
||||
# Step 4: Build messages
|
||||
lang_name = LANGUAGE_NAMES.get(language, "English") if language and language != "en" else ""
|
||||
language_instruction = (
|
||||
f"\n6. Respond in {lang_name}. Match the language of the user's message."
|
||||
if lang_name else ""
|
||||
"\n6. CRITICAL: Always reply in the exact same language the user wrote in. "
|
||||
"If they write in French, reply in French. If Spanish, reply in Spanish. "
|
||||
"Never switch to English unless the user writes in English."
|
||||
)
|
||||
|
||||
system_prompt = RAG_SYSTEM_PROMPT.format(
|
||||
@@ -137,7 +150,7 @@ class RAGEngine:
|
||||
logger.info(f"[RAG] Sending {len(messages)} messages to LLM (model: {chatbot_config.get('model')})")
|
||||
|
||||
# Step 5: Generate response
|
||||
model = chatbot_config.get("model", "accounts/fireworks/models/kimi-k2-instruct-0905")
|
||||
model = chatbot_config.get("model", "accounts/fireworks/models/kimi-k2-instruct")
|
||||
try:
|
||||
result = await self.llm_svc.generate(
|
||||
messages=messages,
|
||||
@@ -146,17 +159,22 @@ class RAGEngine:
|
||||
temperature=chatbot_config.get("temperature", 0.7),
|
||||
)
|
||||
logger.info(f"[RAG] LLM response generated. Tokens used: {result.get('tokens_used', 0)}")
|
||||
return {
|
||||
payload = {
|
||||
"response": result["content"],
|
||||
"sources": sources,
|
||||
"confidence_score": confidence_score,
|
||||
"tokens_used": result.get("tokens_used", 0),
|
||||
"model": result.get("model", model),
|
||||
}
|
||||
if not conversation_history and not bypass_cache:
|
||||
response_cache.set(collection_name, query, payload)
|
||||
return payload
|
||||
except Exception as e:
|
||||
logger.error(f"[RAG] LLM generation error: {e}", exc_info=True)
|
||||
return {
|
||||
"response": "I'm having trouble generating a response. Please try again later.",
|
||||
"sources": sources,
|
||||
"confidence_score": confidence_score,
|
||||
"tokens_used": 0,
|
||||
"model": model,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user