fixed the RAg in test pipeline issue

This commit is contained in:
belviskhoremk
2026-04-26 21:43:19 +00:00
parent 78023ae9c5
commit 260a9c6353
9 changed files with 262 additions and 78 deletions

View File

@@ -9,7 +9,7 @@ import logging
logger = logging.getLogger(__name__)
RAG_SYSTEM_PROMPT = """You are a helpful AI assistant for {company_name}.
Your role is to answer questions based on the provided context from company documents.
Your role is to answer questions based on the provided context from the knowledge base (documents and web pages).
IMPORTANT RULES:
1. Answer based on the provided context below
@@ -20,7 +20,7 @@ IMPORTANT RULES:
{language_instruction}
{custom_instructions}
Context from knowledge base:
Knowledge base context:
{context}
"""
@@ -74,14 +74,22 @@ class RAGEngine:
}
# Step 2: Retrieve relevant chunks
# Fetch more than needed so that after filtering low-quality results
# we still have enough context. score_threshold=0.55 keeps only chunks
# that are genuinely relevant for text-embedding-3-small cosine similarity.
# Retrieve more candidates than needed (10) with a slightly relaxed threshold (0.45)
# so that content from both document and URL sources gets fair representation.
# Scraped web text embeds less cleanly than structured documents, so 0.55 was
# filtering out valid URL chunks. Context is capped by char limit below.
total_in_collection = self.vector_svc.count_vectors(collection_name)
logger.info(f"[RAG] Collection '{collection_name}' has {total_in_collection} vectors total")
# No score_threshold — always return the top-N most similar chunks by rank.
# Absolute cosine scores vary widely by document type and embedding model;
# filtering by a fixed cutoff here discards valid context when scores are
# uniformly low. The confidence_score below captures retrieval quality for
# handoff/fallback decisions without silencing the LLM's context.
retrieved = self.vector_svc.search(
collection_name=collection_name,
query_vector=query_embedding,
limit=8,
score_threshold=0.55,
limit=10,
)
logger.info(f"[RAG] Retrieved {len(retrieved)} chunks from collection '{collection_name}'")
@@ -90,25 +98,38 @@ class RAGEngine:
text_preview = item.get("payload", {}).get("text", "")[:80]
logger.info(f"[RAG] Chunk {i+1}: score={score:.4f}, preview='{text_preview}...'")
# Step 3: Build sources
# Step 3: Build sources and labeled context
# Each chunk is prefixed with its source so the LLM can synthesize
# correctly when mixing document and URL content.
MAX_CONTEXT_CHARS = 10_000
sources = []
context_parts = []
seen_texts = set()
total_chars = 0
for item in retrieved:
payload = item.get("payload", {})
text = payload.get("text", "")
if text and text not in seen_texts:
seen_texts.add(text)
context_parts.append(text)
sources.append(
SourceDocument(
document_name=payload.get("file_name", "Document"),
chunk_text=text[:200] + "..." if len(text) > 200 else text,
score=item.get("score", 0.0),
page_number=payload.get("page_number"),
)
if not text or text in seen_texts:
continue
if total_chars + len(text) > MAX_CONTEXT_CHARS:
break
seen_texts.add(text)
total_chars += len(text)
file_name = payload.get("file_name", "Document")
source_url = payload.get("source_url")
label = f"[Source: {source_url}]" if source_url else f"[Source: {file_name}]"
context_parts.append(f"{label}\n{text}")
sources.append(
SourceDocument(
document_name=file_name,
chunk_text=text[:200] + "..." if len(text) > 200 else text,
score=item.get("score", 0.0),
page_number=payload.get("page_number"),
)
)
if context_parts:
context = "\n\n---\n\n".join(context_parts)