fixed the RAg in test pipeline issue

2026-06-13 08:45:24 +00:00 · 2026-04-26 21:43:19 +00:00
parent 78023ae9c5
commit 260a9c6353
9 changed files with 262 additions and 78 deletions
--- a/app/services/rag.py
+++ b/app/services/rag.py
@@ -9,7 +9,7 @@ import logging
 logger = logging.getLogger(__name__)

 RAG_SYSTEM_PROMPT = """You are a helpful AI assistant for {company_name}.
-Your role is to answer questions based on the provided context from company documents.
+Your role is to answer questions based on the provided context from the knowledge base (documents and web pages).

 IMPORTANT RULES:
 1. Answer based on the provided context below
@@ -20,7 +20,7 @@ IMPORTANT RULES:
 {language_instruction}
 {custom_instructions}

-Context from knowledge base:
+Knowledge base context:
 {context}
 """

@@ -74,14 +74,22 @@ class RAGEngine:
            }

        # Step 2: Retrieve relevant chunks
-        # Fetch more than needed so that after filtering low-quality results
-        # we still have enough context. score_threshold=0.55 keeps only chunks
-        # that are genuinely relevant for text-embedding-3-small cosine similarity.
+        # Retrieve more candidates than needed (10) with a slightly relaxed threshold (0.45)
+        # so that content from both document and URL sources gets fair representation.
+        # Scraped web text embeds less cleanly than structured documents, so 0.55 was
+        # filtering out valid URL chunks. Context is capped by char limit below.
+        total_in_collection = self.vector_svc.count_vectors(collection_name)
+        logger.info(f"[RAG] Collection '{collection_name}' has {total_in_collection} vectors total")
+
+        # No score_threshold — always return the top-N most similar chunks by rank.
+        # Absolute cosine scores vary widely by document type and embedding model;
+        # filtering by a fixed cutoff here discards valid context when scores are
+        # uniformly low. The confidence_score below captures retrieval quality for
+        # handoff/fallback decisions without silencing the LLM's context.
        retrieved = self.vector_svc.search(
            collection_name=collection_name,
            query_vector=query_embedding,
-            limit=8,
-            score_threshold=0.55,
+            limit=10,
        )

        logger.info(f"[RAG] Retrieved {len(retrieved)} chunks from collection '{collection_name}'")
@@ -90,25 +98,38 @@ class RAGEngine:
            text_preview = item.get("payload", {}).get("text", "")[:80]
            logger.info(f"[RAG]   Chunk {i+1}: score={score:.4f}, preview='{text_preview}...'")

-        # Step 3: Build sources
+        # Step 3: Build sources and labeled context
+        # Each chunk is prefixed with its source so the LLM can synthesize
+        # correctly when mixing document and URL content.
+        MAX_CONTEXT_CHARS = 10_000
        sources = []
        context_parts = []
        seen_texts = set()
+        total_chars = 0

        for item in retrieved:
            payload = item.get("payload", {})
            text = payload.get("text", "")
-            if text and text not in seen_texts:
-                seen_texts.add(text)
-                context_parts.append(text)
-                sources.append(
-                    SourceDocument(
-                        document_name=payload.get("file_name", "Document"),
-                        chunk_text=text[:200] + "..." if len(text) > 200 else text,
-                        score=item.get("score", 0.0),
-                        page_number=payload.get("page_number"),
-                    )
+            if not text or text in seen_texts:
+                continue
+            if total_chars + len(text) > MAX_CONTEXT_CHARS:
+                break
+            seen_texts.add(text)
+            total_chars += len(text)
+
+            file_name = payload.get("file_name", "Document")
+            source_url = payload.get("source_url")
+            label = f"[Source: {source_url}]" if source_url else f"[Source: {file_name}]"
+            context_parts.append(f"{label}\n{text}")
+
+            sources.append(
+                SourceDocument(
+                    document_name=file_name,
+                    chunk_text=text[:200] + "..." if len(text) > 200 else text,
+                    score=item.get("score", 0.0),
+                    page_number=payload.get("page_number"),
                )
+            )

        if context_parts:
            context = "\n\n---\n\n".join(context_parts)