fixed the RAg in test pipeline issue

This commit is contained in:
belviskhoremk
2026-04-26 21:43:19 +00:00
parent 78023ae9c5
commit 260a9c6353
9 changed files with 262 additions and 78 deletions

View File

@@ -94,7 +94,7 @@ async def upload_document(
file_bytes=file_bytes,
file_name=file.filename,
doc_id=doc_id,
chatbot=chatbot,
chatbot_id=chatbot_id,
supabase=supabase,
)
@@ -105,16 +105,28 @@ async def _process_document_bg(
file_bytes: bytes,
file_name: str,
doc_id: str,
chatbot: dict,
chatbot_id: str,
supabase,
):
"""Background task to process and embed a document"""
try:
# Re-fetch chatbot to guarantee we use the canonical collection and company_id,
# not a snapshot that could have been captured before an update.
chatbot_row = supabase.table("chatbots").select("company_id, qdrant_collection_name").eq("id", chatbot_id).execute()
if not chatbot_row.data:
logger.error(f"Chatbot {chatbot_id} not found during document processing")
supabase.table("documents").update({
"status": "failed",
"error_message": "Chatbot not found"
}).eq("id", doc_id).execute()
return
chatbot = chatbot_row.data[0]
company_id = chatbot.get("company_id", "")
collection_name = chatbot.get("qdrant_collection_name")
if not collection_name:
logger.error(f"No Qdrant collection for chatbot {chatbot['id']}")
logger.error(f"No Qdrant collection for chatbot {chatbot_id}")
supabase.table("documents").update({
"status": "failed",
"error_message": "Vector store not configured"
@@ -168,7 +180,7 @@ async def _process_document_bg(
}).eq("id", doc_id).execute()
response_cache.invalidate(collection_name)
logger.info(f"Document {doc_id} processed: {len(chunks)} chunks")
logger.info(f"Document {doc_id} processed: {len(chunks)} chunks → collection='{collection_name}' company='{company_id}'")
except Exception as e:
logger.error(f"Document processing error for {doc_id}: {e}")
@@ -274,7 +286,7 @@ async def retry_document_processing(
file_bytes=file_bytes,
file_name=document["file_name"],
doc_id=document_id,
chatbot=chatbot,
chatbot_id=chatbot_id,
supabase=supabase,
)
@@ -333,7 +345,7 @@ async def add_url_source(
_process_url_source,
source_id=source_id,
url=data.url,
chatbot=chatbot,
chatbot_id=chatbot_id,
supabase=supabase,
)
@@ -394,12 +406,12 @@ async def refresh_url_source(
"chunk_count": 0,
}).eq("id", source_id).returning("representation").execute()
background_tasks.add_task(_process_url_source, source_id, src["url"], chatbot, supabase)
background_tasks.add_task(_process_url_source, source_id, src["url"], chatbot_id, supabase)
return UrlSourceResponse(**{**src, "status": "pending", "chunk_count": 0})
async def _process_url_source(source_id: str, url: str, chatbot: dict, supabase):
async def _process_url_source(source_id: str, url: str, chatbot_id: str, supabase):
"""Background task to scrape a URL and add its content to the vector store."""
from app.services.web_scraper import scrape_url
from app.services.document_processor import chunk_text
@@ -407,6 +419,18 @@ async def _process_url_source(source_id: str, url: str, chatbot: dict, supabase)
from app.services.vector_store import vector_store
try:
# Re-fetch chatbot to guarantee we use the canonical collection and company_id.
chatbot_row = supabase.table("chatbots").select("company_id, qdrant_collection_name").eq("id", chatbot_id).execute()
if not chatbot_row.data:
logger.error(f"Chatbot {chatbot_id} not found during URL source processing")
supabase.table("url_sources").update({
"status": "failed",
"error_message": "Chatbot not found",
}).eq("id", source_id).execute()
return
chatbot = chatbot_row.data[0]
# Update status to processing
supabase.table("url_sources").update({"status": "processing"}).eq("id", source_id).execute()
@@ -480,7 +504,8 @@ async def _process_url_source(source_id: str, url: str, chatbot: dict, supabase)
}).eq("id", source_id).execute()
response_cache.invalidate(collection_name)
logger.info(f"URL source {source_id} processed: {len(chunks)} chunks from {url}")
logger.info(f"URL source {source_id} processed: {len(chunks)} chunks from {url} → collection='{collection_name}' company='{chatbot.get('company_id', '')}'")
except Exception as e:
logger.error(f"URL source processing error {source_id}: {e}")