fixed the RAg in test pipeline issue

2026-06-12 23:23:21 +00:00 · 2026-04-26 18:51:48 +00:00
parent 205d9d7901
commit 97a501097d
14 changed files with 249 additions and 57 deletions
--- a/app/routers/documents.py
+++ b/app/routers/documents.py
@@ -6,6 +6,7 @@ from app.services.document_processor import process_document
 from app.services.embeddings import embedding_service
 from app.services.vector_store import vector_store
 from app.services.storage import delete_from_storage, extract_storage_path
+from app.services import cache as response_cache
 from app.config import settings
 from typing import List
 import uuid
@@ -166,6 +167,7 @@ async def _process_document_bg(
            "chunk_count": len(chunks),
        }).eq("id", doc_id).execute()

+        response_cache.invalidate(collection_name)
        logger.info(f"Document {doc_id} processed: {len(chunks)} chunks")

    except Exception as e:
@@ -211,6 +213,8 @@ async def delete_document(chatbot_id: str, document_id: str, user=Depends(get_cu
        delete_from_storage(supabase, "documents", doc.data[0]["file_url"])

    supabase.table("documents").delete().eq("id", document_id).execute()
+    if collection_name:
+        response_cache.invalidate(collection_name)
    return SuccessResponse(success=True, message="Document deleted")


@@ -259,6 +263,11 @@ async def retry_document_processing(
        "chunk_count": 0,
    }).eq("id", document_id).execute()

+    # Clear stale cache before re-processing so tests see fresh results
+    collection_name = chatbot.get("qdrant_collection_name")
+    if collection_name:
+        response_cache.invalidate(collection_name)
+
    # Re-enqueue background processing
    background_tasks.add_task(
        _process_document_bg,
@@ -340,10 +349,56 @@ async def delete_url_source(chatbot_id: str, source_id: str, user=Depends(get_cu
    if not source.data:
        raise HTTPException(status_code=404, detail="URL source not found")

+    chatbot = _get_user_chatbot(chatbot_id, user.id, supabase)
+    collection_name = chatbot.get("qdrant_collection_name")
+    if collection_name:
+        try:
+            vector_store.delete_by_document_id(collection_name, source_id)
+        except Exception:
+            pass
+        response_cache.invalidate(collection_name)
    supabase.table("url_sources").delete().eq("id", source_id).execute()
    return SuccessResponse(success=True, message="URL source deleted")


+@url_router.post("/{source_id}/refresh", response_model=UrlSourceResponse)
+async def refresh_url_source(
+    chatbot_id: str,
+    source_id: str,
+    background_tasks: BackgroundTasks,
+    user=Depends(get_current_user),
+):
+    """Re-scrape a URL source and rebuild its vectors."""
+    supabase = get_supabase()
+    chatbot = _get_user_chatbot(chatbot_id, user.id, supabase)
+
+    source = supabase.table("url_sources").select("*").eq("id", source_id).eq("chatbot_id", chatbot_id).execute()
+    if not source.data:
+        raise HTTPException(status_code=404, detail="URL source not found")
+
+    src = source.data[0]
+    collection_name = chatbot.get("qdrant_collection_name")
+
+    # Drop existing vectors for this source
+    if collection_name:
+        try:
+            vector_store.delete_by_document_id(collection_name, source_id)
+        except Exception as e:
+            logger.warning(f"Could not delete old vectors for url source {source_id}: {e}")
+        response_cache.invalidate(collection_name)
+
+    # Reset to pending and reprocess
+    updated = supabase.table("url_sources").update({
+        "status": "pending",
+        "error_message": None,
+        "chunk_count": 0,
+    }).eq("id", source_id).returning("representation").execute()
+
+    background_tasks.add_task(_process_url_source, source_id, src["url"], chatbot, supabase)
+
+    return UrlSourceResponse(**{**src, "status": "pending", "chunk_count": 0})
+
+
 async def _process_url_source(source_id: str, url: str, chatbot: dict, supabase):
    """Background task to scrape a URL and add its content to the vector store."""
    from app.services.web_scraper import scrape_url
@@ -424,6 +479,7 @@ async def _process_url_source(source_id: str, url: str, chatbot: dict, supabase)
            "chunk_count": len(chunks),
        }).eq("id", source_id).execute()

+        response_cache.invalidate(collection_name)
        logger.info(f"URL source {source_id} processed: {len(chunks)} chunks from {url}")

    except Exception as e: