updates Mar6

2026-06-12 23:23:21 +00:00 · 2026-03-06 22:37:40 +00:00
parent 2ed998058e
commit 9dccc83293
23 changed files with 2257 additions and 74 deletions
--- a/app/routers/documents.py
+++ b/app/routers/documents.py
@@ -1,5 +1,5 @@
 from fastapi import APIRouter, HTTPException, Depends, UploadFile, File, BackgroundTasks
-from app.models import DocumentResponse, SuccessResponse
+from app.models import DocumentResponse, SuccessResponse, UrlSourceCreate, UrlSourceResponse
 from app.database import get_supabase
 from app.dependencies import get_current_user
 from app.services.document_processor import process_document
@@ -12,6 +12,7 @@ import logging

 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/chatbots/{chatbot_id}/documents", tags=["Documents"])
+url_router = APIRouter(prefix="/chatbots/{chatbot_id}/url-sources", tags=["URL Sources"])

 ALLOWED_TYPES = {
    "application/pdf": ".pdf",
@@ -206,3 +207,168 @@ async def delete_document(chatbot_id: str, document_id: str, user=Depends(get_cu

    supabase.table("documents").delete().eq("id", document_id).execute()
    return SuccessResponse(success=True, message="Document deleted")
+
+
+# ── URL Sources ───────────────────────────────────────────────────────────────
+
+@url_router.get("", response_model=List[UrlSourceResponse])
+async def list_url_sources(chatbot_id: str, user=Depends(get_current_user)):
+    supabase = get_supabase()
+    _get_user_chatbot(chatbot_id, user.id, supabase)
+    result = supabase.table("url_sources").select("*") \
+        .eq("chatbot_id", chatbot_id) \
+        .order("created_at", desc=True) \
+        .execute()
+    return result.data or []
+
+
+@url_router.post("", status_code=201)
+async def add_url_source(
+    chatbot_id: str,
+    data: UrlSourceCreate,
+    background_tasks: BackgroundTasks,
+    user=Depends(get_current_user),
+):
+    from app.config import PLAN_LIMITS
+    supabase = get_supabase()
+    chatbot = _get_user_chatbot(chatbot_id, user.id, supabase)
+
+    # Plan check
+    sub = supabase.table("subscriptions").select("plan").eq("user_id", user.id).eq("status", "active").execute()
+    plan = sub.data[0]["plan"] if sub.data else "free"
+    max_sources = PLAN_LIMITS.get(plan, PLAN_LIMITS["free"]).get("url_sources", 0)
+    if max_sources == 0:
+        raise HTTPException(status_code=402, detail="URL sources require Starter plan or higher")
+
+    # Count existing
+    existing = supabase.table("url_sources").select("id", count="exact").eq("chatbot_id", chatbot_id).execute()
+    if (existing.count or 0) >= max_sources:
+        raise HTTPException(status_code=402, detail=f"URL source limit reached ({max_sources}). Upgrade to add more.")
+
+    source_id = str(uuid.uuid4())
+    source_data = {
+        "id": source_id,
+        "chatbot_id": chatbot_id,
+        "url": data.url,
+        "status": "pending",
+    }
+    result = supabase.table("url_sources").insert(source_data).execute()
+    if not result.data:
+        raise HTTPException(status_code=500, detail="Failed to create URL source")
+
+    # Process in background
+    background_tasks.add_task(
+        _process_url_source,
+        source_id=source_id,
+        url=data.url,
+        chatbot=chatbot,
+        supabase=supabase,
+    )
+
+    return result.data[0]
+
+
+@url_router.delete("/{source_id}", response_model=SuccessResponse)
+async def delete_url_source(chatbot_id: str, source_id: str, user=Depends(get_current_user)):
+    supabase = get_supabase()
+    _get_user_chatbot(chatbot_id, user.id, supabase)
+
+    source = supabase.table("url_sources").select("*").eq("id", source_id).eq("chatbot_id", chatbot_id).execute()
+    if not source.data:
+        raise HTTPException(status_code=404, detail="URL source not found")
+
+    supabase.table("url_sources").delete().eq("id", source_id).execute()
+    return SuccessResponse(success=True, message="URL source deleted")
+
+
+async def _process_url_source(source_id: str, url: str, chatbot: dict, supabase):
+    """Background task to scrape a URL and add its content to the vector store."""
+    from app.services.web_scraper import scrape_url
+    from app.services.document_processor import chunk_text
+    from app.services.embeddings import embedding_service
+    from app.services.vector_store import vector_store
+
+    try:
+        # Update status to processing
+        supabase.table("url_sources").update({"status": "processing"}).eq("id", source_id).execute()
+
+        # Scrape URL
+        scraped = await scrape_url(url)
+        if "error" in scraped:
+            supabase.table("url_sources").update({
+                "status": "failed",
+                "error_message": scraped["error"],
+            }).eq("id", source_id).execute()
+            return
+
+        text = scraped["text"]
+        title = scraped.get("title", url)
+        collection_name = chatbot.get("qdrant_collection_name")
+
+        if not collection_name:
+            supabase.table("url_sources").update({
+                "status": "failed",
+                "error_message": "No vector store configured",
+            }).eq("id", source_id).execute()
+            return
+
+        # Ensure collection exists
+        if not vector_store.collection_exists(collection_name):
+            vector_store.create_collection(collection_name)
+
+        # Chunk text
+        chunks = chunk_text(text)
+        if not chunks:
+            supabase.table("url_sources").update({
+                "status": "failed",
+                "error_message": "No content extracted",
+            }).eq("id", source_id).execute()
+            return
+
+        # Embed and upsert
+        all_ids = []
+        all_vectors = []
+        all_payloads = []
+        batch_size = 50
+
+        for i in range(0, len(chunks), batch_size):
+            batch = chunks[i:i + batch_size]
+            vectors = embedding_service.embed_batch(batch)
+            ids = [str(uuid.uuid4()) for _ in vectors]
+            payloads = [{
+                "document_id": source_id,
+                "company_id": chatbot.get("company_id", ""),
+                "file_name": f"[URL] {title}",
+                "page_number": i // batch_size + 1,
+                "chunk_index": i + j,
+                "text": chunk,
+                "source_url": url,
+            } for j, chunk in enumerate(batch)]
+            all_ids.extend(ids)
+            all_vectors.extend(vectors)
+            all_payloads.extend(payloads)
+
+        vector_store.upsert_vectors(
+            collection_name=collection_name,
+            vectors=all_vectors,
+            payloads=all_payloads,
+            ids=all_ids,
+        )
+
+        supabase.table("url_sources").update({
+            "status": "completed",
+            "page_title": title,
+            "chunk_count": len(chunks),
+        }).eq("id", source_id).execute()
+
+        logger.info(f"URL source {source_id} processed: {len(chunks)} chunks from {url}")
+
+    except Exception as e:
+        logger.error(f"URL source processing error {source_id}: {e}")
+        supabase.table("url_sources").update({
+            "status": "failed",
+            "error_message": str(e)[:500],
+        }).eq("id", source_id).execute()
+
+
+router_url_sources = url_router