from fastapi import APIRouter, HTTPException, Depends, UploadFile, File, BackgroundTasks from app.models import DocumentResponse, SuccessResponse, UrlSourceCreate, UrlSourceResponse from app.database import get_supabase from app.dependencies import get_current_user from app.services.document_processor import process_document from app.services.embeddings import embedding_service from app.services.vector_store import vector_store from app.services.storage import delete_from_storage, extract_storage_path from app.services import cache as response_cache from app.config import settings from typing import List import uuid import logging logger = logging.getLogger(__name__) router = APIRouter(prefix="/chatbots/{chatbot_id}/documents", tags=["Documents"]) url_router = APIRouter(prefix="/chatbots/{chatbot_id}/url-sources", tags=["URL Sources"]) ALLOWED_TYPES = { "application/pdf": ".pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", "text/csv": ".csv", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx", "text/plain": ".txt", "text/markdown": ".md", } ALLOWED_EXTENSIONS = {".pdf", ".docx", ".csv", ".xlsx", ".txt", ".md"} def _get_user_chatbot(chatbot_id: str, user_id: str, supabase) -> dict: company = supabase.table("companies").select("id").eq("owner_id", user_id).execute() if not company.data: raise HTTPException(status_code=404, detail="Company not found") company_id = company.data[0]["id"] chatbot = supabase.table("chatbots").select("*").eq("id", chatbot_id).eq("company_id", company_id).execute() if not chatbot.data: raise HTTPException(status_code=404, detail="Chatbot not found") return chatbot.data[0] @router.post("", response_model=DocumentResponse, status_code=201) async def upload_document( chatbot_id: str, background_tasks: BackgroundTasks, file: UploadFile = File(...), user=Depends(get_current_user), ): supabase = get_supabase() chatbot = _get_user_chatbot(chatbot_id, user.id, supabase) # Validate file if not file.filename: raise HTTPException(status_code=400, detail="No filename provided") ext = "." + file.filename.rsplit(".", 1)[-1].lower() if "." in file.filename else "" if ext not in ALLOWED_EXTENSIONS: raise HTTPException( status_code=400, detail=f"File type not supported. Allowed: {', '.join(ALLOWED_EXTENSIONS)}" ) # Read file file_bytes = await file.read() file_size = len(file_bytes) max_size = settings.max_file_size_mb * 1024 * 1024 if file_size > max_size: raise HTTPException( status_code=413, detail=f"File too large. Max size: {settings.max_file_size_mb}MB" ) # Create document record doc_id = str(uuid.uuid4()) doc_data = { "id": doc_id, "chatbot_id": chatbot_id, "file_name": file.filename, "file_type": ext, "file_size": file_size, "chunk_count": 0, "status": "processing", } result = supabase.table("documents").insert(doc_data).execute() if not result.data: raise HTTPException(status_code=500, detail="Failed to create document record") # Process in background background_tasks.add_task( _process_document_bg, file_bytes=file_bytes, file_name=file.filename, doc_id=doc_id, chatbot_id=chatbot_id, supabase=supabase, ) return DocumentResponse(**result.data[0]) async def _process_document_bg( file_bytes: bytes, file_name: str, doc_id: str, chatbot_id: str, supabase, ): """Background task to process and embed a document""" try: # Re-fetch chatbot to guarantee we use the canonical collection and company_id, # not a snapshot that could have been captured before an update. chatbot_row = supabase.table("chatbots").select("company_id, qdrant_collection_name").eq("id", chatbot_id).execute() if not chatbot_row.data: logger.error(f"Chatbot {chatbot_id} not found during document processing") supabase.table("documents").update({ "status": "failed", "error_message": "Chatbot not found" }).eq("id", doc_id).execute() return chatbot = chatbot_row.data[0] company_id = chatbot.get("company_id", "") collection_name = chatbot.get("qdrant_collection_name") if not collection_name: logger.error(f"No Qdrant collection for chatbot {chatbot_id}") supabase.table("documents").update({ "status": "failed", "error_message": "Vector store not configured" }).eq("id", doc_id).execute() return # Ensure collection exists if not vector_store.collection_exists(collection_name): vector_store.create_collection(collection_name) # Process document chunks, payloads = process_document( file_bytes=file_bytes, file_name=file_name, document_id=doc_id, company_id=company_id, ) if not chunks: raise ValueError("No text extracted from document") # Generate embeddings in batches batch_size = 50 all_ids = [] all_vectors = [] all_payloads = [] for i in range(0, len(chunks), batch_size): batch_chunks = chunks[i:i + batch_size] batch_payloads = payloads[i:i + batch_size] vectors = embedding_service.embed_batch(batch_chunks) ids = [str(uuid.uuid4()) for _ in vectors] all_ids.extend(ids) all_vectors.extend(vectors) all_payloads.extend(batch_payloads) # Upsert to Qdrant vector_store.upsert_vectors( collection_name=collection_name, vectors=all_vectors, payloads=all_payloads, ids=all_ids, ) # Update document record supabase.table("documents").update({ "status": "completed", "chunk_count": len(chunks), }).eq("id", doc_id).execute() response_cache.invalidate(collection_name) logger.info(f"Document {doc_id} processed: {len(chunks)} chunks → collection='{collection_name}' company='{company_id}'") except Exception as e: logger.error(f"Document processing error for {doc_id}: {e}") supabase.table("documents").update({ "status": "failed", "error_message": str(e)[:500], }).eq("id", doc_id).execute() @router.get("", response_model=List[DocumentResponse]) async def list_documents(chatbot_id: str, user=Depends(get_current_user)): supabase = get_supabase() _get_user_chatbot(chatbot_id, user.id, supabase) result = supabase.table("documents").select("*") \ .eq("chatbot_id", chatbot_id) \ .order("created_at", desc=True) \ .execute() return [DocumentResponse(**d) for d in (result.data or [])] @router.delete("/{document_id}", response_model=SuccessResponse) async def delete_document(chatbot_id: str, document_id: str, user=Depends(get_current_user)): supabase = get_supabase() chatbot = _get_user_chatbot(chatbot_id, user.id, supabase) doc = supabase.table("documents").select("*").eq("id", document_id).eq("chatbot_id", chatbot_id).execute() if not doc.data: raise HTTPException(status_code=404, detail="Document not found") # Remove vectors from Qdrant collection_name = chatbot.get("qdrant_collection_name") if collection_name: try: vector_store.delete_by_document_id(collection_name, document_id) except Exception as e: logger.warning(f"Failed to delete vectors: {e}") # Delete file from Supabase Storage if doc.data[0].get("file_url"): delete_from_storage(supabase, "documents", doc.data[0]["file_url"]) supabase.table("documents").delete().eq("id", document_id).execute() if collection_name: response_cache.invalidate(collection_name) return SuccessResponse(success=True, message="Document deleted") @router.post("/{document_id}/retry", response_model=DocumentResponse) async def retry_document_processing( chatbot_id: str, document_id: str, background_tasks: BackgroundTasks, user=Depends(get_current_user), ): """Retry processing a failed document.""" supabase = get_supabase() chatbot = _get_user_chatbot(chatbot_id, user.id, supabase) doc = supabase.table("documents").select("*").eq("id", document_id).eq("chatbot_id", chatbot_id).execute() if not doc.data: raise HTTPException(status_code=404, detail="Document not found") document = doc.data[0] if document.get("status") != "failed": raise HTTPException(status_code=400, detail="Only failed documents can be retried") file_url = document.get("file_url") if not file_url: raise HTTPException( status_code=400, detail="No file URL stored. Please re-upload this document." ) # Download file from storage try: path = extract_storage_path(file_url, "documents") if not path: raise HTTPException(status_code=400, detail="Cannot locate file in storage. Please re-upload.") file_bytes = supabase.storage.from_("documents").download(path) except HTTPException: raise except Exception as e: logger.error(f"Failed to download document {document_id} for retry: {e}") raise HTTPException(status_code=400, detail="Cannot retrieve file from storage. Please re-upload.") # Reset status to processing result = supabase.table("documents").update({ "status": "processing", "error_message": None, "chunk_count": 0, }).eq("id", document_id).execute() # Clear stale cache before re-processing so tests see fresh results collection_name = chatbot.get("qdrant_collection_name") if collection_name: response_cache.invalidate(collection_name) # Re-enqueue background processing background_tasks.add_task( _process_document_bg, file_bytes=file_bytes, file_name=document["file_name"], doc_id=document_id, chatbot_id=chatbot_id, supabase=supabase, ) return DocumentResponse(**result.data[0]) # ── URL Sources ─────────────────────────────────────────────────────────────── @url_router.get("", response_model=List[UrlSourceResponse]) async def list_url_sources(chatbot_id: str, user=Depends(get_current_user)): supabase = get_supabase() _get_user_chatbot(chatbot_id, user.id, supabase) result = supabase.table("url_sources").select("*") \ .eq("chatbot_id", chatbot_id) \ .order("created_at", desc=True) \ .execute() return result.data or [] @url_router.post("", status_code=201) async def add_url_source( chatbot_id: str, data: UrlSourceCreate, background_tasks: BackgroundTasks, user=Depends(get_current_user), ): from app.config import PLAN_LIMITS supabase = get_supabase() chatbot = _get_user_chatbot(chatbot_id, user.id, supabase) # Plan check sub = supabase.table("subscriptions").select("plan").eq("user_id", user.id).eq("status", "active").execute() plan = sub.data[0]["plan"] if sub.data else "free" max_sources = PLAN_LIMITS.get(plan, PLAN_LIMITS["free"]).get("url_sources", 0) if max_sources == 0: raise HTTPException(status_code=402, detail="URL sources require Starter plan or higher") # Count existing existing = supabase.table("url_sources").select("id", count="exact").eq("chatbot_id", chatbot_id).execute() if (existing.count or 0) >= max_sources: raise HTTPException(status_code=402, detail=f"URL source limit reached ({max_sources}). Upgrade to add more.") source_id = str(uuid.uuid4()) source_data = { "id": source_id, "chatbot_id": chatbot_id, "url": data.url, "status": "pending", } result = supabase.table("url_sources").insert(source_data).execute() if not result.data: raise HTTPException(status_code=500, detail="Failed to create URL source") # Process in background background_tasks.add_task( _process_url_source, source_id=source_id, url=data.url, chatbot_id=chatbot_id, supabase=supabase, ) return result.data[0] @url_router.delete("/{source_id}", response_model=SuccessResponse) async def delete_url_source(chatbot_id: str, source_id: str, user=Depends(get_current_user)): supabase = get_supabase() _get_user_chatbot(chatbot_id, user.id, supabase) source = supabase.table("url_sources").select("*").eq("id", source_id).eq("chatbot_id", chatbot_id).execute() if not source.data: raise HTTPException(status_code=404, detail="URL source not found") chatbot = _get_user_chatbot(chatbot_id, user.id, supabase) collection_name = chatbot.get("qdrant_collection_name") if collection_name: try: vector_store.delete_by_document_id(collection_name, source_id) except Exception: pass response_cache.invalidate(collection_name) supabase.table("url_sources").delete().eq("id", source_id).execute() return SuccessResponse(success=True, message="URL source deleted") @url_router.post("/{source_id}/refresh", response_model=UrlSourceResponse) async def refresh_url_source( chatbot_id: str, source_id: str, background_tasks: BackgroundTasks, user=Depends(get_current_user), ): """Re-scrape a URL source and rebuild its vectors.""" supabase = get_supabase() chatbot = _get_user_chatbot(chatbot_id, user.id, supabase) source = supabase.table("url_sources").select("*").eq("id", source_id).eq("chatbot_id", chatbot_id).execute() if not source.data: raise HTTPException(status_code=404, detail="URL source not found") src = source.data[0] collection_name = chatbot.get("qdrant_collection_name") # Drop existing vectors for this source if collection_name: try: vector_store.delete_by_document_id(collection_name, source_id) except Exception as e: logger.warning(f"Could not delete old vectors for url source {source_id}: {e}") response_cache.invalidate(collection_name) # Reset to pending and reprocess updated = supabase.table("url_sources").update({ "status": "pending", "error_message": None, "chunk_count": 0, }).eq("id", source_id).execute() background_tasks.add_task(_process_url_source, source_id, src["url"], chatbot_id, supabase) return UrlSourceResponse(**{**src, "status": "pending", "chunk_count": 0}) async def _process_url_source(source_id: str, url: str, chatbot_id: str, supabase): """Background task to scrape a URL and add its content to the vector store.""" from app.services.web_scraper import scrape_url from app.services.document_processor import chunk_text from app.services.embeddings import embedding_service from app.services.vector_store import vector_store try: # Re-fetch chatbot to guarantee we use the canonical collection and company_id. chatbot_row = supabase.table("chatbots").select("company_id, qdrant_collection_name").eq("id", chatbot_id).execute() if not chatbot_row.data: logger.error(f"Chatbot {chatbot_id} not found during URL source processing") supabase.table("url_sources").update({ "status": "failed", "error_message": "Chatbot not found", }).eq("id", source_id).execute() return chatbot = chatbot_row.data[0] # Update status to processing supabase.table("url_sources").update({"status": "processing"}).eq("id", source_id).execute() # Scrape URL scraped = await scrape_url(url) if "error" in scraped: supabase.table("url_sources").update({ "status": "failed", "error_message": scraped["error"], }).eq("id", source_id).execute() return text = scraped["text"] title = scraped.get("title", url) collection_name = chatbot.get("qdrant_collection_name") if not collection_name: supabase.table("url_sources").update({ "status": "failed", "error_message": "No vector store configured", }).eq("id", source_id).execute() return # Ensure collection exists if not vector_store.collection_exists(collection_name): vector_store.create_collection(collection_name) # Chunk text chunks = chunk_text(text) if not chunks: supabase.table("url_sources").update({ "status": "failed", "error_message": "No content extracted", }).eq("id", source_id).execute() return # Embed and upsert all_ids = [] all_vectors = [] all_payloads = [] batch_size = 50 for i in range(0, len(chunks), batch_size): batch = chunks[i:i + batch_size] vectors = embedding_service.embed_batch(batch) ids = [str(uuid.uuid4()) for _ in vectors] payloads = [{ "document_id": source_id, "company_id": chatbot.get("company_id", ""), "file_name": f"[URL] {title}", "page_number": i // batch_size + 1, "chunk_index": i + j, "text": chunk, "source_url": url, } for j, chunk in enumerate(batch)] all_ids.extend(ids) all_vectors.extend(vectors) all_payloads.extend(payloads) vector_store.upsert_vectors( collection_name=collection_name, vectors=all_vectors, payloads=all_payloads, ids=all_ids, ) supabase.table("url_sources").update({ "status": "completed", "page_title": title, "chunk_count": len(chunks), }).eq("id", source_id).execute() response_cache.invalidate(collection_name) logger.info(f"URL source {source_id} processed: {len(chunks)} chunks from {url} → collection='{collection_name}' company='{chatbot.get('company_id', '')}'") except Exception as e: logger.error(f"URL source processing error {source_id}: {e}") supabase.table("url_sources").update({ "status": "failed", "error_message": str(e)[:500], }).eq("id", source_id).execute() router_url_sources = url_router