Initial commit

2026-06-12 23:23:21 +00:00 · 2026-02-22 21:59:37 +00:00
commit 5bd496d355
27 changed files with 4172 additions and 0 deletions
--- a/app/routers/documents.py
+++ b/app/routers/documents.py
@@ -0,0 +1,208 @@
+from fastapi import APIRouter, HTTPException, Depends, UploadFile, File, BackgroundTasks
+from app.models import DocumentResponse, SuccessResponse
+from app.database import get_supabase
+from app.dependencies import get_current_user
+from app.services.document_processor import process_document
+from app.services.embeddings import embedding_service
+from app.services.vector_store import vector_store
+from app.config import settings
+from typing import List
+import uuid
+import logging
+
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/chatbots/{chatbot_id}/documents", tags=["Documents"])
+
+ALLOWED_TYPES = {
+    "application/pdf": ".pdf",
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
+    "text/csv": ".csv",
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
+    "text/plain": ".txt",
+    "text/markdown": ".md",
+}
+
+ALLOWED_EXTENSIONS = {".pdf", ".docx", ".csv", ".xlsx", ".txt", ".md"}
+
+
+def _get_user_chatbot(chatbot_id: str, user_id: str, supabase) -> dict:
+    company = supabase.table("companies").select("id").eq("owner_id", user_id).execute()
+    if not company.data:
+        raise HTTPException(status_code=404, detail="Company not found")
+    company_id = company.data[0]["id"]
+
+    chatbot = supabase.table("chatbots").select("*").eq("id", chatbot_id).eq("company_id", company_id).execute()
+    if not chatbot.data:
+        raise HTTPException(status_code=404, detail="Chatbot not found")
+    return chatbot.data[0]
+
+
+@router.post("", response_model=DocumentResponse, status_code=201)
+async def upload_document(
+    chatbot_id: str,
+    background_tasks: BackgroundTasks,
+    file: UploadFile = File(...),
+    user=Depends(get_current_user),
+):
+    supabase = get_supabase()
+    chatbot = _get_user_chatbot(chatbot_id, user.id, supabase)
+
+    # Validate file
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No filename provided")
+
+    ext = "." + file.filename.rsplit(".", 1)[-1].lower() if "." in file.filename else ""
+    if ext not in ALLOWED_EXTENSIONS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"File type not supported. Allowed: {', '.join(ALLOWED_EXTENSIONS)}"
+        )
+
+    # Read file
+    file_bytes = await file.read()
+    file_size = len(file_bytes)
+    max_size = settings.max_file_size_mb * 1024 * 1024
+
+    if file_size > max_size:
+        raise HTTPException(
+            status_code=413,
+            detail=f"File too large. Max size: {settings.max_file_size_mb}MB"
+        )
+
+    # Create document record
+    doc_id = str(uuid.uuid4())
+    doc_data = {
+        "id": doc_id,
+        "chatbot_id": chatbot_id,
+        "file_name": file.filename,
+        "file_type": ext,
+        "file_size": file_size,
+        "chunk_count": 0,
+        "status": "processing",
+    }
+
+    result = supabase.table("documents").insert(doc_data).execute()
+    if not result.data:
+        raise HTTPException(status_code=500, detail="Failed to create document record")
+
+    # Process in background
+    background_tasks.add_task(
+        _process_document_bg,
+        file_bytes=file_bytes,
+        file_name=file.filename,
+        doc_id=doc_id,
+        chatbot=chatbot,
+        supabase=supabase,
+    )
+
+    return DocumentResponse(**result.data[0])
+
+
+async def _process_document_bg(
+    file_bytes: bytes,
+    file_name: str,
+    doc_id: str,
+    chatbot: dict,
+    supabase,
+):
+    """Background task to process and embed a document"""
+    try:
+        company_id = chatbot.get("company_id", "")
+        collection_name = chatbot.get("qdrant_collection_name")
+
+        if not collection_name:
+            logger.error(f"No Qdrant collection for chatbot {chatbot['id']}")
+            supabase.table("documents").update({
+                "status": "failed",
+                "error_message": "Vector store not configured"
+            }).eq("id", doc_id).execute()
+            return
+
+        # Ensure collection exists
+        if not vector_store.collection_exists(collection_name):
+            vector_store.create_collection(collection_name)
+
+        # Process document
+        chunks, payloads = process_document(
+            file_bytes=file_bytes,
+            file_name=file_name,
+            document_id=doc_id,
+            company_id=company_id,
+        )
+
+        if not chunks:
+            raise ValueError("No text extracted from document")
+
+        # Generate embeddings in batches
+        batch_size = 50
+        all_ids = []
+        all_vectors = []
+        all_payloads = []
+
+        for i in range(0, len(chunks), batch_size):
+            batch_chunks = chunks[i:i + batch_size]
+            batch_payloads = payloads[i:i + batch_size]
+
+            vectors = embedding_service.embed_batch(batch_chunks)
+            ids = [str(uuid.uuid4()) for _ in vectors]
+
+            all_ids.extend(ids)
+            all_vectors.extend(vectors)
+            all_payloads.extend(batch_payloads)
+
+        # Upsert to Qdrant
+        vector_store.upsert_vectors(
+            collection_name=collection_name,
+            vectors=all_vectors,
+            payloads=all_payloads,
+            ids=all_ids,
+        )
+
+        # Update document record
+        supabase.table("documents").update({
+            "status": "completed",
+            "chunk_count": len(chunks),
+        }).eq("id", doc_id).execute()
+
+        logger.info(f"Document {doc_id} processed: {len(chunks)} chunks")
+
+    except Exception as e:
+        logger.error(f"Document processing error for {doc_id}: {e}")
+        supabase.table("documents").update({
+            "status": "failed",
+            "error_message": str(e)[:500],
+        }).eq("id", doc_id).execute()
+
+
+@router.get("", response_model=List[DocumentResponse])
+async def list_documents(chatbot_id: str, user=Depends(get_current_user)):
+    supabase = get_supabase()
+    _get_user_chatbot(chatbot_id, user.id, supabase)
+
+    result = supabase.table("documents").select("*") \
+        .eq("chatbot_id", chatbot_id) \
+        .order("created_at", desc=True) \
+        .execute()
+
+    return [DocumentResponse(**d) for d in (result.data or [])]
+
+
+@router.delete("/{document_id}", response_model=SuccessResponse)
+async def delete_document(chatbot_id: str, document_id: str, user=Depends(get_current_user)):
+    supabase = get_supabase()
+    chatbot = _get_user_chatbot(chatbot_id, user.id, supabase)
+
+    doc = supabase.table("documents").select("*").eq("id", document_id).eq("chatbot_id", chatbot_id).execute()
+    if not doc.data:
+        raise HTTPException(status_code=404, detail="Document not found")
+
+    # Remove vectors from Qdrant
+    collection_name = chatbot.get("qdrant_collection_name")
+    if collection_name:
+        try:
+            vector_store.delete_by_document_id(collection_name, document_id)
+        except Exception as e:
+            logger.warning(f"Failed to delete vectors: {e}")
+
+    supabase.table("documents").delete().eq("id", document_id).execute()
+    return SuccessResponse(success=True, message="Document deleted")