Files
contexta_be/app/routers/documents.py
2026-04-26 21:43:19 +00:00

519 lines
19 KiB
Python

from fastapi import APIRouter, HTTPException, Depends, UploadFile, File, BackgroundTasks
from app.models import DocumentResponse, SuccessResponse, UrlSourceCreate, UrlSourceResponse
from app.database import get_supabase
from app.dependencies import get_current_user
from app.services.document_processor import process_document
from app.services.embeddings import embedding_service
from app.services.vector_store import vector_store
from app.services.storage import delete_from_storage, extract_storage_path
from app.services import cache as response_cache
from app.config import settings
from typing import List
import uuid
import logging
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/chatbots/{chatbot_id}/documents", tags=["Documents"])
url_router = APIRouter(prefix="/chatbots/{chatbot_id}/url-sources", tags=["URL Sources"])
ALLOWED_TYPES = {
"application/pdf": ".pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
"text/csv": ".csv",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
"text/plain": ".txt",
"text/markdown": ".md",
}
ALLOWED_EXTENSIONS = {".pdf", ".docx", ".csv", ".xlsx", ".txt", ".md"}
def _get_user_chatbot(chatbot_id: str, user_id: str, supabase) -> dict:
company = supabase.table("companies").select("id").eq("owner_id", user_id).execute()
if not company.data:
raise HTTPException(status_code=404, detail="Company not found")
company_id = company.data[0]["id"]
chatbot = supabase.table("chatbots").select("*").eq("id", chatbot_id).eq("company_id", company_id).execute()
if not chatbot.data:
raise HTTPException(status_code=404, detail="Chatbot not found")
return chatbot.data[0]
@router.post("", response_model=DocumentResponse, status_code=201)
async def upload_document(
chatbot_id: str,
background_tasks: BackgroundTasks,
file: UploadFile = File(...),
user=Depends(get_current_user),
):
supabase = get_supabase()
chatbot = _get_user_chatbot(chatbot_id, user.id, supabase)
# Validate file
if not file.filename:
raise HTTPException(status_code=400, detail="No filename provided")
ext = "." + file.filename.rsplit(".", 1)[-1].lower() if "." in file.filename else ""
if ext not in ALLOWED_EXTENSIONS:
raise HTTPException(
status_code=400,
detail=f"File type not supported. Allowed: {', '.join(ALLOWED_EXTENSIONS)}"
)
# Read file
file_bytes = await file.read()
file_size = len(file_bytes)
max_size = settings.max_file_size_mb * 1024 * 1024
if file_size > max_size:
raise HTTPException(
status_code=413,
detail=f"File too large. Max size: {settings.max_file_size_mb}MB"
)
# Create document record
doc_id = str(uuid.uuid4())
doc_data = {
"id": doc_id,
"chatbot_id": chatbot_id,
"file_name": file.filename,
"file_type": ext,
"file_size": file_size,
"chunk_count": 0,
"status": "processing",
}
result = supabase.table("documents").insert(doc_data).execute()
if not result.data:
raise HTTPException(status_code=500, detail="Failed to create document record")
# Process in background
background_tasks.add_task(
_process_document_bg,
file_bytes=file_bytes,
file_name=file.filename,
doc_id=doc_id,
chatbot_id=chatbot_id,
supabase=supabase,
)
return DocumentResponse(**result.data[0])
async def _process_document_bg(
file_bytes: bytes,
file_name: str,
doc_id: str,
chatbot_id: str,
supabase,
):
"""Background task to process and embed a document"""
try:
# Re-fetch chatbot to guarantee we use the canonical collection and company_id,
# not a snapshot that could have been captured before an update.
chatbot_row = supabase.table("chatbots").select("company_id, qdrant_collection_name").eq("id", chatbot_id).execute()
if not chatbot_row.data:
logger.error(f"Chatbot {chatbot_id} not found during document processing")
supabase.table("documents").update({
"status": "failed",
"error_message": "Chatbot not found"
}).eq("id", doc_id).execute()
return
chatbot = chatbot_row.data[0]
company_id = chatbot.get("company_id", "")
collection_name = chatbot.get("qdrant_collection_name")
if not collection_name:
logger.error(f"No Qdrant collection for chatbot {chatbot_id}")
supabase.table("documents").update({
"status": "failed",
"error_message": "Vector store not configured"
}).eq("id", doc_id).execute()
return
# Ensure collection exists
if not vector_store.collection_exists(collection_name):
vector_store.create_collection(collection_name)
# Process document
chunks, payloads = process_document(
file_bytes=file_bytes,
file_name=file_name,
document_id=doc_id,
company_id=company_id,
)
if not chunks:
raise ValueError("No text extracted from document")
# Generate embeddings in batches
batch_size = 50
all_ids = []
all_vectors = []
all_payloads = []
for i in range(0, len(chunks), batch_size):
batch_chunks = chunks[i:i + batch_size]
batch_payloads = payloads[i:i + batch_size]
vectors = embedding_service.embed_batch(batch_chunks)
ids = [str(uuid.uuid4()) for _ in vectors]
all_ids.extend(ids)
all_vectors.extend(vectors)
all_payloads.extend(batch_payloads)
# Upsert to Qdrant
vector_store.upsert_vectors(
collection_name=collection_name,
vectors=all_vectors,
payloads=all_payloads,
ids=all_ids,
)
# Update document record
supabase.table("documents").update({
"status": "completed",
"chunk_count": len(chunks),
}).eq("id", doc_id).execute()
response_cache.invalidate(collection_name)
logger.info(f"Document {doc_id} processed: {len(chunks)} chunks → collection='{collection_name}' company='{company_id}'")
except Exception as e:
logger.error(f"Document processing error for {doc_id}: {e}")
supabase.table("documents").update({
"status": "failed",
"error_message": str(e)[:500],
}).eq("id", doc_id).execute()
@router.get("", response_model=List[DocumentResponse])
async def list_documents(chatbot_id: str, user=Depends(get_current_user)):
supabase = get_supabase()
_get_user_chatbot(chatbot_id, user.id, supabase)
result = supabase.table("documents").select("*") \
.eq("chatbot_id", chatbot_id) \
.order("created_at", desc=True) \
.execute()
return [DocumentResponse(**d) for d in (result.data or [])]
@router.delete("/{document_id}", response_model=SuccessResponse)
async def delete_document(chatbot_id: str, document_id: str, user=Depends(get_current_user)):
supabase = get_supabase()
chatbot = _get_user_chatbot(chatbot_id, user.id, supabase)
doc = supabase.table("documents").select("*").eq("id", document_id).eq("chatbot_id", chatbot_id).execute()
if not doc.data:
raise HTTPException(status_code=404, detail="Document not found")
# Remove vectors from Qdrant
collection_name = chatbot.get("qdrant_collection_name")
if collection_name:
try:
vector_store.delete_by_document_id(collection_name, document_id)
except Exception as e:
logger.warning(f"Failed to delete vectors: {e}")
# Delete file from Supabase Storage
if doc.data[0].get("file_url"):
delete_from_storage(supabase, "documents", doc.data[0]["file_url"])
supabase.table("documents").delete().eq("id", document_id).execute()
if collection_name:
response_cache.invalidate(collection_name)
return SuccessResponse(success=True, message="Document deleted")
@router.post("/{document_id}/retry", response_model=DocumentResponse)
async def retry_document_processing(
chatbot_id: str,
document_id: str,
background_tasks: BackgroundTasks,
user=Depends(get_current_user),
):
"""Retry processing a failed document."""
supabase = get_supabase()
chatbot = _get_user_chatbot(chatbot_id, user.id, supabase)
doc = supabase.table("documents").select("*").eq("id", document_id).eq("chatbot_id", chatbot_id).execute()
if not doc.data:
raise HTTPException(status_code=404, detail="Document not found")
document = doc.data[0]
if document.get("status") != "failed":
raise HTTPException(status_code=400, detail="Only failed documents can be retried")
file_url = document.get("file_url")
if not file_url:
raise HTTPException(
status_code=400,
detail="No file URL stored. Please re-upload this document."
)
# Download file from storage
try:
path = extract_storage_path(file_url, "documents")
if not path:
raise HTTPException(status_code=400, detail="Cannot locate file in storage. Please re-upload.")
file_bytes = supabase.storage.from_("documents").download(path)
except HTTPException:
raise
except Exception as e:
logger.error(f"Failed to download document {document_id} for retry: {e}")
raise HTTPException(status_code=400, detail="Cannot retrieve file from storage. Please re-upload.")
# Reset status to processing
result = supabase.table("documents").update({
"status": "processing",
"error_message": None,
"chunk_count": 0,
}).eq("id", document_id).execute()
# Clear stale cache before re-processing so tests see fresh results
collection_name = chatbot.get("qdrant_collection_name")
if collection_name:
response_cache.invalidate(collection_name)
# Re-enqueue background processing
background_tasks.add_task(
_process_document_bg,
file_bytes=file_bytes,
file_name=document["file_name"],
doc_id=document_id,
chatbot_id=chatbot_id,
supabase=supabase,
)
return DocumentResponse(**result.data[0])
# ── URL Sources ───────────────────────────────────────────────────────────────
@url_router.get("", response_model=List[UrlSourceResponse])
async def list_url_sources(chatbot_id: str, user=Depends(get_current_user)):
supabase = get_supabase()
_get_user_chatbot(chatbot_id, user.id, supabase)
result = supabase.table("url_sources").select("*") \
.eq("chatbot_id", chatbot_id) \
.order("created_at", desc=True) \
.execute()
return result.data or []
@url_router.post("", status_code=201)
async def add_url_source(
chatbot_id: str,
data: UrlSourceCreate,
background_tasks: BackgroundTasks,
user=Depends(get_current_user),
):
from app.config import PLAN_LIMITS
supabase = get_supabase()
chatbot = _get_user_chatbot(chatbot_id, user.id, supabase)
# Plan check
sub = supabase.table("subscriptions").select("plan").eq("user_id", user.id).eq("status", "active").execute()
plan = sub.data[0]["plan"] if sub.data else "free"
max_sources = PLAN_LIMITS.get(plan, PLAN_LIMITS["free"]).get("url_sources", 0)
if max_sources == 0:
raise HTTPException(status_code=402, detail="URL sources require Starter plan or higher")
# Count existing
existing = supabase.table("url_sources").select("id", count="exact").eq("chatbot_id", chatbot_id).execute()
if (existing.count or 0) >= max_sources:
raise HTTPException(status_code=402, detail=f"URL source limit reached ({max_sources}). Upgrade to add more.")
source_id = str(uuid.uuid4())
source_data = {
"id": source_id,
"chatbot_id": chatbot_id,
"url": data.url,
"status": "pending",
}
result = supabase.table("url_sources").insert(source_data).execute()
if not result.data:
raise HTTPException(status_code=500, detail="Failed to create URL source")
# Process in background
background_tasks.add_task(
_process_url_source,
source_id=source_id,
url=data.url,
chatbot_id=chatbot_id,
supabase=supabase,
)
return result.data[0]
@url_router.delete("/{source_id}", response_model=SuccessResponse)
async def delete_url_source(chatbot_id: str, source_id: str, user=Depends(get_current_user)):
supabase = get_supabase()
_get_user_chatbot(chatbot_id, user.id, supabase)
source = supabase.table("url_sources").select("*").eq("id", source_id).eq("chatbot_id", chatbot_id).execute()
if not source.data:
raise HTTPException(status_code=404, detail="URL source not found")
chatbot = _get_user_chatbot(chatbot_id, user.id, supabase)
collection_name = chatbot.get("qdrant_collection_name")
if collection_name:
try:
vector_store.delete_by_document_id(collection_name, source_id)
except Exception:
pass
response_cache.invalidate(collection_name)
supabase.table("url_sources").delete().eq("id", source_id).execute()
return SuccessResponse(success=True, message="URL source deleted")
@url_router.post("/{source_id}/refresh", response_model=UrlSourceResponse)
async def refresh_url_source(
chatbot_id: str,
source_id: str,
background_tasks: BackgroundTasks,
user=Depends(get_current_user),
):
"""Re-scrape a URL source and rebuild its vectors."""
supabase = get_supabase()
chatbot = _get_user_chatbot(chatbot_id, user.id, supabase)
source = supabase.table("url_sources").select("*").eq("id", source_id).eq("chatbot_id", chatbot_id).execute()
if not source.data:
raise HTTPException(status_code=404, detail="URL source not found")
src = source.data[0]
collection_name = chatbot.get("qdrant_collection_name")
# Drop existing vectors for this source
if collection_name:
try:
vector_store.delete_by_document_id(collection_name, source_id)
except Exception as e:
logger.warning(f"Could not delete old vectors for url source {source_id}: {e}")
response_cache.invalidate(collection_name)
# Reset to pending and reprocess
updated = supabase.table("url_sources").update({
"status": "pending",
"error_message": None,
"chunk_count": 0,
}).eq("id", source_id).returning("representation").execute()
background_tasks.add_task(_process_url_source, source_id, src["url"], chatbot_id, supabase)
return UrlSourceResponse(**{**src, "status": "pending", "chunk_count": 0})
async def _process_url_source(source_id: str, url: str, chatbot_id: str, supabase):
"""Background task to scrape a URL and add its content to the vector store."""
from app.services.web_scraper import scrape_url
from app.services.document_processor import chunk_text
from app.services.embeddings import embedding_service
from app.services.vector_store import vector_store
try:
# Re-fetch chatbot to guarantee we use the canonical collection and company_id.
chatbot_row = supabase.table("chatbots").select("company_id, qdrant_collection_name").eq("id", chatbot_id).execute()
if not chatbot_row.data:
logger.error(f"Chatbot {chatbot_id} not found during URL source processing")
supabase.table("url_sources").update({
"status": "failed",
"error_message": "Chatbot not found",
}).eq("id", source_id).execute()
return
chatbot = chatbot_row.data[0]
# Update status to processing
supabase.table("url_sources").update({"status": "processing"}).eq("id", source_id).execute()
# Scrape URL
scraped = await scrape_url(url)
if "error" in scraped:
supabase.table("url_sources").update({
"status": "failed",
"error_message": scraped["error"],
}).eq("id", source_id).execute()
return
text = scraped["text"]
title = scraped.get("title", url)
collection_name = chatbot.get("qdrant_collection_name")
if not collection_name:
supabase.table("url_sources").update({
"status": "failed",
"error_message": "No vector store configured",
}).eq("id", source_id).execute()
return
# Ensure collection exists
if not vector_store.collection_exists(collection_name):
vector_store.create_collection(collection_name)
# Chunk text
chunks = chunk_text(text)
if not chunks:
supabase.table("url_sources").update({
"status": "failed",
"error_message": "No content extracted",
}).eq("id", source_id).execute()
return
# Embed and upsert
all_ids = []
all_vectors = []
all_payloads = []
batch_size = 50
for i in range(0, len(chunks), batch_size):
batch = chunks[i:i + batch_size]
vectors = embedding_service.embed_batch(batch)
ids = [str(uuid.uuid4()) for _ in vectors]
payloads = [{
"document_id": source_id,
"company_id": chatbot.get("company_id", ""),
"file_name": f"[URL] {title}",
"page_number": i // batch_size + 1,
"chunk_index": i + j,
"text": chunk,
"source_url": url,
} for j, chunk in enumerate(batch)]
all_ids.extend(ids)
all_vectors.extend(vectors)
all_payloads.extend(payloads)
vector_store.upsert_vectors(
collection_name=collection_name,
vectors=all_vectors,
payloads=all_payloads,
ids=all_ids,
)
supabase.table("url_sources").update({
"status": "completed",
"page_title": title,
"chunk_count": len(chunks),
}).eq("id", source_id).execute()
response_cache.invalidate(collection_name)
logger.info(f"URL source {source_id} processed: {len(chunks)} chunks from {url} → collection='{collection_name}' company='{chatbot.get('company_id', '')}'")
except Exception as e:
logger.error(f"URL source processing error {source_id}: {e}")
supabase.table("url_sources").update({
"status": "failed",
"error_message": str(e)[:500],
}).eq("id", source_id).execute()
router_url_sources = url_router