contexta_be/app/routers/documents.py

from fastapi import APIRouter, HTTPException, Depends, UploadFile, File, BackgroundTasks
from app.models import DocumentResponse, SuccessResponse, UrlSourceCreate, UrlSourceResponse
from app.database import get_supabase
from app.dependencies import get_current_user
from app.services.document_processor import process_document
from app.services.embeddings import embedding_service
from app.services.vector_store import vector_store
from app.services.storage import delete_from_storage, extract_storage_path
from app.services import cache as response_cache
from app.config import settings
from typing import List
import uuid
import logging

logger = logging.getLogger(__name__)
router = APIRouter(prefix="/chatbots/{chatbot_id}/documents", tags=["Documents"])
url_router = APIRouter(prefix="/chatbots/{chatbot_id}/url-sources", tags=["URL Sources"])

ALLOWED_TYPES = {
    "application/pdf": ".pdf",
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
    "text/csv": ".csv",
    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
    "text/plain": ".txt",
    "text/markdown": ".md",
}

ALLOWED_EXTENSIONS = {".pdf", ".docx", ".csv", ".xlsx", ".txt", ".md"}


def _get_user_chatbot(chatbot_id: str, user_id: str, supabase) -> dict:
    company = supabase.table("companies").select("id").eq("owner_id", user_id).execute()
    if not company.data:
        raise HTTPException(status_code=404, detail="Company not found")
    company_id = company.data[0]["id"]

    chatbot = supabase.table("chatbots").select("*").eq("id", chatbot_id).eq("company_id", company_id).execute()
    if not chatbot.data:
        raise HTTPException(status_code=404, detail="Chatbot not found")
    return chatbot.data[0]


@router.post("", response_model=DocumentResponse, status_code=201)
async def upload_document(
    chatbot_id: str,
    background_tasks: BackgroundTasks,
    file: UploadFile = File(...),
    user=Depends(get_current_user),
):
    supabase = get_supabase()
    chatbot = _get_user_chatbot(chatbot_id, user.id, supabase)

    # Validate file
    if not file.filename:
        raise HTTPException(status_code=400, detail="No filename provided")

    ext = "." + file.filename.rsplit(".", 1)[-1].lower() if "." in file.filename else ""
    if ext not in ALLOWED_EXTENSIONS:
        raise HTTPException(
            status_code=400,
            detail=f"File type not supported. Allowed: {', '.join(ALLOWED_EXTENSIONS)}"
        )

    # Read file
    file_bytes = await file.read()
    file_size = len(file_bytes)
    max_size = settings.max_file_size_mb * 1024 * 1024

    if file_size > max_size:
        raise HTTPException(
            status_code=413,
            detail=f"File too large. Max size: {settings.max_file_size_mb}MB"
        )

    # Create document record
    doc_id = str(uuid.uuid4())
    doc_data = {
        "id": doc_id,
        "chatbot_id": chatbot_id,
        "file_name": file.filename,
        "file_type": ext,
        "file_size": file_size,
        "chunk_count": 0,
        "status": "processing",
    }

    result = supabase.table("documents").insert(doc_data).execute()
    if not result.data:
        raise HTTPException(status_code=500, detail="Failed to create document record")

    # Process in background
    background_tasks.add_task(
        _process_document_bg,
        file_bytes=file_bytes,
        file_name=file.filename,
        doc_id=doc_id,
        chatbot_id=chatbot_id,
        supabase=supabase,
    )

    return DocumentResponse(**result.data[0])


async def _process_document_bg(
    file_bytes: bytes,
    file_name: str,
    doc_id: str,
    chatbot_id: str,
    supabase,
):
    """Background task to process and embed a document"""
    try:
        # Re-fetch chatbot to guarantee we use the canonical collection and company_id,
        # not a snapshot that could have been captured before an update.
        chatbot_row = supabase.table("chatbots").select("company_id, qdrant_collection_name").eq("id", chatbot_id).execute()
        if not chatbot_row.data:
            logger.error(f"Chatbot {chatbot_id} not found during document processing")
            supabase.table("documents").update({
                "status": "failed",
                "error_message": "Chatbot not found"
            }).eq("id", doc_id).execute()
            return

        chatbot = chatbot_row.data[0]
        company_id = chatbot.get("company_id", "")
        collection_name = chatbot.get("qdrant_collection_name")

        if not collection_name:
            logger.error(f"No Qdrant collection for chatbot {chatbot_id}")
            supabase.table("documents").update({
                "status": "failed",
                "error_message": "Vector store not configured"
            }).eq("id", doc_id).execute()
            return

        # Ensure collection exists
        if not vector_store.collection_exists(collection_name):
            vector_store.create_collection(collection_name)

        # Process document
        chunks, payloads = process_document(
            file_bytes=file_bytes,
            file_name=file_name,
            document_id=doc_id,
            company_id=company_id,
        )

        if not chunks:
            raise ValueError("No text extracted from document")

        # Generate embeddings in batches
        batch_size = 50
        all_ids = []
        all_vectors = []
        all_payloads = []

        for i in range(0, len(chunks), batch_size):
            batch_chunks = chunks[i:i + batch_size]
            batch_payloads = payloads[i:i + batch_size]

            vectors = embedding_service.embed_batch(batch_chunks)
            ids = [str(uuid.uuid4()) for _ in vectors]

            all_ids.extend(ids)
            all_vectors.extend(vectors)
            all_payloads.extend(batch_payloads)

        # Upsert to Qdrant
        vector_store.upsert_vectors(
            collection_name=collection_name,
            vectors=all_vectors,
            payloads=all_payloads,
            ids=all_ids,
        )

        # Update document record
        supabase.table("documents").update({
            "status": "completed",
            "chunk_count": len(chunks),
        }).eq("id", doc_id).execute()

        response_cache.invalidate(collection_name)
        logger.info(f"Document {doc_id} processed: {len(chunks)} chunks → collection='{collection_name}' company='{company_id}'")

    except Exception as e:
        logger.error(f"Document processing error for {doc_id}: {e}")
        supabase.table("documents").update({
            "status": "failed",
            "error_message": str(e)[:500],
        }).eq("id", doc_id).execute()


@router.get("", response_model=List[DocumentResponse])
async def list_documents(chatbot_id: str, user=Depends(get_current_user)):
    supabase = get_supabase()
    _get_user_chatbot(chatbot_id, user.id, supabase)

    result = supabase.table("documents").select("*") \
        .eq("chatbot_id", chatbot_id) \
        .order("created_at", desc=True) \
        .execute()

    return [DocumentResponse(**d) for d in (result.data or [])]


@router.delete("/{document_id}", response_model=SuccessResponse)
async def delete_document(chatbot_id: str, document_id: str, user=Depends(get_current_user)):
    supabase = get_supabase()
    chatbot = _get_user_chatbot(chatbot_id, user.id, supabase)

    doc = supabase.table("documents").select("*").eq("id", document_id).eq("chatbot_id", chatbot_id).execute()
    if not doc.data:
        raise HTTPException(status_code=404, detail="Document not found")

    # Remove vectors from Qdrant
    collection_name = chatbot.get("qdrant_collection_name")
    if collection_name:
        try:
            vector_store.delete_by_document_id(collection_name, document_id)
        except Exception as e:
            logger.warning(f"Failed to delete vectors: {e}")

    # Delete file from Supabase Storage
    if doc.data[0].get("file_url"):
        delete_from_storage(supabase, "documents", doc.data[0]["file_url"])

    supabase.table("documents").delete().eq("id", document_id).execute()
    if collection_name:
        response_cache.invalidate(collection_name)
    return SuccessResponse(success=True, message="Document deleted")


@router.post("/{document_id}/retry", response_model=DocumentResponse)
async def retry_document_processing(
    chatbot_id: str,
    document_id: str,
    background_tasks: BackgroundTasks,
    user=Depends(get_current_user),
):
    """Retry processing a failed document."""
    supabase = get_supabase()
    chatbot = _get_user_chatbot(chatbot_id, user.id, supabase)

    doc = supabase.table("documents").select("*").eq("id", document_id).eq("chatbot_id", chatbot_id).execute()
    if not doc.data:
        raise HTTPException(status_code=404, detail="Document not found")

    document = doc.data[0]
    if document.get("status") != "failed":
        raise HTTPException(status_code=400, detail="Only failed documents can be retried")

    file_url = document.get("file_url")
    if not file_url:
        raise HTTPException(
            status_code=400,
            detail="No file URL stored. Please re-upload this document."
        )

    # Download file from storage
    try:
        path = extract_storage_path(file_url, "documents")
        if not path:
            raise HTTPException(status_code=400, detail="Cannot locate file in storage. Please re-upload.")
        file_bytes = supabase.storage.from_("documents").download(path)
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Failed to download document {document_id} for retry: {e}")
        raise HTTPException(status_code=400, detail="Cannot retrieve file from storage. Please re-upload.")

    # Reset status to processing
    result = supabase.table("documents").update({
        "status": "processing",
        "error_message": None,
        "chunk_count": 0,
    }).eq("id", document_id).execute()

    # Clear stale cache before re-processing so tests see fresh results
    collection_name = chatbot.get("qdrant_collection_name")
    if collection_name:
        response_cache.invalidate(collection_name)

    # Re-enqueue background processing
    background_tasks.add_task(
        _process_document_bg,
        file_bytes=file_bytes,
        file_name=document["file_name"],
        doc_id=document_id,
        chatbot_id=chatbot_id,
        supabase=supabase,
    )

    return DocumentResponse(**result.data[0])


# ── URL Sources ───────────────────────────────────────────────────────────────

@url_router.get("", response_model=List[UrlSourceResponse])
async def list_url_sources(chatbot_id: str, user=Depends(get_current_user)):
    supabase = get_supabase()
    _get_user_chatbot(chatbot_id, user.id, supabase)
    result = supabase.table("url_sources").select("*") \
        .eq("chatbot_id", chatbot_id) \
        .order("created_at", desc=True) \
        .execute()
    return result.data or []


@url_router.post("", status_code=201)
async def add_url_source(
    chatbot_id: str,
    data: UrlSourceCreate,
    background_tasks: BackgroundTasks,
    user=Depends(get_current_user),
):
    from app.config import PLAN_LIMITS
    supabase = get_supabase()
    chatbot = _get_user_chatbot(chatbot_id, user.id, supabase)

    # Plan check
    sub = supabase.table("subscriptions").select("plan").eq("user_id", user.id).eq("status", "active").execute()
    plan = sub.data[0]["plan"] if sub.data else "free"
    max_sources = PLAN_LIMITS.get(plan, PLAN_LIMITS["free"]).get("url_sources", 0)
    if max_sources == 0:
        raise HTTPException(status_code=402, detail="URL sources require Starter plan or higher")

    # Count existing
    existing = supabase.table("url_sources").select("id", count="exact").eq("chatbot_id", chatbot_id).execute()
    if (existing.count or 0) >= max_sources:
        raise HTTPException(status_code=402, detail=f"URL source limit reached ({max_sources}). Upgrade to add more.")

    source_id = str(uuid.uuid4())
    source_data = {
        "id": source_id,
        "chatbot_id": chatbot_id,
        "url": data.url,
        "status": "pending",
    }
    result = supabase.table("url_sources").insert(source_data).execute()
    if not result.data:
        raise HTTPException(status_code=500, detail="Failed to create URL source")

    # Process in background
    background_tasks.add_task(
        _process_url_source,
        source_id=source_id,
        url=data.url,
        chatbot_id=chatbot_id,
        supabase=supabase,
    )

    return result.data[0]


@url_router.delete("/{source_id}", response_model=SuccessResponse)
async def delete_url_source(chatbot_id: str, source_id: str, user=Depends(get_current_user)):
    supabase = get_supabase()
    _get_user_chatbot(chatbot_id, user.id, supabase)

    source = supabase.table("url_sources").select("*").eq("id", source_id).eq("chatbot_id", chatbot_id).execute()
    if not source.data:
        raise HTTPException(status_code=404, detail="URL source not found")

    chatbot = _get_user_chatbot(chatbot_id, user.id, supabase)
    collection_name = chatbot.get("qdrant_collection_name")
    if collection_name:
        try:
            vector_store.delete_by_document_id(collection_name, source_id)
        except Exception:
            pass
        response_cache.invalidate(collection_name)
    supabase.table("url_sources").delete().eq("id", source_id).execute()
    return SuccessResponse(success=True, message="URL source deleted")


@url_router.post("/{source_id}/refresh", response_model=UrlSourceResponse)
async def refresh_url_source(
    chatbot_id: str,
    source_id: str,
    background_tasks: BackgroundTasks,
    user=Depends(get_current_user),
):
    """Re-scrape a URL source and rebuild its vectors."""
    supabase = get_supabase()
    chatbot = _get_user_chatbot(chatbot_id, user.id, supabase)

    source = supabase.table("url_sources").select("*").eq("id", source_id).eq("chatbot_id", chatbot_id).execute()
    if not source.data:
        raise HTTPException(status_code=404, detail="URL source not found")

    src = source.data[0]
    collection_name = chatbot.get("qdrant_collection_name")

    # Drop existing vectors for this source
    if collection_name:
        try:
            vector_store.delete_by_document_id(collection_name, source_id)
        except Exception as e:
            logger.warning(f"Could not delete old vectors for url source {source_id}: {e}")
        response_cache.invalidate(collection_name)

    # Reset to pending and reprocess
    updated = supabase.table("url_sources").update({
        "status": "pending",
        "error_message": None,
        "chunk_count": 0,
    }).eq("id", source_id).returning("representation").execute()

    background_tasks.add_task(_process_url_source, source_id, src["url"], chatbot_id, supabase)

    return UrlSourceResponse(**{**src, "status": "pending", "chunk_count": 0})


async def _process_url_source(source_id: str, url: str, chatbot_id: str, supabase):
    """Background task to scrape a URL and add its content to the vector store."""
    from app.services.web_scraper import scrape_url
    from app.services.document_processor import chunk_text
    from app.services.embeddings import embedding_service
    from app.services.vector_store import vector_store

    try:
        # Re-fetch chatbot to guarantee we use the canonical collection and company_id.
        chatbot_row = supabase.table("chatbots").select("company_id, qdrant_collection_name").eq("id", chatbot_id).execute()
        if not chatbot_row.data:
            logger.error(f"Chatbot {chatbot_id} not found during URL source processing")
            supabase.table("url_sources").update({
                "status": "failed",
                "error_message": "Chatbot not found",
            }).eq("id", source_id).execute()
            return

        chatbot = chatbot_row.data[0]

        # Update status to processing
        supabase.table("url_sources").update({"status": "processing"}).eq("id", source_id).execute()

        # Scrape URL
        scraped = await scrape_url(url)
        if "error" in scraped:
            supabase.table("url_sources").update({
                "status": "failed",
                "error_message": scraped["error"],
            }).eq("id", source_id).execute()
            return

        text = scraped["text"]
        title = scraped.get("title", url)
        collection_name = chatbot.get("qdrant_collection_name")

        if not collection_name:
            supabase.table("url_sources").update({
                "status": "failed",
                "error_message": "No vector store configured",
            }).eq("id", source_id).execute()
            return

        # Ensure collection exists
        if not vector_store.collection_exists(collection_name):
            vector_store.create_collection(collection_name)

        # Chunk text
        chunks = chunk_text(text)
        if not chunks:
            supabase.table("url_sources").update({
                "status": "failed",
                "error_message": "No content extracted",
            }).eq("id", source_id).execute()
            return

        # Embed and upsert
        all_ids = []
        all_vectors = []
        all_payloads = []
        batch_size = 50

        for i in range(0, len(chunks), batch_size):
            batch = chunks[i:i + batch_size]
            vectors = embedding_service.embed_batch(batch)
            ids = [str(uuid.uuid4()) for _ in vectors]
            payloads = [{
                "document_id": source_id,
                "company_id": chatbot.get("company_id", ""),
                "file_name": f"[URL] {title}",
                "page_number": i // batch_size + 1,
                "chunk_index": i + j,
                "text": chunk,
                "source_url": url,
            } for j, chunk in enumerate(batch)]
            all_ids.extend(ids)
            all_vectors.extend(vectors)
            all_payloads.extend(payloads)

        vector_store.upsert_vectors(
            collection_name=collection_name,
            vectors=all_vectors,
            payloads=all_payloads,
            ids=all_ids,
        )

        supabase.table("url_sources").update({
            "status": "completed",
            "page_title": title,
            "chunk_count": len(chunks),
        }).eq("id", source_id).execute()

        response_cache.invalidate(collection_name)
        logger.info(f"URL source {source_id} processed: {len(chunks)} chunks from {url} → collection='{collection_name}' company='{chatbot.get('company_id', '')}'")


    except Exception as e:
        logger.error(f"URL source processing error {source_id}: {e}")
        supabase.table("url_sources").update({
            "status": "failed",
            "error_message": str(e)[:500],
        }).eq("id", source_id).execute()


router_url_sources = url_router