mirror of
http://88.130.71.182:3000/BlitTech/contexta_be.git
synced 2026-06-12 23:23:21 +00:00
updates Mar6
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
from fastapi import APIRouter, HTTPException, Depends, UploadFile, File, BackgroundTasks
|
||||
from app.models import DocumentResponse, SuccessResponse
|
||||
from app.models import DocumentResponse, SuccessResponse, UrlSourceCreate, UrlSourceResponse
|
||||
from app.database import get_supabase
|
||||
from app.dependencies import get_current_user
|
||||
from app.services.document_processor import process_document
|
||||
@@ -12,6 +12,7 @@ import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(prefix="/chatbots/{chatbot_id}/documents", tags=["Documents"])
|
||||
url_router = APIRouter(prefix="/chatbots/{chatbot_id}/url-sources", tags=["URL Sources"])
|
||||
|
||||
ALLOWED_TYPES = {
|
||||
"application/pdf": ".pdf",
|
||||
@@ -206,3 +207,168 @@ async def delete_document(chatbot_id: str, document_id: str, user=Depends(get_cu
|
||||
|
||||
supabase.table("documents").delete().eq("id", document_id).execute()
|
||||
return SuccessResponse(success=True, message="Document deleted")
|
||||
|
||||
|
||||
# ── URL Sources ───────────────────────────────────────────────────────────────
|
||||
|
||||
@url_router.get("", response_model=List[UrlSourceResponse])
|
||||
async def list_url_sources(chatbot_id: str, user=Depends(get_current_user)):
|
||||
supabase = get_supabase()
|
||||
_get_user_chatbot(chatbot_id, user.id, supabase)
|
||||
result = supabase.table("url_sources").select("*") \
|
||||
.eq("chatbot_id", chatbot_id) \
|
||||
.order("created_at", desc=True) \
|
||||
.execute()
|
||||
return result.data or []
|
||||
|
||||
|
||||
@url_router.post("", status_code=201)
|
||||
async def add_url_source(
|
||||
chatbot_id: str,
|
||||
data: UrlSourceCreate,
|
||||
background_tasks: BackgroundTasks,
|
||||
user=Depends(get_current_user),
|
||||
):
|
||||
from app.config import PLAN_LIMITS
|
||||
supabase = get_supabase()
|
||||
chatbot = _get_user_chatbot(chatbot_id, user.id, supabase)
|
||||
|
||||
# Plan check
|
||||
sub = supabase.table("subscriptions").select("plan").eq("user_id", user.id).eq("status", "active").execute()
|
||||
plan = sub.data[0]["plan"] if sub.data else "free"
|
||||
max_sources = PLAN_LIMITS.get(plan, PLAN_LIMITS["free"]).get("url_sources", 0)
|
||||
if max_sources == 0:
|
||||
raise HTTPException(status_code=402, detail="URL sources require Starter plan or higher")
|
||||
|
||||
# Count existing
|
||||
existing = supabase.table("url_sources").select("id", count="exact").eq("chatbot_id", chatbot_id).execute()
|
||||
if (existing.count or 0) >= max_sources:
|
||||
raise HTTPException(status_code=402, detail=f"URL source limit reached ({max_sources}). Upgrade to add more.")
|
||||
|
||||
source_id = str(uuid.uuid4())
|
||||
source_data = {
|
||||
"id": source_id,
|
||||
"chatbot_id": chatbot_id,
|
||||
"url": data.url,
|
||||
"status": "pending",
|
||||
}
|
||||
result = supabase.table("url_sources").insert(source_data).execute()
|
||||
if not result.data:
|
||||
raise HTTPException(status_code=500, detail="Failed to create URL source")
|
||||
|
||||
# Process in background
|
||||
background_tasks.add_task(
|
||||
_process_url_source,
|
||||
source_id=source_id,
|
||||
url=data.url,
|
||||
chatbot=chatbot,
|
||||
supabase=supabase,
|
||||
)
|
||||
|
||||
return result.data[0]
|
||||
|
||||
|
||||
@url_router.delete("/{source_id}", response_model=SuccessResponse)
|
||||
async def delete_url_source(chatbot_id: str, source_id: str, user=Depends(get_current_user)):
|
||||
supabase = get_supabase()
|
||||
_get_user_chatbot(chatbot_id, user.id, supabase)
|
||||
|
||||
source = supabase.table("url_sources").select("*").eq("id", source_id).eq("chatbot_id", chatbot_id).execute()
|
||||
if not source.data:
|
||||
raise HTTPException(status_code=404, detail="URL source not found")
|
||||
|
||||
supabase.table("url_sources").delete().eq("id", source_id).execute()
|
||||
return SuccessResponse(success=True, message="URL source deleted")
|
||||
|
||||
|
||||
async def _process_url_source(source_id: str, url: str, chatbot: dict, supabase):
|
||||
"""Background task to scrape a URL and add its content to the vector store."""
|
||||
from app.services.web_scraper import scrape_url
|
||||
from app.services.document_processor import chunk_text
|
||||
from app.services.embeddings import embedding_service
|
||||
from app.services.vector_store import vector_store
|
||||
|
||||
try:
|
||||
# Update status to processing
|
||||
supabase.table("url_sources").update({"status": "processing"}).eq("id", source_id).execute()
|
||||
|
||||
# Scrape URL
|
||||
scraped = await scrape_url(url)
|
||||
if "error" in scraped:
|
||||
supabase.table("url_sources").update({
|
||||
"status": "failed",
|
||||
"error_message": scraped["error"],
|
||||
}).eq("id", source_id).execute()
|
||||
return
|
||||
|
||||
text = scraped["text"]
|
||||
title = scraped.get("title", url)
|
||||
collection_name = chatbot.get("qdrant_collection_name")
|
||||
|
||||
if not collection_name:
|
||||
supabase.table("url_sources").update({
|
||||
"status": "failed",
|
||||
"error_message": "No vector store configured",
|
||||
}).eq("id", source_id).execute()
|
||||
return
|
||||
|
||||
# Ensure collection exists
|
||||
if not vector_store.collection_exists(collection_name):
|
||||
vector_store.create_collection(collection_name)
|
||||
|
||||
# Chunk text
|
||||
chunks = chunk_text(text)
|
||||
if not chunks:
|
||||
supabase.table("url_sources").update({
|
||||
"status": "failed",
|
||||
"error_message": "No content extracted",
|
||||
}).eq("id", source_id).execute()
|
||||
return
|
||||
|
||||
# Embed and upsert
|
||||
all_ids = []
|
||||
all_vectors = []
|
||||
all_payloads = []
|
||||
batch_size = 50
|
||||
|
||||
for i in range(0, len(chunks), batch_size):
|
||||
batch = chunks[i:i + batch_size]
|
||||
vectors = embedding_service.embed_batch(batch)
|
||||
ids = [str(uuid.uuid4()) for _ in vectors]
|
||||
payloads = [{
|
||||
"document_id": source_id,
|
||||
"company_id": chatbot.get("company_id", ""),
|
||||
"file_name": f"[URL] {title}",
|
||||
"page_number": i // batch_size + 1,
|
||||
"chunk_index": i + j,
|
||||
"text": chunk,
|
||||
"source_url": url,
|
||||
} for j, chunk in enumerate(batch)]
|
||||
all_ids.extend(ids)
|
||||
all_vectors.extend(vectors)
|
||||
all_payloads.extend(payloads)
|
||||
|
||||
vector_store.upsert_vectors(
|
||||
collection_name=collection_name,
|
||||
vectors=all_vectors,
|
||||
payloads=all_payloads,
|
||||
ids=all_ids,
|
||||
)
|
||||
|
||||
supabase.table("url_sources").update({
|
||||
"status": "completed",
|
||||
"page_title": title,
|
||||
"chunk_count": len(chunks),
|
||||
}).eq("id", source_id).execute()
|
||||
|
||||
logger.info(f"URL source {source_id} processed: {len(chunks)} chunks from {url}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"URL source processing error {source_id}: {e}")
|
||||
supabase.table("url_sources").update({
|
||||
"status": "failed",
|
||||
"error_message": str(e)[:500],
|
||||
}).eq("id", source_id).execute()
|
||||
|
||||
|
||||
router_url_sources = url_router
|
||||
|
||||
Reference in New Issue
Block a user