mirror of
http://88.130.71.182:3000/BlitTech/contexta_be.git
synced 2026-06-12 23:23:21 +00:00
fixed storage error
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -73,4 +73,6 @@ alembic/versions/*.pyc
|
|||||||
|
|
||||||
# Temporary files
|
# Temporary files
|
||||||
*.tmp
|
*.tmp
|
||||||
*.temp
|
*.temp
|
||||||
|
|
||||||
|
.claude
|
||||||
23
Dockerfile
23
Dockerfile
@@ -1,15 +1,15 @@
|
|||||||
FROM python:3.12-alpine
|
FROM python:3.12-slim
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Install ALL required build tools
|
# System deps for build tools + Playwright/Chromium
|
||||||
RUN apk add --no-cache \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
gcc \
|
gcc g++ make cmake \
|
||||||
g++ \
|
# Playwright Chromium dependencies
|
||||||
musl-dev \
|
libnss3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 \
|
||||||
python3-dev \
|
libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \
|
||||||
cmake \
|
libgbm1 libasound2 libpango-1.0-0 libcairo2 libatspi2.0-0 \
|
||||||
make
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN pip install uv
|
RUN pip install uv
|
||||||
|
|
||||||
@@ -19,8 +19,11 @@ COPY uv.lock .
|
|||||||
RUN uv pip install --system "pydantic[email]"
|
RUN uv pip install --system "pydantic[email]"
|
||||||
RUN uv pip install --system .
|
RUN uv pip install --system .
|
||||||
|
|
||||||
|
# Install Playwright's Chromium browser
|
||||||
|
RUN playwright install chromium
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
EXPOSE 8000
|
EXPOSE 8000
|
||||||
|
|
||||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
|
|||||||
@@ -100,23 +100,17 @@ MODEL_CATALOG = {
|
|||||||
"badge": "Smart",
|
"badge": "Smart",
|
||||||
"description": "Cost-effective and highly capable model",
|
"description": "Cost-effective and highly capable model",
|
||||||
},
|
},
|
||||||
"accounts/fireworks/models/deepseek-v3p2": {
|
"accounts/fireworks/models/kimi-k2-instruct-0905": {
|
||||||
"name": "DeepSeek V3.2",
|
|
||||||
"provider": "Fireworks AI",
|
|
||||||
"badge": "Smart",
|
|
||||||
"description": "Latest DeepSeek — faster and more capable",
|
|
||||||
},
|
|
||||||
"accounts/fireworks/models/kimi-k2-instruct": {
|
|
||||||
"name": "Kimi K2",
|
"name": "Kimi K2",
|
||||||
"provider": "Fireworks AI",
|
"provider": "Fireworks AI",
|
||||||
"badge": "Multilingual",
|
"badge": "Multilingual",
|
||||||
"description": "Strong multilingual and coding capabilities",
|
"description": "Strong multilingual and coding capabilities",
|
||||||
},
|
},
|
||||||
"accounts/fireworks/models/kimi-k2p5-instruct": {
|
"accounts/fireworks/models/gpt-oss-120b": {
|
||||||
"name": "Kimi K2.5",
|
"name": "GPT OSS 120B",
|
||||||
"provider": "Fireworks AI",
|
"provider": "Fireworks AI",
|
||||||
"badge": "Multilingual",
|
"badge": "Powerful",
|
||||||
"description": "Upgraded Kimi — stronger reasoning and multilingual",
|
"description": "Large open-source model with strong reasoning",
|
||||||
},
|
},
|
||||||
|
|
||||||
# ── Pro tier (Premium providers) ───────────────────────────────────────────
|
# ── Pro tier (Premium providers) ───────────────────────────────────────────
|
||||||
@@ -169,9 +163,8 @@ MODEL_PROVIDERS = {
|
|||||||
"accounts/fireworks/models/llama-v3p3-70b-instruct": "fireworks",
|
"accounts/fireworks/models/llama-v3p3-70b-instruct": "fireworks",
|
||||||
"accounts/fireworks/models/qwen3-235b-a22b": "fireworks",
|
"accounts/fireworks/models/qwen3-235b-a22b": "fireworks",
|
||||||
"accounts/fireworks/models/deepseek-v3p1": "fireworks",
|
"accounts/fireworks/models/deepseek-v3p1": "fireworks",
|
||||||
"accounts/fireworks/models/deepseek-v3p2": "fireworks",
|
"accounts/fireworks/models/kimi-k2-instruct-0905": "fireworks",
|
||||||
"accounts/fireworks/models/kimi-k2-instruct": "fireworks",
|
"accounts/fireworks/models/gpt-oss-120b": "fireworks",
|
||||||
"accounts/fireworks/models/kimi-k2p5-instruct": "fireworks",
|
|
||||||
# OpenAI
|
# OpenAI
|
||||||
"gpt-4o": "openai",
|
"gpt-4o": "openai",
|
||||||
"gpt-4o-mini": "openai",
|
"gpt-4o-mini": "openai",
|
||||||
@@ -224,9 +217,8 @@ _ALL_FIREWORKS = [
|
|||||||
"accounts/fireworks/models/llama-v3p3-70b-instruct",
|
"accounts/fireworks/models/llama-v3p3-70b-instruct",
|
||||||
"accounts/fireworks/models/qwen3-235b-a22b",
|
"accounts/fireworks/models/qwen3-235b-a22b",
|
||||||
"accounts/fireworks/models/deepseek-v3p1",
|
"accounts/fireworks/models/deepseek-v3p1",
|
||||||
"accounts/fireworks/models/deepseek-v3p2",
|
"accounts/fireworks/models/kimi-k2-instruct-0905",
|
||||||
"accounts/fireworks/models/kimi-k2-instruct",
|
"accounts/fireworks/models/gpt-oss-120b",
|
||||||
"accounts/fireworks/models/kimi-k2p5-instruct",
|
|
||||||
]
|
]
|
||||||
_ALL_PREMIUM = [
|
_ALL_PREMIUM = [
|
||||||
"gpt-4o", "gpt-4o-mini",
|
"gpt-4o", "gpt-4o-mini",
|
||||||
@@ -236,23 +228,23 @@ _ALL_PREMIUM = [
|
|||||||
|
|
||||||
PLAN_LIMITS = {
|
PLAN_LIMITS = {
|
||||||
# ── Free ─────────────────────────────────────────────────────────────────
|
# ── Free ─────────────────────────────────────────────────────────────────
|
||||||
# Build, test, and go live with one chatbot — no card needed.
|
# Generous enough to validate the product, limited enough to drive upgrades.
|
||||||
"free": {
|
"free": {
|
||||||
"max_chatbots": 999999,
|
"max_chatbots": 999999,
|
||||||
"max_published": 1, # can publish 1 chatbot
|
"max_published": 1,
|
||||||
"max_documents_per_chatbot": 3,
|
"max_documents_per_chatbot": 5,
|
||||||
"max_document_size_mb": 5,
|
"max_document_size_mb": 5,
|
||||||
"models": ["accounts/fireworks/models/llama-v3p3-70b-instruct"],
|
"models": ["accounts/fireworks/models/llama-v3p3-70b-instruct"],
|
||||||
"conversations_limit": 100, # 100 real conversations/month
|
"conversations_limit": 300,
|
||||||
"code_export": False,
|
"code_export": False,
|
||||||
"analytics": False,
|
"analytics": False,
|
||||||
"gap_suggestions": False,
|
"gap_suggestions": False,
|
||||||
"channels": [], # no messaging channels
|
"channels": [],
|
||||||
"url_sources": 0,
|
"url_sources": 2,
|
||||||
"leads_per_month": 0,
|
"leads_per_month": 0,
|
||||||
"inbox_replies": False, # read-only inbox
|
"inbox_replies": False,
|
||||||
"leads_editing": False, # view-only leads
|
"leads_editing": False,
|
||||||
"show_branding": True, # cannot remove badge
|
"show_branding": True,
|
||||||
"appointments": False,
|
"appointments": False,
|
||||||
"appointments_chatbots": 0,
|
"appointments_chatbots": 0,
|
||||||
"campaigns": False,
|
"campaigns": False,
|
||||||
@@ -260,38 +252,38 @@ PLAN_LIMITS = {
|
|||||||
"max_campaign_recipients": 0,
|
"max_campaign_recipients": 0,
|
||||||
},
|
},
|
||||||
# ── Starter $19/mo ───────────────────────────────────────────────────────
|
# ── Starter $19/mo ───────────────────────────────────────────────────────
|
||||||
# For solo operators: live chat, leads, booking, and campaigns.
|
# Complete package for individuals and small businesses.
|
||||||
"starter": {
|
"starter": {
|
||||||
"max_chatbots": 999999,
|
"max_chatbots": 999999,
|
||||||
"max_published": 3,
|
"max_published": 3,
|
||||||
"max_documents_per_chatbot": 10,
|
"max_documents_per_chatbot": 20,
|
||||||
"max_document_size_mb": 10,
|
"max_document_size_mb": 20,
|
||||||
"models": _ALL_FIREWORKS,
|
"models": _ALL_FIREWORKS,
|
||||||
"conversations_limit": 1500,
|
"conversations_limit": 2000,
|
||||||
"code_export": False,
|
"code_export": False,
|
||||||
"analytics": True,
|
"analytics": True,
|
||||||
"gap_suggestions": False,
|
"gap_suggestions": False,
|
||||||
"channels": ["telegram"],
|
"channels": ["telegram"],
|
||||||
"url_sources": 5,
|
"url_sources": 10,
|
||||||
"leads_per_month": 500,
|
"leads_per_month": 999999,
|
||||||
"inbox_replies": True,
|
"inbox_replies": True,
|
||||||
"leads_editing": True,
|
"leads_editing": True,
|
||||||
"show_branding": True, # badge stays on Starter
|
"show_branding": False, # branding removal starts at Starter
|
||||||
"appointments": True,
|
"appointments": True,
|
||||||
"appointments_chatbots": 1, # booking on 1 chatbot
|
"appointments_chatbots": 3,
|
||||||
"campaigns": True,
|
"campaigns": True,
|
||||||
"campaigns_per_month": 3,
|
"campaigns_per_month": 5,
|
||||||
"max_campaign_recipients": 500,
|
"max_campaign_recipients": 1000,
|
||||||
},
|
},
|
||||||
# ── Business $49/mo ──────────────────────────────────────────────────────
|
# ── Business $49/mo ──────────────────────────────────────────────────────
|
||||||
# For growing businesses: premium AI, unlimited booking, full analytics.
|
# Same features as Starter + premium AI models and serious scale.
|
||||||
"business": {
|
"business": {
|
||||||
"max_chatbots": 999999,
|
"max_chatbots": 999999,
|
||||||
"max_published": 10,
|
"max_published": 10,
|
||||||
"max_documents_per_chatbot": 50,
|
"max_documents_per_chatbot": 100,
|
||||||
"max_document_size_mb": 50,
|
"max_document_size_mb": 100,
|
||||||
"models": _ALL_FIREWORKS + _ALL_PREMIUM,
|
"models": _ALL_FIREWORKS + _ALL_PREMIUM,
|
||||||
"conversations_limit": 5000,
|
"conversations_limit": 8000,
|
||||||
"code_export": False,
|
"code_export": False,
|
||||||
"analytics": True,
|
"analytics": True,
|
||||||
"gap_suggestions": True,
|
"gap_suggestions": True,
|
||||||
@@ -300,7 +292,7 @@ PLAN_LIMITS = {
|
|||||||
"leads_per_month": 999999,
|
"leads_per_month": 999999,
|
||||||
"inbox_replies": True,
|
"inbox_replies": True,
|
||||||
"leads_editing": True,
|
"leads_editing": True,
|
||||||
"show_branding": False, # can remove badge
|
"show_branding": False,
|
||||||
"appointments": True,
|
"appointments": True,
|
||||||
"appointments_chatbots": 999999,
|
"appointments_chatbots": 999999,
|
||||||
"campaigns": True,
|
"campaigns": True,
|
||||||
@@ -308,18 +300,18 @@ PLAN_LIMITS = {
|
|||||||
"max_campaign_recipients": 5000,
|
"max_campaign_recipients": 5000,
|
||||||
},
|
},
|
||||||
# ── Agency $99/mo ────────────────────────────────────────────────────────
|
# ── Agency $99/mo ────────────────────────────────────────────────────────
|
||||||
# For agencies: unlimited everything, unlimited campaign recipients.
|
# For heavy users and agencies. Reserved tier for WhatsApp and future channels.
|
||||||
"agency": {
|
"agency": {
|
||||||
"max_chatbots": 999999,
|
"max_chatbots": 999999,
|
||||||
"max_published": 999999,
|
"max_published": 999999,
|
||||||
"max_documents_per_chatbot": 999999,
|
"max_documents_per_chatbot": 999999,
|
||||||
"max_document_size_mb": 200,
|
"max_document_size_mb": 500,
|
||||||
"models": _ALL_FIREWORKS + _ALL_PREMIUM,
|
"models": _ALL_FIREWORKS + _ALL_PREMIUM,
|
||||||
"conversations_limit": 20000,
|
"conversations_limit": 25000,
|
||||||
"code_export": True,
|
"code_export": True,
|
||||||
"analytics": True,
|
"analytics": True,
|
||||||
"gap_suggestions": True,
|
"gap_suggestions": True,
|
||||||
"channels": ["telegram"],
|
"channels": ["telegram"], # whatsapp added here when ready
|
||||||
"url_sources": 999999,
|
"url_sources": 999999,
|
||||||
"leads_per_month": 999999,
|
"leads_per_month": 999999,
|
||||||
"inbox_replies": True,
|
"inbox_replies": True,
|
||||||
|
|||||||
21
app/main.py
21
app/main.py
@@ -1,3 +1,4 @@
|
|||||||
|
import asyncio
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
@@ -24,10 +25,30 @@ async def lifespan(app: FastAPI):
|
|||||||
logger.info("Contexta API starting up...")
|
logger.info("Contexta API starting up...")
|
||||||
logger.info(f"Environment: {settings.app_env}")
|
logger.info(f"Environment: {settings.app_env}")
|
||||||
logger.info(f"Allowed origins: {settings.allowed_origins_list}")
|
logger.info(f"Allowed origins: {settings.allowed_origins_list}")
|
||||||
|
asyncio.create_task(_requeue_pending_url_sources())
|
||||||
yield
|
yield
|
||||||
logger.info("Contexta API shutting down...")
|
logger.info("Contexta API shutting down...")
|
||||||
|
|
||||||
|
|
||||||
|
async def _requeue_pending_url_sources():
|
||||||
|
"""Re-queue any url_sources stuck in pending/processing from a previous crash."""
|
||||||
|
try:
|
||||||
|
from app.database import get_supabase
|
||||||
|
from app.routers.documents import _process_url_source
|
||||||
|
supabase = get_supabase()
|
||||||
|
stuck = supabase.table("url_sources") \
|
||||||
|
.select("id, url, chatbot_id") \
|
||||||
|
.in_("status", ["pending", "processing"]) \
|
||||||
|
.execute()
|
||||||
|
if not stuck.data:
|
||||||
|
return
|
||||||
|
logger.info(f"Re-queuing {len(stuck.data)} stuck URL source(s) from previous run")
|
||||||
|
for src in stuck.data:
|
||||||
|
asyncio.create_task(_process_url_source(src["id"], src["url"], src["chatbot_id"], supabase))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to re-queue pending URL sources: {e}")
|
||||||
|
|
||||||
|
|
||||||
# ── App ──────────────────────────────────────────────────────────────────────────
|
# ── App ──────────────────────────────────────────────────────────────────────────
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
title="Contexta API",
|
title="Contexta API",
|
||||||
|
|||||||
@@ -103,8 +103,9 @@ async def stripe_webhook(
|
|||||||
|
|
||||||
if event_type == "checkout.session.completed":
|
if event_type == "checkout.session.completed":
|
||||||
session = event.data.object
|
session = event.data.object
|
||||||
user_id = (session.metadata or {}).get("user_id")
|
metadata = session.metadata
|
||||||
plan = (session.metadata or {}).get("plan", "starter")
|
user_id = getattr(metadata, "user_id", None) if metadata else None
|
||||||
|
plan = getattr(metadata, "plan", "starter") if metadata else "starter"
|
||||||
customer_id = session.customer
|
customer_id = session.customer
|
||||||
subscription_id = session.subscription
|
subscription_id = session.subscription
|
||||||
|
|
||||||
@@ -166,7 +167,7 @@ async def stripe_webhook(
|
|||||||
except HTTPException:
|
except HTTPException:
|
||||||
raise
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Webhook error: {e}")
|
logger.exception(f"Webhook error: {e}")
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -404,7 +404,7 @@ async def refresh_url_source(
|
|||||||
"status": "pending",
|
"status": "pending",
|
||||||
"error_message": None,
|
"error_message": None,
|
||||||
"chunk_count": 0,
|
"chunk_count": 0,
|
||||||
}).eq("id", source_id).returning("representation").execute()
|
}).eq("id", source_id).execute()
|
||||||
|
|
||||||
background_tasks.add_task(_process_url_source, source_id, src["url"], chatbot_id, supabase)
|
background_tasks.add_task(_process_url_source, source_id, src["url"], chatbot_id, supabase)
|
||||||
|
|
||||||
|
|||||||
@@ -9,18 +9,13 @@ logger = logging.getLogger(__name__)
|
|||||||
# Fireworks models are used for free/starter plans so they must always be available.
|
# Fireworks models are used for free/starter plans so they must always be available.
|
||||||
# llama-v3p3-70b-instruct is the guaranteed last resort (confirmed working).
|
# llama-v3p3-70b-instruct is the guaranteed last resort (confirmed working).
|
||||||
_FIREWORKS_FALLBACKS = [
|
_FIREWORKS_FALLBACKS = [
|
||||||
"accounts/fireworks/models/kimi-k2p5-instruct",
|
"accounts/fireworks/models/kimi-k2-instruct-0905",
|
||||||
"accounts/fireworks/models/deepseek-v3p2",
|
"accounts/fireworks/models/gpt-oss-120b",
|
||||||
"accounts/fireworks/models/llama-v3p3-70b-instruct",
|
"accounts/fireworks/models/llama-v3p3-70b-instruct",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def _normalize_model(model: str) -> str:
|
def _normalize_model(model: str) -> str:
|
||||||
"""Strip date-based version suffixes from Fireworks model IDs.
|
|
||||||
e.g. 'accounts/fireworks/models/kimi-k2-instruct-0905' → 'accounts/fireworks/models/kimi-k2-instruct'
|
|
||||||
Matches only purely-numeric suffixes (4–8 digits) so names like 'llama-v3p3-70b' are untouched."""
|
|
||||||
if model.startswith("accounts/fireworks/") or model.startswith("fireworks/"):
|
|
||||||
model = re.sub(r"-\d{4,8}$", "", model)
|
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
import httpx
|
import httpx
|
||||||
import logging
|
import logging
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -10,15 +9,51 @@ MAX_TEXT_BYTES = 100 * 1024 # 100KB
|
|||||||
async def scrape_url(url: str) -> dict:
|
async def scrape_url(url: str) -> dict:
|
||||||
"""
|
"""
|
||||||
Fetch a URL and extract clean text content.
|
Fetch a URL and extract clean text content.
|
||||||
|
Uses Playwright (headless Chromium) to handle JS-rendered pages,
|
||||||
|
falls back to direct httpx for simple static pages.
|
||||||
Returns: {title, text, url} or {error, url}
|
Returns: {title, text, url} or {error, url}
|
||||||
"""
|
"""
|
||||||
|
result = await _scrape_via_playwright(url)
|
||||||
|
if "error" not in result:
|
||||||
|
return result
|
||||||
|
logger.warning(f"Playwright scrape failed for {url}: {result['error']} — falling back to direct fetch")
|
||||||
|
return await _scrape_direct(url)
|
||||||
|
|
||||||
|
|
||||||
|
async def _scrape_via_playwright(url: str) -> dict:
|
||||||
|
"""Headless Chromium scrape — handles JS-rendered SPAs."""
|
||||||
|
try:
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
|
async with async_playwright() as p:
|
||||||
|
browser = await p.chromium.launch(headless=True)
|
||||||
|
page = await browser.new_page()
|
||||||
|
await page.goto(url, wait_until="networkidle", timeout=30000)
|
||||||
|
|
||||||
|
title = await page.title()
|
||||||
|
text = await page.evaluate("""() => {
|
||||||
|
document.querySelectorAll('nav, header, footer, script, style, noscript, aside').forEach(e => e.remove());
|
||||||
|
return document.body ? document.body.innerText.trim() : '';
|
||||||
|
}""")
|
||||||
|
await browser.close()
|
||||||
|
|
||||||
|
text = _clean_text(text)
|
||||||
|
if not text:
|
||||||
|
return {"error": "No text content found on page", "url": url}
|
||||||
|
|
||||||
|
logger.info(f"Scraped {url} via Playwright: {len(text)} chars, title='{title}'")
|
||||||
|
return {"title": title, "text": text, "url": url}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"error": str(e)[:200], "url": url}
|
||||||
|
|
||||||
|
|
||||||
|
async def _scrape_direct(url: str) -> dict:
|
||||||
|
"""Direct httpx scrape — works for server-rendered pages."""
|
||||||
try:
|
try:
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
headers = {
|
headers = {"User-Agent": "Mozilla/5.0 (compatible; ContextaBot/1.0; +https://contexta.ai)"}
|
||||||
"User-Agent": "Mozilla/5.0 (compatible; ContextaBot/1.0; +https://contexta.ai)",
|
|
||||||
}
|
|
||||||
|
|
||||||
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
||||||
response = await client.get(url, headers=headers)
|
response = await client.get(url, headers=headers)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
@@ -27,46 +62,20 @@ async def scrape_url(url: str) -> dict:
|
|||||||
if "text/html" not in content_type and "text/plain" not in content_type:
|
if "text/html" not in content_type and "text/plain" not in content_type:
|
||||||
return {"error": f"Unsupported content type: {content_type}", "url": url}
|
return {"error": f"Unsupported content type: {content_type}", "url": url}
|
||||||
|
|
||||||
html = response.text
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
|
||||||
|
|
||||||
# Extract title
|
|
||||||
title_tag = soup.find("title")
|
title_tag = soup.find("title")
|
||||||
title = title_tag.get_text(strip=True) if title_tag else ""
|
title = title_tag.get_text(strip=True) if title_tag else ""
|
||||||
|
|
||||||
# Remove unwanted tags
|
for tag in soup.find_all(["nav", "header", "footer", "script", "style", "noscript", "aside"]):
|
||||||
for tag in soup.find_all(["nav", "header", "footer", "script", "style", "noscript", "aside", "advertisement"]):
|
|
||||||
tag.decompose()
|
tag.decompose()
|
||||||
|
|
||||||
# Extract main content (prefer article/main/body)
|
|
||||||
main = soup.find("main") or soup.find("article") or soup.find("body") or soup
|
main = soup.find("main") or soup.find("article") or soup.find("body") or soup
|
||||||
text = main.get_text(separator="\n", strip=True)
|
text = _clean_text(main.get_text(separator="\n", strip=True))
|
||||||
|
|
||||||
# Clean up whitespace and filter structural noise
|
|
||||||
seen_lines: set[str] = set()
|
|
||||||
clean_lines = []
|
|
||||||
for line in text.splitlines():
|
|
||||||
line = line.strip()
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
# Skip very short lines (nav items, button labels, breadcrumb separators)
|
|
||||||
if len(line) < 15:
|
|
||||||
continue
|
|
||||||
# Skip duplicate lines (nav/footer repeated across sections)
|
|
||||||
if line in seen_lines:
|
|
||||||
continue
|
|
||||||
seen_lines.add(line)
|
|
||||||
clean_lines.append(line)
|
|
||||||
text = "\n".join(clean_lines)
|
|
||||||
|
|
||||||
# Limit size
|
|
||||||
if len(text.encode("utf-8")) > MAX_TEXT_BYTES:
|
|
||||||
text = text[:MAX_TEXT_BYTES].rsplit("\n", 1)[0]
|
|
||||||
|
|
||||||
if not text:
|
if not text:
|
||||||
return {"error": "No text content found on page", "url": url}
|
return {"error": "No text content found on page", "url": url}
|
||||||
|
|
||||||
logger.info(f"Scraped {url}: {len(text)} chars, title='{title}'")
|
logger.info(f"Scraped {url} directly: {len(text)} chars, title='{title}'")
|
||||||
return {"title": title, "text": text, "url": url}
|
return {"title": title, "text": text, "url": url}
|
||||||
|
|
||||||
except httpx.TimeoutException:
|
except httpx.TimeoutException:
|
||||||
@@ -76,3 +85,18 @@ async def scrape_url(url: str) -> dict:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Scrape error for {url}: {e}")
|
logger.error(f"Scrape error for {url}: {e}")
|
||||||
return {"error": str(e)[:200], "url": url}
|
return {"error": str(e)[:200], "url": url}
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_text(text: str) -> str:
|
||||||
|
seen: set[str] = set()
|
||||||
|
lines = []
|
||||||
|
for line in text.splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line or len(line) < 5 or line in seen:
|
||||||
|
continue
|
||||||
|
seen.add(line)
|
||||||
|
lines.append(line)
|
||||||
|
text = "\n".join(lines)
|
||||||
|
if len(text.encode("utf-8")) > MAX_TEXT_BYTES:
|
||||||
|
text = text[:MAX_TEXT_BYTES].rsplit("\n", 1)[0]
|
||||||
|
return text
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ dependencies = [
|
|||||||
"supabase>=2.28.0",
|
"supabase>=2.28.0",
|
||||||
"uvicorn>=0.41.0",
|
"uvicorn>=0.41.0",
|
||||||
"beautifulsoup4>=4.12.0",
|
"beautifulsoup4>=4.12.0",
|
||||||
|
"playwright>=1.40.0",
|
||||||
"httpx>=0.27.0",
|
"httpx>=0.27.0",
|
||||||
"anthropic>=0.40.0",
|
"anthropic>=0.40.0",
|
||||||
"google-generativeai>=0.8.0",
|
"google-generativeai>=0.8.0",
|
||||||
|
|||||||
Reference in New Issue
Block a user