fixed storage error

2026-06-12 23:23:21 +00:00 · 2026-05-27 23:17:34 +00:00
parent a10c95d8dd
commit fad2b3a9f3
9 changed files with 141 additions and 102 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -73,4 +73,6 @@ alembic/versions/*.pyc

 # Temporary files
 *.tmp
-*.temp
+*.temp
+
+.claude
--- a/23
+++ b/23
@@ -1,15 +1,15 @@
-FROM python:3.12-alpine
+FROM python:3.12-slim

 WORKDIR /app

-# Install ALL required build tools
-RUN apk add --no-cache \
-    gcc \
-    g++ \
-    musl-dev \
-    python3-dev \
-    cmake \
-    make
+# System deps for build tools + Playwright/Chromium
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    gcc g++ make cmake \
+    # Playwright Chromium dependencies
+    libnss3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 \
+    libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \
+    libgbm1 libasound2 libpango-1.0-0 libcairo2 libatspi2.0-0 \
+    && rm -rf /var/lib/apt/lists/*

 RUN pip install uv

@@ -19,8 +19,11 @@ COPY uv.lock .
 RUN uv pip install --system "pydantic[email]"
 RUN uv pip install --system .

+# Install Playwright's Chromium browser
+RUN playwright install chromium
+
 COPY . .

 EXPOSE 8000

-CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/app/config.py
+++ b/app/config.py
@@ -100,23 +100,17 @@ MODEL_CATALOG = {
        "badge": "Smart",
        "description": "Cost-effective and highly capable model",
    },
-    "accounts/fireworks/models/deepseek-v3p2": {
-        "name": "DeepSeek V3.2",
-        "provider": "Fireworks AI",
-        "badge": "Smart",
-        "description": "Latest DeepSeek — faster and more capable",
-    },
-    "accounts/fireworks/models/kimi-k2-instruct": {
+    "accounts/fireworks/models/kimi-k2-instruct-0905": {
        "name": "Kimi K2",
        "provider": "Fireworks AI",
        "badge": "Multilingual",
        "description": "Strong multilingual and coding capabilities",
    },
-    "accounts/fireworks/models/kimi-k2p5-instruct": {
-        "name": "Kimi K2.5",
+    "accounts/fireworks/models/gpt-oss-120b": {
+        "name": "GPT OSS 120B",
        "provider": "Fireworks AI",
-        "badge": "Multilingual",
-        "description": "Upgraded Kimi — stronger reasoning and multilingual",
+        "badge": "Powerful",
+        "description": "Large open-source model with strong reasoning",
    },

    # ── Pro tier (Premium providers) ───────────────────────────────────────────
@@ -169,9 +163,8 @@ MODEL_PROVIDERS = {
    "accounts/fireworks/models/llama-v3p3-70b-instruct": "fireworks",
    "accounts/fireworks/models/qwen3-235b-a22b": "fireworks",
    "accounts/fireworks/models/deepseek-v3p1": "fireworks",
-    "accounts/fireworks/models/deepseek-v3p2": "fireworks",
-    "accounts/fireworks/models/kimi-k2-instruct": "fireworks",
-    "accounts/fireworks/models/kimi-k2p5-instruct": "fireworks",
+    "accounts/fireworks/models/kimi-k2-instruct-0905": "fireworks",
+    "accounts/fireworks/models/gpt-oss-120b": "fireworks",
    # OpenAI
    "gpt-4o": "openai",
    "gpt-4o-mini": "openai",
@@ -224,9 +217,8 @@ _ALL_FIREWORKS = [
    "accounts/fireworks/models/llama-v3p3-70b-instruct",
    "accounts/fireworks/models/qwen3-235b-a22b",
    "accounts/fireworks/models/deepseek-v3p1",
-    "accounts/fireworks/models/deepseek-v3p2",
-    "accounts/fireworks/models/kimi-k2-instruct",
-    "accounts/fireworks/models/kimi-k2p5-instruct",
+    "accounts/fireworks/models/kimi-k2-instruct-0905",
+    "accounts/fireworks/models/gpt-oss-120b",
 ]
 _ALL_PREMIUM = [
    "gpt-4o", "gpt-4o-mini",
@@ -236,23 +228,23 @@ _ALL_PREMIUM = [

 PLAN_LIMITS = {
    # ── Free ─────────────────────────────────────────────────────────────────
-    # Build, test, and go live with one chatbot — no card needed.
+    # Generous enough to validate the product, limited enough to drive upgrades.
    "free": {
        "max_chatbots": 999999,
-        "max_published": 1,            # can publish 1 chatbot
-        "max_documents_per_chatbot": 3,
+        "max_published": 1,
+        "max_documents_per_chatbot": 5,
        "max_document_size_mb": 5,
        "models": ["accounts/fireworks/models/llama-v3p3-70b-instruct"],
-        "conversations_limit": 100,    # 100 real conversations/month
+        "conversations_limit": 300,
        "code_export": False,
        "analytics": False,
        "gap_suggestions": False,
-        "channels": [],                # no messaging channels
-        "url_sources": 0,
+        "channels": [],
+        "url_sources": 2,
        "leads_per_month": 0,
-        "inbox_replies": False,        # read-only inbox
-        "leads_editing": False,        # view-only leads
-        "show_branding": True,         # cannot remove badge
+        "inbox_replies": False,
+        "leads_editing": False,
+        "show_branding": True,
        "appointments": False,
        "appointments_chatbots": 0,
        "campaigns": False,
@@ -260,38 +252,38 @@ PLAN_LIMITS = {
        "max_campaign_recipients": 0,
    },
    # ── Starter $19/mo ───────────────────────────────────────────────────────
-    # For solo operators: live chat, leads, booking, and campaigns.
+    # Complete package for individuals and small businesses.
    "starter": {
        "max_chatbots": 999999,
        "max_published": 3,
-        "max_documents_per_chatbot": 10,
-        "max_document_size_mb": 10,
+        "max_documents_per_chatbot": 20,
+        "max_document_size_mb": 20,
        "models": _ALL_FIREWORKS,
-        "conversations_limit": 1500,
+        "conversations_limit": 2000,
        "code_export": False,
        "analytics": True,
        "gap_suggestions": False,
        "channels": ["telegram"],
-        "url_sources": 5,
-        "leads_per_month": 500,
+        "url_sources": 10,
+        "leads_per_month": 999999,
        "inbox_replies": True,
        "leads_editing": True,
-        "show_branding": True,         # badge stays on Starter
+        "show_branding": False,        # branding removal starts at Starter
        "appointments": True,
-        "appointments_chatbots": 1,    # booking on 1 chatbot
+        "appointments_chatbots": 3,
        "campaigns": True,
-        "campaigns_per_month": 3,
-        "max_campaign_recipients": 500,
+        "campaigns_per_month": 5,
+        "max_campaign_recipients": 1000,
    },
    # ── Business $49/mo ──────────────────────────────────────────────────────
-    # For growing businesses: premium AI, unlimited booking, full analytics.
+    # Same features as Starter + premium AI models and serious scale.
    "business": {
        "max_chatbots": 999999,
        "max_published": 10,
-        "max_documents_per_chatbot": 50,
-        "max_document_size_mb": 50,
+        "max_documents_per_chatbot": 100,
+        "max_document_size_mb": 100,
        "models": _ALL_FIREWORKS + _ALL_PREMIUM,
-        "conversations_limit": 5000,
+        "conversations_limit": 8000,
        "code_export": False,
        "analytics": True,
        "gap_suggestions": True,
@@ -300,7 +292,7 @@ PLAN_LIMITS = {
        "leads_per_month": 999999,
        "inbox_replies": True,
        "leads_editing": True,
-        "show_branding": False,        # can remove badge
+        "show_branding": False,
        "appointments": True,
        "appointments_chatbots": 999999,
        "campaigns": True,
@@ -308,18 +300,18 @@ PLAN_LIMITS = {
        "max_campaign_recipients": 5000,
    },
    # ── Agency $99/mo ────────────────────────────────────────────────────────
-    # For agencies: unlimited everything, unlimited campaign recipients.
+    # For heavy users and agencies. Reserved tier for WhatsApp and future channels.
    "agency": {
        "max_chatbots": 999999,
        "max_published": 999999,
        "max_documents_per_chatbot": 999999,
-        "max_document_size_mb": 200,
+        "max_document_size_mb": 500,
        "models": _ALL_FIREWORKS + _ALL_PREMIUM,
-        "conversations_limit": 20000,
+        "conversations_limit": 25000,
        "code_export": True,
        "analytics": True,
        "gap_suggestions": True,
-        "channels": ["telegram"],
+        "channels": ["telegram"],      # whatsapp added here when ready
        "url_sources": 999999,
        "leads_per_month": 999999,
        "inbox_replies": True,
--- a/app/main.py
+++ b/app/main.py
@@ -1,3 +1,4 @@
+import asyncio
 from contextlib import asynccontextmanager
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
@@ -24,10 +25,30 @@ async def lifespan(app: FastAPI):
    logger.info("Contexta API starting up...")
    logger.info(f"Environment: {settings.app_env}")
    logger.info(f"Allowed origins: {settings.allowed_origins_list}")
+    asyncio.create_task(_requeue_pending_url_sources())
    yield
    logger.info("Contexta API shutting down...")


+async def _requeue_pending_url_sources():
+    """Re-queue any url_sources stuck in pending/processing from a previous crash."""
+    try:
+        from app.database import get_supabase
+        from app.routers.documents import _process_url_source
+        supabase = get_supabase()
+        stuck = supabase.table("url_sources") \
+            .select("id, url, chatbot_id") \
+            .in_("status", ["pending", "processing"]) \
+            .execute()
+        if not stuck.data:
+            return
+        logger.info(f"Re-queuing {len(stuck.data)} stuck URL source(s) from previous run")
+        for src in stuck.data:
+            asyncio.create_task(_process_url_source(src["id"], src["url"], src["chatbot_id"], supabase))
+    except Exception as e:
+        logger.warning(f"Failed to re-queue pending URL sources: {e}")
+
+
 # ── App ──────────────────────────────────────────────────────────────────────────
 app = FastAPI(
    title="Contexta API",
--- a/app/routers/billing.py
+++ b/app/routers/billing.py
@@ -103,8 +103,9 @@ async def stripe_webhook(

        if event_type == "checkout.session.completed":
            session = event.data.object
-            user_id = (session.metadata or {}).get("user_id")
-            plan = (session.metadata or {}).get("plan", "starter")
+            metadata = session.metadata
+            user_id = getattr(metadata, "user_id", None) if metadata else None
+            plan = getattr(metadata, "plan", "starter") if metadata else "starter"
            customer_id = session.customer
            subscription_id = session.subscription

@@ -166,7 +167,7 @@ async def stripe_webhook(
    except HTTPException:
        raise
    except Exception as e:
-        logger.error(f"Webhook error: {e}")
+        logger.exception(f"Webhook error: {e}")
        raise HTTPException(status_code=500, detail=str(e))


--- a/app/routers/documents.py
+++ b/app/routers/documents.py
@@ -404,7 +404,7 @@ async def refresh_url_source(
        "status": "pending",
        "error_message": None,
        "chunk_count": 0,
-    }).eq("id", source_id).returning("representation").execute()
+    }).eq("id", source_id).execute()

    background_tasks.add_task(_process_url_source, source_id, src["url"], chatbot_id, supabase)

--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -9,18 +9,13 @@ logger = logging.getLogger(__name__)
 # Fireworks models are used for free/starter plans so they must always be available.
 # llama-v3p3-70b-instruct is the guaranteed last resort (confirmed working).
 _FIREWORKS_FALLBACKS = [
-    "accounts/fireworks/models/kimi-k2p5-instruct",
-    "accounts/fireworks/models/deepseek-v3p2",
+    "accounts/fireworks/models/kimi-k2-instruct-0905",
+    "accounts/fireworks/models/gpt-oss-120b",
    "accounts/fireworks/models/llama-v3p3-70b-instruct",
 ]


 def _normalize_model(model: str) -> str:
-    """Strip date-based version suffixes from Fireworks model IDs.
-    e.g. 'accounts/fireworks/models/kimi-k2-instruct-0905' → 'accounts/fireworks/models/kimi-k2-instruct'
-    Matches only purely-numeric suffixes (4–8 digits) so names like 'llama-v3p3-70b' are untouched."""
-    if model.startswith("accounts/fireworks/") or model.startswith("fireworks/"):
-        model = re.sub(r"-\d{4,8}$", "", model)
    return model


--- a/app/services/web_scraper.py
+++ b/app/services/web_scraper.py
@@ -1,6 +1,5 @@
 import httpx
 import logging
-from typing import Optional

 logger = logging.getLogger(__name__)

@@ -10,15 +9,51 @@ MAX_TEXT_BYTES = 100 * 1024  # 100KB
 async def scrape_url(url: str) -> dict:
    """
    Fetch a URL and extract clean text content.
+    Uses Playwright (headless Chromium) to handle JS-rendered pages,
+    falls back to direct httpx for simple static pages.
    Returns: {title, text, url} or {error, url}
    """
+    result = await _scrape_via_playwright(url)
+    if "error" not in result:
+        return result
+    logger.warning(f"Playwright scrape failed for {url}: {result['error']} — falling back to direct fetch")
+    return await _scrape_direct(url)
+
+
+async def _scrape_via_playwright(url: str) -> dict:
+    """Headless Chromium scrape — handles JS-rendered SPAs."""
+    try:
+        from playwright.async_api import async_playwright
+
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            page = await browser.new_page()
+            await page.goto(url, wait_until="networkidle", timeout=30000)
+
+            title = await page.title()
+            text = await page.evaluate("""() => {
+                document.querySelectorAll('nav, header, footer, script, style, noscript, aside').forEach(e => e.remove());
+                return document.body ? document.body.innerText.trim() : '';
+            }""")
+            await browser.close()
+
+        text = _clean_text(text)
+        if not text:
+            return {"error": "No text content found on page", "url": url}
+
+        logger.info(f"Scraped {url} via Playwright: {len(text)} chars, title='{title}'")
+        return {"title": title, "text": text, "url": url}
+
+    except Exception as e:
+        return {"error": str(e)[:200], "url": url}
+
+
+async def _scrape_direct(url: str) -> dict:
+    """Direct httpx scrape — works for server-rendered pages."""
    try:
        from bs4 import BeautifulSoup

-        headers = {
-            "User-Agent": "Mozilla/5.0 (compatible; ContextaBot/1.0; +https://contexta.ai)",
-        }
-
+        headers = {"User-Agent": "Mozilla/5.0 (compatible; ContextaBot/1.0; +https://contexta.ai)"}
        async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
            response = await client.get(url, headers=headers)
            response.raise_for_status()
@@ -27,46 +62,20 @@ async def scrape_url(url: str) -> dict:
        if "text/html" not in content_type and "text/plain" not in content_type:
            return {"error": f"Unsupported content type: {content_type}", "url": url}

-        html = response.text
-        soup = BeautifulSoup(html, "html.parser")
-
-        # Extract title
+        soup = BeautifulSoup(response.text, "html.parser")
        title_tag = soup.find("title")
        title = title_tag.get_text(strip=True) if title_tag else ""

-        # Remove unwanted tags
-        for tag in soup.find_all(["nav", "header", "footer", "script", "style", "noscript", "aside", "advertisement"]):
+        for tag in soup.find_all(["nav", "header", "footer", "script", "style", "noscript", "aside"]):
            tag.decompose()

-        # Extract main content (prefer article/main/body)
        main = soup.find("main") or soup.find("article") or soup.find("body") or soup
-        text = main.get_text(separator="\n", strip=True)
-
-        # Clean up whitespace and filter structural noise
-        seen_lines: set[str] = set()
-        clean_lines = []
-        for line in text.splitlines():
-            line = line.strip()
-            if not line:
-                continue
-            # Skip very short lines (nav items, button labels, breadcrumb separators)
-            if len(line) < 15:
-                continue
-            # Skip duplicate lines (nav/footer repeated across sections)
-            if line in seen_lines:
-                continue
-            seen_lines.add(line)
-            clean_lines.append(line)
-        text = "\n".join(clean_lines)
-
-        # Limit size
-        if len(text.encode("utf-8")) > MAX_TEXT_BYTES:
-            text = text[:MAX_TEXT_BYTES].rsplit("\n", 1)[0]
+        text = _clean_text(main.get_text(separator="\n", strip=True))

        if not text:
            return {"error": "No text content found on page", "url": url}

-        logger.info(f"Scraped {url}: {len(text)} chars, title='{title}'")
+        logger.info(f"Scraped {url} directly: {len(text)} chars, title='{title}'")
        return {"title": title, "text": text, "url": url}

    except httpx.TimeoutException:
@@ -76,3 +85,18 @@ async def scrape_url(url: str) -> dict:
    except Exception as e:
        logger.error(f"Scrape error for {url}: {e}")
        return {"error": str(e)[:200], "url": url}
+
+
+def _clean_text(text: str) -> str:
+    seen: set[str] = set()
+    lines = []
+    for line in text.splitlines():
+        line = line.strip()
+        if not line or len(line) < 5 or line in seen:
+            continue
+        seen.add(line)
+        lines.append(line)
+    text = "\n".join(lines)
+    if len(text.encode("utf-8")) > MAX_TEXT_BYTES:
+        text = text[:MAX_TEXT_BYTES].rsplit("\n", 1)[0]
+    return text
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,6 +14,7 @@ dependencies = [
    "supabase>=2.28.0",
    "uvicorn>=0.41.0",
    "beautifulsoup4>=4.12.0",
+    "playwright>=1.40.0",
    "httpx>=0.27.0",
    "anthropic>=0.40.0",
    "google-generativeai>=0.8.0",