diff --git a/.gitignore b/.gitignore index de81240..773ded9 100644 --- a/.gitignore +++ b/.gitignore @@ -73,4 +73,6 @@ alembic/versions/*.pyc # Temporary files *.tmp -*.temp \ No newline at end of file +*.temp + +.claude \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index ba46823..b3f1d9a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,15 +1,15 @@ -FROM python:3.12-alpine +FROM python:3.12-slim WORKDIR /app -# Install ALL required build tools -RUN apk add --no-cache \ - gcc \ - g++ \ - musl-dev \ - python3-dev \ - cmake \ - make +# System deps for build tools + Playwright/Chromium +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc g++ make cmake \ + # Playwright Chromium dependencies + libnss3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 \ + libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \ + libgbm1 libasound2 libpango-1.0-0 libcairo2 libatspi2.0-0 \ + && rm -rf /var/lib/apt/lists/* RUN pip install uv @@ -19,8 +19,11 @@ COPY uv.lock . RUN uv pip install --system "pydantic[email]" RUN uv pip install --system . +# Install Playwright's Chromium browser +RUN playwright install chromium + COPY . . EXPOSE 8000 -CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/app/config.py b/app/config.py index 6ae3aa1..e44e8a8 100644 --- a/app/config.py +++ b/app/config.py @@ -100,23 +100,17 @@ MODEL_CATALOG = { "badge": "Smart", "description": "Cost-effective and highly capable model", }, - "accounts/fireworks/models/deepseek-v3p2": { - "name": "DeepSeek V3.2", - "provider": "Fireworks AI", - "badge": "Smart", - "description": "Latest DeepSeek — faster and more capable", - }, - "accounts/fireworks/models/kimi-k2-instruct": { + "accounts/fireworks/models/kimi-k2-instruct-0905": { "name": "Kimi K2", "provider": "Fireworks AI", "badge": "Multilingual", "description": "Strong multilingual and coding capabilities", }, - "accounts/fireworks/models/kimi-k2p5-instruct": { - "name": "Kimi K2.5", + "accounts/fireworks/models/gpt-oss-120b": { + "name": "GPT OSS 120B", "provider": "Fireworks AI", - "badge": "Multilingual", - "description": "Upgraded Kimi — stronger reasoning and multilingual", + "badge": "Powerful", + "description": "Large open-source model with strong reasoning", }, # ── Pro tier (Premium providers) ─────────────────────────────────────────── @@ -169,9 +163,8 @@ MODEL_PROVIDERS = { "accounts/fireworks/models/llama-v3p3-70b-instruct": "fireworks", "accounts/fireworks/models/qwen3-235b-a22b": "fireworks", "accounts/fireworks/models/deepseek-v3p1": "fireworks", - "accounts/fireworks/models/deepseek-v3p2": "fireworks", - "accounts/fireworks/models/kimi-k2-instruct": "fireworks", - "accounts/fireworks/models/kimi-k2p5-instruct": "fireworks", + "accounts/fireworks/models/kimi-k2-instruct-0905": "fireworks", + "accounts/fireworks/models/gpt-oss-120b": "fireworks", # OpenAI "gpt-4o": "openai", "gpt-4o-mini": "openai", @@ -224,9 +217,8 @@ _ALL_FIREWORKS = [ "accounts/fireworks/models/llama-v3p3-70b-instruct", "accounts/fireworks/models/qwen3-235b-a22b", "accounts/fireworks/models/deepseek-v3p1", - "accounts/fireworks/models/deepseek-v3p2", - "accounts/fireworks/models/kimi-k2-instruct", - "accounts/fireworks/models/kimi-k2p5-instruct", + "accounts/fireworks/models/kimi-k2-instruct-0905", + "accounts/fireworks/models/gpt-oss-120b", ] _ALL_PREMIUM = [ "gpt-4o", "gpt-4o-mini", @@ -236,23 +228,23 @@ _ALL_PREMIUM = [ PLAN_LIMITS = { # ── Free ───────────────────────────────────────────────────────────────── - # Build, test, and go live with one chatbot — no card needed. + # Generous enough to validate the product, limited enough to drive upgrades. "free": { "max_chatbots": 999999, - "max_published": 1, # can publish 1 chatbot - "max_documents_per_chatbot": 3, + "max_published": 1, + "max_documents_per_chatbot": 5, "max_document_size_mb": 5, "models": ["accounts/fireworks/models/llama-v3p3-70b-instruct"], - "conversations_limit": 100, # 100 real conversations/month + "conversations_limit": 300, "code_export": False, "analytics": False, "gap_suggestions": False, - "channels": [], # no messaging channels - "url_sources": 0, + "channels": [], + "url_sources": 2, "leads_per_month": 0, - "inbox_replies": False, # read-only inbox - "leads_editing": False, # view-only leads - "show_branding": True, # cannot remove badge + "inbox_replies": False, + "leads_editing": False, + "show_branding": True, "appointments": False, "appointments_chatbots": 0, "campaigns": False, @@ -260,38 +252,38 @@ PLAN_LIMITS = { "max_campaign_recipients": 0, }, # ── Starter $19/mo ─────────────────────────────────────────────────────── - # For solo operators: live chat, leads, booking, and campaigns. + # Complete package for individuals and small businesses. "starter": { "max_chatbots": 999999, "max_published": 3, - "max_documents_per_chatbot": 10, - "max_document_size_mb": 10, + "max_documents_per_chatbot": 20, + "max_document_size_mb": 20, "models": _ALL_FIREWORKS, - "conversations_limit": 1500, + "conversations_limit": 2000, "code_export": False, "analytics": True, "gap_suggestions": False, "channels": ["telegram"], - "url_sources": 5, - "leads_per_month": 500, + "url_sources": 10, + "leads_per_month": 999999, "inbox_replies": True, "leads_editing": True, - "show_branding": True, # badge stays on Starter + "show_branding": False, # branding removal starts at Starter "appointments": True, - "appointments_chatbots": 1, # booking on 1 chatbot + "appointments_chatbots": 3, "campaigns": True, - "campaigns_per_month": 3, - "max_campaign_recipients": 500, + "campaigns_per_month": 5, + "max_campaign_recipients": 1000, }, # ── Business $49/mo ────────────────────────────────────────────────────── - # For growing businesses: premium AI, unlimited booking, full analytics. + # Same features as Starter + premium AI models and serious scale. "business": { "max_chatbots": 999999, "max_published": 10, - "max_documents_per_chatbot": 50, - "max_document_size_mb": 50, + "max_documents_per_chatbot": 100, + "max_document_size_mb": 100, "models": _ALL_FIREWORKS + _ALL_PREMIUM, - "conversations_limit": 5000, + "conversations_limit": 8000, "code_export": False, "analytics": True, "gap_suggestions": True, @@ -300,7 +292,7 @@ PLAN_LIMITS = { "leads_per_month": 999999, "inbox_replies": True, "leads_editing": True, - "show_branding": False, # can remove badge + "show_branding": False, "appointments": True, "appointments_chatbots": 999999, "campaigns": True, @@ -308,18 +300,18 @@ PLAN_LIMITS = { "max_campaign_recipients": 5000, }, # ── Agency $99/mo ──────────────────────────────────────────────────────── - # For agencies: unlimited everything, unlimited campaign recipients. + # For heavy users and agencies. Reserved tier for WhatsApp and future channels. "agency": { "max_chatbots": 999999, "max_published": 999999, "max_documents_per_chatbot": 999999, - "max_document_size_mb": 200, + "max_document_size_mb": 500, "models": _ALL_FIREWORKS + _ALL_PREMIUM, - "conversations_limit": 20000, + "conversations_limit": 25000, "code_export": True, "analytics": True, "gap_suggestions": True, - "channels": ["telegram"], + "channels": ["telegram"], # whatsapp added here when ready "url_sources": 999999, "leads_per_month": 999999, "inbox_replies": True, diff --git a/app/main.py b/app/main.py index 314b1e5..6e0595d 100644 --- a/app/main.py +++ b/app/main.py @@ -1,3 +1,4 @@ +import asyncio from contextlib import asynccontextmanager from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware @@ -24,10 +25,30 @@ async def lifespan(app: FastAPI): logger.info("Contexta API starting up...") logger.info(f"Environment: {settings.app_env}") logger.info(f"Allowed origins: {settings.allowed_origins_list}") + asyncio.create_task(_requeue_pending_url_sources()) yield logger.info("Contexta API shutting down...") +async def _requeue_pending_url_sources(): + """Re-queue any url_sources stuck in pending/processing from a previous crash.""" + try: + from app.database import get_supabase + from app.routers.documents import _process_url_source + supabase = get_supabase() + stuck = supabase.table("url_sources") \ + .select("id, url, chatbot_id") \ + .in_("status", ["pending", "processing"]) \ + .execute() + if not stuck.data: + return + logger.info(f"Re-queuing {len(stuck.data)} stuck URL source(s) from previous run") + for src in stuck.data: + asyncio.create_task(_process_url_source(src["id"], src["url"], src["chatbot_id"], supabase)) + except Exception as e: + logger.warning(f"Failed to re-queue pending URL sources: {e}") + + # ── App ────────────────────────────────────────────────────────────────────────── app = FastAPI( title="Contexta API", diff --git a/app/routers/billing.py b/app/routers/billing.py index 6ca296c..3dd9474 100644 --- a/app/routers/billing.py +++ b/app/routers/billing.py @@ -103,8 +103,9 @@ async def stripe_webhook( if event_type == "checkout.session.completed": session = event.data.object - user_id = (session.metadata or {}).get("user_id") - plan = (session.metadata or {}).get("plan", "starter") + metadata = session.metadata + user_id = getattr(metadata, "user_id", None) if metadata else None + plan = getattr(metadata, "plan", "starter") if metadata else "starter" customer_id = session.customer subscription_id = session.subscription @@ -166,7 +167,7 @@ async def stripe_webhook( except HTTPException: raise except Exception as e: - logger.error(f"Webhook error: {e}") + logger.exception(f"Webhook error: {e}") raise HTTPException(status_code=500, detail=str(e)) diff --git a/app/routers/documents.py b/app/routers/documents.py index d53c7cd..3681a56 100644 --- a/app/routers/documents.py +++ b/app/routers/documents.py @@ -404,7 +404,7 @@ async def refresh_url_source( "status": "pending", "error_message": None, "chunk_count": 0, - }).eq("id", source_id).returning("representation").execute() + }).eq("id", source_id).execute() background_tasks.add_task(_process_url_source, source_id, src["url"], chatbot_id, supabase) diff --git a/app/services/llm.py b/app/services/llm.py index 8fb256a..707ba63 100644 --- a/app/services/llm.py +++ b/app/services/llm.py @@ -9,18 +9,13 @@ logger = logging.getLogger(__name__) # Fireworks models are used for free/starter plans so they must always be available. # llama-v3p3-70b-instruct is the guaranteed last resort (confirmed working). _FIREWORKS_FALLBACKS = [ - "accounts/fireworks/models/kimi-k2p5-instruct", - "accounts/fireworks/models/deepseek-v3p2", + "accounts/fireworks/models/kimi-k2-instruct-0905", + "accounts/fireworks/models/gpt-oss-120b", "accounts/fireworks/models/llama-v3p3-70b-instruct", ] def _normalize_model(model: str) -> str: - """Strip date-based version suffixes from Fireworks model IDs. - e.g. 'accounts/fireworks/models/kimi-k2-instruct-0905' → 'accounts/fireworks/models/kimi-k2-instruct' - Matches only purely-numeric suffixes (4–8 digits) so names like 'llama-v3p3-70b' are untouched.""" - if model.startswith("accounts/fireworks/") or model.startswith("fireworks/"): - model = re.sub(r"-\d{4,8}$", "", model) return model diff --git a/app/services/web_scraper.py b/app/services/web_scraper.py index 2a12ffd..7a38cca 100644 --- a/app/services/web_scraper.py +++ b/app/services/web_scraper.py @@ -1,6 +1,5 @@ import httpx import logging -from typing import Optional logger = logging.getLogger(__name__) @@ -10,15 +9,51 @@ MAX_TEXT_BYTES = 100 * 1024 # 100KB async def scrape_url(url: str) -> dict: """ Fetch a URL and extract clean text content. + Uses Playwright (headless Chromium) to handle JS-rendered pages, + falls back to direct httpx for simple static pages. Returns: {title, text, url} or {error, url} """ + result = await _scrape_via_playwright(url) + if "error" not in result: + return result + logger.warning(f"Playwright scrape failed for {url}: {result['error']} — falling back to direct fetch") + return await _scrape_direct(url) + + +async def _scrape_via_playwright(url: str) -> dict: + """Headless Chromium scrape — handles JS-rendered SPAs.""" + try: + from playwright.async_api import async_playwright + + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + await page.goto(url, wait_until="networkidle", timeout=30000) + + title = await page.title() + text = await page.evaluate("""() => { + document.querySelectorAll('nav, header, footer, script, style, noscript, aside').forEach(e => e.remove()); + return document.body ? document.body.innerText.trim() : ''; + }""") + await browser.close() + + text = _clean_text(text) + if not text: + return {"error": "No text content found on page", "url": url} + + logger.info(f"Scraped {url} via Playwright: {len(text)} chars, title='{title}'") + return {"title": title, "text": text, "url": url} + + except Exception as e: + return {"error": str(e)[:200], "url": url} + + +async def _scrape_direct(url: str) -> dict: + """Direct httpx scrape — works for server-rendered pages.""" try: from bs4 import BeautifulSoup - headers = { - "User-Agent": "Mozilla/5.0 (compatible; ContextaBot/1.0; +https://contexta.ai)", - } - + headers = {"User-Agent": "Mozilla/5.0 (compatible; ContextaBot/1.0; +https://contexta.ai)"} async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client: response = await client.get(url, headers=headers) response.raise_for_status() @@ -27,46 +62,20 @@ async def scrape_url(url: str) -> dict: if "text/html" not in content_type and "text/plain" not in content_type: return {"error": f"Unsupported content type: {content_type}", "url": url} - html = response.text - soup = BeautifulSoup(html, "html.parser") - - # Extract title + soup = BeautifulSoup(response.text, "html.parser") title_tag = soup.find("title") title = title_tag.get_text(strip=True) if title_tag else "" - # Remove unwanted tags - for tag in soup.find_all(["nav", "header", "footer", "script", "style", "noscript", "aside", "advertisement"]): + for tag in soup.find_all(["nav", "header", "footer", "script", "style", "noscript", "aside"]): tag.decompose() - # Extract main content (prefer article/main/body) main = soup.find("main") or soup.find("article") or soup.find("body") or soup - text = main.get_text(separator="\n", strip=True) - - # Clean up whitespace and filter structural noise - seen_lines: set[str] = set() - clean_lines = [] - for line in text.splitlines(): - line = line.strip() - if not line: - continue - # Skip very short lines (nav items, button labels, breadcrumb separators) - if len(line) < 15: - continue - # Skip duplicate lines (nav/footer repeated across sections) - if line in seen_lines: - continue - seen_lines.add(line) - clean_lines.append(line) - text = "\n".join(clean_lines) - - # Limit size - if len(text.encode("utf-8")) > MAX_TEXT_BYTES: - text = text[:MAX_TEXT_BYTES].rsplit("\n", 1)[0] + text = _clean_text(main.get_text(separator="\n", strip=True)) if not text: return {"error": "No text content found on page", "url": url} - logger.info(f"Scraped {url}: {len(text)} chars, title='{title}'") + logger.info(f"Scraped {url} directly: {len(text)} chars, title='{title}'") return {"title": title, "text": text, "url": url} except httpx.TimeoutException: @@ -76,3 +85,18 @@ async def scrape_url(url: str) -> dict: except Exception as e: logger.error(f"Scrape error for {url}: {e}") return {"error": str(e)[:200], "url": url} + + +def _clean_text(text: str) -> str: + seen: set[str] = set() + lines = [] + for line in text.splitlines(): + line = line.strip() + if not line or len(line) < 5 or line in seen: + continue + seen.add(line) + lines.append(line) + text = "\n".join(lines) + if len(text.encode("utf-8")) > MAX_TEXT_BYTES: + text = text[:MAX_TEXT_BYTES].rsplit("\n", 1)[0] + return text diff --git a/pyproject.toml b/pyproject.toml index d71f877..017ceef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ dependencies = [ "supabase>=2.28.0", "uvicorn>=0.41.0", "beautifulsoup4>=4.12.0", + "playwright>=1.40.0", "httpx>=0.27.0", "anthropic>=0.40.0", "google-generativeai>=0.8.0",