fixed storage error

This commit is contained in:
belviskhoremk
2026-05-27 23:17:34 +00:00
parent a10c95d8dd
commit fad2b3a9f3
9 changed files with 141 additions and 102 deletions

View File

@@ -9,18 +9,13 @@ logger = logging.getLogger(__name__)
# Fireworks models are used for free/starter plans so they must always be available.
# llama-v3p3-70b-instruct is the guaranteed last resort (confirmed working).
_FIREWORKS_FALLBACKS = [
"accounts/fireworks/models/kimi-k2p5-instruct",
"accounts/fireworks/models/deepseek-v3p2",
"accounts/fireworks/models/kimi-k2-instruct-0905",
"accounts/fireworks/models/gpt-oss-120b",
"accounts/fireworks/models/llama-v3p3-70b-instruct",
]
def _normalize_model(model: str) -> str:
"""Strip date-based version suffixes from Fireworks model IDs.
e.g. 'accounts/fireworks/models/kimi-k2-instruct-0905''accounts/fireworks/models/kimi-k2-instruct'
Matches only purely-numeric suffixes (48 digits) so names like 'llama-v3p3-70b' are untouched."""
if model.startswith("accounts/fireworks/") or model.startswith("fireworks/"):
model = re.sub(r"-\d{4,8}$", "", model)
return model

View File

@@ -1,6 +1,5 @@
import httpx
import logging
from typing import Optional
logger = logging.getLogger(__name__)
@@ -10,15 +9,51 @@ MAX_TEXT_BYTES = 100 * 1024 # 100KB
async def scrape_url(url: str) -> dict:
"""
Fetch a URL and extract clean text content.
Uses Playwright (headless Chromium) to handle JS-rendered pages,
falls back to direct httpx for simple static pages.
Returns: {title, text, url} or {error, url}
"""
result = await _scrape_via_playwright(url)
if "error" not in result:
return result
logger.warning(f"Playwright scrape failed for {url}: {result['error']} — falling back to direct fetch")
return await _scrape_direct(url)
async def _scrape_via_playwright(url: str) -> dict:
"""Headless Chromium scrape — handles JS-rendered SPAs."""
try:
from playwright.async_api import async_playwright
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto(url, wait_until="networkidle", timeout=30000)
title = await page.title()
text = await page.evaluate("""() => {
document.querySelectorAll('nav, header, footer, script, style, noscript, aside').forEach(e => e.remove());
return document.body ? document.body.innerText.trim() : '';
}""")
await browser.close()
text = _clean_text(text)
if not text:
return {"error": "No text content found on page", "url": url}
logger.info(f"Scraped {url} via Playwright: {len(text)} chars, title='{title}'")
return {"title": title, "text": text, "url": url}
except Exception as e:
return {"error": str(e)[:200], "url": url}
async def _scrape_direct(url: str) -> dict:
"""Direct httpx scrape — works for server-rendered pages."""
try:
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (compatible; ContextaBot/1.0; +https://contexta.ai)",
}
headers = {"User-Agent": "Mozilla/5.0 (compatible; ContextaBot/1.0; +https://contexta.ai)"}
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
response = await client.get(url, headers=headers)
response.raise_for_status()
@@ -27,46 +62,20 @@ async def scrape_url(url: str) -> dict:
if "text/html" not in content_type and "text/plain" not in content_type:
return {"error": f"Unsupported content type: {content_type}", "url": url}
html = response.text
soup = BeautifulSoup(html, "html.parser")
# Extract title
soup = BeautifulSoup(response.text, "html.parser")
title_tag = soup.find("title")
title = title_tag.get_text(strip=True) if title_tag else ""
# Remove unwanted tags
for tag in soup.find_all(["nav", "header", "footer", "script", "style", "noscript", "aside", "advertisement"]):
for tag in soup.find_all(["nav", "header", "footer", "script", "style", "noscript", "aside"]):
tag.decompose()
# Extract main content (prefer article/main/body)
main = soup.find("main") or soup.find("article") or soup.find("body") or soup
text = main.get_text(separator="\n", strip=True)
# Clean up whitespace and filter structural noise
seen_lines: set[str] = set()
clean_lines = []
for line in text.splitlines():
line = line.strip()
if not line:
continue
# Skip very short lines (nav items, button labels, breadcrumb separators)
if len(line) < 15:
continue
# Skip duplicate lines (nav/footer repeated across sections)
if line in seen_lines:
continue
seen_lines.add(line)
clean_lines.append(line)
text = "\n".join(clean_lines)
# Limit size
if len(text.encode("utf-8")) > MAX_TEXT_BYTES:
text = text[:MAX_TEXT_BYTES].rsplit("\n", 1)[0]
text = _clean_text(main.get_text(separator="\n", strip=True))
if not text:
return {"error": "No text content found on page", "url": url}
logger.info(f"Scraped {url}: {len(text)} chars, title='{title}'")
logger.info(f"Scraped {url} directly: {len(text)} chars, title='{title}'")
return {"title": title, "text": text, "url": url}
except httpx.TimeoutException:
@@ -76,3 +85,18 @@ async def scrape_url(url: str) -> dict:
except Exception as e:
logger.error(f"Scrape error for {url}: {e}")
return {"error": str(e)[:200], "url": url}
def _clean_text(text: str) -> str:
seen: set[str] = set()
lines = []
for line in text.splitlines():
line = line.strip()
if not line or len(line) < 5 or line in seen:
continue
seen.add(line)
lines.append(line)
text = "\n".join(lines)
if len(text.encode("utf-8")) > MAX_TEXT_BYTES:
text = text[:MAX_TEXT_BYTES].rsplit("\n", 1)[0]
return text