fixed the RAg in test pipeline issue

This commit is contained in:
belviskhoremk
2026-04-26 21:43:19 +00:00
parent 78023ae9c5
commit 260a9c6353
9 changed files with 262 additions and 78 deletions

View File

@@ -42,9 +42,22 @@ async def scrape_url(url: str) -> dict:
main = soup.find("main") or soup.find("article") or soup.find("body") or soup
text = main.get_text(separator="\n", strip=True)
# Clean up whitespace
lines = [line.strip() for line in text.splitlines() if line.strip()]
text = "\n".join(lines)
# Clean up whitespace and filter structural noise
seen_lines: set[str] = set()
clean_lines = []
for line in text.splitlines():
line = line.strip()
if not line:
continue
# Skip very short lines (nav items, button labels, breadcrumb separators)
if len(line) < 15:
continue
# Skip duplicate lines (nav/footer repeated across sections)
if line in seen_lines:
continue
seen_lines.add(line)
clean_lines.append(line)
text = "\n".join(clean_lines)
# Limit size
if len(text.encode("utf-8")) > MAX_TEXT_BYTES: