mirror of
http://88.130.71.182:3000/BlitTech/contexta_be.git
synced 2026-06-12 23:23:21 +00:00
fixed the RAg in test pipeline issue
This commit is contained in:
@@ -42,9 +42,22 @@ async def scrape_url(url: str) -> dict:
|
||||
main = soup.find("main") or soup.find("article") or soup.find("body") or soup
|
||||
text = main.get_text(separator="\n", strip=True)
|
||||
|
||||
# Clean up whitespace
|
||||
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
||||
text = "\n".join(lines)
|
||||
# Clean up whitespace and filter structural noise
|
||||
seen_lines: set[str] = set()
|
||||
clean_lines = []
|
||||
for line in text.splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
# Skip very short lines (nav items, button labels, breadcrumb separators)
|
||||
if len(line) < 15:
|
||||
continue
|
||||
# Skip duplicate lines (nav/footer repeated across sections)
|
||||
if line in seen_lines:
|
||||
continue
|
||||
seen_lines.add(line)
|
||||
clean_lines.append(line)
|
||||
text = "\n".join(clean_lines)
|
||||
|
||||
# Limit size
|
||||
if len(text.encode("utf-8")) > MAX_TEXT_BYTES:
|
||||
|
||||
Reference in New Issue
Block a user