Files
contexta_be/app/services/document_processor.py
belviskhoremk 5bd496d355 Initial commit
2026-02-22 21:59:37 +00:00

222 lines
6.8 KiB
Python

import io
import logging
from typing import List, Dict, Any, Tuple
from pathlib import Path
logger = logging.getLogger(__name__)
CHUNK_SIZE = 512 # tokens approximate (chars ÷ 4)
CHUNK_OVERLAP = 50
def parse_pdf(file_bytes: bytes) -> List[Dict[str, Any]]:
"""Parse PDF and return list of {text, page_number}"""
try:
import pypdf
reader = pypdf.PdfReader(io.BytesIO(file_bytes))
pages = []
for i, page in enumerate(reader.pages):
text = page.extract_text() or ""
text = text.strip()
if text:
pages.append({"text": text, "page_number": i + 1})
return pages
except Exception as e:
logger.error(f"PDF parse error: {e}")
raise ValueError(f"Failed to parse PDF: {str(e)}")
def parse_docx(file_bytes: bytes) -> List[Dict[str, Any]]:
"""Parse DOCX and return list of {text, page_number}"""
try:
from docx import Document
doc = Document(io.BytesIO(file_bytes))
sections = []
current_text = []
section_idx = 1
for para in doc.paragraphs:
text = para.text.strip()
if not text:
continue
# New section on headings
if para.style.name.startswith("Heading"):
if current_text:
sections.append(
{"text": "\n".join(current_text), "page_number": section_idx}
)
current_text = []
section_idx += 1
current_text.append(text)
if current_text:
sections.append({"text": "\n".join(current_text), "page_number": section_idx})
return sections if sections else [{"text": "", "page_number": 1}]
except Exception as e:
logger.error(f"DOCX parse error: {e}")
raise ValueError(f"Failed to parse DOCX: {str(e)}")
def parse_csv(file_bytes: bytes) -> List[Dict[str, Any]]:
"""Parse CSV - each row becomes a chunk"""
try:
import pandas as pd
df = pd.read_csv(io.BytesIO(file_bytes))
columns = list(df.columns)
chunks = []
# Process in batches of rows
batch_size = 10
for start in range(0, len(df), batch_size):
batch = df.iloc[start : start + batch_size]
rows_text = []
for _, row in batch.iterrows():
row_parts = [f"{col}: {val}" for col, val in zip(columns, row) if str(val) != "nan"]
rows_text.append(" | ".join(row_parts))
text = "\n".join(rows_text)
chunks.append({"text": text, "page_number": (start // batch_size) + 1})
return chunks
except Exception as e:
logger.error(f"CSV parse error: {e}")
raise ValueError(f"Failed to parse CSV: {str(e)}")
def parse_xlsx(file_bytes: bytes) -> List[Dict[str, Any]]:
"""Parse XLSX - each sheet becomes sections"""
try:
import pandas as pd
xl = pd.ExcelFile(io.BytesIO(file_bytes))
chunks = []
page_num = 1
for sheet_name in xl.sheet_names:
df = xl.parse(sheet_name)
columns = list(df.columns)
batch_size = 10
for start in range(0, len(df), batch_size):
batch = df.iloc[start : start + batch_size]
rows_text = [f"Sheet: {sheet_name}"]
for _, row in batch.iterrows():
row_parts = [
f"{col}: {val}"
for col, val in zip(columns, row)
if str(val) not in ("nan", "NaT", "None")
]
if row_parts:
rows_text.append(" | ".join(row_parts))
text = "\n".join(rows_text)
if text.strip():
chunks.append({"text": text, "page_number": page_num})
page_num += 1
return chunks
except Exception as e:
logger.error(f"XLSX parse error: {e}")
raise ValueError(f"Failed to parse XLSX: {str(e)}")
def parse_txt(file_bytes: bytes) -> List[Dict[str, Any]]:
"""Parse plain text"""
try:
text = file_bytes.decode("utf-8", errors="ignore")
# Split into sections by double newlines
sections = [s.strip() for s in text.split("\n\n") if s.strip()]
if not sections:
sections = [text.strip()]
return [{"text": s, "page_number": i + 1} for i, s in enumerate(sections)]
except Exception as e:
raise ValueError(f"Failed to parse TXT: {str(e)}")
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
"""Split text into overlapping chunks"""
# Approximate token count: 1 token ≈ 4 chars
char_size = chunk_size * 4
char_overlap = overlap * 4
if len(text) <= char_size:
return [text]
chunks = []
start = 0
while start < len(text):
end = min(start + char_size, len(text))
# Try to break at sentence boundary
if end < len(text):
for sep in [". ", "! ", "? ", "\n", " "]:
pos = text.rfind(sep, start, end)
if pos > start + char_size // 2:
end = pos + len(sep)
break
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
start = end - char_overlap if end - char_overlap > start else end
return chunks
def process_document(
file_bytes: bytes,
file_name: str,
document_id: str,
company_id: str,
) -> Tuple[List[str], List[Dict[str, Any]]]:
"""
Main entry point: parse and chunk a document.
Returns (chunks_text, chunk_payloads)
"""
ext = Path(file_name).suffix.lower()
# Parse
if ext == ".pdf":
pages = parse_pdf(file_bytes)
elif ext == ".docx":
pages = parse_docx(file_bytes)
elif ext == ".csv":
pages = parse_csv(file_bytes)
elif ext in (".xlsx", ".xls"):
pages = parse_xlsx(file_bytes)
elif ext in (".txt", ".md"):
pages = parse_txt(file_bytes)
else:
raise ValueError(f"Unsupported file type: {ext}")
# Chunk
all_chunks = []
all_payloads = []
for page in pages:
text = page["text"]
page_num = page.get("page_number", 1)
chunks = chunk_text(text)
for idx, chunk in enumerate(chunks):
all_chunks.append(chunk)
all_payloads.append(
{
"document_id": document_id,
"company_id": company_id,
"file_name": file_name,
"page_number": page_num,
"chunk_index": idx,
"text": chunk,
}
)
logger.info(
f"Processed {file_name}: {len(pages)} pages → {len(all_chunks)} chunks"
)
return all_chunks, all_payloads