import io import logging from typing import List, Dict, Any, Tuple from pathlib import Path logger = logging.getLogger(__name__) CHUNK_SIZE = 512 # tokens approximate (chars ÷ 4) CHUNK_OVERLAP = 50 def parse_pdf(file_bytes: bytes) -> List[Dict[str, Any]]: """Parse PDF and return list of {text, page_number}""" try: import pypdf reader = pypdf.PdfReader(io.BytesIO(file_bytes)) pages = [] for i, page in enumerate(reader.pages): text = page.extract_text() or "" text = text.strip() if text: pages.append({"text": text, "page_number": i + 1}) return pages except Exception as e: logger.error(f"PDF parse error: {e}") raise ValueError(f"Failed to parse PDF: {str(e)}") def parse_docx(file_bytes: bytes) -> List[Dict[str, Any]]: """Parse DOCX and return list of {text, page_number}""" try: from docx import Document doc = Document(io.BytesIO(file_bytes)) sections = [] current_text = [] section_idx = 1 for para in doc.paragraphs: text = para.text.strip() if not text: continue # New section on headings if para.style.name.startswith("Heading"): if current_text: sections.append( {"text": "\n".join(current_text), "page_number": section_idx} ) current_text = [] section_idx += 1 current_text.append(text) if current_text: sections.append({"text": "\n".join(current_text), "page_number": section_idx}) return sections if sections else [{"text": "", "page_number": 1}] except Exception as e: logger.error(f"DOCX parse error: {e}") raise ValueError(f"Failed to parse DOCX: {str(e)}") def parse_csv(file_bytes: bytes) -> List[Dict[str, Any]]: """Parse CSV - each row becomes a chunk""" try: import pandas as pd df = pd.read_csv(io.BytesIO(file_bytes)) columns = list(df.columns) chunks = [] # Process in batches of rows batch_size = 10 for start in range(0, len(df), batch_size): batch = df.iloc[start : start + batch_size] rows_text = [] for _, row in batch.iterrows(): row_parts = [f"{col}: {val}" for col, val in zip(columns, row) if str(val) != "nan"] rows_text.append(" | ".join(row_parts)) text = "\n".join(rows_text) chunks.append({"text": text, "page_number": (start // batch_size) + 1}) return chunks except Exception as e: logger.error(f"CSV parse error: {e}") raise ValueError(f"Failed to parse CSV: {str(e)}") def parse_xlsx(file_bytes: bytes) -> List[Dict[str, Any]]: """Parse XLSX - each sheet becomes sections""" try: import pandas as pd xl = pd.ExcelFile(io.BytesIO(file_bytes)) chunks = [] page_num = 1 for sheet_name in xl.sheet_names: df = xl.parse(sheet_name) columns = list(df.columns) batch_size = 10 for start in range(0, len(df), batch_size): batch = df.iloc[start : start + batch_size] rows_text = [f"Sheet: {sheet_name}"] for _, row in batch.iterrows(): row_parts = [ f"{col}: {val}" for col, val in zip(columns, row) if str(val) not in ("nan", "NaT", "None") ] if row_parts: rows_text.append(" | ".join(row_parts)) text = "\n".join(rows_text) if text.strip(): chunks.append({"text": text, "page_number": page_num}) page_num += 1 return chunks except Exception as e: logger.error(f"XLSX parse error: {e}") raise ValueError(f"Failed to parse XLSX: {str(e)}") def parse_txt(file_bytes: bytes) -> List[Dict[str, Any]]: """Parse plain text""" try: text = file_bytes.decode("utf-8", errors="ignore") # Split into sections by double newlines sections = [s.strip() for s in text.split("\n\n") if s.strip()] if not sections: sections = [text.strip()] return [{"text": s, "page_number": i + 1} for i, s in enumerate(sections)] except Exception as e: raise ValueError(f"Failed to parse TXT: {str(e)}") def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]: """Split text into overlapping chunks""" # Approximate token count: 1 token ≈ 4 chars char_size = chunk_size * 4 char_overlap = overlap * 4 if len(text) <= char_size: return [text] chunks = [] start = 0 while start < len(text): end = min(start + char_size, len(text)) # Try to break at sentence boundary if end < len(text): for sep in [". ", "! ", "? ", "\n", " "]: pos = text.rfind(sep, start, end) if pos > start + char_size // 2: end = pos + len(sep) break chunk = text[start:end].strip() if chunk: chunks.append(chunk) start = end - char_overlap if end - char_overlap > start else end return chunks def process_document( file_bytes: bytes, file_name: str, document_id: str, company_id: str, ) -> Tuple[List[str], List[Dict[str, Any]]]: """ Main entry point: parse and chunk a document. Returns (chunks_text, chunk_payloads) """ ext = Path(file_name).suffix.lower() # Parse if ext == ".pdf": pages = parse_pdf(file_bytes) elif ext == ".docx": pages = parse_docx(file_bytes) elif ext == ".csv": pages = parse_csv(file_bytes) elif ext in (".xlsx", ".xls"): pages = parse_xlsx(file_bytes) elif ext in (".txt", ".md"): pages = parse_txt(file_bytes) else: raise ValueError(f"Unsupported file type: {ext}") # Chunk all_chunks = [] all_payloads = [] for page in pages: text = page["text"] page_num = page.get("page_number", 1) chunks = chunk_text(text) for idx, chunk in enumerate(chunks): all_chunks.append(chunk) all_payloads.append( { "document_id": document_id, "company_id": company_id, "file_name": file_name, "page_number": page_num, "chunk_index": idx, "text": chunk, } ) logger.info( f"Processed {file_name}: {len(pages)} pages → {len(all_chunks)} chunks" ) return all_chunks, all_payloads