"""Tests for RAG response caching integration.""" import pytest from unittest.mock import AsyncMock, patch, MagicMock from app.services import cache as response_cache @pytest.fixture(autouse=True) def clear_cache(): response_cache._store.clear() response_cache._index.clear() yield response_cache._store.clear() response_cache._index.clear() @pytest.fixture def rag(): from app.services.rag import RAGEngine return RAGEngine() @pytest.fixture def chatbot_config(): return { "model": "accounts/fireworks/models/llama-v3p3-70b-instruct", "max_tokens": 500, "temperature": 0.7, "company_name": "Test Corp", "system_prompt": "", } @pytest.fixture def good_search_result(): return [{ "payload": {"text": "We open 9am–6pm Mon–Fri.", "file_name": "faq.pdf", "page_number": 1}, "score": 0.82, }] class TestRAGCaching: async def test_second_identical_query_uses_cache(self, rag, chatbot_config, good_search_result): llm_mock = AsyncMock(return_value={"content": "9am to 6pm", "tokens_used": 20, "model": "m"}) with patch.object(rag.embedding_svc, "embed_text", return_value=[0.1] * 1536), \ patch.object(rag.vector_svc, "search", return_value=good_search_result), \ patch.object(rag.llm_svc, "generate", llm_mock): await rag.process_query("What are your hours?", "col-1", chatbot_config) await rag.process_query("What are your hours?", "col-1", chatbot_config) # LLM should only be called once; second call hits cache assert llm_mock.call_count == 1 async def test_cache_not_used_when_conversation_history_present(self, rag, chatbot_config, good_search_result): llm_mock = AsyncMock(return_value={"content": "Yes!", "tokens_used": 10, "model": "m"}) history = [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello!"}] with patch.object(rag.embedding_svc, "embed_text", return_value=[0.1] * 1536), \ patch.object(rag.vector_svc, "search", return_value=good_search_result), \ patch.object(rag.llm_svc, "generate", llm_mock): await rag.process_query("Follow-up question", "col-1", chatbot_config, conversation_history=history) await rag.process_query("Follow-up question", "col-1", chatbot_config, conversation_history=history) # Both calls go to LLM because history makes them stateful assert llm_mock.call_count == 2 async def test_different_collections_cached_separately(self, rag, chatbot_config, good_search_result): llm_mock = AsyncMock(return_value={"content": "Answer", "tokens_used": 10, "model": "m"}) with patch.object(rag.embedding_svc, "embed_text", return_value=[0.1] * 1536), \ patch.object(rag.vector_svc, "search", return_value=good_search_result), \ patch.object(rag.llm_svc, "generate", llm_mock): await rag.process_query("Same question", "col-A", chatbot_config) await rag.process_query("Same question", "col-B", chatbot_config) # Different collections → two LLM calls, not one assert llm_mock.call_count == 2 async def test_confidence_score_returned_from_cache(self, rag, chatbot_config, good_search_result): llm_mock = AsyncMock(return_value={"content": "Cached answer", "tokens_used": 10, "model": "m"}) with patch.object(rag.embedding_svc, "embed_text", return_value=[0.1] * 1536), \ patch.object(rag.vector_svc, "search", return_value=good_search_result), \ patch.object(rag.llm_svc, "generate", llm_mock): first = await rag.process_query("hours?", "col-1", chatbot_config) second = await rag.process_query("hours?", "col-1", chatbot_config) assert first["confidence_score"] == second["confidence_score"] assert second["response"] == "Cached answer"