contexta_be/tests/test_rag_cache.py

"""Tests for RAG response caching integration."""
import pytest
from unittest.mock import AsyncMock, patch, MagicMock
from app.services import cache as response_cache


@pytest.fixture(autouse=True)
def clear_cache():
    response_cache._store.clear()
    response_cache._index.clear()
    yield
    response_cache._store.clear()
    response_cache._index.clear()


@pytest.fixture
def rag():
    from app.services.rag import RAGEngine
    return RAGEngine()


@pytest.fixture
def chatbot_config():
    return {
        "model": "accounts/fireworks/models/llama-v3p3-70b-instruct",
        "max_tokens": 500,
        "temperature": 0.7,
        "company_name": "Test Corp",
        "system_prompt": "",
    }


@pytest.fixture
def good_search_result():
    return [{
        "payload": {"text": "We open 9am–6pm Mon–Fri.", "file_name": "faq.pdf", "page_number": 1},
        "score": 0.82,
    }]


class TestRAGCaching:
    async def test_second_identical_query_uses_cache(self, rag, chatbot_config, good_search_result):
        llm_mock = AsyncMock(return_value={"content": "9am to 6pm", "tokens_used": 20, "model": "m"})

        with patch.object(rag.embedding_svc, "embed_text", return_value=[0.1] * 1536), \
             patch.object(rag.vector_svc, "search", return_value=good_search_result), \
             patch.object(rag.llm_svc, "generate", llm_mock):

            await rag.process_query("What are your hours?", "col-1", chatbot_config)
            await rag.process_query("What are your hours?", "col-1", chatbot_config)

        # LLM should only be called once; second call hits cache
        assert llm_mock.call_count == 1

    async def test_cache_not_used_when_conversation_history_present(self, rag, chatbot_config, good_search_result):
        llm_mock = AsyncMock(return_value={"content": "Yes!", "tokens_used": 10, "model": "m"})

        history = [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello!"}]

        with patch.object(rag.embedding_svc, "embed_text", return_value=[0.1] * 1536), \
             patch.object(rag.vector_svc, "search", return_value=good_search_result), \
             patch.object(rag.llm_svc, "generate", llm_mock):

            await rag.process_query("Follow-up question", "col-1", chatbot_config, conversation_history=history)
            await rag.process_query("Follow-up question", "col-1", chatbot_config, conversation_history=history)

        # Both calls go to LLM because history makes them stateful
        assert llm_mock.call_count == 2

    async def test_different_collections_cached_separately(self, rag, chatbot_config, good_search_result):
        llm_mock = AsyncMock(return_value={"content": "Answer", "tokens_used": 10, "model": "m"})

        with patch.object(rag.embedding_svc, "embed_text", return_value=[0.1] * 1536), \
             patch.object(rag.vector_svc, "search", return_value=good_search_result), \
             patch.object(rag.llm_svc, "generate", llm_mock):

            await rag.process_query("Same question", "col-A", chatbot_config)
            await rag.process_query("Same question", "col-B", chatbot_config)

        # Different collections → two LLM calls, not one
        assert llm_mock.call_count == 2

    async def test_confidence_score_returned_from_cache(self, rag, chatbot_config, good_search_result):
        llm_mock = AsyncMock(return_value={"content": "Cached answer", "tokens_used": 10, "model": "m"})

        with patch.object(rag.embedding_svc, "embed_text", return_value=[0.1] * 1536), \
             patch.object(rag.vector_svc, "search", return_value=good_search_result), \
             patch.object(rag.llm_svc, "generate", llm_mock):

            first = await rag.process_query("hours?", "col-1", chatbot_config)
            second = await rag.process_query("hours?", "col-1", chatbot_config)

        assert first["confidence_score"] == second["confidence_score"]
        assert second["response"] == "Cached answer"