mnemosyne/tests/test_helper_llm.py
Joey Yakimowich-Payne d26c56c2f0 feat: add multi-fidelity compression engine
5-level fidelity manager (L0-Full to L4-Evicted) with helper LLM
(Haiku 4.5) for intelligent summarization during degradation.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-03-13 11:40:56 -06:00

572 lines
21 KiB
Python

"""Tests for the Helper LLM client.
All tests use mocked Anthropic API responses — no real API calls.
"""
from __future__ import annotations
import json
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from mnemosyne.helper_llm import (
GoalClassification,
HelperLLM,
SummaryResult,
_L0_TO_L1_PROMPT,
_L1_TO_L2_PROMPT,
_GOAL_CLASSIFICATION_PROMPT,
_MICRO_FAULT_PROMPT,
)
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
def _make_api_response(text: str) -> MagicMock:
"""Build a mock Anthropic Messages response with a single text block."""
block = MagicMock()
block.type = "text"
block.text = text
response = MagicMock()
response.content = [block]
return response
@pytest.fixture()
def helper() -> HelperLLM:
"""Return a HelperLLM with a mocked async client."""
with patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key-000"}):
h = HelperLLM()
h._client = MagicMock()
h._client.messages = MagicMock()
h._client.messages.create = AsyncMock()
return h
# ---------------------------------------------------------------------------
# Dataclass tests
# ---------------------------------------------------------------------------
class TestDataclasses:
def test_summary_result_defaults(self) -> None:
r = SummaryResult(summary="hello")
assert r.summary == "hello"
assert r.losses == []
assert r.can_answer == []
assert r.key_entities == []
def test_summary_result_with_fields(self) -> None:
r = SummaryResult(
summary="s",
losses=["a"],
can_answer=["b"],
key_entities=["c"],
)
assert r.losses == ["a"]
assert r.can_answer == ["b"]
assert r.key_entities == ["c"]
def test_goal_classification_defaults(self) -> None:
g = GoalClassification(goal="write tests")
assert g.goal == "write tests"
assert g.relevant_types == []
assert g.relevant_tags == []
assert g.predicted_needs == []
# ---------------------------------------------------------------------------
# JSON parsing tests (static methods, no API calls)
# ---------------------------------------------------------------------------
class TestParseSummaryJson:
def test_valid_json(self) -> None:
raw = json.dumps(
{
"summary": "Auth middleware uses JWT.",
"losses": ["exact error codes"],
"can_answer": ["auth approach"],
"key_entities": ["src/auth/middleware.ts"],
}
)
result = HelperLLM._parse_summary_json(raw)
assert result is not None
assert result.summary == "Auth middleware uses JWT."
assert result.losses == ["exact error codes"]
assert result.can_answer == ["auth approach"]
assert result.key_entities == ["src/auth/middleware.ts"]
def test_json_in_code_fence(self) -> None:
raw = (
'```json\n{"summary": "test", "losses": [], "can_answer": [], "key_entities": []}\n```'
)
result = HelperLLM._parse_summary_json(raw)
assert result is not None
assert result.summary == "test"
def test_json_with_surrounding_text(self) -> None:
raw = 'Here is the result:\n{"summary": "ok", "losses": ["x"]}\nDone.'
result = HelperLLM._parse_summary_json(raw)
assert result is not None
assert result.summary == "ok"
assert result.losses == ["x"]
def test_invalid_json_returns_none(self) -> None:
assert HelperLLM._parse_summary_json("not json at all") is None
def test_empty_string_returns_none(self) -> None:
assert HelperLLM._parse_summary_json("") is None
def test_non_dict_json_returns_none(self) -> None:
assert HelperLLM._parse_summary_json("[1, 2, 3]") is None
def test_missing_fields_default_to_empty(self) -> None:
raw = '{"summary": "minimal"}'
result = HelperLLM._parse_summary_json(raw)
assert result is not None
assert result.summary == "minimal"
assert result.losses == []
assert result.can_answer == []
assert result.key_entities == []
class TestParseGoalJson:
def test_valid_json(self) -> None:
raw = json.dumps(
{
"goal": "write auth tests",
"relevant_types": ["file_context", "design_decision"],
"relevant_tags": ["auth", "testing"],
"predicted_needs": ["auth middleware impl"],
}
)
result = HelperLLM._parse_goal_json(raw)
assert result is not None
assert result.goal == "write auth tests"
assert "file_context" in result.relevant_types
assert "auth" in result.relevant_tags
def test_invalid_json_returns_none(self) -> None:
assert HelperLLM._parse_goal_json("garbage") is None
def test_json_in_code_fence(self) -> None:
raw = '```\n{"goal": "deploy", "relevant_types": [], "relevant_tags": [], "predicted_needs": []}\n```'
result = HelperLLM._parse_goal_json(raw)
assert result is not None
assert result.goal == "deploy"
# ---------------------------------------------------------------------------
# Prompt construction tests
# ---------------------------------------------------------------------------
class TestPromptConstruction:
def test_l0_to_l1_prompt_contains_content_and_type(self) -> None:
prompt = _L0_TO_L1_PROMPT.format(
object_type="file_context",
content="def hello(): pass",
)
assert "file_context" in prompt
assert "def hello(): pass" in prompt
assert "DECLARED LOSSES" in prompt
assert "CAN_ANSWER" in prompt
assert "OUTPUT FORMAT (JSON)" in prompt
def test_l1_to_l2_prompt_includes_losses(self) -> None:
prompt = _L1_TO_L2_PROMPT.format(
object_type="debugging_session",
l1_summary="Fixed race condition",
l1_losses="- exact error codes\n- line numbers",
)
assert "debugging_session" in prompt
assert "Fixed race condition" in prompt
assert "exact error codes" in prompt
def test_goal_prompt_includes_message_and_context(self) -> None:
prompt = _GOAL_CLASSIFICATION_PROMPT.format(
user_message="now write tests",
recent_context="We just implemented auth.",
)
assert "now write tests" in prompt
assert "We just implemented auth." in prompt
def test_micro_fault_prompt_includes_question_and_context(self) -> None:
prompt = _MICRO_FAULT_PROMPT.format(
question="What error code?",
context="Error 401 unauthorized",
)
assert "What error code?" in prompt
assert "Error 401 unauthorized" in prompt
# ---------------------------------------------------------------------------
# API call tests (mocked)
# ---------------------------------------------------------------------------
class TestSummarizeL0ToL1:
async def test_successful_summarization(self, helper: HelperLLM) -> None:
api_response = _make_api_response(
json.dumps(
{
"summary": "Auth middleware validates JWT tokens.",
"losses": ["exact error codes for token expiry"],
"can_answer": ["auth approach used"],
"key_entities": ["src/auth/middleware.ts", "jsonwebtoken"],
}
)
)
helper._client.messages.create = AsyncMock(return_value=api_response)
result = await helper.summarize_l0_to_l1(
content="Full auth middleware source code here...",
object_type="file_context",
)
assert isinstance(result, SummaryResult)
assert result.summary == "Auth middleware validates JWT tokens."
assert "exact error codes for token expiry" in result.losses
assert "src/auth/middleware.ts" in result.key_entities
# Verify the API was called with correct model
call_kwargs = helper._client.messages.create.call_args.kwargs
assert call_kwargs["model"] == "claude-haiku-4-5-20251001"
# Verify prompt contains the content
user_msg = call_kwargs["messages"][0]["content"]
assert "file_context" in user_msg
assert "Full auth middleware source code here..." in user_msg
async def test_json_parse_failure_returns_fallback(self, helper: HelperLLM) -> None:
api_response = _make_api_response("This is not valid JSON at all.")
helper._client.messages.create = AsyncMock(return_value=api_response)
content = "A" * 1000
result = await helper.summarize_l0_to_l1(
content=content,
object_type="file_context",
)
assert isinstance(result, SummaryResult)
# Fallback: first 30% of content
assert len(result.summary) == 300
assert result.summary == "A" * 300
assert result.losses == []
async def test_respects_max_summary_tokens(self, helper: HelperLLM) -> None:
api_response = _make_api_response('{"summary": "ok"}')
helper._client.messages.create = AsyncMock(return_value=api_response)
await helper.summarize_l0_to_l1(
content="test",
object_type="tool_result",
max_summary_tokens=512,
)
call_kwargs = helper._client.messages.create.call_args.kwargs
assert call_kwargs["max_tokens"] == 512
class TestCompressL1ToL2:
async def test_successful_compression(self, helper: HelperLLM) -> None:
api_response = _make_api_response(
json.dumps(
{
"summary": "Auth uses JWT with refresh tokens.",
"losses": ["function signatures"],
"can_answer": ["what was decided"],
"key_entities": ["middleware.ts"],
}
)
)
helper._client.messages.create = AsyncMock(return_value=api_response)
l1_losses = ["exact error codes", "line-by-line implementation"]
result = await helper.compress_l1_to_l2(
l1_summary="Detailed auth summary...",
l1_losses=l1_losses,
object_type="file_context",
)
assert isinstance(result, SummaryResult)
assert result.summary == "Auth uses JWT with refresh tokens."
# L1 losses should be accumulated (prepended)
assert result.losses[0] == "exact error codes"
assert result.losses[1] == "line-by-line implementation"
assert "function signatures" in result.losses
async def test_loss_accumulation(self, helper: HelperLLM) -> None:
api_response = _make_api_response(
json.dumps(
{
"summary": "compact",
"losses": ["new_loss"],
"can_answer": [],
"key_entities": [],
}
)
)
helper._client.messages.create = AsyncMock(return_value=api_response)
result = await helper.compress_l1_to_l2(
l1_summary="summary",
l1_losses=["old_loss_1", "old_loss_2"],
object_type="design_decision",
)
assert result.losses == ["old_loss_1", "old_loss_2", "new_loss"]
async def test_fallback_on_parse_failure(self, helper: HelperLLM) -> None:
api_response = _make_api_response("broken response")
helper._client.messages.create = AsyncMock(return_value=api_response)
l1_losses = ["loss_a"]
result = await helper.compress_l1_to_l2(
l1_summary="A" * 100,
l1_losses=l1_losses,
object_type="file_context",
)
assert isinstance(result, SummaryResult)
# Fallback: 30% of l1_summary
assert len(result.summary) == 30
# L1 losses preserved in fallback
assert result.losses == ["loss_a"]
class TestGenerateStub:
async def test_successful_stub(self, helper: HelperLLM) -> None:
stub_text = "[debugging_session | 2026-03-13 14:30 | Fixed race condition in auth | 12 related objects]"
api_response = _make_api_response(stub_text)
helper._client.messages.create = AsyncMock(return_value=api_response)
result = await helper.generate_stub(
l2_summary="Fixed race condition in auth token refresh.",
object_type="debugging_session",
timestamp="2026-03-13 14:30",
)
assert result == stub_text
assert "debugging_session" in result
assert "2026-03-13 14:30" in result
async def test_multiline_response_takes_first_line(self, helper: HelperLLM) -> None:
api_response = _make_api_response(
"[type | ts | desc | 0 related objects]\nExtra line\nAnother"
)
helper._client.messages.create = AsyncMock(return_value=api_response)
result = await helper.generate_stub("summary", "type", "ts")
assert "\n" not in result
assert result == "[type | ts | desc | 0 related objects]"
async def test_empty_response_fallback(self, helper: HelperLLM) -> None:
api_response = _make_api_response("")
helper._client.messages.create = AsyncMock(return_value=api_response)
result = await helper.generate_stub("summary", "file_context", "2026-01-01")
assert "file_context" in result
assert "2026-01-01" in result
assert "summary unavailable" in result
class TestAnswerMicroFault:
async def test_successful_answer(self, helper: HelperLLM) -> None:
api_response = _make_api_response("The error code is 401 UNAUTHORIZED.")
helper._client.messages.create = AsyncMock(return_value=api_response)
result = await helper.answer_micro_fault(
question="What error code does auth return for expired tokens?",
relevant_contents=[
"Auth middleware returns 401 for expired tokens.",
"Token refresh logic in refresh.ts.",
],
)
assert result == "The error code is 401 UNAUTHORIZED."
# Verify context was joined with separator
call_kwargs = helper._client.messages.create.call_args.kwargs
prompt = call_kwargs["messages"][0]["content"]
assert "What error code" in prompt
assert "Auth middleware returns 401" in prompt
assert "---" in prompt # separator between contents
async def test_empty_response_fallback(self, helper: HelperLLM) -> None:
api_response = _make_api_response(" ")
helper._client.messages.create = AsyncMock(return_value=api_response)
result = await helper.answer_micro_fault(
question="anything",
relevant_contents=["content"],
)
assert result == "Unable to answer from available context."
async def test_respects_max_tokens(self, helper: HelperLLM) -> None:
api_response = _make_api_response("answer")
helper._client.messages.create = AsyncMock(return_value=api_response)
await helper.answer_micro_fault("q", ["c"], max_tokens=150)
call_kwargs = helper._client.messages.create.call_args.kwargs
assert call_kwargs["max_tokens"] == 150
class TestClassifyGoal:
async def test_successful_classification(self, helper: HelperLLM) -> None:
api_response = _make_api_response(
json.dumps(
{
"goal": "write unit tests for auth module",
"relevant_types": ["file_context", "design_decision"],
"relevant_tags": ["auth", "testing"],
"predicted_needs": ["auth middleware implementation", "test patterns"],
}
)
)
helper._client.messages.create = AsyncMock(return_value=api_response)
result = await helper.classify_goal(
user_message="Now write tests for the auth middleware",
recent_context="We just finished implementing JWT auth.",
)
assert isinstance(result, GoalClassification)
assert result.goal == "write unit tests for auth module"
assert "file_context" in result.relevant_types
assert "auth" in result.relevant_tags
assert "auth middleware implementation" in result.predicted_needs
async def test_fallback_on_parse_failure(self, helper: HelperLLM) -> None:
api_response = _make_api_response("I don't understand the format")
helper._client.messages.create = AsyncMock(return_value=api_response)
result = await helper.classify_goal(
user_message="deploy to production",
recent_context="context",
)
assert isinstance(result, GoalClassification)
assert result.goal == "deploy to production"
assert result.relevant_types == []
# ---------------------------------------------------------------------------
# Error handling tests
# ---------------------------------------------------------------------------
class TestErrorHandling:
async def test_timeout_returns_fallback(self, helper: HelperLLM) -> None:
import anthropic as anth
helper._client.messages.create = AsyncMock(
side_effect=anth.APITimeoutError(request=MagicMock())
)
result = await helper.summarize_l0_to_l1(
content="some content here",
object_type="file_context",
)
# Should get fallback (30% of content)
assert isinstance(result, SummaryResult)
assert len(result.summary) == 5 # 30% of 18 chars ≈ 5
async def test_api_error_returns_fallback(self, helper: HelperLLM) -> None:
import anthropic as anth
helper._client.messages.create = AsyncMock(
side_effect=anth.APIError(
message="Internal server error",
request=MagicMock(),
body=None,
)
)
result = await helper.summarize_l0_to_l1(
content="test content",
object_type="tool_result",
)
assert isinstance(result, SummaryResult)
# Fallback summary
assert result.summary == "tes" # 30% of 12 chars = 3
async def test_timeout_on_micro_fault(self, helper: HelperLLM) -> None:
import anthropic as anth
helper._client.messages.create = AsyncMock(
side_effect=anth.APITimeoutError(request=MagicMock())
)
result = await helper.answer_micro_fault("question", ["content"])
assert result == "Unable to answer from available context."
async def test_timeout_on_goal_classification(self, helper: HelperLLM) -> None:
import anthropic as anth
helper._client.messages.create = AsyncMock(
side_effect=anth.APITimeoutError(request=MagicMock())
)
result = await helper.classify_goal("do something", "context")
assert isinstance(result, GoalClassification)
assert result.goal == "do something"
async def test_timeout_on_generate_stub(self, helper: HelperLLM) -> None:
import anthropic as anth
helper._client.messages.create = AsyncMock(
side_effect=anth.APITimeoutError(request=MagicMock())
)
result = await helper.generate_stub("summary", "file_context", "2026-01-01")
assert "file_context" in result
assert "summary unavailable" in result
# ---------------------------------------------------------------------------
# Constructor tests
# ---------------------------------------------------------------------------
class TestConstructor:
def test_uses_provided_api_key(self) -> None:
with patch("mnemosyne.helper_llm.anthropic.AsyncAnthropic") as mock_cls:
HelperLLM(api_key="sk-test-123")
call_kwargs = mock_cls.call_args.kwargs
assert call_kwargs["api_key"] == "sk-test-123"
def test_falls_back_to_env_var(self) -> None:
with (
patch.dict("os.environ", {"ANTHROPIC_API_KEY": "sk-env-456"}),
patch("mnemosyne.helper_llm.anthropic.AsyncAnthropic") as mock_cls,
):
HelperLLM()
call_kwargs = mock_cls.call_args.kwargs
assert call_kwargs["api_key"] == "sk-env-456"
def test_custom_model_and_base_url(self) -> None:
with patch("mnemosyne.helper_llm.anthropic.AsyncAnthropic") as mock_cls:
h = HelperLLM(
api_key="key",
model="claude-3-haiku-20240307",
base_url="http://localhost:8080",
)
call_kwargs = mock_cls.call_args.kwargs
assert call_kwargs["base_url"] == "http://localhost:8080"
assert h._model == "claude-3-haiku-20240307"
def test_default_timeout_and_retries(self) -> None:
with patch("mnemosyne.helper_llm.anthropic.AsyncAnthropic") as mock_cls:
HelperLLM(api_key="key")
call_kwargs = mock_cls.call_args.kwargs
assert call_kwargs["timeout"] == 10.0
assert call_kwargs["max_retries"] == 2