mnemosyne/tests/test_summarization_pipeline.py
Joey Yakimowich-Payne d660414ad7 feat: add benchmarking, auth, and utility modules
CLI benchmark command, threshold auto-tuning, OAuth PKCE auth
(same flow as Claude Code), cost tracking, telemetry, and replay.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-03-13 11:41:22 -06:00

558 lines
20 KiB
Python

"""Tests for the HelperLLM summarization → fidelity degradation pipeline.
Verifies that when FidelityManager degrades objects, the gateway calls
HelperLLM to generate compressed summaries instead of using stub text.
All tests use mocked HelperLLM — no real API calls.
"""
from __future__ import annotations
import asyncio
import json
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from mnemosyne.fidelity import (
FidelityLevel,
FidelityManager,
SemanticObject,
make_object,
)
from mnemosyne.gateway import (
Session,
_apply_fidelity,
_auto_stub,
_generate_degradation_summaries,
)
from mnemosyne.helper_llm import HelperLLM, SummaryResult
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
def _make_obj(
content: str = "x" * 800,
*,
object_type: str = "file_context",
turn: int = 0,
summary_detailed: str | None = None,
summary_compact: str | None = None,
stub: str | None = None,
) -> SemanticObject:
"""Create a SemanticObject with sensible defaults for testing."""
return make_object(
object_type=object_type,
content_full=content,
created_at_turn=turn,
summary_detailed=summary_detailed,
summary_compact=summary_compact,
stub=stub or f"[evicted content: {content[:40]}]",
)
def _make_mock_helper() -> MagicMock:
"""Create a mock HelperLLM with async methods."""
helper = MagicMock(spec=HelperLLM)
helper.summarize_l0_to_l1 = AsyncMock(
return_value=SummaryResult(
summary="LLM-generated L1 summary of the content.",
losses=["exact error codes"],
can_answer=["what was implemented"],
key_entities=["src/main.py"],
)
)
helper.compress_l1_to_l2 = AsyncMock(
return_value=SummaryResult(
summary="LLM-generated compact L2 summary.",
losses=["exact error codes", "function signatures"],
can_answer=["what was decided"],
key_entities=["main.py"],
)
)
helper.generate_stub = AsyncMock(
return_value="[file_context | turn 0 | test stub | 0 related objects]"
)
return helper
def _make_session() -> Session:
"""Create a minimal Session for testing (mocked dependencies)."""
import tempfile
from pathlib import Path
with tempfile.TemporaryDirectory() as tmpdir:
log_dir = Path(tmpdir)
# Session.__init__ imports many things; patch what we need
session = Session.__new__(Session)
session.id = "test-sess"
session.token_state = {"last_effective": 0, "blocked": False, "turn": 5}
session.fidelity_manager = FidelityManager(window_size=200_000)
session._fidelity_content_map = {}
session._summary_cache = {}
return session
# ---------------------------------------------------------------------------
# _generate_degradation_summaries tests
# ---------------------------------------------------------------------------
class TestGenerateDegradationSummaries:
"""Test the async summarization function directly."""
async def test_l0_to_l1_calls_summarize(self) -> None:
"""L0→L1 transition calls helper_llm.summarize_l0_to_l1()."""
session = _make_session()
helper = _make_mock_helper()
fm = session.fidelity_manager
obj = _make_obj(content="Full content for summarization " * 20)
obj_id = fm.register_object(obj)
transitions = [(obj_id, FidelityLevel.L0, FidelityLevel.L1)]
await _generate_degradation_summaries(transitions, session, helper)
# Verify LLM was called
helper.summarize_l0_to_l1.assert_called_once()
call_kwargs = helper.summarize_l0_to_l1.call_args.kwargs
assert call_kwargs["object_type"] == "file_context"
assert call_kwargs["max_summary_tokens"] == 1024
# Verify summary was stored on the object
assert obj.summary_detailed == "LLM-generated L1 summary of the content."
assert obj.losses_l1 == ["exact error codes"]
assert obj.can_answer == ["what was implemented"]
assert obj.key_entities == ["src/main.py"]
# Verify cache was populated
assert (obj_id, int(FidelityLevel.L1)) in session._summary_cache
assert session._summary_cache[(obj_id, int(FidelityLevel.L1))] == (
"LLM-generated L1 summary of the content."
)
async def test_l1_to_l2_calls_compress(self) -> None:
"""L1→L2 transition calls helper_llm.compress_l1_to_l2()."""
session = _make_session()
helper = _make_mock_helper()
fm = session.fidelity_manager
obj = _make_obj(
summary_detailed="Existing L1 detailed summary of auth middleware.",
)
obj.losses_l1 = ["line numbers"]
obj_id = fm.register_object(obj)
transitions = [(obj_id, FidelityLevel.L1, FidelityLevel.L2)]
await _generate_degradation_summaries(transitions, session, helper)
# Verify LLM was called with L1 text
helper.compress_l1_to_l2.assert_called_once()
call_kwargs = helper.compress_l1_to_l2.call_args.kwargs
assert call_kwargs["l1_summary"] == "Existing L1 detailed summary of auth middleware."
assert call_kwargs["l1_losses"] == ["line numbers"]
assert call_kwargs["object_type"] == "file_context"
# Verify compact summary was stored
assert obj.summary_compact == "LLM-generated compact L2 summary."
assert (obj_id, int(FidelityLevel.L2)) in session._summary_cache
async def test_l1_to_l2_uses_cache_for_l1_text(self) -> None:
"""L1→L2 uses cached L1 summary when obj.summary_detailed is None."""
session = _make_session()
helper = _make_mock_helper()
fm = session.fidelity_manager
obj = _make_obj()
obj.summary_detailed = None # No pre-set L1 summary
obj_id = fm.register_object(obj)
# Pre-populate cache with L1 summary
session._summary_cache[(obj_id, int(FidelityLevel.L1))] = "Cached L1 summary text."
transitions = [(obj_id, FidelityLevel.L1, FidelityLevel.L2)]
await _generate_degradation_summaries(transitions, session, helper)
# Should use cached L1 text
call_kwargs = helper.compress_l1_to_l2.call_args.kwargs
assert call_kwargs["l1_summary"] == "Cached L1 summary text."
async def test_l2_to_l3_no_llm_call(self) -> None:
"""L2→L3 generates a simple stub without calling the LLM."""
session = _make_session()
helper = _make_mock_helper()
fm = session.fidelity_manager
obj = _make_obj(object_type="tool_result")
obj.created_at_turn = 7
obj_id = fm.register_object(obj)
transitions = [(obj_id, FidelityLevel.L2, FidelityLevel.L3)]
await _generate_degradation_summaries(transitions, session, helper)
# No LLM calls should have been made
helper.summarize_l0_to_l1.assert_not_called()
helper.compress_l1_to_l2.assert_not_called()
helper.generate_stub.assert_not_called()
# Stub should be set
assert obj.stub == "[evicted: tool_result from turn 7]"
assert (obj_id, int(FidelityLevel.L3)) in session._summary_cache
async def test_l3_to_l4_no_action(self) -> None:
"""L3→L4 (full eviction) doesn't generate any content."""
session = _make_session()
helper = _make_mock_helper()
fm = session.fidelity_manager
obj = _make_obj()
obj_id = fm.register_object(obj)
transitions = [(obj_id, FidelityLevel.L3, FidelityLevel.L4)]
await _generate_degradation_summaries(transitions, session, helper)
# No LLM calls
helper.summarize_l0_to_l1.assert_not_called()
helper.compress_l1_to_l2.assert_not_called()
# No cache entry for L4
assert (obj_id, int(FidelityLevel.L4)) not in session._summary_cache
async def test_fallback_on_llm_exception(self) -> None:
"""If HelperLLM raises an exception, fall back silently."""
session = _make_session()
helper = _make_mock_helper()
helper.summarize_l0_to_l1 = AsyncMock(side_effect=RuntimeError("API down"))
fm = session.fidelity_manager
obj = _make_obj()
original_stub = obj.stub
obj_id = fm.register_object(obj)
transitions = [(obj_id, FidelityLevel.L0, FidelityLevel.L1)]
# Should NOT raise
await _generate_degradation_summaries(transitions, session, helper)
# Cache should NOT have an entry (fallback means no summary generated)
assert (obj_id, int(FidelityLevel.L1)) not in session._summary_cache
# Original stub should be preserved
assert obj.stub == original_stub
async def test_fallback_on_empty_summary(self) -> None:
"""If HelperLLM returns empty summary, don't cache it."""
session = _make_session()
helper = _make_mock_helper()
helper.summarize_l0_to_l1 = AsyncMock(
return_value=SummaryResult(summary="", losses=[], can_answer=[], key_entities=[])
)
fm = session.fidelity_manager
obj = _make_obj()
obj_id = fm.register_object(obj)
transitions = [(obj_id, FidelityLevel.L0, FidelityLevel.L1)]
await _generate_degradation_summaries(transitions, session, helper)
# Empty summary should NOT be cached
assert (obj_id, int(FidelityLevel.L1)) not in session._summary_cache
# summary_detailed should NOT be set to empty
assert obj.summary_detailed is None or obj.summary_detailed != ""
async def test_summary_cache_hit_skips_llm(self) -> None:
"""If summary is already cached, don't call the LLM again."""
session = _make_session()
helper = _make_mock_helper()
fm = session.fidelity_manager
obj = _make_obj()
obj_id = fm.register_object(obj)
# Pre-populate cache
session._summary_cache[(obj_id, int(FidelityLevel.L1))] = "Already cached summary."
transitions = [(obj_id, FidelityLevel.L0, FidelityLevel.L1)]
await _generate_degradation_summaries(transitions, session, helper)
# LLM should NOT have been called
helper.summarize_l0_to_l1.assert_not_called()
async def test_none_helper_llm_is_noop(self) -> None:
"""If helper_llm is None, the function is a no-op."""
session = _make_session()
fm = session.fidelity_manager
obj = _make_obj()
obj_id = fm.register_object(obj)
transitions = [(obj_id, FidelityLevel.L0, FidelityLevel.L1)]
# Should not raise
await _generate_degradation_summaries(transitions, session, None)
# No cache entries
assert len(session._summary_cache) == 0
async def test_multiple_transitions_processed(self) -> None:
"""Multiple transitions in one batch are all processed."""
session = _make_session()
helper = _make_mock_helper()
fm = session.fidelity_manager
obj1 = _make_obj(content="Content A " * 50)
obj2 = _make_obj(content="Content B " * 50, object_type="tool_result")
obj2.summary_detailed = "L1 summary for obj2"
obj2.created_at_turn = 3
id1 = fm.register_object(obj1)
id2 = fm.register_object(obj2)
transitions = [
(id1, FidelityLevel.L0, FidelityLevel.L1),
(id2, FidelityLevel.L2, FidelityLevel.L3),
]
await _generate_degradation_summaries(transitions, session, helper)
# obj1: L0→L1 should have called summarize
helper.summarize_l0_to_l1.assert_called_once()
assert (id1, int(FidelityLevel.L1)) in session._summary_cache
# obj2: L2→L3 should have set stub without LLM
assert obj2.stub == "[evicted: tool_result from turn 3]"
assert (id2, int(FidelityLevel.L3)) in session._summary_cache
# ---------------------------------------------------------------------------
# _apply_fidelity integration tests
# ---------------------------------------------------------------------------
class TestApplyFidelityWithSummaries:
"""Test that _apply_fidelity uses LLM-generated summaries."""
def test_l1_object_uses_summary_from_cache(self) -> None:
"""An L1 object should have its content replaced with the cached summary."""
session = _make_session()
fm = session.fidelity_manager
# Create and register an object
content = "x" * 800
obj = _make_obj(content=content)
obj_id = fm.register_object(obj)
# Set up content map (simulate previous registration)
content_key = (
f"text:{__import__('hashlib').sha256(content[:200].encode()).hexdigest()[:12]}"
)
session._fidelity_content_map[content_key] = obj_id
# Degrade to L1 and cache a summary
obj.current_fidelity = FidelityLevel.L1
session._summary_cache[(obj_id, int(FidelityLevel.L1))] = "Cached L1 summary."
# Build payload with the original content
payload = {
"messages": [
{
"role": "assistant",
"content": [{"type": "text", "text": content}],
}
]
}
_apply_fidelity(payload, session)
# Content should be replaced with the cached summary
assert payload["messages"][0]["content"][0]["text"] == "Cached L1 summary."
def test_l2_object_uses_compact_summary(self) -> None:
"""An L2 object should have its content replaced with the compact summary."""
session = _make_session()
fm = session.fidelity_manager
content = "y" * 800
obj = _make_obj(content=content)
obj.summary_compact = "Pre-set compact summary."
obj_id = fm.register_object(obj)
content_key = (
f"text:{__import__('hashlib').sha256(content[:200].encode()).hexdigest()[:12]}"
)
session._fidelity_content_map[content_key] = obj_id
obj.current_fidelity = FidelityLevel.L2
payload = {
"messages": [
{
"role": "user",
"content": [{"type": "text", "text": content}],
}
]
}
_apply_fidelity(payload, session)
assert payload["messages"][0]["content"][0]["text"] == "Pre-set compact summary."
def test_l1_object_falls_back_to_full_content(self) -> None:
"""If no L1 summary is available, keep full content."""
session = _make_session()
fm = session.fidelity_manager
content = "z" * 800
obj = _make_obj(content=content)
obj.summary_detailed = None # No summary available
obj_id = fm.register_object(obj)
content_key = (
f"text:{__import__('hashlib').sha256(content[:200].encode()).hexdigest()[:12]}"
)
session._fidelity_content_map[content_key] = obj_id
obj.current_fidelity = FidelityLevel.L1
payload = {
"messages": [
{
"role": "assistant",
"content": [{"type": "text", "text": content}],
}
]
}
_apply_fidelity(payload, session)
# Should keep original content as fallback
assert payload["messages"][0]["content"][0]["text"] == content
def test_l3_object_uses_stub(self) -> None:
"""L3 objects should use stub text (unchanged behavior)."""
session = _make_session()
fm = session.fidelity_manager
content = "w" * 800
obj = _make_obj(content=content, stub="[evicted content: test stub]")
obj_id = fm.register_object(obj)
content_key = (
f"text:{__import__('hashlib').sha256(content[:200].encode()).hexdigest()[:12]}"
)
session._fidelity_content_map[content_key] = obj_id
obj.current_fidelity = FidelityLevel.L3
payload = {
"messages": [
{
"role": "assistant",
"content": [{"type": "text", "text": content}],
}
]
}
_apply_fidelity(payload, session)
assert payload["messages"][0]["content"][0]["text"] == "[evicted content: test stub]"
def test_l2_cache_takes_priority_over_object_field(self) -> None:
"""Cache entry should take priority over obj.summary_compact."""
session = _make_session()
fm = session.fidelity_manager
content = "a" * 800
obj = _make_obj(content=content)
obj.summary_compact = "Old compact summary."
obj_id = fm.register_object(obj)
content_key = (
f"text:{__import__('hashlib').sha256(content[:200].encode()).hexdigest()[:12]}"
)
session._fidelity_content_map[content_key] = obj_id
obj.current_fidelity = FidelityLevel.L2
session._summary_cache[(obj_id, int(FidelityLevel.L2))] = "Fresh cached L2 summary."
payload = {
"messages": [
{
"role": "assistant",
"content": [{"type": "text", "text": content}],
}
]
}
_apply_fidelity(payload, session)
assert payload["messages"][0]["content"][0]["text"] == "Fresh cached L2 summary."
def test_tool_result_block_uses_summary(self) -> None:
"""Tool result blocks should also get their content replaced."""
session = _make_session()
fm = session.fidelity_manager
content = "tool output " * 100
obj = _make_obj(content=content, object_type="tool_result")
obj_id = fm.register_object(obj)
content_key = f"tool:use_123"
session._fidelity_content_map[content_key] = obj_id
obj.current_fidelity = FidelityLevel.L1
session._summary_cache[(obj_id, int(FidelityLevel.L1))] = "Tool result L1 summary."
payload = {
"messages": [
{
"role": "user",
"content": [
{
"type": "tool_result",
"tool_use_id": "use_123",
"content": content,
}
],
}
]
}
_apply_fidelity(payload, session)
assert payload["messages"][0]["content"][0]["content"] == "Tool result L1 summary."
# ---------------------------------------------------------------------------
# Async non-blocking behavior test
# ---------------------------------------------------------------------------
class TestAsyncNonBlocking:
"""Verify summarization runs asynchronously without blocking."""
async def test_summarization_completes_independently(self) -> None:
"""The summarization coroutine completes without blocking the caller."""
session = _make_session()
helper = _make_mock_helper()
# Add a small delay to simulate LLM latency
original_summarize = helper.summarize_l0_to_l1
async def slow_summarize(**kwargs):
await asyncio.sleep(0.01) # 10ms simulated latency
return await original_summarize(**kwargs)
helper.summarize_l0_to_l1 = slow_summarize
fm = session.fidelity_manager
obj = _make_obj()
obj_id = fm.register_object(obj)
transitions = [(obj_id, FidelityLevel.L0, FidelityLevel.L1)]
# Run as a task — should complete without issues
task = asyncio.create_task(_generate_degradation_summaries(transitions, session, helper))
await task
# Summary should be cached after completion
assert (obj_id, int(FidelityLevel.L1)) in session._summary_cache