CLI benchmark command, threshold auto-tuning, OAuth PKCE auth (same flow as Claude Code), cost tracking, telemetry, and replay. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
558 lines
20 KiB
Python
558 lines
20 KiB
Python
"""Tests for the HelperLLM summarization → fidelity degradation pipeline.
|
|
|
|
Verifies that when FidelityManager degrades objects, the gateway calls
|
|
HelperLLM to generate compressed summaries instead of using stub text.
|
|
|
|
All tests use mocked HelperLLM — no real API calls.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import json
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from mnemosyne.fidelity import (
|
|
FidelityLevel,
|
|
FidelityManager,
|
|
SemanticObject,
|
|
make_object,
|
|
)
|
|
from mnemosyne.gateway import (
|
|
Session,
|
|
_apply_fidelity,
|
|
_auto_stub,
|
|
_generate_degradation_summaries,
|
|
)
|
|
from mnemosyne.helper_llm import HelperLLM, SummaryResult
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fixtures
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _make_obj(
|
|
content: str = "x" * 800,
|
|
*,
|
|
object_type: str = "file_context",
|
|
turn: int = 0,
|
|
summary_detailed: str | None = None,
|
|
summary_compact: str | None = None,
|
|
stub: str | None = None,
|
|
) -> SemanticObject:
|
|
"""Create a SemanticObject with sensible defaults for testing."""
|
|
return make_object(
|
|
object_type=object_type,
|
|
content_full=content,
|
|
created_at_turn=turn,
|
|
summary_detailed=summary_detailed,
|
|
summary_compact=summary_compact,
|
|
stub=stub or f"[evicted content: {content[:40]}]",
|
|
)
|
|
|
|
|
|
def _make_mock_helper() -> MagicMock:
|
|
"""Create a mock HelperLLM with async methods."""
|
|
helper = MagicMock(spec=HelperLLM)
|
|
helper.summarize_l0_to_l1 = AsyncMock(
|
|
return_value=SummaryResult(
|
|
summary="LLM-generated L1 summary of the content.",
|
|
losses=["exact error codes"],
|
|
can_answer=["what was implemented"],
|
|
key_entities=["src/main.py"],
|
|
)
|
|
)
|
|
helper.compress_l1_to_l2 = AsyncMock(
|
|
return_value=SummaryResult(
|
|
summary="LLM-generated compact L2 summary.",
|
|
losses=["exact error codes", "function signatures"],
|
|
can_answer=["what was decided"],
|
|
key_entities=["main.py"],
|
|
)
|
|
)
|
|
helper.generate_stub = AsyncMock(
|
|
return_value="[file_context | turn 0 | test stub | 0 related objects]"
|
|
)
|
|
return helper
|
|
|
|
|
|
def _make_session() -> Session:
|
|
"""Create a minimal Session for testing (mocked dependencies)."""
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
log_dir = Path(tmpdir)
|
|
# Session.__init__ imports many things; patch what we need
|
|
session = Session.__new__(Session)
|
|
session.id = "test-sess"
|
|
session.token_state = {"last_effective": 0, "blocked": False, "turn": 5}
|
|
session.fidelity_manager = FidelityManager(window_size=200_000)
|
|
session._fidelity_content_map = {}
|
|
session._summary_cache = {}
|
|
return session
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _generate_degradation_summaries tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestGenerateDegradationSummaries:
|
|
"""Test the async summarization function directly."""
|
|
|
|
async def test_l0_to_l1_calls_summarize(self) -> None:
|
|
"""L0→L1 transition calls helper_llm.summarize_l0_to_l1()."""
|
|
session = _make_session()
|
|
helper = _make_mock_helper()
|
|
fm = session.fidelity_manager
|
|
|
|
obj = _make_obj(content="Full content for summarization " * 20)
|
|
obj_id = fm.register_object(obj)
|
|
|
|
transitions = [(obj_id, FidelityLevel.L0, FidelityLevel.L1)]
|
|
await _generate_degradation_summaries(transitions, session, helper)
|
|
|
|
# Verify LLM was called
|
|
helper.summarize_l0_to_l1.assert_called_once()
|
|
call_kwargs = helper.summarize_l0_to_l1.call_args.kwargs
|
|
assert call_kwargs["object_type"] == "file_context"
|
|
assert call_kwargs["max_summary_tokens"] == 1024
|
|
|
|
# Verify summary was stored on the object
|
|
assert obj.summary_detailed == "LLM-generated L1 summary of the content."
|
|
assert obj.losses_l1 == ["exact error codes"]
|
|
assert obj.can_answer == ["what was implemented"]
|
|
assert obj.key_entities == ["src/main.py"]
|
|
|
|
# Verify cache was populated
|
|
assert (obj_id, int(FidelityLevel.L1)) in session._summary_cache
|
|
assert session._summary_cache[(obj_id, int(FidelityLevel.L1))] == (
|
|
"LLM-generated L1 summary of the content."
|
|
)
|
|
|
|
async def test_l1_to_l2_calls_compress(self) -> None:
|
|
"""L1→L2 transition calls helper_llm.compress_l1_to_l2()."""
|
|
session = _make_session()
|
|
helper = _make_mock_helper()
|
|
fm = session.fidelity_manager
|
|
|
|
obj = _make_obj(
|
|
summary_detailed="Existing L1 detailed summary of auth middleware.",
|
|
)
|
|
obj.losses_l1 = ["line numbers"]
|
|
obj_id = fm.register_object(obj)
|
|
|
|
transitions = [(obj_id, FidelityLevel.L1, FidelityLevel.L2)]
|
|
await _generate_degradation_summaries(transitions, session, helper)
|
|
|
|
# Verify LLM was called with L1 text
|
|
helper.compress_l1_to_l2.assert_called_once()
|
|
call_kwargs = helper.compress_l1_to_l2.call_args.kwargs
|
|
assert call_kwargs["l1_summary"] == "Existing L1 detailed summary of auth middleware."
|
|
assert call_kwargs["l1_losses"] == ["line numbers"]
|
|
assert call_kwargs["object_type"] == "file_context"
|
|
|
|
# Verify compact summary was stored
|
|
assert obj.summary_compact == "LLM-generated compact L2 summary."
|
|
assert (obj_id, int(FidelityLevel.L2)) in session._summary_cache
|
|
|
|
async def test_l1_to_l2_uses_cache_for_l1_text(self) -> None:
|
|
"""L1→L2 uses cached L1 summary when obj.summary_detailed is None."""
|
|
session = _make_session()
|
|
helper = _make_mock_helper()
|
|
fm = session.fidelity_manager
|
|
|
|
obj = _make_obj()
|
|
obj.summary_detailed = None # No pre-set L1 summary
|
|
obj_id = fm.register_object(obj)
|
|
|
|
# Pre-populate cache with L1 summary
|
|
session._summary_cache[(obj_id, int(FidelityLevel.L1))] = "Cached L1 summary text."
|
|
|
|
transitions = [(obj_id, FidelityLevel.L1, FidelityLevel.L2)]
|
|
await _generate_degradation_summaries(transitions, session, helper)
|
|
|
|
# Should use cached L1 text
|
|
call_kwargs = helper.compress_l1_to_l2.call_args.kwargs
|
|
assert call_kwargs["l1_summary"] == "Cached L1 summary text."
|
|
|
|
async def test_l2_to_l3_no_llm_call(self) -> None:
|
|
"""L2→L3 generates a simple stub without calling the LLM."""
|
|
session = _make_session()
|
|
helper = _make_mock_helper()
|
|
fm = session.fidelity_manager
|
|
|
|
obj = _make_obj(object_type="tool_result")
|
|
obj.created_at_turn = 7
|
|
obj_id = fm.register_object(obj)
|
|
|
|
transitions = [(obj_id, FidelityLevel.L2, FidelityLevel.L3)]
|
|
await _generate_degradation_summaries(transitions, session, helper)
|
|
|
|
# No LLM calls should have been made
|
|
helper.summarize_l0_to_l1.assert_not_called()
|
|
helper.compress_l1_to_l2.assert_not_called()
|
|
helper.generate_stub.assert_not_called()
|
|
|
|
# Stub should be set
|
|
assert obj.stub == "[evicted: tool_result from turn 7]"
|
|
assert (obj_id, int(FidelityLevel.L3)) in session._summary_cache
|
|
|
|
async def test_l3_to_l4_no_action(self) -> None:
|
|
"""L3→L4 (full eviction) doesn't generate any content."""
|
|
session = _make_session()
|
|
helper = _make_mock_helper()
|
|
fm = session.fidelity_manager
|
|
|
|
obj = _make_obj()
|
|
obj_id = fm.register_object(obj)
|
|
|
|
transitions = [(obj_id, FidelityLevel.L3, FidelityLevel.L4)]
|
|
await _generate_degradation_summaries(transitions, session, helper)
|
|
|
|
# No LLM calls
|
|
helper.summarize_l0_to_l1.assert_not_called()
|
|
helper.compress_l1_to_l2.assert_not_called()
|
|
|
|
# No cache entry for L4
|
|
assert (obj_id, int(FidelityLevel.L4)) not in session._summary_cache
|
|
|
|
async def test_fallback_on_llm_exception(self) -> None:
|
|
"""If HelperLLM raises an exception, fall back silently."""
|
|
session = _make_session()
|
|
helper = _make_mock_helper()
|
|
helper.summarize_l0_to_l1 = AsyncMock(side_effect=RuntimeError("API down"))
|
|
fm = session.fidelity_manager
|
|
|
|
obj = _make_obj()
|
|
original_stub = obj.stub
|
|
obj_id = fm.register_object(obj)
|
|
|
|
transitions = [(obj_id, FidelityLevel.L0, FidelityLevel.L1)]
|
|
# Should NOT raise
|
|
await _generate_degradation_summaries(transitions, session, helper)
|
|
|
|
# Cache should NOT have an entry (fallback means no summary generated)
|
|
assert (obj_id, int(FidelityLevel.L1)) not in session._summary_cache
|
|
# Original stub should be preserved
|
|
assert obj.stub == original_stub
|
|
|
|
async def test_fallback_on_empty_summary(self) -> None:
|
|
"""If HelperLLM returns empty summary, don't cache it."""
|
|
session = _make_session()
|
|
helper = _make_mock_helper()
|
|
helper.summarize_l0_to_l1 = AsyncMock(
|
|
return_value=SummaryResult(summary="", losses=[], can_answer=[], key_entities=[])
|
|
)
|
|
fm = session.fidelity_manager
|
|
|
|
obj = _make_obj()
|
|
obj_id = fm.register_object(obj)
|
|
|
|
transitions = [(obj_id, FidelityLevel.L0, FidelityLevel.L1)]
|
|
await _generate_degradation_summaries(transitions, session, helper)
|
|
|
|
# Empty summary should NOT be cached
|
|
assert (obj_id, int(FidelityLevel.L1)) not in session._summary_cache
|
|
# summary_detailed should NOT be set to empty
|
|
assert obj.summary_detailed is None or obj.summary_detailed != ""
|
|
|
|
async def test_summary_cache_hit_skips_llm(self) -> None:
|
|
"""If summary is already cached, don't call the LLM again."""
|
|
session = _make_session()
|
|
helper = _make_mock_helper()
|
|
fm = session.fidelity_manager
|
|
|
|
obj = _make_obj()
|
|
obj_id = fm.register_object(obj)
|
|
|
|
# Pre-populate cache
|
|
session._summary_cache[(obj_id, int(FidelityLevel.L1))] = "Already cached summary."
|
|
|
|
transitions = [(obj_id, FidelityLevel.L0, FidelityLevel.L1)]
|
|
await _generate_degradation_summaries(transitions, session, helper)
|
|
|
|
# LLM should NOT have been called
|
|
helper.summarize_l0_to_l1.assert_not_called()
|
|
|
|
async def test_none_helper_llm_is_noop(self) -> None:
|
|
"""If helper_llm is None, the function is a no-op."""
|
|
session = _make_session()
|
|
fm = session.fidelity_manager
|
|
|
|
obj = _make_obj()
|
|
obj_id = fm.register_object(obj)
|
|
|
|
transitions = [(obj_id, FidelityLevel.L0, FidelityLevel.L1)]
|
|
# Should not raise
|
|
await _generate_degradation_summaries(transitions, session, None)
|
|
|
|
# No cache entries
|
|
assert len(session._summary_cache) == 0
|
|
|
|
async def test_multiple_transitions_processed(self) -> None:
|
|
"""Multiple transitions in one batch are all processed."""
|
|
session = _make_session()
|
|
helper = _make_mock_helper()
|
|
fm = session.fidelity_manager
|
|
|
|
obj1 = _make_obj(content="Content A " * 50)
|
|
obj2 = _make_obj(content="Content B " * 50, object_type="tool_result")
|
|
obj2.summary_detailed = "L1 summary for obj2"
|
|
obj2.created_at_turn = 3
|
|
|
|
id1 = fm.register_object(obj1)
|
|
id2 = fm.register_object(obj2)
|
|
|
|
transitions = [
|
|
(id1, FidelityLevel.L0, FidelityLevel.L1),
|
|
(id2, FidelityLevel.L2, FidelityLevel.L3),
|
|
]
|
|
await _generate_degradation_summaries(transitions, session, helper)
|
|
|
|
# obj1: L0→L1 should have called summarize
|
|
helper.summarize_l0_to_l1.assert_called_once()
|
|
assert (id1, int(FidelityLevel.L1)) in session._summary_cache
|
|
|
|
# obj2: L2→L3 should have set stub without LLM
|
|
assert obj2.stub == "[evicted: tool_result from turn 3]"
|
|
assert (id2, int(FidelityLevel.L3)) in session._summary_cache
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _apply_fidelity integration tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestApplyFidelityWithSummaries:
|
|
"""Test that _apply_fidelity uses LLM-generated summaries."""
|
|
|
|
def test_l1_object_uses_summary_from_cache(self) -> None:
|
|
"""An L1 object should have its content replaced with the cached summary."""
|
|
session = _make_session()
|
|
fm = session.fidelity_manager
|
|
|
|
# Create and register an object
|
|
content = "x" * 800
|
|
obj = _make_obj(content=content)
|
|
obj_id = fm.register_object(obj)
|
|
|
|
# Set up content map (simulate previous registration)
|
|
content_key = (
|
|
f"text:{__import__('hashlib').sha256(content[:200].encode()).hexdigest()[:12]}"
|
|
)
|
|
session._fidelity_content_map[content_key] = obj_id
|
|
|
|
# Degrade to L1 and cache a summary
|
|
obj.current_fidelity = FidelityLevel.L1
|
|
session._summary_cache[(obj_id, int(FidelityLevel.L1))] = "Cached L1 summary."
|
|
|
|
# Build payload with the original content
|
|
payload = {
|
|
"messages": [
|
|
{
|
|
"role": "assistant",
|
|
"content": [{"type": "text", "text": content}],
|
|
}
|
|
]
|
|
}
|
|
|
|
_apply_fidelity(payload, session)
|
|
|
|
# Content should be replaced with the cached summary
|
|
assert payload["messages"][0]["content"][0]["text"] == "Cached L1 summary."
|
|
|
|
def test_l2_object_uses_compact_summary(self) -> None:
|
|
"""An L2 object should have its content replaced with the compact summary."""
|
|
session = _make_session()
|
|
fm = session.fidelity_manager
|
|
|
|
content = "y" * 800
|
|
obj = _make_obj(content=content)
|
|
obj.summary_compact = "Pre-set compact summary."
|
|
obj_id = fm.register_object(obj)
|
|
|
|
content_key = (
|
|
f"text:{__import__('hashlib').sha256(content[:200].encode()).hexdigest()[:12]}"
|
|
)
|
|
session._fidelity_content_map[content_key] = obj_id
|
|
|
|
obj.current_fidelity = FidelityLevel.L2
|
|
|
|
payload = {
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": [{"type": "text", "text": content}],
|
|
}
|
|
]
|
|
}
|
|
|
|
_apply_fidelity(payload, session)
|
|
|
|
assert payload["messages"][0]["content"][0]["text"] == "Pre-set compact summary."
|
|
|
|
def test_l1_object_falls_back_to_full_content(self) -> None:
|
|
"""If no L1 summary is available, keep full content."""
|
|
session = _make_session()
|
|
fm = session.fidelity_manager
|
|
|
|
content = "z" * 800
|
|
obj = _make_obj(content=content)
|
|
obj.summary_detailed = None # No summary available
|
|
obj_id = fm.register_object(obj)
|
|
|
|
content_key = (
|
|
f"text:{__import__('hashlib').sha256(content[:200].encode()).hexdigest()[:12]}"
|
|
)
|
|
session._fidelity_content_map[content_key] = obj_id
|
|
|
|
obj.current_fidelity = FidelityLevel.L1
|
|
|
|
payload = {
|
|
"messages": [
|
|
{
|
|
"role": "assistant",
|
|
"content": [{"type": "text", "text": content}],
|
|
}
|
|
]
|
|
}
|
|
|
|
_apply_fidelity(payload, session)
|
|
|
|
# Should keep original content as fallback
|
|
assert payload["messages"][0]["content"][0]["text"] == content
|
|
|
|
def test_l3_object_uses_stub(self) -> None:
|
|
"""L3 objects should use stub text (unchanged behavior)."""
|
|
session = _make_session()
|
|
fm = session.fidelity_manager
|
|
|
|
content = "w" * 800
|
|
obj = _make_obj(content=content, stub="[evicted content: test stub]")
|
|
obj_id = fm.register_object(obj)
|
|
|
|
content_key = (
|
|
f"text:{__import__('hashlib').sha256(content[:200].encode()).hexdigest()[:12]}"
|
|
)
|
|
session._fidelity_content_map[content_key] = obj_id
|
|
|
|
obj.current_fidelity = FidelityLevel.L3
|
|
|
|
payload = {
|
|
"messages": [
|
|
{
|
|
"role": "assistant",
|
|
"content": [{"type": "text", "text": content}],
|
|
}
|
|
]
|
|
}
|
|
|
|
_apply_fidelity(payload, session)
|
|
|
|
assert payload["messages"][0]["content"][0]["text"] == "[evicted content: test stub]"
|
|
|
|
def test_l2_cache_takes_priority_over_object_field(self) -> None:
|
|
"""Cache entry should take priority over obj.summary_compact."""
|
|
session = _make_session()
|
|
fm = session.fidelity_manager
|
|
|
|
content = "a" * 800
|
|
obj = _make_obj(content=content)
|
|
obj.summary_compact = "Old compact summary."
|
|
obj_id = fm.register_object(obj)
|
|
|
|
content_key = (
|
|
f"text:{__import__('hashlib').sha256(content[:200].encode()).hexdigest()[:12]}"
|
|
)
|
|
session._fidelity_content_map[content_key] = obj_id
|
|
|
|
obj.current_fidelity = FidelityLevel.L2
|
|
session._summary_cache[(obj_id, int(FidelityLevel.L2))] = "Fresh cached L2 summary."
|
|
|
|
payload = {
|
|
"messages": [
|
|
{
|
|
"role": "assistant",
|
|
"content": [{"type": "text", "text": content}],
|
|
}
|
|
]
|
|
}
|
|
|
|
_apply_fidelity(payload, session)
|
|
|
|
assert payload["messages"][0]["content"][0]["text"] == "Fresh cached L2 summary."
|
|
|
|
def test_tool_result_block_uses_summary(self) -> None:
|
|
"""Tool result blocks should also get their content replaced."""
|
|
session = _make_session()
|
|
fm = session.fidelity_manager
|
|
|
|
content = "tool output " * 100
|
|
obj = _make_obj(content=content, object_type="tool_result")
|
|
obj_id = fm.register_object(obj)
|
|
|
|
content_key = f"tool:use_123"
|
|
session._fidelity_content_map[content_key] = obj_id
|
|
|
|
obj.current_fidelity = FidelityLevel.L1
|
|
session._summary_cache[(obj_id, int(FidelityLevel.L1))] = "Tool result L1 summary."
|
|
|
|
payload = {
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "tool_result",
|
|
"tool_use_id": "use_123",
|
|
"content": content,
|
|
}
|
|
],
|
|
}
|
|
]
|
|
}
|
|
|
|
_apply_fidelity(payload, session)
|
|
|
|
assert payload["messages"][0]["content"][0]["content"] == "Tool result L1 summary."
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Async non-blocking behavior test
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestAsyncNonBlocking:
|
|
"""Verify summarization runs asynchronously without blocking."""
|
|
|
|
async def test_summarization_completes_independently(self) -> None:
|
|
"""The summarization coroutine completes without blocking the caller."""
|
|
session = _make_session()
|
|
helper = _make_mock_helper()
|
|
|
|
# Add a small delay to simulate LLM latency
|
|
original_summarize = helper.summarize_l0_to_l1
|
|
|
|
async def slow_summarize(**kwargs):
|
|
await asyncio.sleep(0.01) # 10ms simulated latency
|
|
return await original_summarize(**kwargs)
|
|
|
|
helper.summarize_l0_to_l1 = slow_summarize
|
|
|
|
fm = session.fidelity_manager
|
|
obj = _make_obj()
|
|
obj_id = fm.register_object(obj)
|
|
|
|
transitions = [(obj_id, FidelityLevel.L0, FidelityLevel.L1)]
|
|
|
|
# Run as a task — should complete without issues
|
|
task = asyncio.create_task(_generate_degradation_summaries(transitions, session, helper))
|
|
await task
|
|
|
|
# Summary should be cached after completion
|
|
assert (obj_id, int(FidelityLevel.L1)) in session._summary_cache
|