"""Tests for the Mnemosyne benchmark module.""" from __future__ import annotations import time from mnemosyne.benchmark import ( AdmissionMetrics, BenchmarkCollector, EntropyMetrics, FidelityMetrics, GoalMetrics, LatencyTracker, MicroFaultMetrics, SegmentationMetrics, SessionBenchmark, Timer, TokenMetrics, ) # ── LatencyTracker ────────────────────────────────────────────────────── class TestLatencyTracker: def test_empty(self): t = LatencyTracker() assert t.count == 0 assert t.avg_ms == 0.0 d = t.to_dict() assert d["count"] == 0 assert d["avg_ms"] == 0 def test_record_single(self): t = LatencyTracker() t.record(5.0) assert t.count == 1 assert t.avg_ms == 5.0 assert t.min_ms == 5.0 assert t.max_ms == 5.0 def test_record_multiple(self): t = LatencyTracker() t.record(2.0) t.record(8.0) t.record(5.0) assert t.count == 3 assert t.avg_ms == 5.0 assert t.min_ms == 2.0 assert t.max_ms == 8.0 def test_to_dict(self): t = LatencyTracker() t.record(10.0) t.record(20.0) d = t.to_dict() assert d["count"] == 2 assert d["avg_ms"] == 15.0 assert d["min_ms"] == 10.0 assert d["max_ms"] == 20.0 assert d["total_ms"] == 30.0 # ── Timer ─────────────────────────────────────────────────────────────── class TestTimer: def test_timer_records(self): tracker = LatencyTracker() with Timer(tracker) as t: time.sleep(0.01) # 10ms assert t.elapsed_ms > 5 # at least 5ms assert tracker.count == 1 assert tracker.total_ms > 5 def test_timer_records_on_exception(self): tracker = LatencyTracker() try: with Timer(tracker): raise ValueError("test") except ValueError: pass assert tracker.count == 1 # ── FidelityMetrics ───────────────────────────────────────────────────── class TestFidelityMetrics: def test_empty(self): m = FidelityMetrics() assert m.degradations == 0 assert m.upgrades == 0 def test_record_degradation(self): m = FidelityMetrics() m.record_transition(0, 1) # L0 -> L1 assert m.degradations == 1 assert m.upgrades == 0 assert m.transitions_by_level["L0->L1"] == 1 def test_record_upgrade(self): m = FidelityMetrics() m.record_transition(2, 0) # L2 -> L0 assert m.degradations == 0 assert m.upgrades == 1 assert m.transitions_by_level["L2->L0"] == 1 def test_multiple_transitions(self): m = FidelityMetrics() m.record_transition(0, 1) m.record_transition(0, 1) m.record_transition(1, 2) m.record_transition(2, 0) assert m.degradations == 3 assert m.upgrades == 1 assert m.transitions_by_level["L0->L1"] == 2 def test_snapshot_levels(self): m = FidelityMetrics() m.snapshot_levels({0: 5, 1: 3, 2: 2}, {0: 5000, 1: 1500, 2: 200}) assert m.objects_by_level["L0"] == 5 assert m.tokens_by_level["L1"] == 1500 def test_to_dict(self): m = FidelityMetrics() m.record_transition(0, 1) d = m.to_dict() assert "degradations" in d assert "transitions_by_level" in d # ── AdmissionMetrics ──────────────────────────────────────────────────── class TestAdmissionMetrics: def test_empty(self): m = AdmissionMetrics() assert m.rejection_rate == 0.0 assert m.avg_score == 0.0 def test_record_admitted(self): m = AdmissionMetrics() m.record_decision(True, 0.8, "file_context") assert m.admitted == 1 assert m.rejected == 0 assert m.rejection_rate == 0.0 def test_record_rejected(self): m = AdmissionMetrics() m.record_decision(False, 0.2, "tool_result") assert m.admitted == 0 assert m.rejected == 1 assert m.rejection_rate == 1.0 assert m.rejected_by_type["tool_result"] == 1 def test_mixed_decisions(self): m = AdmissionMetrics() m.record_decision(True, 0.8, "file_context") m.record_decision(True, 0.7, "plan") m.record_decision(False, 0.2, "tool_result") assert m.total_evaluated == 3 assert m.admitted == 2 assert m.rejected == 1 assert abs(m.rejection_rate - 1 / 3) < 0.01 def test_to_dict(self): m = AdmissionMetrics() m.record_decision(True, 0.8, "file_context") d = m.to_dict() assert d["admitted"] == 1 assert d["rejection_rate"] == 0.0 # ── MicroFaultMetrics ─────────────────────────────────────────────────── class TestMicroFaultMetrics: def test_empty(self): m = MicroFaultMetrics() assert m.success_rate == 0.0 assert m.avg_tokens_saved == 0.0 def test_record_fault(self): m = MicroFaultMetrics() m.record_fault(tokens_answer=50, tokens_avoided=3000, success=True) assert m.attempts == 1 assert m.successes == 1 assert m.tokens_saved == 3000 assert m.tokens_used == 50 assert m.success_rate == 1.0 def test_mixed_faults(self): m = MicroFaultMetrics() m.record_fault(50, 3000, success=True) m.record_fault(60, 2000, success=False) assert m.attempts == 2 assert m.successes == 1 assert m.success_rate == 0.5 assert m.tokens_saved == 5000 def test_to_dict_savings_ratio(self): m = MicroFaultMetrics() m.record_fault(100, 900, success=True) d = m.to_dict() assert d["savings_ratio"] == 0.9 # 900 / (900 + 100) # ── SegmentationMetrics ───────────────────────────────────────────────── class TestSegmentationMetrics: def test_empty(self): m = SegmentationMetrics() assert m.total_objects_created == 0 assert m.avg_object_size == 0.0 def test_record_objects(self): m = SegmentationMetrics() m.record_object("file_context", 500) m.record_object("file_context", 1000) m.record_object("tool_result", 200) assert m.total_objects_created == 3 assert m.objects_by_type["file_context"] == 2 assert m.objects_by_type["tool_result"] == 1 assert m.total_tokens_stored == 1700 def test_to_dict_percentiles(self): m = SegmentationMetrics() for i in range(10): m.record_object("test", (i + 1) * 100) d = m.to_dict() assert d["min_object_size_tokens"] == 100 assert d["max_object_size_tokens"] == 1000 assert d["p50_object_size_tokens"] == 600 # nearest-rank: index 5 of [100..1000] # ── GoalMetrics ───────────────────────────────────────────────────────── class TestGoalMetrics: def test_empty(self): m = GoalMetrics() assert m.topic_shifts_detected == 0 def test_record_events(self): m = GoalMetrics() m.record_topic_shift() m.record_reclassification() m.record_promotion(3) d = m.to_dict() assert d["topic_shifts_detected"] == 1 assert d["goal_reclassifications"] == 1 assert d["promotions_triggered"] == 3 # ── EntropyMetrics ────────────────────────────────────────────────────── class TestEntropyMetrics: def test_empty(self): m = EntropyMetrics() assert m.trigger_rate == 0.0 def test_record_checks(self): m = EntropyMetrics() m.record_check(triggered=False) m.record_check(triggered=True, entities_count=2) m.record_check(triggered=False) assert m.checks == 3 assert m.triggers == 1 assert m.entities_faulted == 2 assert abs(m.trigger_rate - 1 / 3) < 0.01 # ── TokenMetrics ──────────────────────────────────────────────────────── class TestTokenMetrics: def test_empty(self): m = TokenMetrics() assert m.turns == 0 assert m.context_reduction_ratio == 1.0 def test_record_turn(self): m = TokenMetrics() m.record_turn( input_tokens=10000, effective_tokens=15000, cache_read=3000, cache_create=2000, incoming_bytes=50000, outgoing_bytes=30000, ) assert m.turns == 1 assert m.total_input_tokens == 10000 assert m.total_effective_tokens == 15000 assert m.total_cache_read == 3000 def test_context_reduction(self): m = TokenMetrics() m.record_turn(0, 0, 0, 0, incoming_bytes=10000, outgoing_bytes=4000) assert m.context_reduction_ratio == 0.4 # 60% reduction d = m.to_dict() assert d["context_reduction_pct"] == 60.0 def test_cache_hit_rate(self): m = TokenMetrics() m.record_turn(0, 0, cache_read=8000, cache_create=2000, incoming_bytes=0, outgoing_bytes=0) assert m.avg_cache_hit_rate == 0.8 def test_multiple_turns(self): m = TokenMetrics() m.record_turn(5000, 7000, 1500, 500, 20000, 15000) m.record_turn(8000, 12000, 3000, 1000, 40000, 20000) assert m.turns == 2 assert m.total_input_tokens == 13000 d = m.to_dict() assert d["peak_input_tokens"] == 8000 # ── SessionBenchmark ──────────────────────────────────────────────────── class TestSessionBenchmark: def test_creation(self): b = SessionBenchmark("test-session") assert b.session_id == "test-session" assert b.elapsed_seconds >= 0 def test_to_dict_has_all_sections(self): b = SessionBenchmark("test-session") d = b.to_dict() assert "session_id" in d assert "tokens" in d assert "fidelity" in d assert "admission" in d assert "micro_faults" in d assert "segmentation" in d assert "goals" in d assert "entropy" in d assert "latency" in d def test_latency_trackers_initialized(self): b = SessionBenchmark("test") assert "preprocess" in b.latency assert "segmentation" in b.latency assert "embedding" in b.latency assert "admission" in b.latency assert "goal_classification" in b.latency assert "entropy_check" in b.latency assert "memory_query" in b.latency assert "helper_llm" in b.latency # ── BenchmarkCollector ────────────────────────────────────────────────── class TestBenchmarkCollector: def test_empty_aggregate(self): c = BenchmarkCollector() result = c.aggregate() assert result["sessions"] == 0 def test_get_session_creates(self): c = BenchmarkCollector() s = c.get_session("s1") assert s.session_id == "s1" # Getting again returns same instance s2 = c.get_session("s1") assert s is s2 def test_aggregate_single_session(self): c = BenchmarkCollector() s = c.get_session("s1") s.tokens.record_turn(5000, 7000, 1500, 500, 20000, 12000) s.tokens.record_turn(8000, 12000, 3000, 1000, 40000, 18000) s.fidelity.record_transition(0, 1) s.admission.record_decision(True, 0.8, "file_context") s.admission.record_decision(False, 0.2, "tool_result") result = c.aggregate() assert result["sessions"] == 1 assert result["total_turns"] == 2 assert result["tokens"]["total_input_tokens"] == 13000 assert result["fidelity"]["total_degradations"] == 1 assert result["admission"]["total_admitted"] == 1 assert result["admission"]["total_rejected"] == 1 def test_aggregate_multiple_sessions(self): c = BenchmarkCollector() s1 = c.get_session("s1") s1.tokens.record_turn(5000, 7000, 1500, 500, 20000, 12000) s1.fidelity.record_transition(0, 1) s2 = c.get_session("s2") s2.tokens.record_turn(3000, 4000, 800, 200, 10000, 7000) s2.fidelity.record_transition(1, 2) result = c.aggregate() assert result["sessions"] == 2 assert result["total_turns"] == 2 assert result["tokens"]["total_input_tokens"] == 8000 assert result["fidelity"]["total_degradations"] == 2 def test_session_report(self): c = BenchmarkCollector() s = c.get_session("s1") s.segmentation.record_object("file_context", 500) report = c.session_report("s1") assert report is not None assert report["segmentation"]["total_objects_created"] == 1 def test_session_report_not_found(self): c = BenchmarkCollector() assert c.session_report("nonexistent") is None def test_all_sessions(self): c = BenchmarkCollector() c.get_session("s1") c.get_session("s2") all_s = c.all_sessions() assert len(all_s) == 2 assert "s1" in all_s assert "s2" in all_s def test_reset(self): c = BenchmarkCollector() c.get_session("s1") c.reset() assert c.aggregate()["sessions"] == 0 def test_context_reduction_calculation(self): c = BenchmarkCollector() s = c.get_session("s1") # 100k incoming, 20k outgoing = 80% reduction s.tokens.record_turn(0, 0, 0, 0, 100000, 20000) result = c.aggregate() assert result["tokens"]["context_reduction_pct"] == 80.0 # ── suggest_thresholds ────────────────────────────────────────────────── from mnemosyne.benchmark import suggest_thresholds class TestSuggestThresholds: def test_high_rejection_rate_suggests_lower_threshold(self): metrics = { "admission": {"rejection_rate": 0.45, "total_admitted": 55, "total_rejected": 45}, "fidelity": {"total_degradations": 10, "total_upgrades": 10}, "micro_faults": {"total_attempts": 10, "success_rate": 0.8, "total_tokens_saved": 500}, } result = suggest_thresholds(metrics) assert result["admission_threshold"]["status"] == "adjust" assert "lowering" in result["admission_threshold"]["reason"] assert ( result["admission_threshold"]["suggested_value"] < result["admission_threshold"]["current_value"] ) def test_low_rejection_rate_suggests_higher_threshold(self): metrics = { "admission": {"rejection_rate": 0.05, "total_admitted": 95, "total_rejected": 5}, "fidelity": {"total_degradations": 5, "total_upgrades": 5}, "micro_faults": {"total_attempts": 10, "success_rate": 0.8, "total_tokens_saved": 500}, } result = suggest_thresholds(metrics) assert result["admission_threshold"]["status"] == "adjust" assert "raising" in result["admission_threshold"]["reason"] def test_balanced_metrics_all_ok(self): metrics = { "admission": {"rejection_rate": 0.20, "total_admitted": 80, "total_rejected": 20}, "entropy": {"trigger_rate": 0.12, "checks": 100, "triggers": 12}, "fidelity": {"total_degradations": 10, "total_upgrades": 8}, "micro_faults": { "total_attempts": 20, "success_rate": 0.75, "total_tokens_saved": 1000, }, } result = suggest_thresholds(metrics) assert result["admission_threshold"]["status"] == "ok" assert result["entropy_sensitivity"]["status"] == "ok" assert result["fidelity_pressure"]["status"] == "ok" assert result["micro_fault_quality"]["status"] == "ok" def test_high_entropy_trigger_rate_suggests_adjustment(self): metrics = { "admission": {"rejection_rate": 0.20}, "entropy": {"trigger_rate": 0.35, "checks": 100, "triggers": 35}, "fidelity": {"total_degradations": 5, "total_upgrades": 5}, "micro_faults": {"total_attempts": 0, "success_rate": 0.0}, } result = suggest_thresholds(metrics) assert result["entropy_sensitivity"]["status"] == "adjust" assert "lowering" in result["entropy_sensitivity"]["reason"] def test_low_entropy_trigger_rate_suggests_raising(self): metrics = { "admission": {"rejection_rate": 0.20}, "entropy": {"trigger_rate": 0.02, "checks": 100, "triggers": 2}, "fidelity": {"total_degradations": 5, "total_upgrades": 5}, "micro_faults": {"total_attempts": 0, "success_rate": 0.0}, } result = suggest_thresholds(metrics) assert result["entropy_sensitivity"]["status"] == "adjust" assert "raising" in result["entropy_sensitivity"]["reason"] def test_high_fidelity_pressure_suggests_adjustment(self): metrics = { "admission": {"rejection_rate": 0.20}, "fidelity": {"total_degradations": 60, "total_upgrades": 5}, "micro_faults": {"total_attempts": 10, "success_rate": 0.8, "total_tokens_saved": 500}, } result = suggest_thresholds(metrics) assert result["fidelity_pressure"]["status"] == "adjust" assert "over-pressured" in result["fidelity_pressure"]["reason"] def test_too_conservative_fidelity(self): metrics = { "admission": {"rejection_rate": 0.20}, "fidelity": {"total_degradations": 2, "total_upgrades": 20}, "micro_faults": {"total_attempts": 10, "success_rate": 0.8, "total_tokens_saved": 500}, } result = suggest_thresholds(metrics) assert result["fidelity_pressure"]["status"] == "adjust" assert "conservative" in result["fidelity_pressure"]["reason"] def test_low_micro_fault_success_rate(self): metrics = { "admission": {"rejection_rate": 0.20}, "fidelity": {"total_degradations": 5, "total_upgrades": 5}, "micro_faults": {"total_attempts": 20, "success_rate": 0.30, "total_tokens_saved": 100}, } result = suggest_thresholds(metrics) assert result["micro_fault_quality"]["status"] == "adjust" assert "below 50%" in result["micro_fault_quality"]["reason"] def test_no_entropy_checks_is_ok(self): metrics = { "admission": {"rejection_rate": 0.20}, "fidelity": {"total_degradations": 5, "total_upgrades": 5}, "micro_faults": {"total_attempts": 0, "success_rate": 0.0}, } result = suggest_thresholds(metrics) assert result["entropy_sensitivity"]["status"] == "ok" assert "No entropy checks" in result["entropy_sensitivity"]["reason"] def test_no_micro_fault_attempts_is_ok(self): metrics = { "admission": {"rejection_rate": 0.20}, "fidelity": {"total_degradations": 5, "total_upgrades": 5}, "micro_faults": {"total_attempts": 0, "success_rate": 0.0}, } result = suggest_thresholds(metrics) assert result["micro_fault_quality"]["status"] == "ok" def test_empty_metrics(self): result = suggest_thresholds({}) # Should not crash, all should be "ok" with defaults assert result["admission_threshold"]["status"] == "ok" assert result["entropy_sensitivity"]["status"] == "ok" assert result["fidelity_pressure"]["status"] == "ok" assert result["micro_fault_quality"]["status"] == "ok" # ── Benchmark CLI ─────────────────────────────────────────────────────── from unittest.mock import patch, MagicMock import json as json_mod import httpx from mnemosyne.benchmark_cli import ( format_aggregate_report, format_session_report, run_benchmark_cli, _box_top, _box_bottom, _box_sep, _box_line, ) class TestBenchmarkCLIFormatting: def test_box_drawing_characters(self): top = _box_top() assert top.startswith("\u2554") assert top.endswith("\u2557") bottom = _box_bottom() assert bottom.startswith("\u255a") assert bottom.endswith("\u255d") sep = _box_sep() assert sep.startswith("\u2560") assert sep.endswith("\u2563") def test_box_line_pads_correctly(self): line = _box_line("hello") assert line.startswith("\u2551") assert line.endswith("\u2551") assert "hello" in line def test_format_aggregate_report_structure(self): data = { "type": "aggregate", "sessions": 3, "total_turns": 147, "tokens": { "total_input_tokens": 50000, "total_effective_tokens": 70000, "total_cache_read_tokens": 892100, "context_reduction_ratio": 0.268, "context_reduction_pct": 73.2, "avg_input_tokens_per_turn": 12450.0, }, "fidelity": {"total_degradations": 45, "total_upgrades": 12}, "admission": { "total_admitted": 312, "total_rejected": 89, "rejection_rate": 0.222, }, "micro_faults": { "total_attempts": 23, "total_successes": 18, "success_rate": 0.783, "total_tokens_saved": 45200, }, "latency": { "segmentation": {"total_calls": 50, "avg_ms": 12.3, "min_ms": 5.1, "max_ms": 45.2}, "admission": {"total_calls": 40, "avg_ms": 2.1, "min_ms": 0.8, "max_ms": 8.3}, }, } report = format_aggregate_report(data) assert "Mnemosyne Benchmark Report" in report assert "Sessions: 3" in report assert "Total Turns: 147" in report assert "TOKEN SAVINGS" in report assert "73.2%" in report assert "FIDELITY" in report assert "Degradations: 45" in report assert "ADMISSION CONTROL" in report assert "MICRO-FAULTS" in report assert "LATENCY" in report assert "segmentation" in report assert "THRESHOLD SUGGESTIONS" in report # Box drawing chars present assert "\u2554" in report assert "\u255d" in report def test_format_session_report_structure(self): data = { "type": "session", "session_id": "test-abc", "elapsed_seconds": 120.5, "tokens": { "turns": 10, "total_input_tokens": 5000, "context_reduction_pct": 60.0, "avg_input_tokens_per_turn": 500.0, "avg_cache_hit_rate": 0.75, }, "fidelity": {"degradations": 3, "upgrades": 1}, "admission": {"admitted": 20, "rejected": 5, "rejection_rate": 0.2}, "micro_faults": { "attempts": 5, "successes": 4, "success_rate": 0.8, "tokens_saved": 1000, }, "entropy": {"checks": 10, "triggers": 2, "trigger_rate": 0.2}, "latency": {}, } report = format_session_report(data) assert "Session: test-abc" in report assert "Elapsed: 120.5s" in report assert "TOKEN SAVINGS" in report assert "ENTROPY" in report def test_format_aggregate_report_empty_sessions(self): data = {"sessions": 0, "message": "No sessions recorded"} report = format_aggregate_report(data) assert "Sessions: 0" in report class TestBenchmarkCLIExecution: def test_run_benchmark_cli_json_output(self, capsys): mock_data = {"type": "aggregate", "sessions": 2, "total_turns": 50} mock_response = MagicMock() mock_response.json.return_value = mock_data mock_response.raise_for_status = MagicMock() with patch("mnemosyne.benchmark_cli.httpx.get", return_value=mock_response) as mock_get: run_benchmark_cli(port=8080, json_output=True) mock_get.assert_called_once_with("http://127.0.0.1:8080/api/benchmark", timeout=10.0) captured = capsys.readouterr() parsed = json_mod.loads(captured.out) assert parsed["sessions"] == 2 def test_run_benchmark_cli_with_session_id(self, capsys): mock_data = { "type": "session", "session_id": "s1", "elapsed_seconds": 10.0, "tokens": {}, "fidelity": {}, "admission": {}, "micro_faults": {}, "entropy": {}, "latency": {}, } mock_response = MagicMock() mock_response.json.return_value = mock_data mock_response.raise_for_status = MagicMock() with patch("mnemosyne.benchmark_cli.httpx.get", return_value=mock_response) as mock_get: run_benchmark_cli(port=9090, session_id="s1") mock_get.assert_called_once_with( "http://127.0.0.1:9090/api/benchmark?session_id=s1", timeout=10.0 ) captured = capsys.readouterr() assert "Session: s1" in captured.out def test_run_benchmark_cli_pretty_output(self, capsys): mock_data = { "type": "aggregate", "sessions": 1, "total_turns": 10, "tokens": { "context_reduction_pct": 50.0, "avg_input_tokens_per_turn": 1000, "total_cache_read_tokens": 5000, }, "fidelity": {"total_degradations": 2, "total_upgrades": 1}, "admission": {"total_admitted": 10, "total_rejected": 3, "rejection_rate": 0.23}, "micro_faults": { "total_attempts": 5, "total_successes": 4, "success_rate": 0.8, "total_tokens_saved": 2000, }, "latency": {}, } mock_response = MagicMock() mock_response.json.return_value = mock_data mock_response.raise_for_status = MagicMock() with patch("mnemosyne.benchmark_cli.httpx.get", return_value=mock_response): run_benchmark_cli(port=8080) captured = capsys.readouterr() assert "Mnemosyne Benchmark Report" in captured.out assert "\u2554" in captured.out # box top def test_run_benchmark_cli_connection_error(self): with patch("mnemosyne.benchmark_cli.httpx.get", side_effect=httpx.ConnectError("refused")): import pytest with pytest.raises(SystemExit) as exc_info: run_benchmark_cli(port=8080) assert exc_info.value.code == 1 class TestBenchmarkCLIArgParsing: def test_benchmark_flag_parsed(self): """Verify --benchmark is recognized by the argparse parser.""" import argparse # Reconstruct the parser as main() does parser = argparse.ArgumentParser() parser.add_argument("--benchmark", action="store_true") parser.add_argument("--session", type=str, default=None) parser.add_argument("--json-output", action="store_true") parser.add_argument("--port", type=int, default=0) args = parser.parse_args(["--benchmark"]) assert args.benchmark is True assert args.session is None assert args.json_output is False def test_benchmark_with_session_and_json(self): import argparse parser = argparse.ArgumentParser() parser.add_argument("--benchmark", action="store_true") parser.add_argument("--session", type=str, default=None) parser.add_argument("--json-output", action="store_true") parser.add_argument("--port", type=int, default=0) args = parser.parse_args( ["--benchmark", "--session", "abc-123", "--json-output", "--port", "9090"] ) assert args.benchmark is True assert args.session == "abc-123" assert args.json_output is True assert args.port == 9090 def test_benchmark_port_default(self): """When --benchmark is used without --port, port defaults to 0 (gateway treats as 8080).""" import argparse parser = argparse.ArgumentParser() parser.add_argument("--benchmark", action="store_true") parser.add_argument("--port", type=int, default=0) args = parser.parse_args(["--benchmark"]) # The gateway main() maps port=0 to 8080 for benchmark mode benchmark_port = args.port if args.port != 0 else 8080 assert benchmark_port == 8080