CLI benchmark command, threshold auto-tuning, OAuth PKCE auth (same flow as Claude Code), cost tracking, telemetry, and replay. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
812 lines
29 KiB
Python
812 lines
29 KiB
Python
"""Tests for the Mnemosyne benchmark module."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import time
|
|
|
|
from mnemosyne.benchmark import (
|
|
AdmissionMetrics,
|
|
BenchmarkCollector,
|
|
EntropyMetrics,
|
|
FidelityMetrics,
|
|
GoalMetrics,
|
|
LatencyTracker,
|
|
MicroFaultMetrics,
|
|
SegmentationMetrics,
|
|
SessionBenchmark,
|
|
Timer,
|
|
TokenMetrics,
|
|
)
|
|
|
|
|
|
# ── LatencyTracker ──────────────────────────────────────────────────────
|
|
|
|
|
|
class TestLatencyTracker:
|
|
def test_empty(self):
|
|
t = LatencyTracker()
|
|
assert t.count == 0
|
|
assert t.avg_ms == 0.0
|
|
d = t.to_dict()
|
|
assert d["count"] == 0
|
|
assert d["avg_ms"] == 0
|
|
|
|
def test_record_single(self):
|
|
t = LatencyTracker()
|
|
t.record(5.0)
|
|
assert t.count == 1
|
|
assert t.avg_ms == 5.0
|
|
assert t.min_ms == 5.0
|
|
assert t.max_ms == 5.0
|
|
|
|
def test_record_multiple(self):
|
|
t = LatencyTracker()
|
|
t.record(2.0)
|
|
t.record(8.0)
|
|
t.record(5.0)
|
|
assert t.count == 3
|
|
assert t.avg_ms == 5.0
|
|
assert t.min_ms == 2.0
|
|
assert t.max_ms == 8.0
|
|
|
|
def test_to_dict(self):
|
|
t = LatencyTracker()
|
|
t.record(10.0)
|
|
t.record(20.0)
|
|
d = t.to_dict()
|
|
assert d["count"] == 2
|
|
assert d["avg_ms"] == 15.0
|
|
assert d["min_ms"] == 10.0
|
|
assert d["max_ms"] == 20.0
|
|
assert d["total_ms"] == 30.0
|
|
|
|
|
|
# ── Timer ───────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestTimer:
|
|
def test_timer_records(self):
|
|
tracker = LatencyTracker()
|
|
with Timer(tracker) as t:
|
|
time.sleep(0.01) # 10ms
|
|
assert t.elapsed_ms > 5 # at least 5ms
|
|
assert tracker.count == 1
|
|
assert tracker.total_ms > 5
|
|
|
|
def test_timer_records_on_exception(self):
|
|
tracker = LatencyTracker()
|
|
try:
|
|
with Timer(tracker):
|
|
raise ValueError("test")
|
|
except ValueError:
|
|
pass
|
|
assert tracker.count == 1
|
|
|
|
|
|
# ── FidelityMetrics ─────────────────────────────────────────────────────
|
|
|
|
|
|
class TestFidelityMetrics:
|
|
def test_empty(self):
|
|
m = FidelityMetrics()
|
|
assert m.degradations == 0
|
|
assert m.upgrades == 0
|
|
|
|
def test_record_degradation(self):
|
|
m = FidelityMetrics()
|
|
m.record_transition(0, 1) # L0 -> L1
|
|
assert m.degradations == 1
|
|
assert m.upgrades == 0
|
|
assert m.transitions_by_level["L0->L1"] == 1
|
|
|
|
def test_record_upgrade(self):
|
|
m = FidelityMetrics()
|
|
m.record_transition(2, 0) # L2 -> L0
|
|
assert m.degradations == 0
|
|
assert m.upgrades == 1
|
|
assert m.transitions_by_level["L2->L0"] == 1
|
|
|
|
def test_multiple_transitions(self):
|
|
m = FidelityMetrics()
|
|
m.record_transition(0, 1)
|
|
m.record_transition(0, 1)
|
|
m.record_transition(1, 2)
|
|
m.record_transition(2, 0)
|
|
assert m.degradations == 3
|
|
assert m.upgrades == 1
|
|
assert m.transitions_by_level["L0->L1"] == 2
|
|
|
|
def test_snapshot_levels(self):
|
|
m = FidelityMetrics()
|
|
m.snapshot_levels({0: 5, 1: 3, 2: 2}, {0: 5000, 1: 1500, 2: 200})
|
|
assert m.objects_by_level["L0"] == 5
|
|
assert m.tokens_by_level["L1"] == 1500
|
|
|
|
def test_to_dict(self):
|
|
m = FidelityMetrics()
|
|
m.record_transition(0, 1)
|
|
d = m.to_dict()
|
|
assert "degradations" in d
|
|
assert "transitions_by_level" in d
|
|
|
|
|
|
# ── AdmissionMetrics ────────────────────────────────────────────────────
|
|
|
|
|
|
class TestAdmissionMetrics:
|
|
def test_empty(self):
|
|
m = AdmissionMetrics()
|
|
assert m.rejection_rate == 0.0
|
|
assert m.avg_score == 0.0
|
|
|
|
def test_record_admitted(self):
|
|
m = AdmissionMetrics()
|
|
m.record_decision(True, 0.8, "file_context")
|
|
assert m.admitted == 1
|
|
assert m.rejected == 0
|
|
assert m.rejection_rate == 0.0
|
|
|
|
def test_record_rejected(self):
|
|
m = AdmissionMetrics()
|
|
m.record_decision(False, 0.2, "tool_result")
|
|
assert m.admitted == 0
|
|
assert m.rejected == 1
|
|
assert m.rejection_rate == 1.0
|
|
assert m.rejected_by_type["tool_result"] == 1
|
|
|
|
def test_mixed_decisions(self):
|
|
m = AdmissionMetrics()
|
|
m.record_decision(True, 0.8, "file_context")
|
|
m.record_decision(True, 0.7, "plan")
|
|
m.record_decision(False, 0.2, "tool_result")
|
|
assert m.total_evaluated == 3
|
|
assert m.admitted == 2
|
|
assert m.rejected == 1
|
|
assert abs(m.rejection_rate - 1 / 3) < 0.01
|
|
|
|
def test_to_dict(self):
|
|
m = AdmissionMetrics()
|
|
m.record_decision(True, 0.8, "file_context")
|
|
d = m.to_dict()
|
|
assert d["admitted"] == 1
|
|
assert d["rejection_rate"] == 0.0
|
|
|
|
|
|
# ── MicroFaultMetrics ───────────────────────────────────────────────────
|
|
|
|
|
|
class TestMicroFaultMetrics:
|
|
def test_empty(self):
|
|
m = MicroFaultMetrics()
|
|
assert m.success_rate == 0.0
|
|
assert m.avg_tokens_saved == 0.0
|
|
|
|
def test_record_fault(self):
|
|
m = MicroFaultMetrics()
|
|
m.record_fault(tokens_answer=50, tokens_avoided=3000, success=True)
|
|
assert m.attempts == 1
|
|
assert m.successes == 1
|
|
assert m.tokens_saved == 3000
|
|
assert m.tokens_used == 50
|
|
assert m.success_rate == 1.0
|
|
|
|
def test_mixed_faults(self):
|
|
m = MicroFaultMetrics()
|
|
m.record_fault(50, 3000, success=True)
|
|
m.record_fault(60, 2000, success=False)
|
|
assert m.attempts == 2
|
|
assert m.successes == 1
|
|
assert m.success_rate == 0.5
|
|
assert m.tokens_saved == 5000
|
|
|
|
def test_to_dict_savings_ratio(self):
|
|
m = MicroFaultMetrics()
|
|
m.record_fault(100, 900, success=True)
|
|
d = m.to_dict()
|
|
assert d["savings_ratio"] == 0.9 # 900 / (900 + 100)
|
|
|
|
|
|
# ── SegmentationMetrics ─────────────────────────────────────────────────
|
|
|
|
|
|
class TestSegmentationMetrics:
|
|
def test_empty(self):
|
|
m = SegmentationMetrics()
|
|
assert m.total_objects_created == 0
|
|
assert m.avg_object_size == 0.0
|
|
|
|
def test_record_objects(self):
|
|
m = SegmentationMetrics()
|
|
m.record_object("file_context", 500)
|
|
m.record_object("file_context", 1000)
|
|
m.record_object("tool_result", 200)
|
|
assert m.total_objects_created == 3
|
|
assert m.objects_by_type["file_context"] == 2
|
|
assert m.objects_by_type["tool_result"] == 1
|
|
assert m.total_tokens_stored == 1700
|
|
|
|
def test_to_dict_percentiles(self):
|
|
m = SegmentationMetrics()
|
|
for i in range(10):
|
|
m.record_object("test", (i + 1) * 100)
|
|
d = m.to_dict()
|
|
assert d["min_object_size_tokens"] == 100
|
|
assert d["max_object_size_tokens"] == 1000
|
|
assert d["p50_object_size_tokens"] == 600 # nearest-rank: index 5 of [100..1000]
|
|
|
|
|
|
# ── GoalMetrics ─────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestGoalMetrics:
|
|
def test_empty(self):
|
|
m = GoalMetrics()
|
|
assert m.topic_shifts_detected == 0
|
|
|
|
def test_record_events(self):
|
|
m = GoalMetrics()
|
|
m.record_topic_shift()
|
|
m.record_reclassification()
|
|
m.record_promotion(3)
|
|
d = m.to_dict()
|
|
assert d["topic_shifts_detected"] == 1
|
|
assert d["goal_reclassifications"] == 1
|
|
assert d["promotions_triggered"] == 3
|
|
|
|
|
|
# ── EntropyMetrics ──────────────────────────────────────────────────────
|
|
|
|
|
|
class TestEntropyMetrics:
|
|
def test_empty(self):
|
|
m = EntropyMetrics()
|
|
assert m.trigger_rate == 0.0
|
|
|
|
def test_record_checks(self):
|
|
m = EntropyMetrics()
|
|
m.record_check(triggered=False)
|
|
m.record_check(triggered=True, entities_count=2)
|
|
m.record_check(triggered=False)
|
|
assert m.checks == 3
|
|
assert m.triggers == 1
|
|
assert m.entities_faulted == 2
|
|
assert abs(m.trigger_rate - 1 / 3) < 0.01
|
|
|
|
|
|
# ── TokenMetrics ────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestTokenMetrics:
|
|
def test_empty(self):
|
|
m = TokenMetrics()
|
|
assert m.turns == 0
|
|
assert m.context_reduction_ratio == 1.0
|
|
|
|
def test_record_turn(self):
|
|
m = TokenMetrics()
|
|
m.record_turn(
|
|
input_tokens=10000,
|
|
effective_tokens=15000,
|
|
cache_read=3000,
|
|
cache_create=2000,
|
|
incoming_bytes=50000,
|
|
outgoing_bytes=30000,
|
|
)
|
|
assert m.turns == 1
|
|
assert m.total_input_tokens == 10000
|
|
assert m.total_effective_tokens == 15000
|
|
assert m.total_cache_read == 3000
|
|
|
|
def test_context_reduction(self):
|
|
m = TokenMetrics()
|
|
m.record_turn(0, 0, 0, 0, incoming_bytes=10000, outgoing_bytes=4000)
|
|
assert m.context_reduction_ratio == 0.4 # 60% reduction
|
|
d = m.to_dict()
|
|
assert d["context_reduction_pct"] == 60.0
|
|
|
|
def test_cache_hit_rate(self):
|
|
m = TokenMetrics()
|
|
m.record_turn(0, 0, cache_read=8000, cache_create=2000, incoming_bytes=0, outgoing_bytes=0)
|
|
assert m.avg_cache_hit_rate == 0.8
|
|
|
|
def test_multiple_turns(self):
|
|
m = TokenMetrics()
|
|
m.record_turn(5000, 7000, 1500, 500, 20000, 15000)
|
|
m.record_turn(8000, 12000, 3000, 1000, 40000, 20000)
|
|
assert m.turns == 2
|
|
assert m.total_input_tokens == 13000
|
|
d = m.to_dict()
|
|
assert d["peak_input_tokens"] == 8000
|
|
|
|
|
|
# ── SessionBenchmark ────────────────────────────────────────────────────
|
|
|
|
|
|
class TestSessionBenchmark:
|
|
def test_creation(self):
|
|
b = SessionBenchmark("test-session")
|
|
assert b.session_id == "test-session"
|
|
assert b.elapsed_seconds >= 0
|
|
|
|
def test_to_dict_has_all_sections(self):
|
|
b = SessionBenchmark("test-session")
|
|
d = b.to_dict()
|
|
assert "session_id" in d
|
|
assert "tokens" in d
|
|
assert "fidelity" in d
|
|
assert "admission" in d
|
|
assert "micro_faults" in d
|
|
assert "segmentation" in d
|
|
assert "goals" in d
|
|
assert "entropy" in d
|
|
assert "latency" in d
|
|
|
|
def test_latency_trackers_initialized(self):
|
|
b = SessionBenchmark("test")
|
|
assert "preprocess" in b.latency
|
|
assert "segmentation" in b.latency
|
|
assert "embedding" in b.latency
|
|
assert "admission" in b.latency
|
|
assert "goal_classification" in b.latency
|
|
assert "entropy_check" in b.latency
|
|
assert "memory_query" in b.latency
|
|
assert "helper_llm" in b.latency
|
|
|
|
|
|
# ── BenchmarkCollector ──────────────────────────────────────────────────
|
|
|
|
|
|
class TestBenchmarkCollector:
|
|
def test_empty_aggregate(self):
|
|
c = BenchmarkCollector()
|
|
result = c.aggregate()
|
|
assert result["sessions"] == 0
|
|
|
|
def test_get_session_creates(self):
|
|
c = BenchmarkCollector()
|
|
s = c.get_session("s1")
|
|
assert s.session_id == "s1"
|
|
# Getting again returns same instance
|
|
s2 = c.get_session("s1")
|
|
assert s is s2
|
|
|
|
def test_aggregate_single_session(self):
|
|
c = BenchmarkCollector()
|
|
s = c.get_session("s1")
|
|
s.tokens.record_turn(5000, 7000, 1500, 500, 20000, 12000)
|
|
s.tokens.record_turn(8000, 12000, 3000, 1000, 40000, 18000)
|
|
s.fidelity.record_transition(0, 1)
|
|
s.admission.record_decision(True, 0.8, "file_context")
|
|
s.admission.record_decision(False, 0.2, "tool_result")
|
|
|
|
result = c.aggregate()
|
|
assert result["sessions"] == 1
|
|
assert result["total_turns"] == 2
|
|
assert result["tokens"]["total_input_tokens"] == 13000
|
|
assert result["fidelity"]["total_degradations"] == 1
|
|
assert result["admission"]["total_admitted"] == 1
|
|
assert result["admission"]["total_rejected"] == 1
|
|
|
|
def test_aggregate_multiple_sessions(self):
|
|
c = BenchmarkCollector()
|
|
s1 = c.get_session("s1")
|
|
s1.tokens.record_turn(5000, 7000, 1500, 500, 20000, 12000)
|
|
s1.fidelity.record_transition(0, 1)
|
|
|
|
s2 = c.get_session("s2")
|
|
s2.tokens.record_turn(3000, 4000, 800, 200, 10000, 7000)
|
|
s2.fidelity.record_transition(1, 2)
|
|
|
|
result = c.aggregate()
|
|
assert result["sessions"] == 2
|
|
assert result["total_turns"] == 2
|
|
assert result["tokens"]["total_input_tokens"] == 8000
|
|
assert result["fidelity"]["total_degradations"] == 2
|
|
|
|
def test_session_report(self):
|
|
c = BenchmarkCollector()
|
|
s = c.get_session("s1")
|
|
s.segmentation.record_object("file_context", 500)
|
|
report = c.session_report("s1")
|
|
assert report is not None
|
|
assert report["segmentation"]["total_objects_created"] == 1
|
|
|
|
def test_session_report_not_found(self):
|
|
c = BenchmarkCollector()
|
|
assert c.session_report("nonexistent") is None
|
|
|
|
def test_all_sessions(self):
|
|
c = BenchmarkCollector()
|
|
c.get_session("s1")
|
|
c.get_session("s2")
|
|
all_s = c.all_sessions()
|
|
assert len(all_s) == 2
|
|
assert "s1" in all_s
|
|
assert "s2" in all_s
|
|
|
|
def test_reset(self):
|
|
c = BenchmarkCollector()
|
|
c.get_session("s1")
|
|
c.reset()
|
|
assert c.aggregate()["sessions"] == 0
|
|
|
|
def test_context_reduction_calculation(self):
|
|
c = BenchmarkCollector()
|
|
s = c.get_session("s1")
|
|
# 100k incoming, 20k outgoing = 80% reduction
|
|
s.tokens.record_turn(0, 0, 0, 0, 100000, 20000)
|
|
result = c.aggregate()
|
|
assert result["tokens"]["context_reduction_pct"] == 80.0
|
|
|
|
|
|
# ── suggest_thresholds ──────────────────────────────────────────────────
|
|
|
|
from mnemosyne.benchmark import suggest_thresholds
|
|
|
|
|
|
class TestSuggestThresholds:
|
|
def test_high_rejection_rate_suggests_lower_threshold(self):
|
|
metrics = {
|
|
"admission": {"rejection_rate": 0.45, "total_admitted": 55, "total_rejected": 45},
|
|
"fidelity": {"total_degradations": 10, "total_upgrades": 10},
|
|
"micro_faults": {"total_attempts": 10, "success_rate": 0.8, "total_tokens_saved": 500},
|
|
}
|
|
result = suggest_thresholds(metrics)
|
|
assert result["admission_threshold"]["status"] == "adjust"
|
|
assert "lowering" in result["admission_threshold"]["reason"]
|
|
assert (
|
|
result["admission_threshold"]["suggested_value"]
|
|
< result["admission_threshold"]["current_value"]
|
|
)
|
|
|
|
def test_low_rejection_rate_suggests_higher_threshold(self):
|
|
metrics = {
|
|
"admission": {"rejection_rate": 0.05, "total_admitted": 95, "total_rejected": 5},
|
|
"fidelity": {"total_degradations": 5, "total_upgrades": 5},
|
|
"micro_faults": {"total_attempts": 10, "success_rate": 0.8, "total_tokens_saved": 500},
|
|
}
|
|
result = suggest_thresholds(metrics)
|
|
assert result["admission_threshold"]["status"] == "adjust"
|
|
assert "raising" in result["admission_threshold"]["reason"]
|
|
|
|
def test_balanced_metrics_all_ok(self):
|
|
metrics = {
|
|
"admission": {"rejection_rate": 0.20, "total_admitted": 80, "total_rejected": 20},
|
|
"entropy": {"trigger_rate": 0.12, "checks": 100, "triggers": 12},
|
|
"fidelity": {"total_degradations": 10, "total_upgrades": 8},
|
|
"micro_faults": {
|
|
"total_attempts": 20,
|
|
"success_rate": 0.75,
|
|
"total_tokens_saved": 1000,
|
|
},
|
|
}
|
|
result = suggest_thresholds(metrics)
|
|
assert result["admission_threshold"]["status"] == "ok"
|
|
assert result["entropy_sensitivity"]["status"] == "ok"
|
|
assert result["fidelity_pressure"]["status"] == "ok"
|
|
assert result["micro_fault_quality"]["status"] == "ok"
|
|
|
|
def test_high_entropy_trigger_rate_suggests_adjustment(self):
|
|
metrics = {
|
|
"admission": {"rejection_rate": 0.20},
|
|
"entropy": {"trigger_rate": 0.35, "checks": 100, "triggers": 35},
|
|
"fidelity": {"total_degradations": 5, "total_upgrades": 5},
|
|
"micro_faults": {"total_attempts": 0, "success_rate": 0.0},
|
|
}
|
|
result = suggest_thresholds(metrics)
|
|
assert result["entropy_sensitivity"]["status"] == "adjust"
|
|
assert "lowering" in result["entropy_sensitivity"]["reason"]
|
|
|
|
def test_low_entropy_trigger_rate_suggests_raising(self):
|
|
metrics = {
|
|
"admission": {"rejection_rate": 0.20},
|
|
"entropy": {"trigger_rate": 0.02, "checks": 100, "triggers": 2},
|
|
"fidelity": {"total_degradations": 5, "total_upgrades": 5},
|
|
"micro_faults": {"total_attempts": 0, "success_rate": 0.0},
|
|
}
|
|
result = suggest_thresholds(metrics)
|
|
assert result["entropy_sensitivity"]["status"] == "adjust"
|
|
assert "raising" in result["entropy_sensitivity"]["reason"]
|
|
|
|
def test_high_fidelity_pressure_suggests_adjustment(self):
|
|
metrics = {
|
|
"admission": {"rejection_rate": 0.20},
|
|
"fidelity": {"total_degradations": 60, "total_upgrades": 5},
|
|
"micro_faults": {"total_attempts": 10, "success_rate": 0.8, "total_tokens_saved": 500},
|
|
}
|
|
result = suggest_thresholds(metrics)
|
|
assert result["fidelity_pressure"]["status"] == "adjust"
|
|
assert "over-pressured" in result["fidelity_pressure"]["reason"]
|
|
|
|
def test_too_conservative_fidelity(self):
|
|
metrics = {
|
|
"admission": {"rejection_rate": 0.20},
|
|
"fidelity": {"total_degradations": 2, "total_upgrades": 20},
|
|
"micro_faults": {"total_attempts": 10, "success_rate": 0.8, "total_tokens_saved": 500},
|
|
}
|
|
result = suggest_thresholds(metrics)
|
|
assert result["fidelity_pressure"]["status"] == "adjust"
|
|
assert "conservative" in result["fidelity_pressure"]["reason"]
|
|
|
|
def test_low_micro_fault_success_rate(self):
|
|
metrics = {
|
|
"admission": {"rejection_rate": 0.20},
|
|
"fidelity": {"total_degradations": 5, "total_upgrades": 5},
|
|
"micro_faults": {"total_attempts": 20, "success_rate": 0.30, "total_tokens_saved": 100},
|
|
}
|
|
result = suggest_thresholds(metrics)
|
|
assert result["micro_fault_quality"]["status"] == "adjust"
|
|
assert "below 50%" in result["micro_fault_quality"]["reason"]
|
|
|
|
def test_no_entropy_checks_is_ok(self):
|
|
metrics = {
|
|
"admission": {"rejection_rate": 0.20},
|
|
"fidelity": {"total_degradations": 5, "total_upgrades": 5},
|
|
"micro_faults": {"total_attempts": 0, "success_rate": 0.0},
|
|
}
|
|
result = suggest_thresholds(metrics)
|
|
assert result["entropy_sensitivity"]["status"] == "ok"
|
|
assert "No entropy checks" in result["entropy_sensitivity"]["reason"]
|
|
|
|
def test_no_micro_fault_attempts_is_ok(self):
|
|
metrics = {
|
|
"admission": {"rejection_rate": 0.20},
|
|
"fidelity": {"total_degradations": 5, "total_upgrades": 5},
|
|
"micro_faults": {"total_attempts": 0, "success_rate": 0.0},
|
|
}
|
|
result = suggest_thresholds(metrics)
|
|
assert result["micro_fault_quality"]["status"] == "ok"
|
|
|
|
def test_empty_metrics(self):
|
|
result = suggest_thresholds({})
|
|
# Should not crash, all should be "ok" with defaults
|
|
assert result["admission_threshold"]["status"] == "ok"
|
|
assert result["entropy_sensitivity"]["status"] == "ok"
|
|
assert result["fidelity_pressure"]["status"] == "ok"
|
|
assert result["micro_fault_quality"]["status"] == "ok"
|
|
|
|
|
|
# ── Benchmark CLI ───────────────────────────────────────────────────────
|
|
|
|
from unittest.mock import patch, MagicMock
|
|
import json as json_mod
|
|
|
|
import httpx
|
|
|
|
from mnemosyne.benchmark_cli import (
|
|
format_aggregate_report,
|
|
format_session_report,
|
|
run_benchmark_cli,
|
|
_box_top,
|
|
_box_bottom,
|
|
_box_sep,
|
|
_box_line,
|
|
)
|
|
|
|
|
|
class TestBenchmarkCLIFormatting:
|
|
def test_box_drawing_characters(self):
|
|
top = _box_top()
|
|
assert top.startswith("\u2554")
|
|
assert top.endswith("\u2557")
|
|
bottom = _box_bottom()
|
|
assert bottom.startswith("\u255a")
|
|
assert bottom.endswith("\u255d")
|
|
sep = _box_sep()
|
|
assert sep.startswith("\u2560")
|
|
assert sep.endswith("\u2563")
|
|
|
|
def test_box_line_pads_correctly(self):
|
|
line = _box_line("hello")
|
|
assert line.startswith("\u2551")
|
|
assert line.endswith("\u2551")
|
|
assert "hello" in line
|
|
|
|
def test_format_aggregate_report_structure(self):
|
|
data = {
|
|
"type": "aggregate",
|
|
"sessions": 3,
|
|
"total_turns": 147,
|
|
"tokens": {
|
|
"total_input_tokens": 50000,
|
|
"total_effective_tokens": 70000,
|
|
"total_cache_read_tokens": 892100,
|
|
"context_reduction_ratio": 0.268,
|
|
"context_reduction_pct": 73.2,
|
|
"avg_input_tokens_per_turn": 12450.0,
|
|
},
|
|
"fidelity": {"total_degradations": 45, "total_upgrades": 12},
|
|
"admission": {
|
|
"total_admitted": 312,
|
|
"total_rejected": 89,
|
|
"rejection_rate": 0.222,
|
|
},
|
|
"micro_faults": {
|
|
"total_attempts": 23,
|
|
"total_successes": 18,
|
|
"success_rate": 0.783,
|
|
"total_tokens_saved": 45200,
|
|
},
|
|
"latency": {
|
|
"segmentation": {"total_calls": 50, "avg_ms": 12.3, "min_ms": 5.1, "max_ms": 45.2},
|
|
"admission": {"total_calls": 40, "avg_ms": 2.1, "min_ms": 0.8, "max_ms": 8.3},
|
|
},
|
|
}
|
|
report = format_aggregate_report(data)
|
|
assert "Mnemosyne Benchmark Report" in report
|
|
assert "Sessions: 3" in report
|
|
assert "Total Turns: 147" in report
|
|
assert "TOKEN SAVINGS" in report
|
|
assert "73.2%" in report
|
|
assert "FIDELITY" in report
|
|
assert "Degradations: 45" in report
|
|
assert "ADMISSION CONTROL" in report
|
|
assert "MICRO-FAULTS" in report
|
|
assert "LATENCY" in report
|
|
assert "segmentation" in report
|
|
assert "THRESHOLD SUGGESTIONS" in report
|
|
# Box drawing chars present
|
|
assert "\u2554" in report
|
|
assert "\u255d" in report
|
|
|
|
def test_format_session_report_structure(self):
|
|
data = {
|
|
"type": "session",
|
|
"session_id": "test-abc",
|
|
"elapsed_seconds": 120.5,
|
|
"tokens": {
|
|
"turns": 10,
|
|
"total_input_tokens": 5000,
|
|
"context_reduction_pct": 60.0,
|
|
"avg_input_tokens_per_turn": 500.0,
|
|
"avg_cache_hit_rate": 0.75,
|
|
},
|
|
"fidelity": {"degradations": 3, "upgrades": 1},
|
|
"admission": {"admitted": 20, "rejected": 5, "rejection_rate": 0.2},
|
|
"micro_faults": {
|
|
"attempts": 5,
|
|
"successes": 4,
|
|
"success_rate": 0.8,
|
|
"tokens_saved": 1000,
|
|
},
|
|
"entropy": {"checks": 10, "triggers": 2, "trigger_rate": 0.2},
|
|
"latency": {},
|
|
}
|
|
report = format_session_report(data)
|
|
assert "Session: test-abc" in report
|
|
assert "Elapsed: 120.5s" in report
|
|
assert "TOKEN SAVINGS" in report
|
|
assert "ENTROPY" in report
|
|
|
|
def test_format_aggregate_report_empty_sessions(self):
|
|
data = {"sessions": 0, "message": "No sessions recorded"}
|
|
report = format_aggregate_report(data)
|
|
assert "Sessions: 0" in report
|
|
|
|
|
|
class TestBenchmarkCLIExecution:
|
|
def test_run_benchmark_cli_json_output(self, capsys):
|
|
mock_data = {"type": "aggregate", "sessions": 2, "total_turns": 50}
|
|
mock_response = MagicMock()
|
|
mock_response.json.return_value = mock_data
|
|
mock_response.raise_for_status = MagicMock()
|
|
|
|
with patch("mnemosyne.benchmark_cli.httpx.get", return_value=mock_response) as mock_get:
|
|
run_benchmark_cli(port=8080, json_output=True)
|
|
mock_get.assert_called_once_with("http://127.0.0.1:8080/api/benchmark", timeout=10.0)
|
|
|
|
captured = capsys.readouterr()
|
|
parsed = json_mod.loads(captured.out)
|
|
assert parsed["sessions"] == 2
|
|
|
|
def test_run_benchmark_cli_with_session_id(self, capsys):
|
|
mock_data = {
|
|
"type": "session",
|
|
"session_id": "s1",
|
|
"elapsed_seconds": 10.0,
|
|
"tokens": {},
|
|
"fidelity": {},
|
|
"admission": {},
|
|
"micro_faults": {},
|
|
"entropy": {},
|
|
"latency": {},
|
|
}
|
|
mock_response = MagicMock()
|
|
mock_response.json.return_value = mock_data
|
|
mock_response.raise_for_status = MagicMock()
|
|
|
|
with patch("mnemosyne.benchmark_cli.httpx.get", return_value=mock_response) as mock_get:
|
|
run_benchmark_cli(port=9090, session_id="s1")
|
|
mock_get.assert_called_once_with(
|
|
"http://127.0.0.1:9090/api/benchmark?session_id=s1", timeout=10.0
|
|
)
|
|
|
|
captured = capsys.readouterr()
|
|
assert "Session: s1" in captured.out
|
|
|
|
def test_run_benchmark_cli_pretty_output(self, capsys):
|
|
mock_data = {
|
|
"type": "aggregate",
|
|
"sessions": 1,
|
|
"total_turns": 10,
|
|
"tokens": {
|
|
"context_reduction_pct": 50.0,
|
|
"avg_input_tokens_per_turn": 1000,
|
|
"total_cache_read_tokens": 5000,
|
|
},
|
|
"fidelity": {"total_degradations": 2, "total_upgrades": 1},
|
|
"admission": {"total_admitted": 10, "total_rejected": 3, "rejection_rate": 0.23},
|
|
"micro_faults": {
|
|
"total_attempts": 5,
|
|
"total_successes": 4,
|
|
"success_rate": 0.8,
|
|
"total_tokens_saved": 2000,
|
|
},
|
|
"latency": {},
|
|
}
|
|
mock_response = MagicMock()
|
|
mock_response.json.return_value = mock_data
|
|
mock_response.raise_for_status = MagicMock()
|
|
|
|
with patch("mnemosyne.benchmark_cli.httpx.get", return_value=mock_response):
|
|
run_benchmark_cli(port=8080)
|
|
|
|
captured = capsys.readouterr()
|
|
assert "Mnemosyne Benchmark Report" in captured.out
|
|
assert "\u2554" in captured.out # box top
|
|
|
|
def test_run_benchmark_cli_connection_error(self):
|
|
with patch("mnemosyne.benchmark_cli.httpx.get", side_effect=httpx.ConnectError("refused")):
|
|
import pytest
|
|
|
|
with pytest.raises(SystemExit) as exc_info:
|
|
run_benchmark_cli(port=8080)
|
|
assert exc_info.value.code == 1
|
|
|
|
|
|
class TestBenchmarkCLIArgParsing:
|
|
def test_benchmark_flag_parsed(self):
|
|
"""Verify --benchmark is recognized by the argparse parser."""
|
|
import argparse
|
|
|
|
# Reconstruct the parser as main() does
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--benchmark", action="store_true")
|
|
parser.add_argument("--session", type=str, default=None)
|
|
parser.add_argument("--json-output", action="store_true")
|
|
parser.add_argument("--port", type=int, default=0)
|
|
|
|
args = parser.parse_args(["--benchmark"])
|
|
assert args.benchmark is True
|
|
assert args.session is None
|
|
assert args.json_output is False
|
|
|
|
def test_benchmark_with_session_and_json(self):
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--benchmark", action="store_true")
|
|
parser.add_argument("--session", type=str, default=None)
|
|
parser.add_argument("--json-output", action="store_true")
|
|
parser.add_argument("--port", type=int, default=0)
|
|
|
|
args = parser.parse_args(
|
|
["--benchmark", "--session", "abc-123", "--json-output", "--port", "9090"]
|
|
)
|
|
assert args.benchmark is True
|
|
assert args.session == "abc-123"
|
|
assert args.json_output is True
|
|
assert args.port == 9090
|
|
|
|
def test_benchmark_port_default(self):
|
|
"""When --benchmark is used without --port, port defaults to 0 (gateway treats as 8080)."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--benchmark", action="store_true")
|
|
parser.add_argument("--port", type=int, default=0)
|
|
|
|
args = parser.parse_args(["--benchmark"])
|
|
# The gateway main() maps port=0 to 8080 for benchmark mode
|
|
benchmark_port = args.port if args.port != 0 else 8080
|
|
assert benchmark_port == 8080
|