mnemosyne/tests/test_benchmark.py
Joey Yakimowich-Payne d660414ad7 feat: add benchmarking, auth, and utility modules
CLI benchmark command, threshold auto-tuning, OAuth PKCE auth
(same flow as Claude Code), cost tracking, telemetry, and replay.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-03-13 11:41:22 -06:00

812 lines
29 KiB
Python

"""Tests for the Mnemosyne benchmark module."""
from __future__ import annotations
import time
from mnemosyne.benchmark import (
AdmissionMetrics,
BenchmarkCollector,
EntropyMetrics,
FidelityMetrics,
GoalMetrics,
LatencyTracker,
MicroFaultMetrics,
SegmentationMetrics,
SessionBenchmark,
Timer,
TokenMetrics,
)
# ── LatencyTracker ──────────────────────────────────────────────────────
class TestLatencyTracker:
def test_empty(self):
t = LatencyTracker()
assert t.count == 0
assert t.avg_ms == 0.0
d = t.to_dict()
assert d["count"] == 0
assert d["avg_ms"] == 0
def test_record_single(self):
t = LatencyTracker()
t.record(5.0)
assert t.count == 1
assert t.avg_ms == 5.0
assert t.min_ms == 5.0
assert t.max_ms == 5.0
def test_record_multiple(self):
t = LatencyTracker()
t.record(2.0)
t.record(8.0)
t.record(5.0)
assert t.count == 3
assert t.avg_ms == 5.0
assert t.min_ms == 2.0
assert t.max_ms == 8.0
def test_to_dict(self):
t = LatencyTracker()
t.record(10.0)
t.record(20.0)
d = t.to_dict()
assert d["count"] == 2
assert d["avg_ms"] == 15.0
assert d["min_ms"] == 10.0
assert d["max_ms"] == 20.0
assert d["total_ms"] == 30.0
# ── Timer ───────────────────────────────────────────────────────────────
class TestTimer:
def test_timer_records(self):
tracker = LatencyTracker()
with Timer(tracker) as t:
time.sleep(0.01) # 10ms
assert t.elapsed_ms > 5 # at least 5ms
assert tracker.count == 1
assert tracker.total_ms > 5
def test_timer_records_on_exception(self):
tracker = LatencyTracker()
try:
with Timer(tracker):
raise ValueError("test")
except ValueError:
pass
assert tracker.count == 1
# ── FidelityMetrics ─────────────────────────────────────────────────────
class TestFidelityMetrics:
def test_empty(self):
m = FidelityMetrics()
assert m.degradations == 0
assert m.upgrades == 0
def test_record_degradation(self):
m = FidelityMetrics()
m.record_transition(0, 1) # L0 -> L1
assert m.degradations == 1
assert m.upgrades == 0
assert m.transitions_by_level["L0->L1"] == 1
def test_record_upgrade(self):
m = FidelityMetrics()
m.record_transition(2, 0) # L2 -> L0
assert m.degradations == 0
assert m.upgrades == 1
assert m.transitions_by_level["L2->L0"] == 1
def test_multiple_transitions(self):
m = FidelityMetrics()
m.record_transition(0, 1)
m.record_transition(0, 1)
m.record_transition(1, 2)
m.record_transition(2, 0)
assert m.degradations == 3
assert m.upgrades == 1
assert m.transitions_by_level["L0->L1"] == 2
def test_snapshot_levels(self):
m = FidelityMetrics()
m.snapshot_levels({0: 5, 1: 3, 2: 2}, {0: 5000, 1: 1500, 2: 200})
assert m.objects_by_level["L0"] == 5
assert m.tokens_by_level["L1"] == 1500
def test_to_dict(self):
m = FidelityMetrics()
m.record_transition(0, 1)
d = m.to_dict()
assert "degradations" in d
assert "transitions_by_level" in d
# ── AdmissionMetrics ────────────────────────────────────────────────────
class TestAdmissionMetrics:
def test_empty(self):
m = AdmissionMetrics()
assert m.rejection_rate == 0.0
assert m.avg_score == 0.0
def test_record_admitted(self):
m = AdmissionMetrics()
m.record_decision(True, 0.8, "file_context")
assert m.admitted == 1
assert m.rejected == 0
assert m.rejection_rate == 0.0
def test_record_rejected(self):
m = AdmissionMetrics()
m.record_decision(False, 0.2, "tool_result")
assert m.admitted == 0
assert m.rejected == 1
assert m.rejection_rate == 1.0
assert m.rejected_by_type["tool_result"] == 1
def test_mixed_decisions(self):
m = AdmissionMetrics()
m.record_decision(True, 0.8, "file_context")
m.record_decision(True, 0.7, "plan")
m.record_decision(False, 0.2, "tool_result")
assert m.total_evaluated == 3
assert m.admitted == 2
assert m.rejected == 1
assert abs(m.rejection_rate - 1 / 3) < 0.01
def test_to_dict(self):
m = AdmissionMetrics()
m.record_decision(True, 0.8, "file_context")
d = m.to_dict()
assert d["admitted"] == 1
assert d["rejection_rate"] == 0.0
# ── MicroFaultMetrics ───────────────────────────────────────────────────
class TestMicroFaultMetrics:
def test_empty(self):
m = MicroFaultMetrics()
assert m.success_rate == 0.0
assert m.avg_tokens_saved == 0.0
def test_record_fault(self):
m = MicroFaultMetrics()
m.record_fault(tokens_answer=50, tokens_avoided=3000, success=True)
assert m.attempts == 1
assert m.successes == 1
assert m.tokens_saved == 3000
assert m.tokens_used == 50
assert m.success_rate == 1.0
def test_mixed_faults(self):
m = MicroFaultMetrics()
m.record_fault(50, 3000, success=True)
m.record_fault(60, 2000, success=False)
assert m.attempts == 2
assert m.successes == 1
assert m.success_rate == 0.5
assert m.tokens_saved == 5000
def test_to_dict_savings_ratio(self):
m = MicroFaultMetrics()
m.record_fault(100, 900, success=True)
d = m.to_dict()
assert d["savings_ratio"] == 0.9 # 900 / (900 + 100)
# ── SegmentationMetrics ─────────────────────────────────────────────────
class TestSegmentationMetrics:
def test_empty(self):
m = SegmentationMetrics()
assert m.total_objects_created == 0
assert m.avg_object_size == 0.0
def test_record_objects(self):
m = SegmentationMetrics()
m.record_object("file_context", 500)
m.record_object("file_context", 1000)
m.record_object("tool_result", 200)
assert m.total_objects_created == 3
assert m.objects_by_type["file_context"] == 2
assert m.objects_by_type["tool_result"] == 1
assert m.total_tokens_stored == 1700
def test_to_dict_percentiles(self):
m = SegmentationMetrics()
for i in range(10):
m.record_object("test", (i + 1) * 100)
d = m.to_dict()
assert d["min_object_size_tokens"] == 100
assert d["max_object_size_tokens"] == 1000
assert d["p50_object_size_tokens"] == 600 # nearest-rank: index 5 of [100..1000]
# ── GoalMetrics ─────────────────────────────────────────────────────────
class TestGoalMetrics:
def test_empty(self):
m = GoalMetrics()
assert m.topic_shifts_detected == 0
def test_record_events(self):
m = GoalMetrics()
m.record_topic_shift()
m.record_reclassification()
m.record_promotion(3)
d = m.to_dict()
assert d["topic_shifts_detected"] == 1
assert d["goal_reclassifications"] == 1
assert d["promotions_triggered"] == 3
# ── EntropyMetrics ──────────────────────────────────────────────────────
class TestEntropyMetrics:
def test_empty(self):
m = EntropyMetrics()
assert m.trigger_rate == 0.0
def test_record_checks(self):
m = EntropyMetrics()
m.record_check(triggered=False)
m.record_check(triggered=True, entities_count=2)
m.record_check(triggered=False)
assert m.checks == 3
assert m.triggers == 1
assert m.entities_faulted == 2
assert abs(m.trigger_rate - 1 / 3) < 0.01
# ── TokenMetrics ────────────────────────────────────────────────────────
class TestTokenMetrics:
def test_empty(self):
m = TokenMetrics()
assert m.turns == 0
assert m.context_reduction_ratio == 1.0
def test_record_turn(self):
m = TokenMetrics()
m.record_turn(
input_tokens=10000,
effective_tokens=15000,
cache_read=3000,
cache_create=2000,
incoming_bytes=50000,
outgoing_bytes=30000,
)
assert m.turns == 1
assert m.total_input_tokens == 10000
assert m.total_effective_tokens == 15000
assert m.total_cache_read == 3000
def test_context_reduction(self):
m = TokenMetrics()
m.record_turn(0, 0, 0, 0, incoming_bytes=10000, outgoing_bytes=4000)
assert m.context_reduction_ratio == 0.4 # 60% reduction
d = m.to_dict()
assert d["context_reduction_pct"] == 60.0
def test_cache_hit_rate(self):
m = TokenMetrics()
m.record_turn(0, 0, cache_read=8000, cache_create=2000, incoming_bytes=0, outgoing_bytes=0)
assert m.avg_cache_hit_rate == 0.8
def test_multiple_turns(self):
m = TokenMetrics()
m.record_turn(5000, 7000, 1500, 500, 20000, 15000)
m.record_turn(8000, 12000, 3000, 1000, 40000, 20000)
assert m.turns == 2
assert m.total_input_tokens == 13000
d = m.to_dict()
assert d["peak_input_tokens"] == 8000
# ── SessionBenchmark ────────────────────────────────────────────────────
class TestSessionBenchmark:
def test_creation(self):
b = SessionBenchmark("test-session")
assert b.session_id == "test-session"
assert b.elapsed_seconds >= 0
def test_to_dict_has_all_sections(self):
b = SessionBenchmark("test-session")
d = b.to_dict()
assert "session_id" in d
assert "tokens" in d
assert "fidelity" in d
assert "admission" in d
assert "micro_faults" in d
assert "segmentation" in d
assert "goals" in d
assert "entropy" in d
assert "latency" in d
def test_latency_trackers_initialized(self):
b = SessionBenchmark("test")
assert "preprocess" in b.latency
assert "segmentation" in b.latency
assert "embedding" in b.latency
assert "admission" in b.latency
assert "goal_classification" in b.latency
assert "entropy_check" in b.latency
assert "memory_query" in b.latency
assert "helper_llm" in b.latency
# ── BenchmarkCollector ──────────────────────────────────────────────────
class TestBenchmarkCollector:
def test_empty_aggregate(self):
c = BenchmarkCollector()
result = c.aggregate()
assert result["sessions"] == 0
def test_get_session_creates(self):
c = BenchmarkCollector()
s = c.get_session("s1")
assert s.session_id == "s1"
# Getting again returns same instance
s2 = c.get_session("s1")
assert s is s2
def test_aggregate_single_session(self):
c = BenchmarkCollector()
s = c.get_session("s1")
s.tokens.record_turn(5000, 7000, 1500, 500, 20000, 12000)
s.tokens.record_turn(8000, 12000, 3000, 1000, 40000, 18000)
s.fidelity.record_transition(0, 1)
s.admission.record_decision(True, 0.8, "file_context")
s.admission.record_decision(False, 0.2, "tool_result")
result = c.aggregate()
assert result["sessions"] == 1
assert result["total_turns"] == 2
assert result["tokens"]["total_input_tokens"] == 13000
assert result["fidelity"]["total_degradations"] == 1
assert result["admission"]["total_admitted"] == 1
assert result["admission"]["total_rejected"] == 1
def test_aggregate_multiple_sessions(self):
c = BenchmarkCollector()
s1 = c.get_session("s1")
s1.tokens.record_turn(5000, 7000, 1500, 500, 20000, 12000)
s1.fidelity.record_transition(0, 1)
s2 = c.get_session("s2")
s2.tokens.record_turn(3000, 4000, 800, 200, 10000, 7000)
s2.fidelity.record_transition(1, 2)
result = c.aggregate()
assert result["sessions"] == 2
assert result["total_turns"] == 2
assert result["tokens"]["total_input_tokens"] == 8000
assert result["fidelity"]["total_degradations"] == 2
def test_session_report(self):
c = BenchmarkCollector()
s = c.get_session("s1")
s.segmentation.record_object("file_context", 500)
report = c.session_report("s1")
assert report is not None
assert report["segmentation"]["total_objects_created"] == 1
def test_session_report_not_found(self):
c = BenchmarkCollector()
assert c.session_report("nonexistent") is None
def test_all_sessions(self):
c = BenchmarkCollector()
c.get_session("s1")
c.get_session("s2")
all_s = c.all_sessions()
assert len(all_s) == 2
assert "s1" in all_s
assert "s2" in all_s
def test_reset(self):
c = BenchmarkCollector()
c.get_session("s1")
c.reset()
assert c.aggregate()["sessions"] == 0
def test_context_reduction_calculation(self):
c = BenchmarkCollector()
s = c.get_session("s1")
# 100k incoming, 20k outgoing = 80% reduction
s.tokens.record_turn(0, 0, 0, 0, 100000, 20000)
result = c.aggregate()
assert result["tokens"]["context_reduction_pct"] == 80.0
# ── suggest_thresholds ──────────────────────────────────────────────────
from mnemosyne.benchmark import suggest_thresholds
class TestSuggestThresholds:
def test_high_rejection_rate_suggests_lower_threshold(self):
metrics = {
"admission": {"rejection_rate": 0.45, "total_admitted": 55, "total_rejected": 45},
"fidelity": {"total_degradations": 10, "total_upgrades": 10},
"micro_faults": {"total_attempts": 10, "success_rate": 0.8, "total_tokens_saved": 500},
}
result = suggest_thresholds(metrics)
assert result["admission_threshold"]["status"] == "adjust"
assert "lowering" in result["admission_threshold"]["reason"]
assert (
result["admission_threshold"]["suggested_value"]
< result["admission_threshold"]["current_value"]
)
def test_low_rejection_rate_suggests_higher_threshold(self):
metrics = {
"admission": {"rejection_rate": 0.05, "total_admitted": 95, "total_rejected": 5},
"fidelity": {"total_degradations": 5, "total_upgrades": 5},
"micro_faults": {"total_attempts": 10, "success_rate": 0.8, "total_tokens_saved": 500},
}
result = suggest_thresholds(metrics)
assert result["admission_threshold"]["status"] == "adjust"
assert "raising" in result["admission_threshold"]["reason"]
def test_balanced_metrics_all_ok(self):
metrics = {
"admission": {"rejection_rate": 0.20, "total_admitted": 80, "total_rejected": 20},
"entropy": {"trigger_rate": 0.12, "checks": 100, "triggers": 12},
"fidelity": {"total_degradations": 10, "total_upgrades": 8},
"micro_faults": {
"total_attempts": 20,
"success_rate": 0.75,
"total_tokens_saved": 1000,
},
}
result = suggest_thresholds(metrics)
assert result["admission_threshold"]["status"] == "ok"
assert result["entropy_sensitivity"]["status"] == "ok"
assert result["fidelity_pressure"]["status"] == "ok"
assert result["micro_fault_quality"]["status"] == "ok"
def test_high_entropy_trigger_rate_suggests_adjustment(self):
metrics = {
"admission": {"rejection_rate": 0.20},
"entropy": {"trigger_rate": 0.35, "checks": 100, "triggers": 35},
"fidelity": {"total_degradations": 5, "total_upgrades": 5},
"micro_faults": {"total_attempts": 0, "success_rate": 0.0},
}
result = suggest_thresholds(metrics)
assert result["entropy_sensitivity"]["status"] == "adjust"
assert "lowering" in result["entropy_sensitivity"]["reason"]
def test_low_entropy_trigger_rate_suggests_raising(self):
metrics = {
"admission": {"rejection_rate": 0.20},
"entropy": {"trigger_rate": 0.02, "checks": 100, "triggers": 2},
"fidelity": {"total_degradations": 5, "total_upgrades": 5},
"micro_faults": {"total_attempts": 0, "success_rate": 0.0},
}
result = suggest_thresholds(metrics)
assert result["entropy_sensitivity"]["status"] == "adjust"
assert "raising" in result["entropy_sensitivity"]["reason"]
def test_high_fidelity_pressure_suggests_adjustment(self):
metrics = {
"admission": {"rejection_rate": 0.20},
"fidelity": {"total_degradations": 60, "total_upgrades": 5},
"micro_faults": {"total_attempts": 10, "success_rate": 0.8, "total_tokens_saved": 500},
}
result = suggest_thresholds(metrics)
assert result["fidelity_pressure"]["status"] == "adjust"
assert "over-pressured" in result["fidelity_pressure"]["reason"]
def test_too_conservative_fidelity(self):
metrics = {
"admission": {"rejection_rate": 0.20},
"fidelity": {"total_degradations": 2, "total_upgrades": 20},
"micro_faults": {"total_attempts": 10, "success_rate": 0.8, "total_tokens_saved": 500},
}
result = suggest_thresholds(metrics)
assert result["fidelity_pressure"]["status"] == "adjust"
assert "conservative" in result["fidelity_pressure"]["reason"]
def test_low_micro_fault_success_rate(self):
metrics = {
"admission": {"rejection_rate": 0.20},
"fidelity": {"total_degradations": 5, "total_upgrades": 5},
"micro_faults": {"total_attempts": 20, "success_rate": 0.30, "total_tokens_saved": 100},
}
result = suggest_thresholds(metrics)
assert result["micro_fault_quality"]["status"] == "adjust"
assert "below 50%" in result["micro_fault_quality"]["reason"]
def test_no_entropy_checks_is_ok(self):
metrics = {
"admission": {"rejection_rate": 0.20},
"fidelity": {"total_degradations": 5, "total_upgrades": 5},
"micro_faults": {"total_attempts": 0, "success_rate": 0.0},
}
result = suggest_thresholds(metrics)
assert result["entropy_sensitivity"]["status"] == "ok"
assert "No entropy checks" in result["entropy_sensitivity"]["reason"]
def test_no_micro_fault_attempts_is_ok(self):
metrics = {
"admission": {"rejection_rate": 0.20},
"fidelity": {"total_degradations": 5, "total_upgrades": 5},
"micro_faults": {"total_attempts": 0, "success_rate": 0.0},
}
result = suggest_thresholds(metrics)
assert result["micro_fault_quality"]["status"] == "ok"
def test_empty_metrics(self):
result = suggest_thresholds({})
# Should not crash, all should be "ok" with defaults
assert result["admission_threshold"]["status"] == "ok"
assert result["entropy_sensitivity"]["status"] == "ok"
assert result["fidelity_pressure"]["status"] == "ok"
assert result["micro_fault_quality"]["status"] == "ok"
# ── Benchmark CLI ───────────────────────────────────────────────────────
from unittest.mock import patch, MagicMock
import json as json_mod
import httpx
from mnemosyne.benchmark_cli import (
format_aggregate_report,
format_session_report,
run_benchmark_cli,
_box_top,
_box_bottom,
_box_sep,
_box_line,
)
class TestBenchmarkCLIFormatting:
def test_box_drawing_characters(self):
top = _box_top()
assert top.startswith("\u2554")
assert top.endswith("\u2557")
bottom = _box_bottom()
assert bottom.startswith("\u255a")
assert bottom.endswith("\u255d")
sep = _box_sep()
assert sep.startswith("\u2560")
assert sep.endswith("\u2563")
def test_box_line_pads_correctly(self):
line = _box_line("hello")
assert line.startswith("\u2551")
assert line.endswith("\u2551")
assert "hello" in line
def test_format_aggregate_report_structure(self):
data = {
"type": "aggregate",
"sessions": 3,
"total_turns": 147,
"tokens": {
"total_input_tokens": 50000,
"total_effective_tokens": 70000,
"total_cache_read_tokens": 892100,
"context_reduction_ratio": 0.268,
"context_reduction_pct": 73.2,
"avg_input_tokens_per_turn": 12450.0,
},
"fidelity": {"total_degradations": 45, "total_upgrades": 12},
"admission": {
"total_admitted": 312,
"total_rejected": 89,
"rejection_rate": 0.222,
},
"micro_faults": {
"total_attempts": 23,
"total_successes": 18,
"success_rate": 0.783,
"total_tokens_saved": 45200,
},
"latency": {
"segmentation": {"total_calls": 50, "avg_ms": 12.3, "min_ms": 5.1, "max_ms": 45.2},
"admission": {"total_calls": 40, "avg_ms": 2.1, "min_ms": 0.8, "max_ms": 8.3},
},
}
report = format_aggregate_report(data)
assert "Mnemosyne Benchmark Report" in report
assert "Sessions: 3" in report
assert "Total Turns: 147" in report
assert "TOKEN SAVINGS" in report
assert "73.2%" in report
assert "FIDELITY" in report
assert "Degradations: 45" in report
assert "ADMISSION CONTROL" in report
assert "MICRO-FAULTS" in report
assert "LATENCY" in report
assert "segmentation" in report
assert "THRESHOLD SUGGESTIONS" in report
# Box drawing chars present
assert "\u2554" in report
assert "\u255d" in report
def test_format_session_report_structure(self):
data = {
"type": "session",
"session_id": "test-abc",
"elapsed_seconds": 120.5,
"tokens": {
"turns": 10,
"total_input_tokens": 5000,
"context_reduction_pct": 60.0,
"avg_input_tokens_per_turn": 500.0,
"avg_cache_hit_rate": 0.75,
},
"fidelity": {"degradations": 3, "upgrades": 1},
"admission": {"admitted": 20, "rejected": 5, "rejection_rate": 0.2},
"micro_faults": {
"attempts": 5,
"successes": 4,
"success_rate": 0.8,
"tokens_saved": 1000,
},
"entropy": {"checks": 10, "triggers": 2, "trigger_rate": 0.2},
"latency": {},
}
report = format_session_report(data)
assert "Session: test-abc" in report
assert "Elapsed: 120.5s" in report
assert "TOKEN SAVINGS" in report
assert "ENTROPY" in report
def test_format_aggregate_report_empty_sessions(self):
data = {"sessions": 0, "message": "No sessions recorded"}
report = format_aggregate_report(data)
assert "Sessions: 0" in report
class TestBenchmarkCLIExecution:
def test_run_benchmark_cli_json_output(self, capsys):
mock_data = {"type": "aggregate", "sessions": 2, "total_turns": 50}
mock_response = MagicMock()
mock_response.json.return_value = mock_data
mock_response.raise_for_status = MagicMock()
with patch("mnemosyne.benchmark_cli.httpx.get", return_value=mock_response) as mock_get:
run_benchmark_cli(port=8080, json_output=True)
mock_get.assert_called_once_with("http://127.0.0.1:8080/api/benchmark", timeout=10.0)
captured = capsys.readouterr()
parsed = json_mod.loads(captured.out)
assert parsed["sessions"] == 2
def test_run_benchmark_cli_with_session_id(self, capsys):
mock_data = {
"type": "session",
"session_id": "s1",
"elapsed_seconds": 10.0,
"tokens": {},
"fidelity": {},
"admission": {},
"micro_faults": {},
"entropy": {},
"latency": {},
}
mock_response = MagicMock()
mock_response.json.return_value = mock_data
mock_response.raise_for_status = MagicMock()
with patch("mnemosyne.benchmark_cli.httpx.get", return_value=mock_response) as mock_get:
run_benchmark_cli(port=9090, session_id="s1")
mock_get.assert_called_once_with(
"http://127.0.0.1:9090/api/benchmark?session_id=s1", timeout=10.0
)
captured = capsys.readouterr()
assert "Session: s1" in captured.out
def test_run_benchmark_cli_pretty_output(self, capsys):
mock_data = {
"type": "aggregate",
"sessions": 1,
"total_turns": 10,
"tokens": {
"context_reduction_pct": 50.0,
"avg_input_tokens_per_turn": 1000,
"total_cache_read_tokens": 5000,
},
"fidelity": {"total_degradations": 2, "total_upgrades": 1},
"admission": {"total_admitted": 10, "total_rejected": 3, "rejection_rate": 0.23},
"micro_faults": {
"total_attempts": 5,
"total_successes": 4,
"success_rate": 0.8,
"total_tokens_saved": 2000,
},
"latency": {},
}
mock_response = MagicMock()
mock_response.json.return_value = mock_data
mock_response.raise_for_status = MagicMock()
with patch("mnemosyne.benchmark_cli.httpx.get", return_value=mock_response):
run_benchmark_cli(port=8080)
captured = capsys.readouterr()
assert "Mnemosyne Benchmark Report" in captured.out
assert "\u2554" in captured.out # box top
def test_run_benchmark_cli_connection_error(self):
with patch("mnemosyne.benchmark_cli.httpx.get", side_effect=httpx.ConnectError("refused")):
import pytest
with pytest.raises(SystemExit) as exc_info:
run_benchmark_cli(port=8080)
assert exc_info.value.code == 1
class TestBenchmarkCLIArgParsing:
def test_benchmark_flag_parsed(self):
"""Verify --benchmark is recognized by the argparse parser."""
import argparse
# Reconstruct the parser as main() does
parser = argparse.ArgumentParser()
parser.add_argument("--benchmark", action="store_true")
parser.add_argument("--session", type=str, default=None)
parser.add_argument("--json-output", action="store_true")
parser.add_argument("--port", type=int, default=0)
args = parser.parse_args(["--benchmark"])
assert args.benchmark is True
assert args.session is None
assert args.json_output is False
def test_benchmark_with_session_and_json(self):
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--benchmark", action="store_true")
parser.add_argument("--session", type=str, default=None)
parser.add_argument("--json-output", action="store_true")
parser.add_argument("--port", type=int, default=0)
args = parser.parse_args(
["--benchmark", "--session", "abc-123", "--json-output", "--port", "9090"]
)
assert args.benchmark is True
assert args.session == "abc-123"
assert args.json_output is True
assert args.port == 9090
def test_benchmark_port_default(self):
"""When --benchmark is used without --port, port defaults to 0 (gateway treats as 8080)."""
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--benchmark", action="store_true")
parser.add_argument("--port", type=int, default=0)
args = parser.parse_args(["--benchmark"])
# The gateway main() maps port=0 to 8080 for benchmark mode
benchmark_port = args.port if args.port != 0 else 8080
assert benchmark_port == 8080