Skip to content

Commit e593137

Browse files
committed
test: update harness runner loopback bindings and benchmark report
1 parent 32ce0e2 commit e593137

2 files changed

Lines changed: 128 additions & 0 deletions

File tree

benchmark_matrix_report.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
### Benchmark Results (HomeSec-Bench)
2+
3+
| Model | Configuration | Success Rate | Avg TPS | Peak Memory (GB) |
4+
|---|---|---|---|---|
5+
| `gemma-4-26b-a4b-it-4bit` | Dense/Vanilla | N/A | N/A tok/s | N/A GB |

harness_runner.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
import os
2+
import subprocess
3+
import time
4+
import urllib.request
5+
import urllib.error
6+
import json
7+
import re
8+
9+
MODELS = [
10+
"gemma-4-e4b-it-8bit",
11+
"gemma-4-26b-a4b-it-4bit",
12+
"gemma-4-31b-it-8bit"
13+
]
14+
15+
CONFIGS = [
16+
{"name": "Dense/Vanilla", "flags": []},
17+
{"name": "SSD Stream", "flags": ["--stream-experts"]},
18+
{"name": "TurboQuant", "flags": ["--turbo-kv"]},
19+
{"name": "SSD + TurboQuant", "flags": ["--stream-experts", "--turbo-kv"]}
20+
]
21+
22+
SWIFTLM_PATH = ".build/arm64-apple-macosx/release/SwiftLM"
23+
DEEPCAMERA_SCRIPT = "/Users/simba/workspace/DeepCamera/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs"
24+
RESULTS_FILE = "./benchmark_matrix_report.md"
25+
26+
def poll_health(port=5413, timeout=120):
27+
start = time.time()
28+
url = f"http://127.0.0.1:{port}/health"
29+
while time.time() - start < timeout:
30+
try:
31+
r = urllib.request.urlopen(url)
32+
if r.getcode() == 200:
33+
print("Server is healthy!")
34+
return True
35+
except urllib.error.URLError:
36+
pass
37+
time.sleep(2)
38+
return False
39+
40+
def run_harness():
41+
print("Clearing background SwiftLM processes...")
42+
subprocess.run(["killall", "SwiftLM"], stderr=subprocess.DEVNULL)
43+
time.sleep(2)
44+
45+
results = []
46+
47+
for model in MODELS:
48+
is_dense = "e4b" in model or "e2b" in model
49+
50+
for config in CONFIGS:
51+
if is_dense and "--stream-experts" in config["flags"]:
52+
print(f"Skipping {config['name']} for dense model {model}")
53+
continue
54+
55+
print(f"\n=============================================")
56+
print(f"Testing {model} | Config: {config['name']}")
57+
print(f"=============================================")
58+
59+
model_path = f"/Users/simba/.aegis-ai/models/mlx_models/mlx-community/{model}"
60+
cmd = [SWIFTLM_PATH, "--model", model_path] + config["flags"]
61+
print(f"Starting SwiftLM: {' '.join(cmd)}")
62+
63+
server_proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
64+
65+
if not poll_health():
66+
print(f"ERR: Server failed to start for {model}")
67+
server_proc.terminate()
68+
continue
69+
70+
print("Running DeepCamera HomeSec-Bench...")
71+
bench_cmd = ["node", DEEPCAMERA_SCRIPT, "--llm", "http://127.0.0.1:5413"]
72+
bench_cwd = os.path.dirname(os.path.dirname(DEEPCAMERA_SCRIPT))
73+
bench_proc = subprocess.Popen(bench_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, cwd=bench_cwd)
74+
75+
success_rate = "N/A"
76+
average_tps = "N/A"
77+
78+
for line in bench_proc.stdout:
79+
line = line.strip()
80+
if "Overall Success Rate:" in line:
81+
match = re.search(r"Overall Success Rate:\s*([^\s]+)\s*\((.*?)\)", line)
82+
if match:
83+
success_rate = match.group(1)
84+
elif "Average Generation Speed:" in line:
85+
match = re.search(r"Average Generation Speed:\s*([0-9.]+)", line)
86+
if match:
87+
average_tps = match.group(1)
88+
89+
bench_proc.wait()
90+
91+
server_proc.send_signal(subprocess.signal.SIGTERM)
92+
93+
peak_ram = "N/A"
94+
for line in server_proc.stdout:
95+
if "Peak Memory" in line:
96+
match = re.search(r"Peak Memory:\s*([0-9.]+)\s*GB", line)
97+
if match:
98+
peak_ram = match.group(1)
99+
100+
server_proc.wait()
101+
102+
res = {
103+
"model": model,
104+
"config": config["name"],
105+
"success_rate": success_rate,
106+
"average_tps": average_tps,
107+
"peak_ram": peak_ram
108+
}
109+
results.append(res)
110+
print(res)
111+
time.sleep(5)
112+
113+
with open(RESULTS_FILE, "w") as f:
114+
f.write("### Benchmark Results (HomeSec-Bench)\n\n")
115+
f.write("| Model | Configuration | Success Rate | Avg TPS | Peak Memory (GB) |\n")
116+
f.write("|---|---|---|---|---|\n")
117+
for r in results:
118+
f.write(f"| `{r['model']}` | {r['config']} | {r['success_rate']} | {r['average_tps']} tok/s | {r['peak_ram']} GB |\n")
119+
120+
print(f"\n[DONE] Matrix complete. Results saved to {RESULTS_FILE}")
121+
122+
if __name__ == "__main__":
123+
run_harness()

0 commit comments

Comments
 (0)