|
| 1 | +import os |
| 2 | +import subprocess |
| 3 | +import time |
| 4 | +import urllib.request |
| 5 | +import urllib.error |
| 6 | +import json |
| 7 | +import re |
| 8 | + |
| 9 | +MODELS = [ |
| 10 | + "gemma-4-e4b-it-8bit", |
| 11 | + "gemma-4-26b-a4b-it-4bit", |
| 12 | + "gemma-4-31b-it-8bit" |
| 13 | +] |
| 14 | + |
| 15 | +CONFIGS = [ |
| 16 | + {"name": "Dense/Vanilla", "flags": []}, |
| 17 | + {"name": "SSD Stream", "flags": ["--stream-experts"]}, |
| 18 | + {"name": "TurboQuant", "flags": ["--turbo-kv"]}, |
| 19 | + {"name": "SSD + TurboQuant", "flags": ["--stream-experts", "--turbo-kv"]} |
| 20 | +] |
| 21 | + |
| 22 | +SWIFTLM_PATH = ".build/arm64-apple-macosx/release/SwiftLM" |
| 23 | +DEEPCAMERA_SCRIPT = "/Users/simba/workspace/DeepCamera/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs" |
| 24 | +RESULTS_FILE = "./benchmark_matrix_report.md" |
| 25 | + |
| 26 | +def poll_health(port=5413, timeout=120): |
| 27 | + start = time.time() |
| 28 | + url = f"http://127.0.0.1:{port}/health" |
| 29 | + while time.time() - start < timeout: |
| 30 | + try: |
| 31 | + r = urllib.request.urlopen(url) |
| 32 | + if r.getcode() == 200: |
| 33 | + print("Server is healthy!") |
| 34 | + return True |
| 35 | + except urllib.error.URLError: |
| 36 | + pass |
| 37 | + time.sleep(2) |
| 38 | + return False |
| 39 | + |
| 40 | +def run_harness(): |
| 41 | + print("Clearing background SwiftLM processes...") |
| 42 | + subprocess.run(["killall", "SwiftLM"], stderr=subprocess.DEVNULL) |
| 43 | + time.sleep(2) |
| 44 | + |
| 45 | + results = [] |
| 46 | + |
| 47 | + for model in MODELS: |
| 48 | + is_dense = "e4b" in model or "e2b" in model |
| 49 | + |
| 50 | + for config in CONFIGS: |
| 51 | + if is_dense and "--stream-experts" in config["flags"]: |
| 52 | + print(f"Skipping {config['name']} for dense model {model}") |
| 53 | + continue |
| 54 | + |
| 55 | + print(f"\n=============================================") |
| 56 | + print(f"Testing {model} | Config: {config['name']}") |
| 57 | + print(f"=============================================") |
| 58 | + |
| 59 | + model_path = f"/Users/simba/.aegis-ai/models/mlx_models/mlx-community/{model}" |
| 60 | + cmd = [SWIFTLM_PATH, "--model", model_path] + config["flags"] |
| 61 | + print(f"Starting SwiftLM: {' '.join(cmd)}") |
| 62 | + |
| 63 | + server_proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) |
| 64 | + |
| 65 | + if not poll_health(): |
| 66 | + print(f"ERR: Server failed to start for {model}") |
| 67 | + server_proc.terminate() |
| 68 | + continue |
| 69 | + |
| 70 | + print("Running DeepCamera HomeSec-Bench...") |
| 71 | + bench_cmd = ["node", DEEPCAMERA_SCRIPT, "--llm", "http://127.0.0.1:5413"] |
| 72 | + bench_cwd = os.path.dirname(os.path.dirname(DEEPCAMERA_SCRIPT)) |
| 73 | + bench_proc = subprocess.Popen(bench_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, cwd=bench_cwd) |
| 74 | + |
| 75 | + success_rate = "N/A" |
| 76 | + average_tps = "N/A" |
| 77 | + |
| 78 | + for line in bench_proc.stdout: |
| 79 | + line = line.strip() |
| 80 | + if "Overall Success Rate:" in line: |
| 81 | + match = re.search(r"Overall Success Rate:\s*([^\s]+)\s*\((.*?)\)", line) |
| 82 | + if match: |
| 83 | + success_rate = match.group(1) |
| 84 | + elif "Average Generation Speed:" in line: |
| 85 | + match = re.search(r"Average Generation Speed:\s*([0-9.]+)", line) |
| 86 | + if match: |
| 87 | + average_tps = match.group(1) |
| 88 | + |
| 89 | + bench_proc.wait() |
| 90 | + |
| 91 | + server_proc.send_signal(subprocess.signal.SIGTERM) |
| 92 | + |
| 93 | + peak_ram = "N/A" |
| 94 | + for line in server_proc.stdout: |
| 95 | + if "Peak Memory" in line: |
| 96 | + match = re.search(r"Peak Memory:\s*([0-9.]+)\s*GB", line) |
| 97 | + if match: |
| 98 | + peak_ram = match.group(1) |
| 99 | + |
| 100 | + server_proc.wait() |
| 101 | + |
| 102 | + res = { |
| 103 | + "model": model, |
| 104 | + "config": config["name"], |
| 105 | + "success_rate": success_rate, |
| 106 | + "average_tps": average_tps, |
| 107 | + "peak_ram": peak_ram |
| 108 | + } |
| 109 | + results.append(res) |
| 110 | + print(res) |
| 111 | + time.sleep(5) |
| 112 | + |
| 113 | + with open(RESULTS_FILE, "w") as f: |
| 114 | + f.write("### Benchmark Results (HomeSec-Bench)\n\n") |
| 115 | + f.write("| Model | Configuration | Success Rate | Avg TPS | Peak Memory (GB) |\n") |
| 116 | + f.write("|---|---|---|---|---|\n") |
| 117 | + for r in results: |
| 118 | + f.write(f"| `{r['model']}` | {r['config']} | {r['success_rate']} | {r['average_tps']} tok/s | {r['peak_ram']} GB |\n") |
| 119 | + |
| 120 | + print(f"\n[DONE] Matrix complete. Results saved to {RESULTS_FILE}") |
| 121 | + |
| 122 | +if __name__ == "__main__": |
| 123 | + run_harness() |
0 commit comments