#!/usr/bin/env python3
"""
Model Tool-Call Loop Detection Script
Tests whether LLM models enter infinite tool-call loops when handling
consecutive tool calls (read → exec → should respond with text).
Usage:
python3 test-model-tool-loop.py # Test all models
python3 test-model-tool-loop.py kimi-k2.5 glm-5 # Test specific models
python3 test-model-tool-loop.py --rounds 5 # Max loop detection rounds
Environment variables:
BAILIAN_API_KEY - DashScope API key
SILICONFLOW_API_KEY - SiliconFlow API key
LOCAL_API_KEY - Local model API key
LOCAL_API_URL - Local model endpoint (default: http://localhost:8080/v1)
"""
import json, urllib.request, ssl, sys, time, argparse, os
# ── Provider config (keys from env) ──────────────────────────
PROVIDERS = {
"bailian": {
"url": "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions",
"key_env": "BAILIAN_API_KEY",
"models": ["kimi-k2.5", "qwen3.5-plus", "qwen3-coder-plus", "qwen3-coder-next", "MiniMax-M2.5", "glm-5"],
},
"siliconflow": {
"url": "https://api.siliconflow.cn/v1/chat/completions",
"key_env": "SILICONFLOW_API_KEY",
"models": ["Pro/moonshotai/Kimi-K2.5"],
},
"local": {
"url": os.environ.get("LOCAL_API_URL", "http://localhost:8080/v1") + "/chat/completions",
"key_env": "LOCAL_API_KEY",
"models": ["default"],
},
}
# ── Tool definitions ─────────────────────────────────────────
TOOLS = [
{"type": "function", "function": {
"name": "exec",
"description": "Execute a shell command on the host. Returns stdout/stderr and exit code.",
"parameters": {"type": "object", "properties": {
"command": {"type": "string", "description": "Shell command to execute"},
}, "required": ["command"]}
}},
{"type": "function", "function": {
"name": "read",
"description": "Read a file from the filesystem. Returns the file content.",
"parameters": {"type": "object", "properties": {
"path": {"type": "string", "description": "Absolute file path to read"}
}, "required": ["path"]}
}}
]
# ── Test fixtures ────────────────────────────────────────────
SYSTEM_PROMPT = """You are an AI assistant that helps with infrastructure tasks.
Rules: Use exec tool to run commands. After getting tool results, respond to the user with a clear summary."""
SKILL_CONTENT = """---
name: grafana
description: Query Grafana dashboards, alerts, datasources via REST API.
---
# Grafana
## Helper Script
Use `grafana-api.sh` for all operations:
Commands:
grafana-api.sh health
grafana-api.sh datasources
grafana-api.sh dashboards [query]
grafana-api.sh alerts
"""
EXEC_RESULT = """{
"id": 1, "name": "Prometheus", "type": "prometheus", "url": "http://prometheus:9090"
}
{
"id": 2, "name": "Loki", "type": "loki", "url": "http://loki:3100"
}
{
"id": 3, "name": "Tempo", "type": "tempo", "url": "http://tempo:3200"
}"""
JIRA_RESULT = '{"id":"40100","key":"PROJ-42","self":"https://jira.example.com/rest/api/2/issue/40100"}'
# ── Test scenarios ───────────────────────────────────────────
SCENARIOS = {
"read→exec (skill activation flow)": lambda: [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": "List all datasources"},
{"role": "assistant", "content": None, "tool_calls": [{
"id": "c1", "type": "function",
"function": {"name": "read", "arguments": json.dumps({"path": "/skills/grafana/SKILL.md"})}
}]},
{"role": "tool", "tool_call_id": "c1", "content": SKILL_CONTENT},
{"role": "assistant", "content": None, "tool_calls": [{
"id": "c2", "type": "function",
"function": {"name": "exec", "arguments": json.dumps({"command": "grafana-api.sh datasources"})}
}]},
{"role": "tool", "tool_call_id": "c2", "content": EXEC_RESULT},
],
"direct exec (no read)": lambda: [
{"role": "system", "content": SYSTEM_PROMPT + "\n\n" + SKILL_CONTENT},
{"role": "user", "content": "List all datasources"},
{"role": "assistant", "content": None, "tool_calls": [{
"id": "c1", "type": "function",
"function": {"name": "exec", "arguments": json.dumps({"command": "grafana-api.sh datasources"})}
}]},
{"role": "tool", "tool_call_id": "c1", "content": EXEC_RESULT},
],
"write op (duplicate risk)": lambda: [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": "Create a task: Test Grafana alerts"},
{"role": "assistant", "content": None, "tool_calls": [{
"id": "c1", "type": "function",
"function": {"name": "exec", "arguments": json.dumps({"command": "jira-cli create PROJ Task \"Test Grafana alerts\""})}
}]},
{"role": "tool", "tool_call_id": "c1", "content": JIRA_RESULT},
],
}
def call_api(api_url, api_key, model, messages, timeout=30):
payload = {"model": model, "messages": messages, "tools": TOOLS, "max_tokens": 2048}
data = json.dumps(payload).encode('utf-8')
req = urllib.request.Request(api_url, data=data, method='POST', headers={
'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json',
})
ctx = ssl.create_default_context()
with urllib.request.urlopen(req, context=ctx, timeout=timeout) as resp:
return json.loads(resp.read().decode('utf-8'))
def test_model(provider_name, api_url, api_key, model, max_rounds=3):
results = {}
for scenario_name, build_messages in SCENARIOS.items():
messages = build_messages()
looped = False
rounds = 0
error = None
latency = 0
try:
t0 = time.time()
timeout = 60 if provider_name == "local" else 30
r = call_api(api_url, api_key, model, messages, timeout)
latency = time.time() - t0
choice = r['choices'][0]
msg = choice['message']
tc = msg.get('tool_calls')
if tc and choice['finish_reason'] in ('tool_calls', 'tool_call'):
looped = True
rounds = 1
for i in range(2, max_rounds + 1):
messages.append(msg)
messages.append({"role": "tool", "tool_call_id": tc[0]['id'], "content": EXEC_RESULT})
r = call_api(api_url, api_key, model, messages, timeout)
choice = r['choices'][0]
msg = choice['message']
tc = msg.get('tool_calls')
if not tc or choice['finish_reason'] not in ('tool_calls', 'tool_call'):
looped = False
rounds = i
break
rounds = i
except Exception as e:
error = str(e)[:80]
results[scenario_name] = {"looped": looped, "rounds": rounds, "error": error, "latency": round(latency, 1)}
return results
def main():
parser = argparse.ArgumentParser(description="Model Tool-Call Loop Detection")
parser.add_argument("models", nargs="*", help="Model IDs to test (default: all)")
parser.add_argument("--rounds", type=int, default=3, help="Max loop detection rounds (default: 3)")
parser.add_argument("--provider", type=str, help="Provider name (with --model)")
parser.add_argument("--model", type=str, help="Model ID (with --provider)")
args = parser.parse_args()
# Build test list
test_list = []
if args.provider and args.model:
p = PROVIDERS.get(args.provider)
if not p:
print(f"Unknown provider: {args.provider}"); sys.exit(1)
key = os.environ.get(p["key_env"], "")
if not key:
print(f"Set {p['key_env']} env var"); sys.exit(1)
test_list.append((args.provider, p["url"], key, args.model))
elif args.models:
for model_id in args.models:
for pname, pconf in PROVIDERS.items():
if model_id in pconf["models"]:
key = os.environ.get(pconf["key_env"], "")
if key:
test_list.append((pname, pconf["url"], key, model_id))
else:
print(f"Skip {pname}/{model_id}: {pconf['key_env']} not set")
break
else:
for pname, pconf in PROVIDERS.items():
key = os.environ.get(pconf["key_env"], "")
if not key:
print(f"Skip {pname}: {pconf['key_env']} not set")
continue
for m in pconf["models"]:
test_list.append((pname, pconf["url"], key, m))
if not test_list:
print("No models to test. Set API key env vars."); sys.exit(1)
print(f"Model Tool-Call Loop Detection")
print(f"Rounds: {args.rounds} | Models: {len(test_list)} | Scenarios: {len(SCENARIOS)}")
print(f"{'='*75}")
all_results = {}
for pname, url, key, model in test_list:
label = f"{pname}/{model}"
print(f"\nTesting {label} ...", end="", flush=True)
results = test_model(pname, url, key, model, args.rounds)
all_results[label] = results
has_loop = any(r["looped"] for r in results.values())
has_error = any(r["error"] for r in results.values())
print(" FAIL (loop)" if has_loop else " ERROR" if has_error else " PASS")
# Summary
print(f"\n\n{'='*75}")
print("SUMMARY")
print(f"{'='*75}\n")
for scenario_name in SCENARIOS:
print(f"Scenario: {scenario_name}")
print(f"{'-'*60}")
print(f"{'Model':<40} {'Result':<10} {'Rounds':<8} {'Latency'}")
print(f"{'-'*60}")
for label, results in all_results.items():
r = results[scenario_name]
if r["error"]:
status = "ERROR"
elif r["looped"]:
status = "FAIL"
elif r["rounds"] > 0:
status = "WARN"
else:
status = "PASS"
latency = f"{r['latency']}s" if r['latency'] else "-"
print(f"{label:<40} {status:<10} {r['rounds']:<8} {latency}")
print()
loop_models = [l for l, res in all_results.items() if any(r["looped"] for r in res.values())]
safe_models = [l for l, res in all_results.items()
if not any(r["looped"] for r in res.values()) and not any(r["error"] for r in res.values())]
print(f"{'='*75}")
if loop_models:
print(f"FAIL: {', '.join(loop_models)}")
if safe_models:
print(f"PASS: {', '.join(safe_models)}")
print(f"{'='*75}")
sys.exit(1 if loop_models else 0)
if __name__ == "__main__":
main()
Bug: loopDetection does not catch repeated
exectool callsEnvironment
Description
The
tools.loopDetectionconfig fires for thereadtool but does not detect or block repeatedexectool calls. When a model enters an infinite tool-call loop (calling the same exec command repeatedly with identical results), OpenClaw executes it indefinitely.In our case, the model called the same shell command 121 times in one session. Each call returned
exitCode=0with valid output, but the model kept issuing the same tool call (stopReason: toolUse) and never generated text.Config
Session evidence
All 122 assistant messages from the looping session:
Every exec
toolResult:{ "role": "toolResult", "toolCallId": "functions.exec:0", "toolName": "exec", "content": [{"type": "text", "text": "{\"id\":2,\"name\":\"Loki\"...}"}], "details": {"status": "completed", "exitCode": 0, "durationMs": 136}, "isError": false }Result was correct every time —
exitCode=0, valid JSON,isError=false. The model never produced text output.Expected behavior
loopDetectionshould detect the sameexeccommand repeating with identical results and:warningThreshold(2) repetitionscriticalThreshold(3)globalCircuitBreakerThreshold(5)Actual behavior
No warning, no blocking, no circuit breaker. The loop runs until manual intervention.
Impact
Cross-model verification
We isolated this as a model-specific issue by testing all available models with the same tool call sequence (
read → exec → expect text). The loopDetection gap affects all models, but only Kimi K2.5 currently triggers it:Even though only one model loops today, any model could develop this behavior after an update, and loopDetection should catch it regardless.
Suggestion
genericRepeatdetector to coverexectool (compare command strings + output)Test script
We wrote an automated test to verify this. API keys via env vars, no hardcoded credentials:
test-model-tool-loop.py