Skip to content

Commit 602d153

Browse files
committed
feat(ops): add Cloudflare cost killswitch
Polls CF GraphQL Analytics every 10 minutes. Opens a GitHub issue when daily usage crosses configurable thresholds for Worker requests, D1 row reads, DO requests, or DO active duration. Script exits 0 (ok) / 1 (tripped) / 2 (query failure). The optional auto-disable step only fires on exit=1, so query failures can never take the site offline on their own.
1 parent 2a90bd3 commit 602d153

2 files changed

Lines changed: 286 additions & 0 deletions

File tree

.github/workflows/killswitch.yml

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
name: Cost Killswitch
2+
3+
# Polls Cloudflare billing-relevant counters every 10 minutes. Opens an issue
4+
# on threshold breach. If AUTO_DISABLE=true and CF_WORKER_NAME is set, removes
5+
# the Worker's routes to stop billable traffic.
6+
#
7+
# Required repo secrets:
8+
# CF_API_TOKEN Analytics: Read (always), Workers Scripts: Edit
9+
# and Zone: Edit (only if AUTO_DISABLE=true)
10+
#
11+
# Required repo variables (Settings → Variables → Actions):
12+
# CF_ACCOUNT_ID e.g. 94a89d03c0fc785c8dcbd3c674d6742a
13+
#
14+
# Optional repo variables:
15+
# DAILY_WORKER_REQUESTS default 1000000
16+
# DAILY_D1_ROWS_READ default 10000000
17+
# DAILY_DO_REQUESTS default 500000
18+
# DAILY_DO_DURATION_SEC default 10000
19+
# AUTO_DISABLE "true" to actually stop the Worker; default off
20+
# CF_WORKER_NAME e.g. agent-kanban (for route removal)
21+
# CF_ZONE_ID zone ID hosting the Worker route
22+
23+
on:
24+
schedule:
25+
- cron: "*/10 * * * *"
26+
workflow_dispatch:
27+
28+
concurrency:
29+
group: cost-killswitch
30+
cancel-in-progress: false
31+
32+
jobs:
33+
check:
34+
runs-on: ubuntu-latest
35+
permissions:
36+
issues: write
37+
env:
38+
CF_ACCOUNT_ID: ${{ vars.CF_ACCOUNT_ID || '94a89d03c0fc785c8dcbd3c674d6742a' }}
39+
CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
40+
DAILY_WORKER_REQUESTS: ${{ vars.DAILY_WORKER_REQUESTS }}
41+
DAILY_D1_ROWS_READ: ${{ vars.DAILY_D1_ROWS_READ }}
42+
DAILY_DO_REQUESTS: ${{ vars.DAILY_DO_REQUESTS }}
43+
DAILY_DO_DURATION_SEC: ${{ vars.DAILY_DO_DURATION_SEC }}
44+
steps:
45+
- uses: actions/checkout@v4
46+
47+
- uses: actions/setup-node@v4
48+
with:
49+
node-version: 22
50+
51+
- name: Check usage
52+
id: check
53+
run: |
54+
set +e
55+
OUTPUT=$(node scripts/cost-killswitch.mjs 2> >(tee -a $GITHUB_STEP_SUMMARY >&2))
56+
CODE=$?
57+
echo "$OUTPUT" > /tmp/killswitch.json
58+
{
59+
echo "exit_code=$CODE"
60+
echo "result<<EOF"
61+
cat /tmp/killswitch.json
62+
echo "EOF"
63+
} >> "$GITHUB_OUTPUT"
64+
exit 0
65+
66+
- name: Open alert issue
67+
if: steps.check.outputs.exit_code == '1'
68+
uses: actions/github-script@v7
69+
env:
70+
KILL_RESULT: ${{ steps.check.outputs.result }}
71+
with:
72+
script: |
73+
const r = JSON.parse(process.env.KILL_RESULT);
74+
const title = `[killswitch] Cost threshold tripped: ${r.reasons.join('; ')}`;
75+
const body = [
76+
`Killswitch tripped at \`${new Date().toISOString()}\`.`,
77+
``,
78+
`**Reasons:**`,
79+
...r.reasons.map(x => `- ${x}`),
80+
``,
81+
`**Usage window:** \`${r.windowStart}\` → \`${r.windowEnd}\``,
82+
``,
83+
`| Metric | Used | Threshold |`,
84+
`|---|---:|---:|`,
85+
`| Worker requests | ${r.usage.workerRequests} | ${r.thresholds.workerRequests} |`,
86+
`| D1 rows read | ${r.usage.d1RowsRead} | ${r.thresholds.d1RowsRead} |`,
87+
`| DO requests | ${r.usage.doRequests} | ${r.thresholds.doRequests} |`,
88+
`| DO duration (s) | ${r.usage.doDurationSec} | ${r.thresholds.doDurationSec} |`,
89+
``,
90+
`Investigate at https://dash.cloudflare.com/${process.env.CF_ACCOUNT_ID || ''}/workers/overview`,
91+
].join('\n');
92+
const { data: open } = await github.rest.issues.listForRepo({
93+
owner: context.repo.owner,
94+
repo: context.repo.repo,
95+
state: 'open',
96+
labels: 'killswitch',
97+
});
98+
if (open.length > 0) {
99+
await github.rest.issues.createComment({
100+
owner: context.repo.owner,
101+
repo: context.repo.repo,
102+
issue_number: open[0].number,
103+
body,
104+
});
105+
} else {
106+
await github.rest.issues.create({
107+
owner: context.repo.owner,
108+
repo: context.repo.repo,
109+
title,
110+
body,
111+
labels: ['killswitch'],
112+
});
113+
}
114+
115+
- name: Auto-disable Worker routes
116+
if: steps.check.outputs.exit_code == '1' && vars.AUTO_DISABLE == 'true' && vars.CF_ZONE_ID != '' && vars.CF_WORKER_NAME != ''
117+
env:
118+
CF_ZONE_ID: ${{ vars.CF_ZONE_ID }}
119+
CF_WORKER_NAME: ${{ vars.CF_WORKER_NAME }}
120+
run: |
121+
set -euo pipefail
122+
ROUTES=$(curl -sS -H "Authorization: Bearer $CF_API_TOKEN" \
123+
"https://api.cloudflare.com/client/v4/zones/$CF_ZONE_ID/workers/routes")
124+
echo "$ROUTES" | jq -r --arg name "$CF_WORKER_NAME" \
125+
'.result[] | select(.script == $name) | .id' | while read id; do
126+
if [ -n "$id" ]; then
127+
echo "Removing route $id"
128+
curl -sS -X DELETE -H "Authorization: Bearer $CF_API_TOKEN" \
129+
"https://api.cloudflare.com/client/v4/zones/$CF_ZONE_ID/workers/routes/$id"
130+
fi
131+
done
132+
133+
- name: Fail the job on trip
134+
if: steps.check.outputs.exit_code == '1'
135+
run: exit 1

scripts/cost-killswitch.mjs

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#!/usr/bin/env node
2+
/**
3+
* Cost killswitch: queries Cloudflare GraphQL Analytics for today's usage
4+
* and exits non-zero when any threshold is exceeded.
5+
*
6+
* Env:
7+
* CF_ACCOUNT_ID — required
8+
* CF_API_TOKEN — required, needs Analytics: Read
9+
* DAILY_WORKER_REQUESTS — optional, default 1_000_000
10+
* DAILY_D1_ROWS_READ — optional, default 10_000_000
11+
* DAILY_DO_REQUESTS — optional, default 500_000
12+
* DAILY_DO_DURATION_SEC — optional, default 100_000 (active seconds, not GB-sec)
13+
*
14+
* Exit codes:
15+
* 0 — all thresholds ok
16+
* 1 — one or more thresholds tripped
17+
* 2 — query failed (treat as unknown, do NOT auto-disable)
18+
*
19+
* Stdout: a single JSON line with { tripped, usage, thresholds, reasons }
20+
* Stderr: human-readable diagnostics
21+
*/
22+
23+
const ACCOUNT_ID = process.env.CF_ACCOUNT_ID;
24+
const API_TOKEN = process.env.CF_API_TOKEN;
25+
26+
if (!ACCOUNT_ID || !API_TOKEN) {
27+
console.error("CF_ACCOUNT_ID and CF_API_TOKEN are required");
28+
process.exit(2);
29+
}
30+
31+
const thresholds = {
32+
workerRequests: Number(process.env.DAILY_WORKER_REQUESTS ?? 1_000_000),
33+
d1RowsRead: Number(process.env.DAILY_D1_ROWS_READ ?? 10_000_000),
34+
doRequests: Number(process.env.DAILY_DO_REQUESTS ?? 500_000),
35+
doDurationSec: Number(process.env.DAILY_DO_DURATION_SEC ?? 100_000),
36+
};
37+
38+
// UTC day window — CF billing resets at 00:00 UTC
39+
const now = new Date();
40+
const start = new Date(Date.UTC(now.getUTCFullYear(), now.getUTCMonth(), now.getUTCDate()));
41+
const startIso = start.toISOString();
42+
const nowIso = now.toISOString();
43+
44+
const query = `
45+
query Usage($accountTag: String!, $start: Time!, $end: Time!) {
46+
viewer {
47+
accounts(filter: { accountTag: $accountTag }) {
48+
workersInvocationsAdaptive(
49+
filter: { datetime_geq: $start, datetime_lt: $end }
50+
limit: 10000
51+
) {
52+
sum { requests errors }
53+
}
54+
durableObjectsInvocationsAdaptiveGroups(
55+
filter: { datetime_geq: $start, datetime_lt: $end }
56+
limit: 10000
57+
) {
58+
sum { requests }
59+
}
60+
durableObjectsPeriodicGroups(
61+
filter: { datetime_geq: $start, datetime_lt: $end }
62+
limit: 10000
63+
) {
64+
sum { activeTime }
65+
}
66+
d1AnalyticsAdaptiveGroups(
67+
filter: { datetime_geq: $start, datetime_lt: $end }
68+
limit: 10000
69+
) {
70+
sum { readQueries writeQueries rowsRead rowsWritten }
71+
}
72+
}
73+
}
74+
}
75+
`;
76+
77+
async function callGraphQL() {
78+
const res = await fetch("https://api.cloudflare.com/client/v4/graphql", {
79+
method: "POST",
80+
headers: {
81+
"Authorization": `Bearer ${API_TOKEN}`,
82+
"Content-Type": "application/json",
83+
},
84+
body: JSON.stringify({
85+
query,
86+
variables: { accountTag: ACCOUNT_ID, start: startIso, end: nowIso },
87+
}),
88+
});
89+
if (!res.ok) {
90+
throw new Error(`GraphQL HTTP ${res.status}: ${await res.text()}`);
91+
}
92+
const data = await res.json();
93+
if (data.errors?.length) {
94+
throw new Error(`GraphQL errors: ${JSON.stringify(data.errors)}`);
95+
}
96+
return data.data.viewer.accounts[0] ?? {};
97+
}
98+
99+
function sumBy(rows, key) {
100+
if (!Array.isArray(rows)) return 0;
101+
return rows.reduce((acc, r) => acc + (r?.sum?.[key] ?? 0), 0);
102+
}
103+
104+
function main() {
105+
return callGraphQL().then((acct) => {
106+
const usage = {
107+
workerRequests: sumBy(acct.workersInvocationsAdaptive, "requests"),
108+
workerErrors: sumBy(acct.workersInvocationsAdaptive, "errors"),
109+
doRequests: sumBy(acct.durableObjectsInvocationsAdaptiveGroups, "requests"),
110+
// activeTime is returned in microseconds; convert to seconds for human-friendly threshold
111+
doDurationSec: Math.round(sumBy(acct.durableObjectsPeriodicGroups, "activeTime") / 1_000_000),
112+
d1RowsRead: sumBy(acct.d1AnalyticsAdaptiveGroups, "rowsRead"),
113+
d1RowsWritten: sumBy(acct.d1AnalyticsAdaptiveGroups, "rowsWritten"),
114+
};
115+
116+
const reasons = [];
117+
if (usage.workerRequests > thresholds.workerRequests) {
118+
reasons.push(`workerRequests ${usage.workerRequests} > ${thresholds.workerRequests}`);
119+
}
120+
if (usage.d1RowsRead > thresholds.d1RowsRead) {
121+
reasons.push(`d1RowsRead ${usage.d1RowsRead} > ${thresholds.d1RowsRead}`);
122+
}
123+
if (usage.doRequests > thresholds.doRequests) {
124+
reasons.push(`doRequests ${usage.doRequests} > ${thresholds.doRequests}`);
125+
}
126+
if (usage.doDurationSec > thresholds.doDurationSec) {
127+
reasons.push(`doDurationSec ${usage.doDurationSec} > ${thresholds.doDurationSec}`);
128+
}
129+
130+
const tripped = reasons.length > 0;
131+
const result = { tripped, windowStart: startIso, windowEnd: nowIso, usage, thresholds, reasons };
132+
133+
console.log(JSON.stringify(result));
134+
console.error(
135+
`[killswitch] window=${startIso}..${nowIso}\n` +
136+
` workerRequests: ${usage.workerRequests} / ${thresholds.workerRequests}\n` +
137+
` d1RowsRead: ${usage.d1RowsRead} / ${thresholds.d1RowsRead}\n` +
138+
` doRequests: ${usage.doRequests} / ${thresholds.doRequests}\n` +
139+
` doDurationSec: ${usage.doDurationSec} / ${thresholds.doDurationSec}\n` +
140+
` tripped: ${tripped}${tripped ? ` (${reasons.join("; ")})` : ""}`
141+
);
142+
143+
process.exit(tripped ? 1 : 0);
144+
}).catch((err) => {
145+
console.error(`[killswitch] query failed: ${err.message}`);
146+
console.log(JSON.stringify({ tripped: false, error: err.message }));
147+
process.exit(2);
148+
});
149+
}
150+
151+
main();

0 commit comments

Comments
 (0)