Skip to content

Commit b8ad46b

Browse files
Merge master into fix-03562-parallel-replicas-flaky
2 parents d17e968 + 083ff35 commit b8ad46b

835 files changed

Lines changed: 21066 additions & 2546 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.claude/learnings.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Learnings
2+
3+
- When adding new settings in `Settings.cpp`, you must also add `extern` declarations in every `.cpp` file that uses them via `Setting::name` (e.g., in the `namespace Setting` block in `executeQuery.cpp`, `ClientBase.cpp`).
4+
- Rust crate build targets in ninja have the form `rust/workspace/cargo-build__ch_rust_<name>`, not just `_ch_rust_<name>`.

.github/workflows/retry_infra_failures.yml

Lines changed: 59 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,56 @@ jobs:
8787
:
8888
else
8989
should_rerun=true
90-
echo " Infrastructure failure detected."
90+
echo " Infrastructure failure detected (job-level heuristic)."
91+
fi
92+
93+
# Fetch run metadata once for all checks below
94+
run_data=$(gh api "repos/$GH_REPO/actions/runs/$run_id" \
95+
--jq '{pr: .pull_requests[0].number, sha: .head_sha}')
96+
pr_number=$(echo "$run_data" | jq -r '.pr // empty')
97+
run_sha=$(echo "$run_data" | jq -r '.sha')
98+
99+
# If the job-level heuristic didn't trigger, check Praktika result JSONs
100+
# on S3 for failures at the "Checkout Submodules" sub-step (e.g. transient
101+
# DNS failures cloning git submodules).
102+
if [ "$should_rerun" = "false" ] && [ -n "$verdicts" ] && [ -n "$pr_number" ] && [ -n "$run_sha" ]; then
103+
# Get names of failed jobs (excluding pipeline plumbing)
104+
failed_job_names=$(echo "$jobs_raw" | jq -r '
105+
.jobs[] | select(.conclusion == "failure") |
106+
select(.name != "Config Workflow" and .name != "Finish Workflow") |
107+
.name
108+
')
109+
110+
all_checkout_failures=true
111+
while IFS= read -r job_name; do
112+
[ -z "$job_name" ] && continue
113+
# Normalize job name to match Praktika result file naming:
114+
# lowercase, replace non-alphanumeric chars with underscore, collapse runs
115+
normalized=$(echo "$job_name" | tr '[:upper:]' '[:lower:]' | \
116+
sed 's/[^a-z0-9_]/_/g; s/__*/_/g')
117+
result_url="https://s3.amazonaws.com/clickhouse-test-reports/PRs/${pr_number}/${run_sha}/result_${normalized}.json"
118+
119+
result_json=$(curl -sf --compressed "$result_url" 2>/dev/null || true)
120+
if [ -z "$result_json" ]; then
121+
all_checkout_failures=false
122+
break
123+
fi
124+
125+
# Check: all failed sub-results must be "Checkout Submodules"
126+
has_non_checkout_failure=$(echo "$result_json" | jq -r '
127+
[.results[] | select(.status == "failure" or .status == "error") |
128+
.name] | map(select(. != "Checkout Submodules")) | length > 0
129+
')
130+
if [ "$has_non_checkout_failure" = "true" ]; then
131+
all_checkout_failures=false
132+
break
133+
fi
134+
done <<< "$failed_job_names"
135+
136+
if [ "$all_checkout_failures" = "true" ] && [ -n "$failed_job_names" ]; then
137+
should_rerun=true
138+
echo " Infrastructure failure detected (submodule checkout failure in Praktika results)."
139+
fi
91140
fi
92141
93142
# Check if "Config Workflow" failed in its "Run" step (e.g. due to
@@ -103,22 +152,15 @@ jobs:
103152
] | first // empty
104153
')
105154
106-
if [ -n "$config_failed_at" ]; then
107-
run_data=$(gh api "repos/$GH_REPO/actions/runs/$run_id" \
108-
--jq '{pr: .pull_requests[0].number, sha: .head_sha}')
109-
pr_number=$(echo "$run_data" | jq -r '.pr // empty')
110-
run_sha=$(echo "$run_data" | jq -r '.sha')
111-
112-
if [ -n "$pr_number" ]; then
113-
pr_data=$(gh api "repos/$GH_REPO/pulls/$pr_number" \
114-
--jq '{sha: .head.sha, updated: .updated_at}')
115-
pr_sha=$(echo "$pr_data" | jq -r '.sha')
116-
pr_updated=$(echo "$pr_data" | jq -r '.updated')
117-
118-
if [ "$run_sha" = "$pr_sha" ] && [[ "$pr_updated" > "$config_failed_at" ]]; then
119-
should_rerun=true
120-
echo " Config Workflow failed but PR #$pr_number was updated after — rerunning."
121-
fi
155+
if [ -n "$config_failed_at" ] && [ -n "$pr_number" ]; then
156+
pr_data=$(gh api "repos/$GH_REPO/pulls/$pr_number" \
157+
--jq '{sha: .head.sha, updated: .updated_at}')
158+
pr_sha=$(echo "$pr_data" | jq -r '.sha')
159+
pr_updated=$(echo "$pr_data" | jq -r '.updated')
160+
161+
if [ "$run_sha" = "$pr_sha" ] && [[ "$pr_updated" > "$config_failed_at" ]]; then
162+
should_rerun=true
163+
echo " Config Workflow failed but PR #$pr_number was updated after — rerunning."
122164
fi
123165
fi
124166
fi

ci/defs/defs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from praktika import Artifact, Docker, Job, Secret
1+
from praktika import Artifact, Docker, Secret
22
from praktika.utils import MetaClasses, Utils
33

44
# i.e. "ClickHouse/ci/tmp"

ci/defs/job_configs.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
fast_test_digest_config = Job.CacheDigestConfig(
4848
include_paths=[
4949
"./ci/jobs/fast_test.py",
50+
"./ci/jobs/scripts/clickhouse_proc.py",
5051
"./tests/queries/0_stateless/",
5152
"./tests/config/",
5253
"./tests/clickhouse-test",
@@ -116,6 +117,7 @@
116117
include_paths=[
117118
"./tests/queries/0_stateless/",
118119
"./ci/jobs/stress_job.py",
120+
"./ci/jobs/scripts/clickhouse_proc.py",
119121
"./ci/jobs/scripts/stress/stress.py",
120122
"./tests/clickhouse-test",
121123
"./tests/config",
@@ -183,6 +185,20 @@ class JobConfigs:
183185
digest_config=fast_test_digest_config,
184186
result_name_for_cidb="Tests",
185187
)
188+
darwin_fast_test_jobs = Job.Config(
189+
name="Darwin fast test",
190+
runs_on=None, # from parametrize()
191+
command="python3 ./ci/jobs/fast_test.py --set-status-success",
192+
digest_config=fast_test_digest_config,
193+
result_name_for_cidb="Darwin tests",
194+
allow_merge_on_failure=True,
195+
).parametrize(
196+
Job.ParamSet(
197+
parameter=BuildTypes.AMD_DARWIN,
198+
runs_on=RunnerLabels.MACOS_AMD_SMALL,
199+
requires=[ArtifactNames.CH_AMD_DARWIN_BIN],
200+
),
201+
)
186202
smoke_tests_macos = Job.Config(
187203
name=JobNames.SMOKE_TEST_MACOS,
188204
runs_on=RunnerLabels.MACOS_AMD_SMALL,
@@ -1323,4 +1339,4 @@ class JobConfigs:
13231339
),
13241340
timeout=3600,
13251341
enable_gh_auth=True,
1326-
)
1342+
)

ci/docker/integration/runner/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - \
6868
COPY requirements.txt /
6969
RUN python3.13 -m pip install --no-cache-dir -r requirements.txt
7070

71+
7172
RUN curl -fsSL -O https://archive.apache.org/dist/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz \
7273
&& tar xzvf spark-3.5.5-bin-hadoop3.tgz -C / \
7374
&& rm spark-3.5.5-bin-hadoop3.tgz

ci/docker/integration/runner/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,4 +106,4 @@ tzlocal==2.1
106106
websocket-client==1.9.0
107107
wheel==0.46.3
108108
filelock==3.25.0
109-
kazoo @ git+https://github.com/ClickHouse/kazoo.git
109+
kazoo @ git+https://github.com/ClickHouse/kazoo.git@879e0b0e4ae367a289847b43a531938f68770125

ci/jobs/buzzhouse_job.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,8 @@ def main():
176176
"max_tables": random.randint(3, 10),
177177
"max_views": random.randint(0, 10),
178178
"max_dictionaries": random.randint(0, 10),
179+
"max_functions": random.randint(0, 8),
180+
"max_policies": random.randint(0, 8),
179181
"max_columns": random.randint(1, 8),
180182
"min_nested_rows": min_nested_rows,
181183
"max_nested_rows": random.randint(min_nested_rows, max_nested_rows),

ci/jobs/fast_test.py

Lines changed: 67 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
import argparse
22
import os
33
import time
4+
import sys
45
from pathlib import Path
56

7+
repo_path = Path(__file__).resolve().parent.parent.parent
8+
repo_path_normalized = str(repo_path)
9+
sys.path.append(str(repo_path / "ci"))
10+
611
from ci.defs.defs import ToolSet, chcache_secret
712
from ci.jobs.scripts.clickhouse_proc import ClickHouseProc
813
from ci.jobs.scripts.functional_tests_results import FTResultsProcessor
@@ -14,8 +19,7 @@
1419
current_directory = Utils.cwd()
1520
build_dir = f"{current_directory}/ci/tmp/fast_build"
1621
temp_dir = f"{current_directory}/ci/tmp/"
17-
repo_path_normalized = "/ClickHouse"
18-
build_dir_normalized = f"{repo_path_normalized}/ci/tmp/fast_build"
22+
build_dir_normalized = str(repo_path / "ci" / "tmp" / "fast_build")
1923

2024

2125
def clone_submodules():
@@ -69,8 +73,8 @@ def clone_submodules():
6973
# Roll back to 10 if this starts hitting GitHub rate limits.
7074
command=f"xargs --max-procs={min([Utils.cpu_count(), 20])} --null --no-run-if-empty --max-args=1 git submodule update --depth 1 --single-branch",
7175
stdin_str="\0".join(submodules_to_update) + "\0",
72-
timeout=240,
73-
retries=2,
76+
timeout=300,
77+
retries=3,
7478
verbose=True,
7579
)
7680
# NOTE: the three "git submodule foreach" cleanup commands (reset --hard,
@@ -118,9 +122,9 @@ def parse_args():
118122
parser = argparse.ArgumentParser(description="ClickHouse Fast Test Job")
119123
parser.add_argument("--test", help="Optional test_case name to run", default="")
120124
parser.add_argument("--param", help="Optional custom job start stage", default=None)
125+
parser.add_argument("--set-status-success", help="Forcefully set a green status", action="store_true")
121126
return parser.parse_args()
122127

123-
124128
def main():
125129
args = parse_args()
126130
stop_watch = Utils.Stopwatch()
@@ -136,6 +140,28 @@ def main():
136140

137141
clickhouse_bin_path = Path(f"{build_dir}/programs/clickhouse")
138142

143+
for path in [
144+
Path(temp_dir) / "clickhouse",
145+
clickhouse_bin_path,
146+
Path(current_directory) / "clickhouse",
147+
]:
148+
if path.is_file():
149+
clickhouse_bin_path = path
150+
print(f"NOTE: clickhouse binary is found [{clickhouse_bin_path}] - skip the build")
151+
152+
stages = [JobStages.CONFIG, JobStages.TEST]
153+
resolved_clickhouse_bin_path = clickhouse_bin_path.resolve()
154+
Utils.link(resolved_clickhouse_bin_path, resolved_clickhouse_bin_path.parent / "clickhouse-server")
155+
Utils.link(resolved_clickhouse_bin_path, resolved_clickhouse_bin_path.parent / "clickhouse-client")
156+
Utils.link(resolved_clickhouse_bin_path, resolved_clickhouse_bin_path.parent / "clickhouse-local")
157+
Shell.check(f"chmod +x {resolved_clickhouse_bin_path}", strict=True)
158+
159+
break
160+
else:
161+
print(
162+
f"NOTE: clickhouse binary is not found [{clickhouse_bin_path}] - will be built"
163+
)
164+
139165
# Global sccache settings for local and CI runs
140166
os.environ["SCCACHE_DIR"] = f"{temp_dir}/sccache"
141167
os.environ["SCCACHE_CACHE_SIZE"] = "40G"
@@ -145,36 +171,10 @@ def main():
145171
os.environ["SCCACHE_ERROR_LOG"] = f"{build_dir}/sccache.log"
146172
os.environ["SCCACHE_LOG"] = "info"
147173

148-
if Info().is_local_run:
174+
info = Info()
175+
if info.is_local_run:
176+
print("NOTE: It's a local run")
149177
os.environ["SCCACHE_S3_NO_CREDENTIALS"] = "true"
150-
for path in [
151-
clickhouse_bin_path,
152-
Path(temp_dir) / "clickhouse",
153-
Path(current_directory) / "clickhouse",
154-
]:
155-
if path.exists():
156-
clickhouse_bin_path = path
157-
break
158-
if clickhouse_bin_path.exists():
159-
print(
160-
f"NOTE: It's a local run and clickhouse binary is found [{clickhouse_bin_path}] - skip the build"
161-
)
162-
stages = [JobStages.CONFIG, JobStages.TEST]
163-
resolved_clickhouse_bin_path = clickhouse_bin_path.resolve()
164-
Shell.check(
165-
f"ln -sf {resolved_clickhouse_bin_path} {resolved_clickhouse_bin_path.parent}/clickhouse-server",
166-
strict=True,
167-
)
168-
Shell.check(
169-
f"ln -sf {resolved_clickhouse_bin_path} {resolved_clickhouse_bin_path.parent}/clickhouse-client",
170-
strict=True,
171-
)
172-
Shell.check(f"chmod +x {resolved_clickhouse_bin_path}", strict=True)
173-
else:
174-
print(
175-
f"NOTE: It's a local run and clickhouse binary is not found [{clickhouse_bin_path}] - will be built"
176-
)
177-
time.sleep(5)
178178
else:
179179
os.environ["CH_HOSTNAME"] = (
180180
"https://build-cache.eu-west-1.aws.clickhouse-staging.com"
@@ -258,8 +258,9 @@ def main():
258258

259259
if res and JobStages.CONFIG in stages:
260260
commands = [
261+
f"mkdir -p {temp_dir}/etc/clickhouse-server",
261262
f"cp ./programs/server/config.xml ./programs/server/users.xml {temp_dir}/etc/clickhouse-server/",
262-
f"./tests/config/install.sh /etc/clickhouse-server /etc/clickhouse-client --fast-test",
263+
f"./tests/config/install.sh {temp_dir}/etc/clickhouse-server {temp_dir}/etc/clickhouse-client --fast-test",
263264
# f"cp -a {current_directory}/programs/server/config.d/log_to_console.xml {temp_dir}/etc/clickhouse-server/config.d/",
264265
f"rm -f {temp_dir}/etc/clickhouse-server/config.d/secure_ports.xml",
265266
update_path_ch_config,
@@ -272,7 +273,12 @@ def main():
272273
)
273274
res = results[-1].is_ok()
274275

275-
CH = ClickHouseProc(fast_test=True)
276+
CH = ClickHouseProc(
277+
ch_config_dir=f"{temp_dir}/etc/clickhouse-server",
278+
ch_var_lib_dir=f"{temp_dir}/var/lib/clickhouse",
279+
)
280+
CH.install_configs()
281+
276282
attach_debug = False
277283
if res and JobStages.TEST in stages:
278284
stop_watch_ = Utils.Stopwatch()
@@ -290,34 +296,45 @@ def main():
290296
stop_watch_ = Utils.Stopwatch()
291297
step_name = "Tests"
292298
print(step_name)
293-
res = res and CH.run_fast_test(test=args.test or "")
294-
if res:
295-
results.append(FTResultsProcessor(wd=Settings.OUTPUT_DIR).run())
296-
results[-1].set_timing(stopwatch=stop_watch_)
297-
else:
298-
results.append(
299+
300+
# Fast test runs lightweight SQL tests that are not CPU-bound,
301+
# so we can use more parallelism than the default cpu_count/2.
302+
nproc_fast = max(1, int(Utils.cpu_count() * 3 / 4))
303+
304+
fast_test_command = f"cd {temp_dir} && clickhouse-test --hung-check --trace --capture-client-stacktrace --no-random-settings --no-random-merge-tree-settings --no-long --testname --shard --check-zookeeper-session --order random --report-logs-stats --fast-tests-only --no-stateful --jobs {nproc_fast}"
305+
if args.test:
306+
fast_test_command += f" -- '{args.test}'"
307+
308+
res = CH.run_test(fast_test_command)
309+
310+
test_results = FTResultsProcessor(wd=Settings.OUTPUT_DIR).run()
311+
if not res:
312+
test_results.results.append(
299313
Result.create_from(
300-
name=step_name,
301-
status=Result.Status.ERROR,
302-
stopwatch=stop_watch_,
303-
info="Tests run error",
314+
name="clickhouse-test",
315+
status=Result.StatusExtended.FAIL,
316+
info="clickhouse-test error",
304317
)
305318
)
319+
attach_debug = True
320+
321+
results.append(test_results)
322+
results[-1].set_timing(stopwatch=stop_watch_)
306323
if not results[-1].is_ok():
307324
attach_debug = True
308325
job_info = results[-1].info
309326

310327
if attach_debug:
311328
attach_files += [
312329
clickhouse_bin_path,
313-
f"{temp_dir}/var/log/clickhouse-server/clickhouse-server.err.log",
314-
f"{temp_dir}/var/log/clickhouse-server/clickhouse-server.log",
330+
*CH.prepare_logs(info=info, all=True),
315331
]
316332

317-
CH.terminate()
333+
CH.terminate(force=True)
318334

335+
status = Result.Status.SUCCESS if args.set_status_success else ""
319336
Result.create_from(
320-
results=results, stopwatch=stop_watch, files=attach_files, info=job_info
337+
results=results, status=status, stopwatch=stop_watch, files=attach_files, info=job_info
321338
).complete_job()
322339

323340

ci/jobs/functional_tests.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,9 @@ def configure_log_export():
431431

432432
if is_flaky_check or is_targeted_check:
433433
commands.append(CH.enable_thread_fuzzer_config)
434+
sanitizers = ("asan", "tsan", "msan", "ubsan")
435+
if any(san in args.options for san in sanitizers):
436+
commands.append(lambda: CH.set_memory_ratio(0.8))
434437

435438
os.environ["MALLOC_CONF"] = (
436439
f"prof_prefix:{temp_dir}/jemalloc_profiles/clickhouse.jemalloc"

0 commit comments

Comments
 (0)