Skip to content

Commit d4acd1a

Browse files
Merge pull request ClickHouse#11179 from excitoon-favorites/s3globfix
Fixed S3 globbing which could fail in case of more than 1000 keys and some backends
2 parents 55a02c3 + 7201018 commit d4acd1a

2 files changed

Lines changed: 34 additions & 5 deletions

File tree

src/Storages/StorageS3.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#include <DataTypes/DataTypeString.h>
2525

2626
#include <aws/s3/S3Client.h>
27-
#include <aws/s3/model/ListObjectsRequest.h>
27+
#include <aws/s3/model/ListObjectsV2Request.h>
2828

2929
#include <Common/parseGlobs.h>
3030
#include <Common/quoteString.h>
@@ -228,18 +228,18 @@ Strings listFilesWithRegexpMatching(Aws::S3::S3Client & client, const S3::URI &
228228
return {globbed_uri.key};
229229
}
230230

231-
Aws::S3::Model::ListObjectsRequest request;
231+
Aws::S3::Model::ListObjectsV2Request request;
232232
request.SetBucket(globbed_uri.bucket);
233233
request.SetPrefix(key_prefix);
234234

235235
re2::RE2 matcher(makeRegexpPatternFromGlobs(globbed_uri.key));
236236
Strings result;
237-
Aws::S3::Model::ListObjectsOutcome outcome;
237+
Aws::S3::Model::ListObjectsV2Outcome outcome;
238238
int page = 0;
239239
do
240240
{
241241
++page;
242-
outcome = client.ListObjects(request);
242+
outcome = client.ListObjectsV2(request);
243243
if (!outcome.IsSuccess())
244244
{
245245
throw Exception("Could not list objects in bucket " + quoteString(request.GetBucket())
@@ -256,7 +256,7 @@ Strings listFilesWithRegexpMatching(Aws::S3::S3Client & client, const S3::URI &
256256
result.emplace_back(std::move(key));
257257
}
258258

259-
request.SetMarker(outcome.GetResult().GetNextMarker());
259+
request.SetContinuationToken(outcome.GetResult().GetNextContinuationToken());
260260
}
261261
while (outcome.GetResult().GetIsTruncated());
262262

tests/integration/test_storage_s3/test.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import json
22
import logging
33
import random
4+
import threading
45

56
import pytest
67

@@ -278,3 +279,31 @@ def test_wrong_s3_syntax(cluster, s3_storage_args):
278279

279280
query = "create table test_table_s3_syntax (id UInt32) ENGINE = S3({})".format(s3_storage_args)
280281
assert expected_err_msg in instance.query_and_get_error(query)
282+
283+
284+
# https://en.wikipedia.org/wiki/One_Thousand_and_One_Nights
285+
def test_s3_glob_scheherazade(cluster):
286+
bucket = cluster.minio_bucket
287+
instance = cluster.instances["dummy"] # type: ClickHouseInstance
288+
table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
289+
max_path = ""
290+
values = "(1, 1, 1)"
291+
nights_per_job = 1001 // 30
292+
jobs = []
293+
for night in range(0, 1001, nights_per_job):
294+
def add_tales(start, end):
295+
for i in range(start, end):
296+
path = "night_{}/tale.csv".format(i)
297+
query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
298+
cluster.minio_host, cluster.minio_port, bucket, path, table_format, values)
299+
run_query(instance, query)
300+
301+
jobs.append(threading.Thread(target=add_tales, args=(night, min(night+nights_per_job, 1001))))
302+
jobs[-1].start()
303+
304+
for job in jobs:
305+
job.join()
306+
307+
query = "select count(), sum(column1), sum(column2), sum(column3) from s3('http://{}:{}/{}/night_*/tale.csv', 'CSV', '{}')".format(
308+
cluster.minio_redirect_host, cluster.minio_redirect_port, bucket, table_format)
309+
assert run_query(instance, query).splitlines() == ["1001\t1001\t1001\t1001"]

0 commit comments

Comments
 (0)