Skip to content

Commit 857ae92

Browse files
Backport #94095 to 25.12: Fix accuracy of uniqTheta when using UInt8 aggregation keys in parallel
1 parent 387a929 commit 857ae92

5 files changed

Lines changed: 23 additions & 2 deletions

File tree

src/AggregateFunctions/ThetaSketchData.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ template <typename Key>
2020
class ThetaSketchData : private boost::noncopyable
2121
{
2222
private:
23+
/// Used for insertions
2324
std::unique_ptr<datasketches::update_theta_sketch> sk_update;
25+
/// Used for merging
2426
std::unique_ptr<datasketches::theta_union> sk_union;
2527

2628
datasketches::update_theta_sketch * getSkUpdate()
@@ -47,12 +49,26 @@ class ThetaSketchData : private boost::noncopyable
4749
void insertOriginal(std::string_view value)
4850
{
4951
getSkUpdate()->update(value.data(), value.size());
52+
/// In case of optimization for u8 keys (see addBatchLookupTable()) it is possible to have few calls of insert() after merge(),
53+
/// and we should update sk_union as well, note, that there should not be too many, so performance wise it should be OK
54+
if (sk_union)
55+
{
56+
sk_union->update(*sk_update);
57+
sk_update.reset(nullptr);
58+
}
5059
}
5160

5261
/// Note that `datasketches::update_theta_sketch.update` will do the hash again.
5362
void insert(Key value)
5463
{
5564
getSkUpdate()->update(value);
65+
/// In case of optimization for u8 keys (see addBatchLookupTable()) it is possible to have few calls of insert() after merge(),
66+
/// and we should update sk_union as well, note, that there should not be too many, so performance wise it should be OK
67+
if (sk_union)
68+
{
69+
sk_union->update(*sk_update);
70+
sk_update.reset(nullptr);
71+
}
5672
}
5773

5874
UInt64 size() const

tests/queries/0_stateless/01798_uniq_theta_sketch.oldanalyzer.reference

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ uniqTheta many agrs
22
10 10 100 100 1000 1000
33
17 10 10 100 100 610 610 766
44
52 10 10 100 100 608 608 766
5-
5 10 10 100 100 608 608 765
5+
5 10 10 100 100 609 609 765
66
9 10 10 100 100 608 608 765
77
13 10 10 100 100 607 607 765
88
46 10 10 100 100 607 607 765

tests/queries/0_stateless/01798_uniq_theta_sketch.reference

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ uniqTheta many agrs
22
10 10 100 100 1000 1000
33
17 10 10 100 100 610 610 766
44
52 10 10 100 100 608 608 766
5-
5 10 10 100 100 608 608 765
5+
5 10 10 100 100 609 609 765
66
9 10 10 100 100 608 608 765
77
13 10 10 100 100 607 607 765
88
46 10 10 100 100 607 607 765

tests/queries/0_stateless/03790_uniqTheta_error.reference

Whitespace-only changes.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
-- Tags: no-fasttest
2+
-- - no-fasttest -- compiled w/o datasketches
3+
4+
-- Regression for very high error in uniqTheta() due to optimization for u8 keys
5+
select throwIf(stddevSampStable(theta)/avg(theta)>0.1) from (select number%3 key, uniqTheta(generateUUIDv4(number)) theta FROM numbers_mt(9160000) GROUP BY key with totals SETTINGS max_threads = 8) format Null;

0 commit comments

Comments
 (0)