Skip to content

Commit 4ed5a14

Browse files
authored
GH-43797: [C++] Attach arrow::ArrayStatistics to arrow::ArrayData (#43801)
### Rationale for this change If we can attach associated statistics to an array via `ArrayData`, we can use it in later processes such as query planning. If `ArrayData` not `Array` has statistics, we can use statistics in computing kernels. There was a concern that associated `arrow::ArrayStatistics` may be outdated if `arrow::ArrayData` is mutated after attaching `arrow::ArrayStatistics`. But `arrow::ArrayData` isn't mutable after the first population. So `arrow::ArrayStatistics` will not be outdated. We can require mutators to take responsibility for statistics. ### What changes are included in this PR? * Add `arrow::ArrayData::statistics` * Add `arrow::Array::statistics()` to get statistics attached in `arrow::ArrayData` This doesn't provide a new `arrow::ArrayData` constructor (`arrow::ArrayData::Make()`) that accepts `arrow::ArrayStatistics`. We can change `arrow::ArrayData::statistics` after we create `arrow::ArrayData`. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. `arrow::Array::statistics()` is a new public API. * GitHub Issue: #43797 Authored-by: Sutou Kouhei <kou@clear-code.com> Signed-off-by: Sutou Kouhei <kou@clear-code.com>
1 parent 589ab7a commit 4ed5a14

File tree

4 files changed

+159
-2
lines changed

4 files changed

+159
-2
lines changed

cpp/src/arrow/array/array_base.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,14 @@ class ARROW_EXPORT Array {
232232
/// \return DeviceAllocationType
233233
DeviceAllocationType device_type() const { return data_->device_type(); }
234234

235+
/// \brief Return the statistics of this Array
236+
///
237+
/// This just delegates to calling statistics on the underlying ArrayData
238+
/// object which backs this Array.
239+
///
240+
/// \return const ArrayStatistics&
241+
std::shared_ptr<ArrayStatistics> statistics() const { return data_->statistics; }
242+
235243
protected:
236244
Array() = default;
237245
ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);

cpp/src/arrow/array/array_test.cc

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3709,6 +3709,132 @@ TEST(TestSwapEndianArrayData, InvalidLength) {
37093709
}
37103710
}
37113711

3712+
class TestArrayDataStatistics : public ::testing::Test {
3713+
public:
3714+
void SetUp() {
3715+
valids_ = {1, 0, 1, 1};
3716+
null_count_ = std::count(valids_.begin(), valids_.end(), 0);
3717+
null_buffer_ = *internal::BytesToBits(valids_);
3718+
values_ = {1, 0, 3, -4};
3719+
min_ = *std::min_element(values_.begin(), values_.end());
3720+
max_ = *std::max_element(values_.begin(), values_.end());
3721+
values_buffer_ = Buffer::FromVector(values_);
3722+
data_ = ArrayData::Make(int32(), values_.size(), {null_buffer_, values_buffer_},
3723+
null_count_);
3724+
data_->statistics = std::make_shared<ArrayStatistics>();
3725+
data_->statistics->null_count = null_count_;
3726+
data_->statistics->min = min_;
3727+
data_->statistics->is_min_exact = true;
3728+
data_->statistics->max = max_;
3729+
data_->statistics->is_max_exact = true;
3730+
}
3731+
3732+
protected:
3733+
std::vector<uint8_t> valids_;
3734+
size_t null_count_;
3735+
std::shared_ptr<Buffer> null_buffer_;
3736+
std::vector<int32_t> values_;
3737+
int64_t min_;
3738+
int64_t max_;
3739+
std::shared_ptr<Buffer> values_buffer_;
3740+
std::shared_ptr<ArrayData> data_;
3741+
};
3742+
3743+
TEST_F(TestArrayDataStatistics, MoveConstructor) {
3744+
ArrayData copied_data(*data_);
3745+
ArrayData moved_data(std::move(copied_data));
3746+
3747+
ASSERT_TRUE(moved_data.statistics->null_count.has_value());
3748+
ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());
3749+
3750+
ASSERT_TRUE(moved_data.statistics->min.has_value());
3751+
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->min.value()));
3752+
ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics->min.value()));
3753+
ASSERT_TRUE(moved_data.statistics->is_min_exact);
3754+
3755+
ASSERT_TRUE(moved_data.statistics->max.has_value());
3756+
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->max.value()));
3757+
ASSERT_EQ(max_, std::get<int64_t>(moved_data.statistics->max.value()));
3758+
ASSERT_TRUE(moved_data.statistics->is_max_exact);
3759+
}
3760+
3761+
TEST_F(TestArrayDataStatistics, CopyConstructor) {
3762+
ArrayData copied_data(*data_);
3763+
3764+
ASSERT_TRUE(copied_data.statistics->null_count.has_value());
3765+
ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());
3766+
3767+
ASSERT_TRUE(copied_data.statistics->min.has_value());
3768+
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->min.value()));
3769+
ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics->min.value()));
3770+
ASSERT_TRUE(copied_data.statistics->is_min_exact);
3771+
3772+
ASSERT_TRUE(copied_data.statistics->max.has_value());
3773+
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->max.value()));
3774+
ASSERT_EQ(max_, std::get<int64_t>(copied_data.statistics->max.value()));
3775+
ASSERT_TRUE(copied_data.statistics->is_max_exact);
3776+
}
3777+
3778+
TEST_F(TestArrayDataStatistics, MoveAssignment) {
3779+
ArrayData copied_data(*data_);
3780+
ArrayData moved_data;
3781+
moved_data = std::move(copied_data);
3782+
3783+
ASSERT_TRUE(moved_data.statistics->null_count.has_value());
3784+
ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());
3785+
3786+
ASSERT_TRUE(moved_data.statistics->min.has_value());
3787+
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->min.value()));
3788+
ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics->min.value()));
3789+
ASSERT_TRUE(moved_data.statistics->is_min_exact);
3790+
3791+
ASSERT_TRUE(moved_data.statistics->max.has_value());
3792+
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->max.value()));
3793+
ASSERT_EQ(max_, std::get<int64_t>(moved_data.statistics->max.value()));
3794+
ASSERT_TRUE(moved_data.statistics->is_max_exact);
3795+
}
3796+
3797+
TEST_F(TestArrayDataStatistics, CopyAssignment) {
3798+
ArrayData copied_data;
3799+
copied_data = *data_;
3800+
3801+
ASSERT_TRUE(copied_data.statistics->null_count.has_value());
3802+
ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());
3803+
3804+
ASSERT_TRUE(copied_data.statistics->min.has_value());
3805+
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->min.value()));
3806+
ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics->min.value()));
3807+
ASSERT_TRUE(copied_data.statistics->is_min_exact);
3808+
3809+
ASSERT_TRUE(copied_data.statistics->max.has_value());
3810+
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->max.value()));
3811+
ASSERT_EQ(max_, std::get<int64_t>(copied_data.statistics->max.value()));
3812+
ASSERT_TRUE(copied_data.statistics->is_max_exact);
3813+
}
3814+
3815+
TEST_F(TestArrayDataStatistics, CopyTo) {
3816+
ASSERT_OK_AND_ASSIGN(auto copied_data,
3817+
data_->CopyTo(arrow::default_cpu_memory_manager()));
3818+
3819+
ASSERT_TRUE(copied_data->statistics->null_count.has_value());
3820+
ASSERT_EQ(null_count_, copied_data->statistics->null_count.value());
3821+
3822+
ASSERT_TRUE(copied_data->statistics->min.has_value());
3823+
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data->statistics->min.value()));
3824+
ASSERT_EQ(min_, std::get<int64_t>(copied_data->statistics->min.value()));
3825+
ASSERT_TRUE(copied_data->statistics->is_min_exact);
3826+
3827+
ASSERT_TRUE(copied_data->statistics->max.has_value());
3828+
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data->statistics->max.value()));
3829+
ASSERT_EQ(max_, std::get<int64_t>(copied_data->statistics->max.value()));
3830+
ASSERT_TRUE(copied_data->statistics->is_max_exact);
3831+
}
3832+
3833+
TEST_F(TestArrayDataStatistics, Slice) {
3834+
auto sliced_data = data_->Slice(0, 1);
3835+
ASSERT_FALSE(sliced_data->statistics);
3836+
}
3837+
37123838
template <typename PType>
37133839
class TestPrimitiveArray : public ::testing::Test {
37143840
public:

cpp/src/arrow/array/data.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,8 @@ Result<std::shared_ptr<ArrayData>> CopyToImpl(const ArrayData& data,
165165
ARROW_ASSIGN_OR_RAISE(output->dictionary, CopyToImpl(*data.dictionary, to, copy_fn));
166166
}
167167

168+
output->statistics = data.statistics;
169+
168170
return output;
169171
}
170172
} // namespace
@@ -195,6 +197,7 @@ std::shared_ptr<ArrayData> ArrayData::Slice(int64_t off, int64_t len) const {
195197
} else {
196198
copy->null_count = null_count != 0 ? kUnknownNullCount : 0;
197199
}
200+
copy->statistics = nullptr;
198201
return copy;
199202
}
200203

cpp/src/arrow/array/data.h

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <utility>
2525
#include <vector>
2626

27+
#include "arrow/array/statistics.h"
2728
#include "arrow/buffer.h"
2829
#include "arrow/result.h"
2930
#include "arrow/type.h"
@@ -152,7 +153,8 @@ struct ARROW_EXPORT ArrayData {
152153
offset(other.offset),
153154
buffers(std::move(other.buffers)),
154155
child_data(std::move(other.child_data)),
155-
dictionary(std::move(other.dictionary)) {
156+
dictionary(std::move(other.dictionary)),
157+
statistics(std::move(other.statistics)) {
156158
SetNullCount(other.null_count);
157159
}
158160

@@ -163,7 +165,8 @@ struct ARROW_EXPORT ArrayData {
163165
offset(other.offset),
164166
buffers(other.buffers),
165167
child_data(other.child_data),
166-
dictionary(other.dictionary) {
168+
dictionary(other.dictionary),
169+
statistics(other.statistics) {
167170
SetNullCount(other.null_count);
168171
}
169172

@@ -176,6 +179,7 @@ struct ARROW_EXPORT ArrayData {
176179
buffers = std::move(other.buffers);
177180
child_data = std::move(other.child_data);
178181
dictionary = std::move(other.dictionary);
182+
statistics = std::move(other.statistics);
179183
return *this;
180184
}
181185

@@ -188,6 +192,7 @@ struct ARROW_EXPORT ArrayData {
188192
buffers = other.buffers;
189193
child_data = other.child_data;
190194
dictionary = other.dictionary;
195+
statistics = other.statistics;
191196
return *this;
192197
}
193198

@@ -274,6 +279,18 @@ struct ARROW_EXPORT ArrayData {
274279
}
275280

276281
/// \brief Construct a zero-copy slice of the data with the given offset and length
282+
///
283+
/// The associated `ArrayStatistics` is always discarded in a sliced
284+
/// `ArrayData`. Because `ArrayStatistics` in the original
285+
/// `ArrayData` may be invalid in a sliced `ArrayData`. If you want
286+
/// to reuse statistics in the original `ArrayData`, you need to do
287+
/// it by yourself.
288+
///
289+
/// If the specified slice range has the same range as the original
290+
/// `ArrayData`, we can reuse statistics in the original
291+
/// `ArrayData`. Because it has the same data as the original
292+
/// `ArrayData`. But the associated `ArrayStatistics` is discarded
293+
/// in this case too. Use `Copy()` instead for the case.
277294
std::shared_ptr<ArrayData> Slice(int64_t offset, int64_t length) const;
278295

279296
/// \brief Input-checking variant of Slice
@@ -390,6 +407,9 @@ struct ARROW_EXPORT ArrayData {
390407

391408
// The dictionary for this Array, if any. Only used for dictionary type
392409
std::shared_ptr<ArrayData> dictionary;
410+
411+
// The statistics for this Array.
412+
std::shared_ptr<ArrayStatistics> statistics;
393413
};
394414

395415
/// \brief A non-owning Buffer reference

0 commit comments

Comments
 (0)