Skip to content

Commit fde31ed

Browse files
authored
GH-34361: [C++] Fix the handling of logical nulls for types without bitmaps like Unions and Run-End Encoded (#34408)
Bonus: add `ArrayData::IsValid()` to make it consistent with `Array` and `ArraySpan`. ### Rationale for this change This is the proposed fix to #34361 plus the addition of more APIs dealing with validity/nullity. ### What changes are included in this PR? This PR changes the behavior of `IsNull` and `IsValid` in `Array`, `ArrayData`, and `ArraySpan`. It preserves the behavior of `MayHaveNulls`, `GetNullCount` and introduces new member functions to `Array`, `ArrayData`, and `ArraySpan`: - `bool HasValidityBitmap() const` - `bool MayHaveLogicalNulls() const` - `int64_t ComputeLogicalNullCount() const` ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. See above. Breakage with these changes can only happen if users rely on `IsNull(i)` always returning `true` for union types, but we have users reporting that the current behavior or broken #34315. This is why the behavior of `IsNull` and `IsValid` is changing., **This PR contains a "Critical Fix".** * Closes: #34361 Authored-by: Felipe Oliveira Carvalho <felipekde@gmail.com> Signed-off-by: Matt Topol <zotthewizard@gmail.com>
1 parent 84e5430 commit fde31ed

11 files changed

Lines changed: 517 additions & 40 deletions

File tree

cpp/src/arrow/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ set(ARROW_SRCS
230230
util/time.cc
231231
util/tracing.cc
232232
util/trie.cc
233+
util/union_util.cc
233234
util/unreachable.cc
234235
util/uri.cc
235236
util/utf8.cc

cpp/src/arrow/array/array_base.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@ class ExtensionArray;
5252

5353
int64_t Array::null_count() const { return data_->GetNullCount(); }
5454

55+
int64_t Array::ComputeLogicalNullCount() const {
56+
return data_->ComputeLogicalNullCount();
57+
}
58+
5559
namespace internal {
5660

5761
struct ScalarFromArraySlotImpl {
@@ -175,6 +179,10 @@ struct ScalarFromArraySlotImpl {
175179
array_.length());
176180
}
177181

182+
// Skip checking for nulls in RUN_END_ENCODED arrays to avoid potentially
183+
// making two O(log n) searches for the physical index of the slot -- one
184+
// here and another in Visit(const RunEndEncodedArray&) in case the values
185+
// is not null.
178186
if (array_.type()->id() != Type::RUN_END_ENCODED && array_.IsNull(index_)) {
179187
auto null = MakeNullScalar(array_.type());
180188
if (is_dictionary(array_.type()->id())) {

cpp/src/arrow/array/array_base.h

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -55,18 +55,28 @@ class ARROW_EXPORT Array {
5555
virtual ~Array() = default;
5656

5757
/// \brief Return true if value at index is null. Does not boundscheck
58-
bool IsNull(int64_t i) const {
59-
return null_bitmap_data_ != NULLPTR
60-
? !bit_util::GetBit(null_bitmap_data_, i + data_->offset)
61-
: data_->null_count == data_->length;
62-
}
58+
bool IsNull(int64_t i) const { return !IsValid(i); }
6359

6460
/// \brief Return true if value at index is valid (not null). Does not
6561
/// boundscheck
6662
bool IsValid(int64_t i) const {
67-
return null_bitmap_data_ != NULLPTR
68-
? bit_util::GetBit(null_bitmap_data_, i + data_->offset)
69-
: data_->null_count != data_->length;
63+
if (null_bitmap_data_ != NULLPTR) {
64+
return bit_util::GetBit(null_bitmap_data_, i + data_->offset);
65+
}
66+
// Dispatching with a few conditionals like this makes IsNull more
67+
// efficient for how it is used in practice. Making IsNull virtual
68+
// would add a vtable lookup to every call and prevent inlining +
69+
// a potential inner-branch removal.
70+
if (type_id() == Type::SPARSE_UNION) {
71+
return !internal::IsNullSparseUnion(*data_, i);
72+
}
73+
if (type_id() == Type::DENSE_UNION) {
74+
return !internal::IsNullDenseUnion(*data_, i);
75+
}
76+
if (type_id() == Type::RUN_END_ENCODED) {
77+
return !internal::IsNullRunEndEncoded(*data_, i);
78+
}
79+
return data_->null_count != data_->length;
7080
}
7181

7282
/// \brief Return a Scalar containing the value of this array at i
@@ -85,6 +95,17 @@ class ARROW_EXPORT Array {
8595
/// function
8696
int64_t null_count() const;
8797

98+
/// \brief Computes the logical null count for arrays of all types including
99+
/// those that do not have a validity bitmap like union and run-end encoded
100+
/// arrays
101+
///
102+
/// If the array has a validity bitmap, this function behaves the same as
103+
/// null_count(). For types that have no validity bitmap, this function will
104+
/// recompute the null count every time it is called.
105+
///
106+
/// \see GetNullCount
107+
int64_t ComputeLogicalNullCount() const;
108+
88109
std::shared_ptr<DataType> type() const { return data_->type; }
89110
Type::type type_id() const { return data_->type->id(); }
90111

cpp/src/arrow/array/array_test.cc

Lines changed: 65 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include "arrow/array/builder_binary.h"
3838
#include "arrow/array/builder_decimal.h"
3939
#include "arrow/array/builder_dict.h"
40+
#include "arrow/array/builder_run_end.h"
4041
#include "arrow/array/builder_time.h"
4142
#include "arrow/array/data.h"
4243
#include "arrow/array/util.h"
@@ -83,15 +84,48 @@ TEST_F(TestArray, TestNullCount) {
8384
auto data = std::make_shared<Buffer>(nullptr, 0);
8485
auto null_bitmap = std::make_shared<Buffer>(nullptr, 0);
8586

86-
std::unique_ptr<Int32Array> arr(new Int32Array(100, data, null_bitmap, 10));
87+
std::shared_ptr<Int32Array> arr(new Int32Array(100, data, null_bitmap, 10));
88+
ASSERT_EQ(10, arr->ComputeLogicalNullCount());
8789
ASSERT_EQ(10, arr->null_count());
90+
ASSERT_TRUE(arr->data()->MayHaveNulls());
91+
ASSERT_TRUE(arr->data()->MayHaveLogicalNulls());
8892

89-
std::unique_ptr<Int32Array> arr_no_nulls(new Int32Array(100, data));
93+
std::shared_ptr<Int32Array> arr_no_nulls(new Int32Array(100, data));
94+
ASSERT_EQ(0, arr_no_nulls->ComputeLogicalNullCount());
9095
ASSERT_EQ(0, arr_no_nulls->null_count());
96+
ASSERT_FALSE(arr_no_nulls->data()->MayHaveNulls());
97+
ASSERT_FALSE(arr_no_nulls->data()->MayHaveLogicalNulls());
9198

92-
std::unique_ptr<Int32Array> arr_default_null_count(
99+
std::shared_ptr<Int32Array> arr_default_null_count(
93100
new Int32Array(100, data, null_bitmap));
94101
ASSERT_EQ(kUnknownNullCount, arr_default_null_count->data()->null_count);
102+
ASSERT_TRUE(arr_default_null_count->data()->MayHaveNulls());
103+
ASSERT_TRUE(arr_default_null_count->data()->MayHaveLogicalNulls());
104+
105+
RunEndEncodedBuilder ree_builder(pool_, std::make_shared<Int32Builder>(pool_),
106+
std::make_shared<Int32Builder>(pool_),
107+
run_end_encoded(int32(), int32()));
108+
ASSERT_OK(ree_builder.AppendScalar(*MakeScalar<int32_t>(2), 2));
109+
ASSERT_OK(ree_builder.AppendNull());
110+
ASSERT_OK(ree_builder.AppendScalar(*MakeScalar<int32_t>(4), 3));
111+
ASSERT_OK(ree_builder.AppendNulls(2));
112+
ASSERT_OK(ree_builder.AppendScalar(*MakeScalar<int32_t>(8), 5));
113+
ASSERT_OK(ree_builder.AppendNulls(7));
114+
ASSERT_OK_AND_ASSIGN(auto ree, ree_builder.Finish());
115+
116+
ASSERT_EQ(0, ree->null_count());
117+
ASSERT_EQ(10, ree->ComputeLogicalNullCount());
118+
ASSERT_FALSE(ree->data()->MayHaveNulls());
119+
ASSERT_TRUE(ree->data()->MayHaveLogicalNulls());
120+
121+
ASSERT_OK(ree_builder.AppendScalar(*MakeScalar<int32_t>(2), 2));
122+
ASSERT_OK(ree_builder.AppendScalar(*MakeScalar<int32_t>(4), 3));
123+
ASSERT_OK(ree_builder.AppendScalar(*MakeScalar<int32_t>(8), 5));
124+
ASSERT_OK_AND_ASSIGN(auto ree_no_nulls, ree_builder.Finish());
125+
ASSERT_EQ(0, ree_no_nulls->null_count());
126+
ASSERT_EQ(0, ree_no_nulls->ComputeLogicalNullCount());
127+
ASSERT_FALSE(ree_no_nulls->data()->MayHaveNulls());
128+
ASSERT_FALSE(ree_no_nulls->data()->MayHaveLogicalNulls());
95129
}
96130

97131
TEST_F(TestArray, TestSlicePreservesAllNullCount) {
@@ -377,20 +411,23 @@ TEST_F(TestArray, TestMakeArrayOfNull) {
377411
ASSERT_EQ(array->length(), length);
378412
if (is_union(type->id())) {
379413
ASSERT_EQ(array->null_count(), 0);
414+
ASSERT_EQ(array->ComputeLogicalNullCount(), length);
380415
const auto& union_array = checked_cast<const UnionArray&>(*array);
381416
for (int i = 0; i < union_array.num_fields(); ++i) {
382417
ASSERT_EQ(union_array.field(i)->null_count(), union_array.field(i)->length());
383418
}
384419
} else if (type->id() == Type::RUN_END_ENCODED) {
385420
ASSERT_EQ(array->null_count(), 0);
421+
ASSERT_EQ(array->ComputeLogicalNullCount(), length);
386422
const auto& ree_array = checked_cast<const RunEndEncodedArray&>(*array);
387423
ASSERT_EQ(ree_array.values()->null_count(), ree_array.values()->length());
388424
} else {
389425
ASSERT_EQ(array->null_count(), length);
390-
for (int64_t i = 0; i < length; ++i) {
391-
ASSERT_TRUE(array->IsNull(i));
392-
ASSERT_FALSE(array->IsValid(i));
393-
}
426+
ASSERT_EQ(array->ComputeLogicalNullCount(), length);
427+
}
428+
for (int64_t i = 0; i < length; ++i) {
429+
ASSERT_TRUE(array->IsNull(i));
430+
ASSERT_FALSE(array->IsValid(i));
394431
}
395432
}
396433
}
@@ -482,35 +519,45 @@ void AssertAppendScalar(MemoryPool* pool, const std::shared_ptr<Scalar>& scalar)
482519
std::unique_ptr<arrow::ArrayBuilder> builder;
483520
auto null_scalar = MakeNullScalar(scalar->type);
484521
ASSERT_OK(MakeBuilderExactIndex(pool, scalar->type, &builder));
485-
ASSERT_OK(builder->AppendScalar(*scalar));
486-
ASSERT_OK(builder->AppendScalar(*scalar));
487-
ASSERT_OK(builder->AppendScalar(*null_scalar));
488-
ASSERT_OK(builder->AppendScalars({scalar, null_scalar}));
489-
ASSERT_OK(builder->AppendScalar(*scalar, /*n_repeats=*/2));
490-
ASSERT_OK(builder->AppendScalar(*null_scalar, /*n_repeats=*/2));
522+
ASSERT_OK(builder->AppendScalar(*scalar)); // [0] = scalar
523+
ASSERT_OK(builder->AppendScalar(*scalar)); // [1] = scalar
524+
ASSERT_OK(builder->AppendScalar(*null_scalar)); // [2] = NULL
525+
ASSERT_OK(builder->AppendScalars({scalar, null_scalar})); // [3, 4] = {scalar, NULL}
526+
ASSERT_OK(
527+
builder->AppendScalar(*scalar, /*n_repeats=*/2)); // [5, 6] = {scalar, scalar}
528+
ASSERT_OK(
529+
builder->AppendScalar(*null_scalar, /*n_repeats=*/2)); // [7, 8] = {NULL, NULL}
491530

492531
std::shared_ptr<Array> out;
493532
FinishAndCheckPadding(builder.get(), &out);
494533
ASSERT_OK(out->ValidateFull());
495534
AssertTypeEqual(scalar->type, out->type());
496535
ASSERT_EQ(out->length(), 9);
497536

498-
const bool can_check_nulls = internal::HasValidityBitmap(out->type()->id());
537+
auto out_type_id = out->type()->id();
538+
const bool has_validity = internal::HasValidityBitmap(out_type_id);
499539
// For a dictionary builder, the output dictionary won't necessarily be the same
500-
const bool can_check_values = !is_dictionary(out->type()->id());
540+
const bool can_check_values = !is_dictionary(out_type_id);
501541

502-
if (can_check_nulls) {
542+
if (has_validity) {
503543
ASSERT_EQ(out->null_count(), 4);
544+
} else {
545+
ASSERT_EQ(out->null_count(), 0);
546+
}
547+
if (scalar->is_valid) {
548+
ASSERT_EQ(out->ComputeLogicalNullCount(), 4);
549+
} else {
550+
ASSERT_EQ(out->ComputeLogicalNullCount(), 9);
504551
}
505552

506553
for (const auto index : {0, 1, 3, 5, 6}) {
507-
ASSERT_FALSE(out->IsNull(index));
554+
ASSERT_NE(out->IsNull(index), scalar->is_valid);
508555
ASSERT_OK_AND_ASSIGN(auto scalar_i, out->GetScalar(index));
509556
ASSERT_OK(scalar_i->ValidateFull());
510557
if (can_check_values) AssertScalarsEqual(*scalar, *scalar_i, /*verbose=*/true);
511558
}
512559
for (const auto index : {2, 4, 7, 8}) {
513-
ASSERT_EQ(out->IsNull(index), can_check_nulls);
560+
ASSERT_TRUE(out->IsNull(index));
514561
ASSERT_OK_AND_ASSIGN(auto scalar_i, out->GetScalar(index));
515562
ASSERT_OK(scalar_i->ValidateFull());
516563
AssertScalarsEqual(*null_scalar, *scalar_i, /*verbose=*/true);

cpp/src/arrow/array/builder_nested.h

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -131,9 +131,13 @@ class BaseListBuilder : public ArrayBuilder {
131131
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
132132
int64_t length) override {
133133
const offset_type* offsets = array.GetValues<offset_type>(1);
134-
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
134+
const bool all_valid = !array.MayHaveLogicalNulls();
135+
const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR;
135136
for (int64_t row = offset; row < offset + length; row++) {
136-
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
137+
const bool is_valid =
138+
all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) ||
139+
array.IsValid(row);
140+
if (is_valid) {
137141
ARROW_RETURN_NOT_OK(Append());
138142
int64_t slot_length = offsets[row + 1] - offsets[row];
139143
ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(array.child_data[0],
@@ -301,9 +305,13 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder {
301305
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
302306
int64_t length) override {
303307
const int32_t* offsets = array.GetValues<int32_t>(1);
304-
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
308+
const bool all_valid = !array.MayHaveLogicalNulls();
309+
const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR;
305310
for (int64_t row = offset; row < offset + length; row++) {
306-
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
311+
const bool is_valid =
312+
all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) ||
313+
array.IsValid(row);
314+
if (is_valid) {
307315
ARROW_RETURN_NOT_OK(Append());
308316
const int64_t slot_length = offsets[row + 1] - offsets[row];
309317
// Add together the inner StructArray offset to the Map/List offset

cpp/src/arrow/array/data.cc

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@
3434
#include "arrow/util/bitmap_ops.h"
3535
#include "arrow/util/logging.h"
3636
#include "arrow/util/macros.h"
37+
#include "arrow/util/ree_util.h"
3738
#include "arrow/util/slice_util_internal.h"
39+
#include "arrow/util/union_util.h"
3840

3941
namespace arrow {
4042

@@ -60,6 +62,38 @@ static inline void AdjustNonNullable(Type::type type_id, int64_t length,
6062
}
6163
}
6264

65+
namespace internal {
66+
67+
bool IsNullSparseUnion(const ArrayData& data, int64_t i) {
68+
auto* union_type = checked_cast<const SparseUnionType*>(data.type.get());
69+
const auto* types = reinterpret_cast<const int8_t*>(data.buffers[1]->data());
70+
const int child_id = union_type->child_ids()[types[data.offset + i]];
71+
return data.child_data[child_id]->IsNull(i);
72+
}
73+
74+
bool IsNullDenseUnion(const ArrayData& data, int64_t i) {
75+
auto* union_type = checked_cast<const DenseUnionType*>(data.type.get());
76+
const auto* types = reinterpret_cast<const int8_t*>(data.buffers[1]->data());
77+
const int child_id = union_type->child_ids()[types[data.offset + i]];
78+
const auto* offsets = reinterpret_cast<const int32_t*>(data.buffers[2]->data());
79+
const int64_t child_offset = offsets[data.offset + i];
80+
return data.child_data[child_id]->IsNull(child_offset);
81+
}
82+
83+
bool IsNullRunEndEncoded(const ArrayData& data, int64_t i) {
84+
return ArraySpan(data).IsNullRunEndEncoded(i);
85+
}
86+
87+
bool UnionMayHaveLogicalNulls(const ArrayData& data) {
88+
return ArraySpan(data).MayHaveLogicalNulls();
89+
}
90+
91+
bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data) {
92+
return ArraySpan(data).MayHaveLogicalNulls();
93+
}
94+
95+
} // namespace internal
96+
6397
std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64_t length,
6498
std::vector<std::shared_ptr<Buffer>> buffers,
6599
int64_t null_count, int64_t offset) {
@@ -132,6 +166,13 @@ int64_t ArrayData::GetNullCount() const {
132166
return precomputed;
133167
}
134168

169+
int64_t ArrayData::ComputeLogicalNullCount() const {
170+
if (this->buffers[0]) {
171+
return GetNullCount();
172+
}
173+
return ArraySpan(*this).ComputeLogicalNullCount();
174+
}
175+
135176
// ----------------------------------------------------------------------
136177
// Methods for ArraySpan
137178

@@ -407,6 +448,20 @@ int64_t ArraySpan::GetNullCount() const {
407448
return precomputed;
408449
}
409450

451+
int64_t ArraySpan::ComputeLogicalNullCount() const {
452+
const auto t = this->type->id();
453+
if (t == Type::SPARSE_UNION) {
454+
return union_util::LogicalSparseUnionNullCount(*this);
455+
}
456+
if (t == Type::DENSE_UNION) {
457+
return union_util::LogicalDenseUnionNullCount(*this);
458+
}
459+
if (t == Type::RUN_END_ENCODED) {
460+
return ree_util::LogicalNullCount(*this);
461+
}
462+
return GetNullCount();
463+
}
464+
410465
int ArraySpan::num_buffers() const { return GetNumBuffers(*this->type); }
411466

412467
std::shared_ptr<ArrayData> ArraySpan::ToArrayData() const {
@@ -445,6 +500,44 @@ std::shared_ptr<Array> ArraySpan::ToArray() const {
445500
return MakeArray(this->ToArrayData());
446501
}
447502

503+
bool ArraySpan::IsNullSparseUnion(int64_t i) const {
504+
auto* union_type = checked_cast<const SparseUnionType*>(this->type);
505+
const auto* types = reinterpret_cast<const int8_t*>(this->buffers[1].data);
506+
const int child_id = union_type->child_ids()[types[this->offset + i]];
507+
return this->child_data[child_id].IsNull(i);
508+
}
509+
510+
bool ArraySpan::IsNullDenseUnion(int64_t i) const {
511+
auto* union_type = checked_cast<const DenseUnionType*>(this->type);
512+
const auto* types = reinterpret_cast<const int8_t*>(this->buffers[1].data);
513+
const auto* offsets = reinterpret_cast<const int32_t*>(this->buffers[2].data);
514+
const int64_t child_id = union_type->child_ids()[types[this->offset + i]];
515+
const int64_t child_offset = offsets[this->offset + i];
516+
return this->child_data[child_id].IsNull(child_offset);
517+
}
518+
519+
bool ArraySpan::IsNullRunEndEncoded(int64_t i) const {
520+
const auto& values = ree_util::ValuesArray(*this);
521+
if (values.MayHaveLogicalNulls()) {
522+
const int64_t physical_offset = ree_util::FindPhysicalIndex(*this, i, this->offset);
523+
return ree_util::ValuesArray(*this).IsNull(physical_offset);
524+
}
525+
return false;
526+
}
527+
528+
bool ArraySpan::UnionMayHaveLogicalNulls() const {
529+
for (auto& child : this->child_data) {
530+
if (child.MayHaveLogicalNulls()) {
531+
return true;
532+
}
533+
}
534+
return false;
535+
}
536+
537+
bool ArraySpan::RunEndEncodedMayHaveLogicalNulls() const {
538+
return ree_util::ValuesArray(*this).MayHaveLogicalNulls();
539+
}
540+
448541
// ----------------------------------------------------------------------
449542
// Implement internal::GetArrayView
450543

0 commit comments

Comments
 (0)