Skip to content

Commit f9b2b3c

Browse files
Merge remote-tracking branch 'upstream/main' into gh-41480-pyarrow-build-config
2 parents c3a2136 + 5385926 commit f9b2b3c

31 files changed

Lines changed: 351 additions & 370 deletions

cpp/src/arrow/compute/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,8 @@ add_arrow_test(internals_test
9090
light_array_test.cc
9191
registry_test.cc
9292
key_hash_test.cc
93-
row/compare_test.cc)
93+
row/compare_test.cc
94+
row/grouper_test.cc)
9495

9596
add_arrow_compute_test(expression_test SOURCES expression_test.cc)
9697

cpp/src/arrow/compute/row/compare_internal.cc

Lines changed: 18 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -36,22 +36,22 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_com
3636
const uint32_t* left_to_right_map,
3737
LightContext* ctx, const KeyColumnArray& col,
3838
const RowTableImpl& rows,
39-
uint8_t* match_bytevector,
40-
bool are_cols_in_encoding_order) {
39+
bool are_cols_in_encoding_order,
40+
uint8_t* match_bytevector) {
4141
if (!rows.has_any_nulls(ctx) && !col.data(0)) {
4242
return;
4343
}
4444
uint32_t num_processed = 0;
4545
#if defined(ARROW_HAVE_RUNTIME_AVX2)
4646
if (ctx->has_avx2()) {
47-
num_processed = NullUpdateColumnToRow_avx2(use_selection, id_col, num_rows_to_compare,
48-
sel_left_maybe_null, left_to_right_map,
49-
ctx, col, rows, match_bytevector);
47+
num_processed = NullUpdateColumnToRow_avx2(
48+
use_selection, id_col, num_rows_to_compare, sel_left_maybe_null,
49+
left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, match_bytevector);
5050
}
5151
#endif
5252

53-
uint32_t null_bit_id =
54-
are_cols_in_encoding_order ? id_col : rows.metadata().pos_after_encoding(id_col);
53+
const uint32_t null_bit_id =
54+
ColIdInEncodingOrder(rows, id_col, are_cols_in_encoding_order);
5555

5656
if (!col.data(0)) {
5757
// Remove rows from the result for which the column value is a null
@@ -363,10 +363,9 @@ void KeyCompare::CompareColumnsToRows(
363363
continue;
364364
}
365365

366-
uint32_t offset_within_row = rows.metadata().encoded_field_offset(
367-
are_cols_in_encoding_order
368-
? static_cast<uint32_t>(icol)
369-
: rows.metadata().pos_after_encoding(static_cast<uint32_t>(icol)));
366+
uint32_t offset_within_row =
367+
rows.metadata().encoded_field_offset(ColIdInEncodingOrder(
368+
rows, static_cast<uint32_t>(icol), are_cols_in_encoding_order));
370369
if (col.metadata().is_fixed_length) {
371370
if (sel_left_maybe_null) {
372371
CompareBinaryColumnToRow<true>(
@@ -375,9 +374,8 @@ void KeyCompare::CompareColumnsToRows(
375374
is_first_column ? match_bytevector_A : match_bytevector_B);
376375
NullUpdateColumnToRow<true>(
377376
static_cast<uint32_t>(icol), num_rows_to_compare, sel_left_maybe_null,
378-
left_to_right_map, ctx, col, rows,
379-
is_first_column ? match_bytevector_A : match_bytevector_B,
380-
are_cols_in_encoding_order);
377+
left_to_right_map, ctx, col, rows, are_cols_in_encoding_order,
378+
is_first_column ? match_bytevector_A : match_bytevector_B);
381379
} else {
382380
// Version without using selection vector
383381
CompareBinaryColumnToRow<false>(
@@ -386,9 +384,8 @@ void KeyCompare::CompareColumnsToRows(
386384
is_first_column ? match_bytevector_A : match_bytevector_B);
387385
NullUpdateColumnToRow<false>(
388386
static_cast<uint32_t>(icol), num_rows_to_compare, sel_left_maybe_null,
389-
left_to_right_map, ctx, col, rows,
390-
is_first_column ? match_bytevector_A : match_bytevector_B,
391-
are_cols_in_encoding_order);
387+
left_to_right_map, ctx, col, rows, are_cols_in_encoding_order,
388+
is_first_column ? match_bytevector_A : match_bytevector_B);
392389
}
393390
if (!is_first_column) {
394391
AndByteVectors(ctx, num_rows_to_compare, match_bytevector_A, match_bytevector_B);
@@ -414,9 +411,8 @@ void KeyCompare::CompareColumnsToRows(
414411
}
415412
NullUpdateColumnToRow<true>(
416413
static_cast<uint32_t>(icol), num_rows_to_compare, sel_left_maybe_null,
417-
left_to_right_map, ctx, col, rows,
418-
is_first_column ? match_bytevector_A : match_bytevector_B,
419-
are_cols_in_encoding_order);
414+
left_to_right_map, ctx, col, rows, are_cols_in_encoding_order,
415+
is_first_column ? match_bytevector_A : match_bytevector_B);
420416
} else {
421417
if (ivarbinary == 0) {
422418
CompareVarBinaryColumnToRow<false, true>(
@@ -429,9 +425,8 @@ void KeyCompare::CompareColumnsToRows(
429425
}
430426
NullUpdateColumnToRow<false>(
431427
static_cast<uint32_t>(icol), num_rows_to_compare, sel_left_maybe_null,
432-
left_to_right_map, ctx, col, rows,
433-
is_first_column ? match_bytevector_A : match_bytevector_B,
434-
are_cols_in_encoding_order);
428+
left_to_right_map, ctx, col, rows, are_cols_in_encoding_order,
429+
is_first_column ? match_bytevector_A : match_bytevector_B);
435430
}
436431
if (!is_first_column) {
437432
AndByteVectors(ctx, num_rows_to_compare, match_bytevector_A, match_bytevector_B);

cpp/src/arrow/compute/row/compare_internal.h

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,19 @@ class ARROW_EXPORT KeyCompare {
4343
uint8_t* out_match_bitvector_maybe_null = NULLPTR);
4444

4545
private:
46+
static uint32_t ColIdInEncodingOrder(const RowTableImpl& rows, uint32_t id_col,
47+
bool are_cols_in_encoding_order) {
48+
return are_cols_in_encoding_order ? id_col
49+
: rows.metadata().pos_after_encoding(id_col);
50+
}
51+
4652
template <bool use_selection>
4753
static void NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_compare,
4854
const uint16_t* sel_left_maybe_null,
4955
const uint32_t* left_to_right_map, LightContext* ctx,
5056
const KeyColumnArray& col, const RowTableImpl& rows,
51-
uint8_t* match_bytevector,
52-
bool are_cols_in_encoding_order);
57+
bool are_cols_in_encoding_order,
58+
uint8_t* match_bytevector);
5359

5460
template <bool use_selection, class COMPARE_FN>
5561
static void CompareBinaryColumnToRowHelper(
@@ -92,7 +98,8 @@ class ARROW_EXPORT KeyCompare {
9298
static uint32_t NullUpdateColumnToRowImp_avx2(
9399
uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null,
94100
const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col,
95-
const RowTableImpl& rows, uint8_t* match_bytevector);
101+
const RowTableImpl& rows, bool are_cols_in_encoding_order,
102+
uint8_t* match_bytevector);
96103

97104
template <bool use_selection, class COMPARE8_FN>
98105
static uint32_t CompareBinaryColumnToRowHelper_avx2(
@@ -118,13 +125,11 @@ class ARROW_EXPORT KeyCompare {
118125
static uint32_t AndByteVectors_avx2(uint32_t num_elements, uint8_t* bytevector_A,
119126
const uint8_t* bytevector_B);
120127

121-
static uint32_t NullUpdateColumnToRow_avx2(bool use_selection, uint32_t id_col,
122-
uint32_t num_rows_to_compare,
123-
const uint16_t* sel_left_maybe_null,
124-
const uint32_t* left_to_right_map,
125-
LightContext* ctx, const KeyColumnArray& col,
126-
const RowTableImpl& rows,
127-
uint8_t* match_bytevector);
128+
static uint32_t NullUpdateColumnToRow_avx2(
129+
bool use_selection, uint32_t id_col, uint32_t num_rows_to_compare,
130+
const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map,
131+
LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows,
132+
bool are_cols_in_encoding_order, uint8_t* match_bytevector);
128133

129134
static uint32_t CompareBinaryColumnToRow_avx2(
130135
bool use_selection, uint32_t offset_within_row, uint32_t num_rows_to_compare,

cpp/src/arrow/compute/row/compare_internal_avx2.cc

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,14 @@ template <bool use_selection>
3939
uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2(
4040
uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null,
4141
const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col,
42-
const RowTableImpl& rows, uint8_t* match_bytevector) {
42+
const RowTableImpl& rows, bool are_cols_in_encoding_order,
43+
uint8_t* match_bytevector) {
4344
if (!rows.has_any_nulls(ctx) && !col.data(0)) {
4445
return num_rows_to_compare;
4546
}
4647

47-
uint32_t null_bit_id = rows.metadata().pos_after_encoding(id_col);
48+
const uint32_t null_bit_id =
49+
ColIdInEncodingOrder(rows, id_col, are_cols_in_encoding_order);
4850

4951
if (!col.data(0)) {
5052
// Remove rows from the result for which the column value is a null
@@ -569,7 +571,7 @@ uint32_t KeyCompare::NullUpdateColumnToRow_avx2(
569571
bool use_selection, uint32_t id_col, uint32_t num_rows_to_compare,
570572
const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map,
571573
LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows,
572-
uint8_t* match_bytevector) {
574+
bool are_cols_in_encoding_order, uint8_t* match_bytevector) {
573575
int64_t num_rows_safe =
574576
TailSkipForSIMD::FixBitAccess(sizeof(uint32_t), col.length(), col.bit_offset(0));
575577
if (sel_left_maybe_null) {
@@ -580,13 +582,13 @@ uint32_t KeyCompare::NullUpdateColumnToRow_avx2(
580582
}
581583

582584
if (use_selection) {
583-
return NullUpdateColumnToRowImp_avx2<true>(id_col, num_rows_to_compare,
584-
sel_left_maybe_null, left_to_right_map,
585-
ctx, col, rows, match_bytevector);
585+
return NullUpdateColumnToRowImp_avx2<true>(
586+
id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col,
587+
rows, are_cols_in_encoding_order, match_bytevector);
586588
} else {
587-
return NullUpdateColumnToRowImp_avx2<false>(id_col, num_rows_to_compare,
588-
sel_left_maybe_null, left_to_right_map,
589-
ctx, col, rows, match_bytevector);
589+
return NullUpdateColumnToRowImp_avx2<false>(
590+
id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col,
591+
rows, are_cols_in_encoding_order, match_bytevector);
590592
}
591593
}
592594

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include <numeric>
19+
20+
#include "arrow/compute/exec.h"
21+
#include "arrow/compute/row/grouper.h"
22+
#include "arrow/testing/gtest_util.h"
23+
#include "arrow/testing/random.h"
24+
25+
namespace arrow {
26+
namespace compute {
27+
28+
// Specialized case for GH-40997
29+
TEST(Grouper, ResortedColumnsWithLargeNullRows) {
30+
const uint64_t num_rows = 1024;
31+
32+
// construct random array with plenty of null values
33+
const int32_t kSeed = 42;
34+
const int32_t min = 0;
35+
const int32_t max = 100;
36+
const double null_probability = 0.3;
37+
const double true_probability = 0.5;
38+
auto rng = random::RandomArrayGenerator(kSeed);
39+
auto b_arr = rng.Boolean(num_rows, true_probability, null_probability);
40+
auto i32_arr = rng.Int32(num_rows, min, max, null_probability);
41+
auto i64_arr = rng.Int64(num_rows, min, max * 10, null_probability);
42+
43+
// construct batches with columns which will be resorted in the grouper make
44+
std::vector<ExecBatch> exec_batches = {ExecBatch({i64_arr, i32_arr, b_arr}, num_rows),
45+
ExecBatch({i32_arr, i64_arr, b_arr}, num_rows),
46+
ExecBatch({i64_arr, b_arr, i32_arr}, num_rows),
47+
ExecBatch({i32_arr, b_arr, i64_arr}, num_rows),
48+
ExecBatch({b_arr, i32_arr, i64_arr}, num_rows),
49+
ExecBatch({b_arr, i64_arr, i32_arr}, num_rows)};
50+
51+
const int num_batches = static_cast<int>(exec_batches.size());
52+
std::vector<uint32_t> group_num_vec;
53+
group_num_vec.reserve(num_batches);
54+
55+
for (const auto& exec_batch : exec_batches) {
56+
ExecSpan span(exec_batch);
57+
ASSERT_OK_AND_ASSIGN(auto grouper, Grouper::Make(span.GetTypes()));
58+
ASSERT_OK_AND_ASSIGN(Datum group_ids, grouper->Consume(span));
59+
group_num_vec.emplace_back(grouper->num_groups());
60+
}
61+
62+
for (int i = 1; i < num_batches; i++) {
63+
ASSERT_EQ(group_num_vec[i - 1], group_num_vec[i]);
64+
}
65+
}
66+
67+
} // namespace compute
68+
} // namespace arrow

cpp/src/arrow/compute/row/row_internal.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,8 @@ void RowTableMetadata::FromColumnMetadataVector(
6666
//
6767
// Columns are sorted based on the size in bytes of their fixed-length part.
6868
// For the varying-length column, the fixed-length part is the 32-bit field storing
69-
// cumulative length of varying-length fields.
69+
// cumulative length of varying-length fields. This is to make the memory access of
70+
// each individual column within the encoded row alignment-friendly.
7071
//
7172
// The rules are:
7273
//

cpp/src/arrow/dataset/file_parquet_encryption_test.cc

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -148,17 +148,22 @@ class DatasetEncryptionTestBase : public ::testing::Test {
148148
FileSystemDatasetFactory::Make(file_system_, selector,
149149
file_format, factory_options));
150150

151-
// Read dataset into table
151+
// Create the dataset
152152
ASSERT_OK_AND_ASSIGN(auto dataset, dataset_factory->Finish());
153-
ASSERT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan());
154-
ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish());
155-
ASSERT_OK_AND_ASSIGN(auto read_table, scanner->ToTable());
156-
157-
// Verify the data was read correctly
158-
ASSERT_OK_AND_ASSIGN(auto combined_table, read_table->CombineChunks());
159-
// Validate the table
160-
ASSERT_OK(combined_table->ValidateFull());
161-
AssertTablesEqual(*combined_table, *table_);
153+
154+
// Reuse the dataset above to scan it twice to make sure decryption works correctly.
155+
for (size_t i = 0; i < 2; ++i) {
156+
// Read dataset into table
157+
ASSERT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan());
158+
ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish());
159+
ASSERT_OK_AND_ASSIGN(auto read_table, scanner->ToTable());
160+
161+
// Verify the data was read correctly
162+
ASSERT_OK_AND_ASSIGN(auto combined_table, read_table->CombineChunks());
163+
// Validate the table
164+
ASSERT_OK(combined_table->ValidateFull());
165+
AssertTablesEqual(*combined_table, *table_);
166+
}
162167
}
163168

164169
protected:

0 commit comments

Comments
 (0)