Skip to content

Commit 73b4018

Browse files
committed
Merge branch 'main' into codec_options
2 parents b5b9e24 + 366dbe1 commit 73b4018

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+1011
-429
lines changed

.github/CODEOWNERS

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,13 @@
4242
/go/ @zeroshade
4343
/java/ @lidavidm
4444
/js/ @domoritz @trxcllnt
45-
/matlab/ @kevingurney
45+
/matlab/ @kevingurney @kou
4646
/python/ @AlenkaF
4747
/python/pyarrow/_flight.pyx @lidavidm
4848
/python/pyarrow/**/*gandiva* @wjones127
4949
/r/ @paleolimbot @thisisnic
5050
/ruby/ @kou
51+
/swift/ @kou
5152

5253
# Docs
5354
# /docs/

.github/workflows/dev_pr/labeler.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@
4545
"Component: Ruby":
4646
- ruby/**/*
4747

48+
"Component: Swift":
49+
- swift/**/*
50+
4851
"Component: FlightRPC":
4952
- cpp/src/arrow/flight/**/*
5053
- r/R/flight.*

.github/workflows/js.yml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ jobs:
8080
name: AMD64 macOS 11 NodeJS ${{ matrix.node }}
8181
runs-on: macos-latest
8282
if: github.event_name == 'push'
83-
timeout-minutes: 60
83+
timeout-minutes: 90
8484
strategy:
8585
fail-fast: false
8686
matrix:
@@ -90,6 +90,12 @@ jobs:
9090
uses: actions/checkout@v3
9191
with:
9292
fetch-depth: 0
93+
- name: Jest Cache
94+
uses: actions/cache@v3
95+
with:
96+
path: js/.jest-cache
97+
key: js-jest-cache-${{ runner.os }}-${{ hashFiles('js/src/**/*.ts', 'js/test/**/*.ts', 'js/yarn.lock') }}
98+
restore-keys: js-jest-cache-${{ runner.os }}-
9399
- name: Install NodeJS
94100
uses: actions/setup-node@v3
95101
with:
@@ -114,6 +120,12 @@ jobs:
114120
uses: actions/checkout@v3
115121
with:
116122
fetch-depth: 0
123+
- name: Jest Cache
124+
uses: actions/cache@v3
125+
with:
126+
path: js/.jest-cache
127+
key: js-jest-cache-${{ runner.os }}-${{ hashFiles('js/src/**/*.ts', 'js/test/**/*.ts', 'js/yarn.lock') }}
128+
restore-keys: js-jest-cache-${{ runner.os }}-
117129
- name: Install NodeJS
118130
uses: actions/setup-node@v3
119131
with:

cpp/src/arrow/array/array_test.cc

Lines changed: 34 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
#include "arrow/util/bitmap_builders.h"
6060
#include "arrow/util/checked_cast.h"
6161
#include "arrow/util/decimal.h"
62+
#include "arrow/util/key_value_metadata.h"
6263
#include "arrow/util/macros.h"
6364
#include "arrow/util/range.h"
6465
#include "arrow/visit_data_inline.h"
@@ -366,13 +367,12 @@ TEST_F(TestArray, BuildLargeInMemoryArray) {
366367
ASSERT_EQ(length, result->length());
367368
}
368369

369-
TEST_F(TestArray, TestMakeArrayOfNull) {
370+
static std::vector<std::shared_ptr<DataType>> TestArrayUtilitiesAgainstTheseTypes() {
370371
FieldVector union_fields1({field("a", utf8()), field("b", int32())});
371372
FieldVector union_fields2({field("a", null()), field("b", list(large_utf8()))});
372373
std::vector<int8_t> union_type_codes{7, 42};
373374

374-
std::shared_ptr<DataType> types[] = {
375-
// clang-format off
375+
return {
376376
null(),
377377
boolean(),
378378
int8(),
@@ -387,7 +387,7 @@ TEST_F(TestArray, TestMakeArrayOfNull) {
387387
utf8(),
388388
large_utf8(),
389389
list(utf8()),
390-
list(int64()), // ARROW-9071
390+
list(int64()), // NOTE: Regression case for ARROW-9071/MakeArrayOfNull
391391
large_list(large_utf8()),
392392
fixed_size_list(utf8(), 3),
393393
fixed_size_list(int64(), 4),
@@ -397,13 +397,15 @@ TEST_F(TestArray, TestMakeArrayOfNull) {
397397
sparse_union(union_fields2, union_type_codes),
398398
dense_union(union_fields1, union_type_codes),
399399
dense_union(union_fields2, union_type_codes),
400-
smallint(), // extension type
401-
list_extension_type(), // nested extension type
402-
// clang-format on
400+
smallint(), // extension type
401+
list_extension_type(), // nested extension type
402+
run_end_encoded(int16(), utf8()),
403403
};
404+
}
404405

406+
TEST_F(TestArray, TestMakeArrayOfNull) {
405407
for (int64_t length : {0, 1, 16, 133}) {
406-
for (auto type : types) {
408+
for (auto type : TestArrayUtilitiesAgainstTheseTypes()) {
407409
ARROW_SCOPED_TRACE("type = ", type->ToString());
408410
ASSERT_OK_AND_ASSIGN(auto array, MakeArrayOfNull(type, length));
409411
ASSERT_EQ(array->type(), type);
@@ -716,36 +718,7 @@ void CheckSpanRoundTrip(const Array& array) {
716718
}
717719

718720
TEST_F(TestArray, TestMakeEmptyArray) {
719-
FieldVector union_fields1({field("a", utf8()), field("b", int32())});
720-
FieldVector union_fields2({field("a", null()), field("b", list(large_utf8()))});
721-
std::vector<int8_t> union_type_codes{7, 42};
722-
723-
std::shared_ptr<DataType> types[] = {null(),
724-
boolean(),
725-
int8(),
726-
uint16(),
727-
int32(),
728-
uint64(),
729-
float64(),
730-
binary(),
731-
large_binary(),
732-
fixed_size_binary(3),
733-
decimal(16, 4),
734-
utf8(),
735-
large_utf8(),
736-
list(utf8()),
737-
list(int64()),
738-
large_list(large_utf8()),
739-
fixed_size_list(utf8(), 3),
740-
fixed_size_list(int64(), 4),
741-
dictionary(int32(), utf8()),
742-
struct_({field("a", utf8()), field("b", int32())}),
743-
sparse_union(union_fields1, union_type_codes),
744-
sparse_union(union_fields2, union_type_codes),
745-
dense_union(union_fields1, union_type_codes),
746-
dense_union(union_fields2, union_type_codes)};
747-
748-
for (auto type : types) {
721+
for (auto type : TestArrayUtilitiesAgainstTheseTypes()) {
749722
ARROW_SCOPED_TRACE("type = ", type->ToString());
750723
ASSERT_OK_AND_ASSIGN(auto array, MakeEmptyArray(type));
751724
ASSERT_OK(array->ValidateFull());
@@ -754,6 +727,29 @@ TEST_F(TestArray, TestMakeEmptyArray) {
754727
}
755728
}
756729

730+
TEST_F(TestArray, TestFillFromScalar) {
731+
for (auto type : TestArrayUtilitiesAgainstTheseTypes()) {
732+
ARROW_SCOPED_TRACE("type = ", type->ToString());
733+
for (auto seed : {0u, 0xdeadbeef, 42u}) {
734+
ARROW_SCOPED_TRACE("seed = ", seed);
735+
736+
Field field("", type, /*nullable=*/true,
737+
key_value_metadata({{"extension_allow_random_storage", "true"}}));
738+
auto array = random::GenerateArray(field, 1, seed);
739+
740+
ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(0));
741+
742+
ArraySpan span(*scalar);
743+
auto roundtripped_array = span.ToArray();
744+
AssertArraysEqual(*array, *roundtripped_array);
745+
746+
ASSERT_OK(roundtripped_array->ValidateFull());
747+
ASSERT_OK_AND_ASSIGN(auto roundtripped_scalar, roundtripped_array->GetScalar(0));
748+
AssertScalarsEqual(*scalar, *roundtripped_scalar);
749+
}
750+
}
751+
}
752+
757753
TEST_F(TestArray, ExtensionSpanRoundTrip) {
758754
// Other types are checked in MakeEmptyArray but MakeEmptyArray doesn't
759755
// work for extension types so we check that here

cpp/src/arrow/array/data.cc

Lines changed: 68 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,8 @@ std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64
130130
}
131131

132132
std::shared_ptr<ArrayData> ArrayData::Slice(int64_t off, int64_t len) const {
133-
ARROW_CHECK_LE(off, length) << "Slice offset greater than array length";
133+
ARROW_CHECK_LE(off, length) << "Slice offset (" << off
134+
<< ") greater than array length (" << length << ")";
134135
len = std::min(length - off, len);
135136
off += offset;
136137

@@ -228,22 +229,20 @@ void ArraySpan::SetMembers(const ArrayData& data) {
228229
namespace {
229230

230231
template <typename offset_type>
231-
void SetOffsetsForScalar(ArraySpan* span, offset_type* buffer, int64_t value_size,
232-
int buffer_index = 1) {
233-
buffer[0] = 0;
234-
buffer[1] = static_cast<offset_type>(value_size);
235-
span->buffers[buffer_index].data = reinterpret_cast<uint8_t*>(buffer);
236-
span->buffers[buffer_index].size = 2 * sizeof(offset_type);
232+
BufferSpan OffsetsForScalar(uint8_t* scratch_space, offset_type value_size) {
233+
auto* offsets = reinterpret_cast<offset_type*>(scratch_space);
234+
offsets[0] = 0;
235+
offsets[1] = static_cast<offset_type>(value_size);
236+
return {scratch_space, sizeof(offset_type) * 2};
237237
}
238238

239239
int GetNumBuffers(const DataType& type) {
240240
switch (type.id()) {
241241
case Type::NA:
242242
case Type::STRUCT:
243243
case Type::FIXED_SIZE_LIST:
244-
return 1;
245244
case Type::RUN_END_ENCODED:
246-
return 0;
245+
return 1;
247246
case Type::BINARY:
248247
case Type::LARGE_BINARY:
249248
case Type::STRING:
@@ -265,16 +264,19 @@ int GetNumBuffers(const DataType& type) {
265264
namespace internal {
266265

267266
void FillZeroLengthArray(const DataType* type, ArraySpan* span) {
268-
memset(span->scratch_space, 0x00, sizeof(span->scratch_space));
269-
270267
span->type = type;
271268
span->length = 0;
272269
int num_buffers = GetNumBuffers(*type);
273270
for (int i = 0; i < num_buffers; ++i) {
274-
span->buffers[i].data = reinterpret_cast<uint8_t*>(span->scratch_space);
271+
alignas(int64_t) static std::array<uint8_t, sizeof(int64_t) * 2> kZeros{0};
272+
span->buffers[i].data = kZeros.data();
275273
span->buffers[i].size = 0;
276274
}
277275

276+
if (!HasValidityBitmap(type->id())) {
277+
span->buffers[0] = {};
278+
}
279+
278280
for (int i = num_buffers; i < 3; ++i) {
279281
span->buffers[i] = {};
280282
}
@@ -304,9 +306,13 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
304306

305307
Type::type type_id = value.type->id();
306308

307-
// Populate null count and validity bitmap (only for non-union/null types)
308-
this->null_count = value.is_valid ? 0 : 1;
309-
if (!is_union(type_id) && type_id != Type::NA) {
309+
if (type_id == Type::NA) {
310+
this->null_count = 1;
311+
} else if (!internal::HasValidityBitmap(type_id)) {
312+
this->null_count = 0;
313+
} else {
314+
// Populate null count and validity bitmap
315+
this->null_count = value.is_valid ? 0 : 1;
310316
this->buffers[0].data = value.is_valid ? &kTrueBit : &kFalseBit;
311317
this->buffers[0].size = 1;
312318
}
@@ -329,20 +335,19 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
329335
}
330336
} else if (is_base_binary_like(type_id)) {
331337
const auto& scalar = checked_cast<const BaseBinaryScalar&>(value);
332-
this->buffers[1].data = reinterpret_cast<uint8_t*>(this->scratch_space);
338+
333339
const uint8_t* data_buffer = nullptr;
334340
int64_t data_size = 0;
335341
if (scalar.is_valid) {
336342
data_buffer = scalar.value->data();
337343
data_size = scalar.value->size();
338344
}
339345
if (is_binary_like(type_id)) {
340-
SetOffsetsForScalar<int32_t>(this, reinterpret_cast<int32_t*>(this->scratch_space),
341-
data_size);
346+
this->buffers[1] =
347+
OffsetsForScalar(scalar.scratch_space_, static_cast<int32_t>(data_size));
342348
} else {
343349
// is_large_binary_like
344-
SetOffsetsForScalar<int64_t>(this, reinterpret_cast<int64_t*>(this->scratch_space),
345-
data_size);
350+
this->buffers[1] = OffsetsForScalar(scalar.scratch_space_, data_size);
346351
}
347352
this->buffers[2].data = const_cast<uint8_t*>(data_buffer);
348353
this->buffers[2].size = data_size;
@@ -367,11 +372,10 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
367372
}
368373

369374
if (type_id == Type::LIST || type_id == Type::MAP) {
370-
SetOffsetsForScalar<int32_t>(this, reinterpret_cast<int32_t*>(this->scratch_space),
371-
value_length);
375+
this->buffers[1] =
376+
OffsetsForScalar(scalar.scratch_space_, static_cast<int32_t>(value_length));
372377
} else if (type_id == Type::LARGE_LIST) {
373-
SetOffsetsForScalar<int64_t>(this, reinterpret_cast<int64_t*>(this->scratch_space),
374-
value_length);
378+
this->buffers[1] = OffsetsForScalar(scalar.scratch_space_, value_length);
375379
} else {
376380
// FIXED_SIZE_LIST: does not have a second buffer
377381
this->buffers[1] = {};
@@ -384,26 +388,31 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
384388
this->child_data[i].FillFromScalar(*scalar.value[i]);
385389
}
386390
} else if (is_union(type_id)) {
391+
// Dense union needs scratch space to store both offsets and a type code
392+
struct UnionScratchSpace {
393+
alignas(int64_t) int8_t type_code;
394+
alignas(int64_t) uint8_t offsets[sizeof(int32_t) * 2];
395+
};
396+
static_assert(sizeof(UnionScratchSpace) <= sizeof(UnionScalar::scratch_space_));
397+
auto* union_scratch_space = reinterpret_cast<UnionScratchSpace*>(
398+
&checked_cast<const UnionScalar&>(value).scratch_space_);
399+
387400
// First buffer is kept null since unions have no validity vector
388401
this->buffers[0] = {};
389402

390-
this->buffers[1].data = reinterpret_cast<uint8_t*>(this->scratch_space);
403+
union_scratch_space->type_code = checked_cast<const UnionScalar&>(value).type_code;
404+
this->buffers[1].data = reinterpret_cast<uint8_t*>(&union_scratch_space->type_code);
391405
this->buffers[1].size = 1;
392-
int8_t* type_codes = reinterpret_cast<int8_t*>(this->scratch_space);
393-
type_codes[0] = checked_cast<const UnionScalar&>(value).type_code;
394406

395407
this->child_data.resize(this->type->num_fields());
396408
if (type_id == Type::DENSE_UNION) {
397409
const auto& scalar = checked_cast<const DenseUnionScalar&>(value);
398-
// Has offset; start 4 bytes in so it's aligned to a 32-bit boundaries
399-
SetOffsetsForScalar<int32_t>(this,
400-
reinterpret_cast<int32_t*>(this->scratch_space) + 1, 1,
401-
/*buffer_index=*/2);
410+
this->buffers[2] =
411+
OffsetsForScalar(union_scratch_space->offsets, static_cast<int32_t>(1));
402412
// We can't "see" the other arrays in the union, but we put the "active"
403413
// union array in the right place and fill zero-length arrays for the
404414
// others
405-
const std::vector<int>& child_ids =
406-
checked_cast<const UnionType*>(this->type)->child_ids();
415+
const auto& child_ids = checked_cast<const UnionType*>(this->type)->child_ids();
407416
DCHECK_GE(scalar.type_code, 0);
408417
DCHECK_LT(scalar.type_code, static_cast<int>(child_ids.size()));
409418
for (int i = 0; i < static_cast<int>(this->child_data.size()); ++i) {
@@ -429,6 +438,32 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
429438

430439
// Restore the extension type
431440
this->type = value.type.get();
441+
} else if (type_id == Type::RUN_END_ENCODED) {
442+
const auto& scalar = checked_cast<const RunEndEncodedScalar&>(value);
443+
this->child_data.resize(2);
444+
445+
auto set_run_end = [&](auto run_end) {
446+
auto& e = this->child_data[0];
447+
e.type = scalar.run_end_type().get();
448+
e.length = 1;
449+
e.null_count = 0;
450+
e.buffers[1].data = scalar.scratch_space_;
451+
e.buffers[1].size = sizeof(run_end);
452+
reinterpret_cast<decltype(run_end)*>(scalar.scratch_space_)[0] = run_end;
453+
};
454+
455+
switch (scalar.run_end_type()->id()) {
456+
case Type::INT16:
457+
set_run_end(static_cast<int16_t>(1));
458+
break;
459+
case Type::INT32:
460+
set_run_end(static_cast<int32_t>(1));
461+
break;
462+
default:
463+
DCHECK_EQ(scalar.run_end_type()->id(), Type::INT64);
464+
set_run_end(static_cast<int64_t>(1));
465+
}
466+
this->child_data[1].FillFromScalar(*scalar.value);
432467
} else {
433468
DCHECK_EQ(Type::NA, type_id) << "should be unreachable: " << *value.type;
434469
}

cpp/src/arrow/array/data.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -372,11 +372,6 @@ struct ARROW_EXPORT ArraySpan {
372372
int64_t offset = 0;
373373
BufferSpan buffers[3];
374374

375-
// 16 bytes of scratch space to enable this ArraySpan to be a view onto
376-
// scalar values including binary scalars (where we need to create a buffer
377-
// that looks like two 32-bit or 64-bit offsets)
378-
uint64_t scratch_space[2];
379-
380375
ArraySpan() = default;
381376

382377
explicit ArraySpan(const DataType* type, int64_t length) : type(type), length(length) {}

0 commit comments

Comments
 (0)