Skip to content

Commit 196fbe5

Browse files
authored
GH-34639: [C++] Support RecordBatch::FromStructArray even if struct array has nulls/offsets (#34691)
### Rationale for this change A struct array can have a validity map and an offset. A record batch cannot. When converting from a struct array to a record batch we were throwing an error if a validity map was present and returning the wrong data if an offset was present. ### What changes are included in this PR? If a validity map or offset are present then StructArray::Flatten is used to push the offset and validity map into the struct array. Note: this means that RecordBatch::FromStructArray will not be zero-copy if it has to push down a validity map. ### Are these changes tested? Yes ### Are there any user-facing changes? Yes, RecordBatch::FromStructArray now takes in a memory pool because it might have to make allocations when pushing validity bitmaps down. * Closes: #34639 Authored-by: Weston Pace <weston.pace@gmail.com> Signed-off-by: Weston Pace <weston.pace@gmail.com>
1 parent 8d1863a commit 196fbe5

File tree

4 files changed

+40
-10
lines changed

4 files changed

+40
-10
lines changed

cpp/src/arrow/json/reader.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,8 @@ class DecodingOperator {
317317

318318
std::shared_ptr<ChunkedArray> chunked;
319319
RETURN_NOT_OK(builder->Finish(&chunked));
320-
ARROW_ASSIGN_OR_RAISE(auto batch, RecordBatch::FromStructArray(chunked->chunk(0)));
320+
ARROW_ASSIGN_OR_RAISE(
321+
auto batch, RecordBatch::FromStructArray(chunked->chunk(0), context_->pool()));
321322

322323
return DecodedBlock{std::move(batch), num_bytes};
323324
}
@@ -552,7 +553,7 @@ Result<std::shared_ptr<RecordBatch>> ParseOne(ParseOptions options,
552553
std::shared_ptr<ChunkedArray> converted_chunked;
553554
RETURN_NOT_OK(builder->Finish(&converted_chunked));
554555

555-
return RecordBatch::FromStructArray(converted_chunked->chunk(0));
556+
return RecordBatch::FromStructArray(converted_chunked->chunk(0), context.pool());
556557
}
557558

558559
} // namespace json

cpp/src/arrow/record_batch.cc

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -198,14 +198,20 @@ Result<std::shared_ptr<RecordBatch>> RecordBatch::MakeEmpty(
198198
}
199199

200200
Result<std::shared_ptr<RecordBatch>> RecordBatch::FromStructArray(
201-
const std::shared_ptr<Array>& array) {
201+
const std::shared_ptr<Array>& array, MemoryPool* memory_pool) {
202202
if (array->type_id() != Type::STRUCT) {
203203
return Status::TypeError("Cannot construct record batch from array of type ",
204204
*array->type());
205205
}
206-
if (array->null_count() != 0) {
207-
return Status::Invalid(
208-
"Unable to construct record batch from a StructArray with non-zero nulls.");
206+
if (array->null_count() != 0 || array->offset() != 0) {
207+
// If the struct array has a validity map or offset we need to push those into
208+
// the child arrays via Flatten since the RecordBatch doesn't have validity/offset
209+
const std::shared_ptr<StructArray>& struct_array =
210+
internal::checked_pointer_cast<StructArray>(array);
211+
ARROW_ASSIGN_OR_RAISE(std::vector<std::shared_ptr<Array>> fields,
212+
struct_array->Flatten(memory_pool));
213+
return Make(arrow::schema(array->type()->fields()), array->length(),
214+
std::move(fields));
209215
}
210216
return Make(arrow::schema(array->type()->fields()), array->length(),
211217
array->data()->child_data);

cpp/src/arrow/record_batch.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,16 @@ class ARROW_EXPORT RecordBatch {
8282
/// \brief Construct record batch from struct array
8383
///
8484
/// This constructs a record batch using the child arrays of the given
85-
/// array, which must be a struct array. Note that the struct array's own
86-
/// null bitmap is not reflected in the resulting record batch.
85+
/// array, which must be a struct array.
86+
///
87+
/// \param[in] array the source array, must be a StructArray
88+
/// \param[in] pool the memory pool to allocate new validity bitmaps
89+
///
90+
/// This operation will usually be zero-copy. However, if the struct array has an
91+
/// offset or a validity bitmap then these will need to be pushed into the child arrays.
92+
/// Pushing the offset is zero-copy but pushing the validity bitmap is not.
8793
static Result<std::shared_ptr<RecordBatch>> FromStructArray(
88-
const std::shared_ptr<Array>& array);
94+
const std::shared_ptr<Array>& array, MemoryPool* pool = default_memory_pool());
8995

9096
/// \brief Determine if two record batches are exactly equal
9197
///

cpp/src/arrow/record_batch_test.cc

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,19 @@ TEST_F(TestRecordBatch, ToFromEmptyStructArray) {
331331
ASSERT_TRUE(batch1->Equals(*batch2));
332332
}
333333

334+
TEST_F(TestRecordBatch, FromSlicedStructArray) {
335+
static constexpr int64_t kLength = 10;
336+
std::shared_ptr<Array> x_arr = ArrayFromJSON(int64(), "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]");
337+
StructArray struct_array(struct_({field("x", int64())}), kLength, {x_arr});
338+
std::shared_ptr<Array> sliced = struct_array.Slice(5, 3);
339+
ASSERT_OK_AND_ASSIGN(auto batch, RecordBatch::FromStructArray(sliced));
340+
341+
std::shared_ptr<Array> expected_arr = ArrayFromJSON(int64(), "[5, 6, 7]");
342+
std::shared_ptr<RecordBatch> expected =
343+
RecordBatch::Make(schema({field("x", int64())}), 3, {expected_arr});
344+
AssertBatchesEqual(*expected, *batch);
345+
}
346+
334347
TEST_F(TestRecordBatch, FromStructArrayInvalidType) {
335348
random::RandomArrayGenerator gen(42);
336349
ASSERT_RAISES(TypeError, RecordBatch::FromStructArray(gen.ArrayOf(int32(), 6)));
@@ -339,7 +352,11 @@ TEST_F(TestRecordBatch, FromStructArrayInvalidType) {
339352
TEST_F(TestRecordBatch, FromStructArrayInvalidNullCount) {
340353
auto struct_array =
341354
ArrayFromJSON(struct_({field("f1", int32())}), R"([{"f1": 1}, null])");
342-
ASSERT_RAISES(Invalid, RecordBatch::FromStructArray(struct_array));
355+
ASSERT_OK_AND_ASSIGN(auto batch, RecordBatch::FromStructArray(struct_array));
356+
std::shared_ptr<Array> expected_arr = ArrayFromJSON(int32(), "[1, null]");
357+
std::shared_ptr<RecordBatch> expected =
358+
RecordBatch::Make(schema({field("f1", int32())}), 2, {expected_arr});
359+
AssertBatchesEqual(*expected, *batch);
343360
}
344361

345362
TEST_F(TestRecordBatch, MakeEmpty) {

0 commit comments

Comments
 (0)