Skip to content

Commit 2328b6e

Browse files
authored
GH-15058: [C++][Python] Native support for UUID (#37298)
### Rationale for this change See #15058. UUID datatype is common in throughout the ecosystem and Arrow as supporting it as a native type would reduce friction. ### What changes are included in this PR? This PR implements logic for Arrow canonical extension type in C++ and a Python wrapper. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes, new extension type is added. * Closes: #15058 Authored-by: Rok Mihevc <rok@mihevc.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent 51e9f70 commit 2328b6e

29 files changed

Lines changed: 412 additions & 132 deletions

cpp/src/arrow/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,7 @@ set(ARROW_SRCS
375375
device.cc
376376
extension_type.cc
377377
extension/bool8.cc
378+
extension/uuid.cc
378379
pretty_print.cc
379380
record_batch.cc
380381
result.cc
@@ -1225,6 +1226,7 @@ add_subdirectory(testing)
12251226
add_subdirectory(array)
12261227
add_subdirectory(c)
12271228
add_subdirectory(compute)
1229+
add_subdirectory(extension)
12281230
add_subdirectory(io)
12291231
add_subdirectory(tensor)
12301232
add_subdirectory(util)
@@ -1267,7 +1269,6 @@ endif()
12671269

12681270
if(ARROW_JSON)
12691271
add_subdirectory(json)
1270-
add_subdirectory(extension)
12711272
endif()
12721273

12731274
if(ARROW_ORC)

cpp/src/arrow/acero/hash_join_node_test.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "arrow/compute/kernels/test_util.h"
3030
#include "arrow/compute/light_array_internal.h"
3131
#include "arrow/compute/row/row_encoder_internal.h"
32+
#include "arrow/extension/uuid.h"
3233
#include "arrow/testing/extension_type.h"
3334
#include "arrow/testing/generator.h"
3435
#include "arrow/testing/gtest_util.h"

cpp/src/arrow/extension/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
set(CANONICAL_EXTENSION_TESTS bool8_test.cc)
18+
set(CANONICAL_EXTENSION_TESTS bool8_test.cc uuid_test.cc)
1919

2020
if(ARROW_JSON)
2121
list(APPEND CANONICAL_EXTENSION_TESTS fixed_shape_tensor_test.cc opaque_test.cc)

cpp/src/arrow/extension/fixed_shape_tensor_test.cc

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
#include "arrow/array/array_primitive.h"
2424
#include "arrow/io/memory.h"
2525
#include "arrow/ipc/reader.h"
26-
#include "arrow/ipc/writer.h"
26+
#include "arrow/ipc/test_common.h"
2727
#include "arrow/record_batch.h"
2828
#include "arrow/tensor.h"
2929
#include "arrow/testing/gtest_util.h"
@@ -33,6 +33,7 @@
3333
namespace arrow {
3434

3535
using FixedShapeTensorType = extension::FixedShapeTensorType;
36+
using arrow::ipc::test::RoundtripBatch;
3637
using extension::fixed_shape_tensor;
3738
using extension::FixedShapeTensorArray;
3839

@@ -71,20 +72,6 @@ class TestExtensionType : public ::testing::Test {
7172
std::string serialized_;
7273
};
7374

74-
auto RoundtripBatch = [](const std::shared_ptr<RecordBatch>& batch,
75-
std::shared_ptr<RecordBatch>* out) {
76-
ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
77-
ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
78-
out_stream.get()));
79-
80-
ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
81-
82-
io::BufferReader reader(complete_ipc_stream);
83-
std::shared_ptr<RecordBatchReader> batch_reader;
84-
ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
85-
ASSERT_OK(batch_reader->ReadNext(out));
86-
};
87-
8875
TEST_F(TestExtensionType, CheckDummyRegistration) {
8976
// We need a registered dummy type at runtime to allow for IPC deserialization
9077
auto registered_type = GetExtensionType("arrow.fixed_shape_tensor");

cpp/src/arrow/extension/uuid.cc

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include <sstream>
19+
20+
#include "arrow/extension_type.h"
21+
#include "arrow/util/logging.h"
22+
23+
#include "arrow/extension/uuid.h"
24+
25+
namespace arrow::extension {
26+
27+
bool UuidType::ExtensionEquals(const ExtensionType& other) const {
28+
return (other.extension_name() == this->extension_name());
29+
}
30+
31+
std::shared_ptr<Array> UuidType::MakeArray(std::shared_ptr<ArrayData> data) const {
32+
DCHECK_EQ(data->type->id(), Type::EXTENSION);
33+
DCHECK_EQ("arrow.uuid",
34+
static_cast<const ExtensionType&>(*data->type).extension_name());
35+
return std::make_shared<UuidArray>(data);
36+
}
37+
38+
Result<std::shared_ptr<DataType>> UuidType::Deserialize(
39+
std::shared_ptr<DataType> storage_type, const std::string& serialized) const {
40+
if (!serialized.empty()) {
41+
return Status::Invalid("Unexpected serialized metadata: '", serialized, "'");
42+
}
43+
if (!storage_type->Equals(*fixed_size_binary(16))) {
44+
return Status::Invalid("Invalid storage type for UuidType: ",
45+
storage_type->ToString());
46+
}
47+
return std::make_shared<UuidType>();
48+
}
49+
50+
std::string UuidType::ToString(bool show_metadata) const {
51+
std::stringstream ss;
52+
ss << "extension<" << this->extension_name() << ">";
53+
return ss.str();
54+
}
55+
56+
std::shared_ptr<DataType> uuid() { return std::make_shared<UuidType>(); }
57+
58+
} // namespace arrow::extension

cpp/src/arrow/extension/uuid.h

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#pragma once
19+
20+
#include "arrow/extension_type.h"
21+
22+
namespace arrow::extension {
23+
24+
/// \brief UuidArray stores array of UUIDs. Underlying storage type is
25+
/// FixedSizeBinary(16).
26+
class ARROW_EXPORT UuidArray : public ExtensionArray {
27+
public:
28+
using ExtensionArray::ExtensionArray;
29+
};
30+
31+
/// \brief UuidType is a canonical arrow extension type for UUIDs.
32+
/// UUIDs are stored as FixedSizeBinary(16) with big-endian notation and this
33+
/// does not interpret the bytes in any way. Specific UUID version is not
34+
/// required or guaranteed.
35+
class ARROW_EXPORT UuidType : public ExtensionType {
36+
public:
37+
/// \brief Construct a UuidType.
38+
UuidType() : ExtensionType(fixed_size_binary(16)) {}
39+
40+
std::string extension_name() const override { return "arrow.uuid"; }
41+
std::string ToString(bool show_metadata = false) const override;
42+
43+
bool ExtensionEquals(const ExtensionType& other) const override;
44+
45+
/// Create a UuidArray from ArrayData
46+
std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
47+
48+
Result<std::shared_ptr<DataType>> Deserialize(
49+
std::shared_ptr<DataType> storage_type,
50+
const std::string& serialized) const override;
51+
52+
std::string Serialize() const override { return ""; }
53+
54+
/// \brief Create a UuidType instance
55+
static Result<std::shared_ptr<DataType>> Make() { return std::make_shared<UuidType>(); }
56+
};
57+
58+
/// \brief Return a UuidType instance.
59+
ARROW_EXPORT std::shared_ptr<DataType> uuid();
60+
61+
} // namespace arrow::extension
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "arrow/extension/uuid.h"
19+
20+
#include "arrow/testing/matchers.h"
21+
22+
#include "arrow/io/memory.h"
23+
#include "arrow/ipc/reader.h"
24+
#include "arrow/ipc/test_common.h"
25+
#include "arrow/testing/gtest_util.h"
26+
#include "arrow/util/key_value_metadata.h"
27+
28+
#include "arrow/testing/extension_type.h"
29+
30+
namespace arrow {
31+
32+
using arrow::ipc::test::RoundtripBatch;
33+
34+
TEST(TestUuuidExtensionType, ExtensionTypeTest) {
35+
auto type = uuid();
36+
ASSERT_EQ(type->id(), Type::EXTENSION);
37+
38+
const auto& ext_type = static_cast<const ExtensionType&>(*type);
39+
std::string serialized = ext_type.Serialize();
40+
41+
ASSERT_OK_AND_ASSIGN(auto deserialized,
42+
ext_type.Deserialize(fixed_size_binary(16), serialized));
43+
ASSERT_TRUE(deserialized->Equals(*type));
44+
ASSERT_FALSE(deserialized->Equals(*fixed_size_binary(16)));
45+
}
46+
47+
TEST(TestUuuidExtensionType, RoundtripBatch) {
48+
auto ext_type = extension::uuid();
49+
auto exact_ext_type = internal::checked_pointer_cast<extension::UuidType>(ext_type);
50+
auto arr = ArrayFromJSON(fixed_size_binary(16), R"(["abcdefghijklmnop", null])");
51+
auto ext_arr = ExtensionType::WrapArray(ext_type, arr);
52+
53+
// Pass extension array, expect getting back extension array
54+
std::shared_ptr<RecordBatch> read_batch;
55+
auto ext_field = field(/*name=*/"f0", /*type=*/ext_type);
56+
auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr});
57+
RoundtripBatch(batch, &read_batch);
58+
CompareBatch(*batch, *read_batch, /*compare_metadata=*/true);
59+
60+
// Pass extension metadata and storage array, expect getting back extension array
61+
std::shared_ptr<RecordBatch> read_batch2;
62+
auto ext_metadata =
63+
key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()},
64+
{"ARROW:extension:metadata", ""}});
65+
ext_field = field(/*name=*/"f0", /*type=*/exact_ext_type->storage_type(),
66+
/*nullable=*/true, /*metadata=*/ext_metadata);
67+
auto batch2 = RecordBatch::Make(schema({ext_field}), arr->length(), {arr});
68+
RoundtripBatch(batch2, &read_batch2);
69+
CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true);
70+
}
71+
72+
} // namespace arrow

cpp/src/arrow/extension_type.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "arrow/extension/fixed_shape_tensor.h"
3333
#include "arrow/extension/opaque.h"
3434
#endif
35+
#include "arrow/extension/uuid.h"
3536
#include "arrow/status.h"
3637
#include "arrow/type.h"
3738
#include "arrow/util/checked_cast.h"
@@ -147,14 +148,13 @@ static void CreateGlobalRegistry() {
147148
// Register canonical extension types
148149

149150
g_registry = std::make_shared<ExtensionTypeRegistryImpl>();
150-
std::vector<std::shared_ptr<DataType>> ext_types{extension::bool8()};
151+
std::vector<std::shared_ptr<DataType>> ext_types{extension::bool8(), extension::uuid()};
151152

152153
#ifdef ARROW_JSON
153154
ext_types.push_back(extension::fixed_shape_tensor(int64(), {}));
154155
ext_types.push_back(extension::opaque(null(), "", ""));
155156
#endif
156157

157-
// Register canonical extension types
158158
for (const auto& ext_type : ext_types) {
159159
ARROW_CHECK_OK(
160160
g_registry->RegisterType(checked_pointer_cast<ExtensionType>(ext_type)));

cpp/src/arrow/extension_type_test.cc

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "arrow/io/memory.h"
3131
#include "arrow/ipc/options.h"
3232
#include "arrow/ipc/reader.h"
33+
#include "arrow/ipc/test_common.h"
3334
#include "arrow/ipc/writer.h"
3435
#include "arrow/record_batch.h"
3536
#include "arrow/status.h"
@@ -41,6 +42,8 @@
4142

4243
namespace arrow {
4344

45+
using arrow::ipc::test::RoundtripBatch;
46+
4447
class Parametric1Array : public ExtensionArray {
4548
public:
4649
using ExtensionArray::ExtensionArray;
@@ -178,7 +181,7 @@ class ExtStructType : public ExtensionType {
178181

179182
class TestExtensionType : public ::testing::Test {
180183
public:
181-
void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared<UuidType>())); }
184+
void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared<ExampleUuidType>())); }
182185

183186
void TearDown() {
184187
if (GetExtensionType("uuid")) {
@@ -211,20 +214,6 @@ TEST_F(TestExtensionType, ExtensionTypeTest) {
211214
ASSERT_EQ(deserialized->byte_width(), 16);
212215
}
213216

214-
auto RoundtripBatch = [](const std::shared_ptr<RecordBatch>& batch,
215-
std::shared_ptr<RecordBatch>* out) {
216-
ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
217-
ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
218-
out_stream.get()));
219-
220-
ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
221-
222-
io::BufferReader reader(complete_ipc_stream);
223-
std::shared_ptr<RecordBatchReader> batch_reader;
224-
ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
225-
ASSERT_OK(batch_reader->ReadNext(out));
226-
};
227-
228217
TEST_F(TestExtensionType, IpcRoundtrip) {
229218
auto ext_arr = ExampleUuid();
230219
auto batch = RecordBatch::Make(schema({field("f0", uuid())}), 4, {ext_arr});

cpp/src/arrow/integration/json_integration_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1046,7 +1046,7 @@ TEST(TestJsonFileReadWrite, JsonExample2) {
10461046

10471047
auto storage_array =
10481048
ArrayFromJSON(fixed_size_binary(16), R"(["0123456789abcdef", null])");
1049-
AssertArraysEqual(*batch->column(0), UuidArray(uuid_type, storage_array));
1049+
AssertArraysEqual(*batch->column(0), ExampleUuidArray(uuid_type, storage_array));
10501050

10511051
AssertArraysEqual(*batch->column(1), NullArray(2));
10521052
}

0 commit comments

Comments
 (0)