Skip to content

Commit 6e7125b

Browse files
lidavidmwestonpace
andauthored
GH-43454: [C++][Python] Add Opaque canonical extension type (#43458)
### Rationale for this change Add the newly ratified extension type. ### What changes are included in this PR? The C++/Python implementation only. ### Are these changes tested? Yes ### Are there any user-facing changes? No. * GitHub Issue: #43454 Lead-authored-by: David Li <li.davidm96@gmail.com> Co-authored-by: Weston Pace <weston.pace@gmail.com> Signed-off-by: David Li <li.davidm96@gmail.com>
1 parent 4d200dc commit 6e7125b

17 files changed

Lines changed: 627 additions & 3 deletions

File tree

cpp/src/arrow/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,7 @@ endif()
907907
if(ARROW_JSON)
908908
arrow_add_object_library(ARROW_JSON
909909
extension/fixed_shape_tensor.cc
910+
extension/opaque.cc
910911
json/options.cc
911912
json/chunked_builder.cc
912913
json/chunker.cc

cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -865,6 +865,25 @@ std::shared_ptr<CastFunction> GetCastToHalfFloat() {
865865
return func;
866866
}
867867

868+
struct NullExtensionTypeMatcher : public TypeMatcher {
869+
~NullExtensionTypeMatcher() override = default;
870+
871+
bool Matches(const DataType& type) const override {
872+
return type.id() == Type::EXTENSION &&
873+
checked_cast<const ExtensionType&>(type).storage_id() == Type::NA;
874+
}
875+
876+
std::string ToString() const override { return "extension<storage_type: null>"; }
877+
878+
bool Equals(const TypeMatcher& other) const override {
879+
if (this == &other) {
880+
return true;
881+
}
882+
auto casted = dynamic_cast<const NullExtensionTypeMatcher*>(&other);
883+
return casted != nullptr;
884+
}
885+
};
886+
868887
} // namespace
869888

870889
std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
@@ -875,6 +894,10 @@ std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
875894
auto cast_null = std::make_shared<CastFunction>("cast_null", Type::NA);
876895
DCHECK_OK(cast_null->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, null(),
877896
OutputAllNull));
897+
// Explicitly allow casting extension type with null backing array to null
898+
DCHECK_OK(cast_null->AddKernel(
899+
Type::EXTENSION, {InputType(std::make_shared<NullExtensionTypeMatcher>())}, null(),
900+
OutputAllNull));
878901
functions.push_back(cast_null);
879902

880903
functions.push_back(GetCastToInteger<Int8Type>("cast_int8"));

cpp/src/arrow/extension/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,10 @@ add_arrow_test(test
2121
PREFIX
2222
"arrow-fixed-shape-tensor")
2323

24+
add_arrow_test(test
25+
SOURCES
26+
opaque_test.cc
27+
PREFIX
28+
"arrow-extension-opaque")
29+
2430
arrow_install_all_headers("arrow/extension")

cpp/src/arrow/extension/opaque.cc

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "arrow/extension/opaque.h"
19+
20+
#include <sstream>
21+
22+
#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep
23+
#include "arrow/util/logging.h"
24+
25+
#include <rapidjson/document.h>
26+
#include <rapidjson/error/en.h>
27+
#include <rapidjson/writer.h>
28+
29+
namespace arrow::extension {
30+
31+
std::string OpaqueType::ToString(bool show_metadata) const {
32+
std::stringstream ss;
33+
ss << "extension<" << this->extension_name()
34+
<< "[storage_type=" << storage_type_->ToString(show_metadata)
35+
<< ", type_name=" << type_name_ << ", vendor_name=" << vendor_name_ << "]>";
36+
return ss.str();
37+
}
38+
39+
bool OpaqueType::ExtensionEquals(const ExtensionType& other) const {
40+
if (extension_name() != other.extension_name()) {
41+
return false;
42+
}
43+
const auto& opaque = internal::checked_cast<const OpaqueType&>(other);
44+
return storage_type()->Equals(*opaque.storage_type()) &&
45+
type_name() == opaque.type_name() && vendor_name() == opaque.vendor_name();
46+
}
47+
48+
std::string OpaqueType::Serialize() const {
49+
rapidjson::Document document;
50+
document.SetObject();
51+
rapidjson::Document::AllocatorType& allocator = document.GetAllocator();
52+
53+
rapidjson::Value type_name(rapidjson::StringRef(type_name_));
54+
document.AddMember(rapidjson::Value("type_name", allocator), type_name, allocator);
55+
rapidjson::Value vendor_name(rapidjson::StringRef(vendor_name_));
56+
document.AddMember(rapidjson::Value("vendor_name", allocator), vendor_name, allocator);
57+
58+
rapidjson::StringBuffer buffer;
59+
rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
60+
document.Accept(writer);
61+
return buffer.GetString();
62+
}
63+
64+
Result<std::shared_ptr<DataType>> OpaqueType::Deserialize(
65+
std::shared_ptr<DataType> storage_type, const std::string& serialized_data) const {
66+
rapidjson::Document document;
67+
const auto& parsed = document.Parse(serialized_data.data(), serialized_data.length());
68+
if (parsed.HasParseError()) {
69+
return Status::Invalid("Invalid serialized JSON data for OpaqueType: ",
70+
rapidjson::GetParseError_En(parsed.GetParseError()), ": ",
71+
serialized_data);
72+
} else if (!document.IsObject()) {
73+
return Status::Invalid("Invalid serialized JSON data for OpaqueType: not an object");
74+
}
75+
if (!document.HasMember("type_name")) {
76+
return Status::Invalid(
77+
"Invalid serialized JSON data for OpaqueType: missing type_name");
78+
} else if (!document.HasMember("vendor_name")) {
79+
return Status::Invalid(
80+
"Invalid serialized JSON data for OpaqueType: missing vendor_name");
81+
}
82+
83+
const auto& type_name = document["type_name"];
84+
const auto& vendor_name = document["vendor_name"];
85+
if (!type_name.IsString()) {
86+
return Status::Invalid(
87+
"Invalid serialized JSON data for OpaqueType: type_name is not a string");
88+
} else if (!vendor_name.IsString()) {
89+
return Status::Invalid(
90+
"Invalid serialized JSON data for OpaqueType: vendor_name is not a string");
91+
}
92+
93+
return opaque(std::move(storage_type), type_name.GetString(), vendor_name.GetString());
94+
}
95+
96+
std::shared_ptr<Array> OpaqueType::MakeArray(std::shared_ptr<ArrayData> data) const {
97+
DCHECK_EQ(data->type->id(), Type::EXTENSION);
98+
DCHECK_EQ("arrow.opaque",
99+
internal::checked_cast<const ExtensionType&>(*data->type).extension_name());
100+
return std::make_shared<OpaqueArray>(data);
101+
}
102+
103+
std::shared_ptr<DataType> opaque(std::shared_ptr<DataType> storage_type,
104+
std::string type_name, std::string vendor_name) {
105+
return std::make_shared<OpaqueType>(std::move(storage_type), std::move(type_name),
106+
std::move(vendor_name));
107+
}
108+
109+
} // namespace arrow::extension

cpp/src/arrow/extension/opaque.h

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "arrow/extension_type.h"
19+
#include "arrow/type.h"
20+
21+
namespace arrow::extension {
22+
23+
/// \brief Opaque is a placeholder for a type from an external (usually
24+
/// non-Arrow) system that could not be interpreted.
25+
class ARROW_EXPORT OpaqueType : public ExtensionType {
26+
public:
27+
/// \brief Construct an OpaqueType.
28+
///
29+
/// \param[in] storage_type The underlying storage type. Should be
30+
/// arrow::null if there is no data.
31+
/// \param[in] type_name The name of the type in the external system.
32+
/// \param[in] vendor_name The name of the external system.
33+
explicit OpaqueType(std::shared_ptr<DataType> storage_type, std::string type_name,
34+
std::string vendor_name)
35+
: ExtensionType(std::move(storage_type)),
36+
type_name_(std::move(type_name)),
37+
vendor_name_(std::move(vendor_name)) {}
38+
39+
std::string extension_name() const override { return "arrow.opaque"; }
40+
std::string ToString(bool show_metadata) const override;
41+
bool ExtensionEquals(const ExtensionType& other) const override;
42+
std::string Serialize() const override;
43+
Result<std::shared_ptr<DataType>> Deserialize(
44+
std::shared_ptr<DataType> storage_type,
45+
const std::string& serialized_data) const override;
46+
/// Create an OpaqueArray from ArrayData
47+
std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
48+
49+
std::string_view type_name() const { return type_name_; }
50+
std::string_view vendor_name() const { return vendor_name_; }
51+
52+
private:
53+
std::string type_name_;
54+
std::string vendor_name_;
55+
};
56+
57+
/// \brief Opaque is a wrapper for (usually binary) data from an external
58+
/// (often non-Arrow) system that could not be interpreted.
59+
class ARROW_EXPORT OpaqueArray : public ExtensionArray {
60+
public:
61+
using ExtensionArray::ExtensionArray;
62+
};
63+
64+
/// \brief Return an OpaqueType instance.
65+
ARROW_EXPORT std::shared_ptr<DataType> opaque(std::shared_ptr<DataType> storage_type,
66+
std::string type_name,
67+
std::string vendor_name);
68+
69+
} // namespace arrow::extension

0 commit comments

Comments
 (0)