Skip to content

Commit 322f9ce

Browse files
[Variant] Add unshred_variant support for Binary and LargeBinary types (#9576)
# Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. --> - Closes #9526 # Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> `shred_variant` already supports Binary and LargeBinary types (#9525, #9554), but unshred_variant does not handle these types. This means shredded Binary/LargeBinary columns cannot be converted back to unshredded VariantArrays. # What changes are included in this PR? Adds unshred_variant support for DataType::Binary and DataType::LargeBinary in parquet-variant-compute/src/unshred_variant.rs: - New enum variants PrimitiveBinary and PrimitiveLargeBinary - Match arms in append_row and try_new_opt - AppendToVariantBuilder impls for BinaryArray and LargeBinaryArray # Are these changes tested? Yes # Are there any user-facing changes? No breaking changes --------- Signed-off-by: Kunal Singh Dadhwal <kunalsinghdadhwal@gmail.com>
1 parent bc74c71 commit 322f9ce

1 file changed

Lines changed: 58 additions & 4 deletions

File tree

parquet-variant-compute/src/unshred_variant.rs

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919
2020
use crate::{BorrowedShreddingState, VariantArray, VariantValueArrayBuilder};
2121
use arrow::array::{
22-
Array, AsArray as _, BinaryViewArray, BooleanArray, FixedSizeBinaryArray, FixedSizeListArray,
23-
GenericListArray, GenericListViewArray, LargeStringArray, ListLikeArray, PrimitiveArray,
24-
StringArray, StringViewArray, StructArray,
22+
Array, AsArray as _, BinaryArray, BinaryViewArray, BooleanArray, FixedSizeBinaryArray,
23+
FixedSizeListArray, GenericListArray, GenericListViewArray, LargeBinaryArray, LargeStringArray,
24+
ListLikeArray, PrimitiveArray, StringArray, StringViewArray, StructArray,
2525
};
2626
use arrow::buffer::NullBuffer;
2727
use arrow::datatypes::{
@@ -107,7 +107,9 @@ enum UnshredVariantRowBuilder<'a> {
107107
PrimitiveString(UnshredPrimitiveRowBuilder<'a, StringArray>),
108108
PrimitiveStringView(UnshredPrimitiveRowBuilder<'a, StringViewArray>),
109109
PrimitiveLargeString(UnshredPrimitiveRowBuilder<'a, LargeStringArray>),
110+
PrimitiveBinary(UnshredPrimitiveRowBuilder<'a, BinaryArray>),
110111
PrimitiveBinaryView(UnshredPrimitiveRowBuilder<'a, BinaryViewArray>),
112+
PrimitiveLargeBinary(UnshredPrimitiveRowBuilder<'a, LargeBinaryArray>),
111113
PrimitiveUuid(UnshredPrimitiveRowBuilder<'a, FixedSizeBinaryArray>),
112114
List(ListUnshredVariantBuilder<'a, GenericListArray<i32>>),
113115
LargeList(ListUnshredVariantBuilder<'a, GenericListArray<i64>>),
@@ -150,7 +152,9 @@ impl<'a> UnshredVariantRowBuilder<'a> {
150152
Self::PrimitiveString(b) => b.append_row(builder, metadata, index),
151153
Self::PrimitiveStringView(b) => b.append_row(builder, metadata, index),
152154
Self::PrimitiveLargeString(b) => b.append_row(builder, metadata, index),
155+
Self::PrimitiveBinary(b) => b.append_row(builder, metadata, index),
153156
Self::PrimitiveBinaryView(b) => b.append_row(builder, metadata, index),
157+
Self::PrimitiveLargeBinary(b) => b.append_row(builder, metadata, index),
154158
Self::PrimitiveUuid(b) => b.append_row(builder, metadata, index),
155159
Self::List(b) => b.append_row(builder, metadata, index),
156160
Self::LargeList(b) => b.append_row(builder, metadata, index),
@@ -232,7 +236,9 @@ impl<'a> UnshredVariantRowBuilder<'a> {
232236
DataType::Utf8 => primitive_builder!(PrimitiveString, as_string),
233237
DataType::Utf8View => primitive_builder!(PrimitiveStringView, as_string_view),
234238
DataType::LargeUtf8 => primitive_builder!(PrimitiveLargeString, as_string),
239+
DataType::Binary => primitive_builder!(PrimitiveBinary, as_binary),
235240
DataType::BinaryView => primitive_builder!(PrimitiveBinaryView, as_binary_view),
241+
DataType::LargeBinary => primitive_builder!(PrimitiveLargeBinary, as_binary),
236242
DataType::FixedSizeBinary(16) => {
237243
primitive_builder!(PrimitiveUuid, as_fixed_size_binary)
238244
}
@@ -413,7 +419,9 @@ impl_append_to_variant_builder!(BooleanArray);
413419
impl_append_to_variant_builder!(StringArray);
414420
impl_append_to_variant_builder!(StringViewArray);
415421
impl_append_to_variant_builder!(LargeStringArray);
422+
impl_append_to_variant_builder!(BinaryArray);
416423
impl_append_to_variant_builder!(BinaryViewArray);
424+
impl_append_to_variant_builder!(LargeBinaryArray);
417425
impl_append_to_variant_builder!(PrimitiveArray<Int8Type>);
418426
impl_append_to_variant_builder!(PrimitiveArray<Int16Type>);
419427
impl_append_to_variant_builder!(PrimitiveArray<Int32Type>);
@@ -675,7 +683,9 @@ impl<'a, L: ListLikeArray> ListUnshredVariantBuilder<'a, L> {
675683
#[cfg(test)]
676684
mod tests {
677685
use crate::VariantArray;
678-
use arrow::array::{BinaryViewArray, LargeStringArray, StringViewArray};
686+
use arrow::array::{
687+
BinaryArray, BinaryViewArray, LargeBinaryArray, LargeStringArray, StringViewArray,
688+
};
679689
use parquet_variant::Variant;
680690

681691
#[test]
@@ -720,4 +730,48 @@ mod tests {
720730
assert_eq!(result.value(1), Variant::from("middle"));
721731
assert_eq!(result.value(2), Variant::from("world"));
722732
}
733+
734+
#[test]
735+
fn test_unshred_binary_typed_value() {
736+
let metadata_bytes: &[u8] = &[0x01, 0x00, 0x00];
737+
let metadata = BinaryViewArray::from_iter_values(vec![metadata_bytes; 3]);
738+
739+
let typed_value: arrow::array::ArrayRef =
740+
std::sync::Arc::new(BinaryArray::from_iter_values(vec![
741+
&b"\x00\x01\x02"[..],
742+
&b"\xff\xaa"[..],
743+
&b"\xde\xad\xbe\xef"[..],
744+
]));
745+
746+
let variant_array = VariantArray::from_parts(metadata, None, Some(typed_value), None);
747+
748+
let result = crate::unshred_variant(&variant_array).unwrap();
749+
750+
assert_eq!(result.len(), 3);
751+
assert_eq!(result.value(0), Variant::from(&b"\x00\x01\x02"[..]));
752+
assert_eq!(result.value(1), Variant::from(&b"\xff\xaa"[..]));
753+
assert_eq!(result.value(2), Variant::from(&b"\xde\xad\xbe\xef"[..]));
754+
}
755+
756+
#[test]
757+
fn test_unshred_largebinary_typed_value() {
758+
let metadata_bytes: &[u8] = &[0x01, 0x00, 0x00];
759+
let metadata = BinaryViewArray::from_iter_values(vec![metadata_bytes; 3]);
760+
761+
let typed_value: arrow::array::ArrayRef =
762+
std::sync::Arc::new(LargeBinaryArray::from_iter_values(vec![
763+
&b"\x00\x01\x02"[..],
764+
&b"\xff\xaa"[..],
765+
&b"\xde\xad\xbe\xef"[..],
766+
]));
767+
768+
let variant_array = VariantArray::from_parts(metadata, None, Some(typed_value), None);
769+
770+
let result = crate::unshred_variant(&variant_array).unwrap();
771+
772+
assert_eq!(result.len(), 3);
773+
assert_eq!(result.value(0), Variant::from(&b"\x00\x01\x02"[..]));
774+
assert_eq!(result.value(1), Variant::from(&b"\xff\xaa"[..]));
775+
assert_eq!(result.value(2), Variant::from(&b"\xde\xad\xbe\xef"[..]));
776+
}
723777
}

0 commit comments

Comments
 (0)