Skip to content

Commit 8149c39

Browse files
authored
GH-39560: [C++][Parquet] Add integration test for BYTE_STREAM_SPLIT (#39570)
### Rationale for this change In apache/parquet-testing#45 , an integration file for BYTE_STREAM_SPLIT was added to the parquet-testing repo. ### What changes are included in this PR? Add a test reading that file and ensuring the decoded values are as expected. ### Are these changes tested? By definition. ### Are there any user-facing changes? No. * Closes: #39560 Authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent 6fe7480 commit 8149c39

2 files changed

Lines changed: 51 additions & 4 deletions

File tree

cpp/src/parquet/reader_test.cc

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,11 +120,27 @@ std::string concatenated_gzip_members() {
120120
return data_file("concatenated_gzip_members.parquet");
121121
}
122122

123+
std::string byte_stream_split() { return data_file("byte_stream_split.zstd.parquet"); }
124+
125+
template <typename DType, typename ValueType = typename DType::c_type>
126+
std::vector<ValueType> ReadColumnValues(ParquetFileReader* file_reader, int row_group,
127+
int column, int64_t expected_values_read) {
128+
auto column_reader = checked_pointer_cast<TypedColumnReader<DType>>(
129+
file_reader->RowGroup(row_group)->Column(column));
130+
std::vector<ValueType> values(expected_values_read);
131+
int64_t values_read;
132+
auto levels_read = column_reader->ReadBatch(expected_values_read, nullptr, nullptr,
133+
values.data(), &values_read);
134+
EXPECT_EQ(expected_values_read, levels_read);
135+
EXPECT_EQ(expected_values_read, values_read);
136+
return values;
137+
}
138+
123139
// TODO: Assert on definition and repetition levels
124-
template <typename DType, typename ValueType>
140+
template <typename DType, typename ValueType = typename DType::c_type>
125141
void AssertColumnValues(std::shared_ptr<TypedColumnReader<DType>> col, int64_t batch_size,
126142
int64_t expected_levels_read,
127-
std::vector<ValueType>& expected_values,
143+
const std::vector<ValueType>& expected_values,
128144
int64_t expected_values_read) {
129145
std::vector<ValueType> values(batch_size);
130146
int64_t values_read;
@@ -1412,7 +1428,6 @@ TEST_P(TestCodec, LargeFileValues) {
14121428

14131429
// column 0 ("a")
14141430
auto col = checked_pointer_cast<ByteArrayReader>(group->Column(0));
1415-
14161431
std::vector<ByteArray> values(kNumRows);
14171432
int64_t values_read;
14181433
auto levels_read =
@@ -1474,6 +1489,38 @@ TEST(TestFileReader, TestOverflowInt16PageOrdinal) {
14741489
}
14751490
}
14761491

1492+
#ifdef ARROW_WITH_ZSTD
1493+
TEST(TestByteStreamSplit, FloatIntegrationFile) {
1494+
auto file_path = byte_stream_split();
1495+
auto file = ParquetFileReader::OpenFile(file_path);
1496+
1497+
const int64_t kNumRows = 300;
1498+
1499+
ASSERT_EQ(kNumRows, file->metadata()->num_rows());
1500+
ASSERT_EQ(2, file->metadata()->num_columns());
1501+
ASSERT_EQ(1, file->metadata()->num_row_groups());
1502+
1503+
// column 0 ("f32")
1504+
{
1505+
auto values =
1506+
ReadColumnValues<FloatType>(file.get(), /*row_group=*/0, /*column=*/0, kNumRows);
1507+
ASSERT_EQ(values[0], 1.7640524f);
1508+
ASSERT_EQ(values[1], 0.4001572f);
1509+
ASSERT_EQ(values[kNumRows - 2], -0.39944902f);
1510+
ASSERT_EQ(values[kNumRows - 1], 0.37005588f);
1511+
}
1512+
// column 1 ("f64")
1513+
{
1514+
auto values =
1515+
ReadColumnValues<DoubleType>(file.get(), /*row_group=*/0, /*column=*/1, kNumRows);
1516+
ASSERT_EQ(values[0], -1.3065268517353166);
1517+
ASSERT_EQ(values[1], 1.658130679618188);
1518+
ASSERT_EQ(values[kNumRows - 2], -0.9301565025243212);
1519+
ASSERT_EQ(values[kNumRows - 1], -0.17858909208732915);
1520+
}
1521+
}
1522+
#endif // ARROW_WITH_ZSTD
1523+
14771524
struct PageIndexReaderParam {
14781525
std::vector<int32_t> row_group_indices;
14791526
std::vector<int32_t> column_indices;

0 commit comments

Comments
 (0)