Skip to content

Commit 552f0ec

Browse files
feat: change default behavior for Parquet PageEncodingStats to bitmask
1 parent 8ed2b52 commit 552f0ec

5 files changed

Lines changed: 112 additions & 9 deletions

File tree

parquet/src/arrow/arrow_writer/mod.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4433,7 +4433,10 @@ mod tests {
44334433
.unwrap();
44344434

44354435
// check that the read metadata is also correct
4436-
let options = ReadOptionsBuilder::new().with_page_index().build();
4436+
let options = ReadOptionsBuilder::new()
4437+
.with_page_index()
4438+
.with_encoding_stats_as_mask(false)
4439+
.build();
44374440
let reader = SerializedFileReader::new_with_options(file, options).unwrap();
44384441

44394442
let rowgroup = reader.get_row_group(0).expect("row group missing");

parquet/src/file/metadata/mod.rs

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1062,6 +1062,10 @@ impl ColumnChunkMetaData {
10621062

10631063
/// Returns the page encoding statistics, or `None` if no page encoding statistics
10641064
/// are available (or they were converted to a mask).
1065+
///
1066+
/// Note: By default, this crate converts page encoding statistics to a mask for performance
1067+
/// reasons. To get the full statistics, you must set [`ParquetMetaDataOptions::with_encoding_stats_as_mask`]
1068+
/// to `false`.
10651069
pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
10661070
match self.encoding_stats.as_ref() {
10671071
Some(ParquetPageEncodingStats::Full(stats)) => Some(stats),
@@ -1072,6 +1076,8 @@ impl ColumnChunkMetaData {
10721076
/// Returns the page encoding statistics reduced to a bitmask, or `None` if statistics are
10731077
/// not available (or they were left in their original form).
10741078
///
1079+
/// Note: This is the default behavior for this crate.
1080+
///
10751081
/// The [`PageEncodingStats`] struct was added to the Parquet specification specifically to
10761082
/// enable fast determination of whether all pages in a column chunk are dictionary encoded
10771083
/// (see <https://github.com/apache/parquet-format/pull/16>).
@@ -1667,7 +1673,9 @@ impl OffsetIndexBuilder {
16671673
mod tests {
16681674
use super::*;
16691675
use crate::basic::{PageType, SortOrder};
1670-
use crate::file::metadata::thrift::tests::{read_column_chunk, read_row_group};
1676+
use crate::file::metadata::thrift::tests::{
1677+
read_column_chunk, read_column_chunk_with_options, read_row_group,
1678+
};
16711679

16721680
#[test]
16731681
fn test_row_group_metadata_thrift_conversion() {
@@ -1822,7 +1830,72 @@ mod tests {
18221830
let mut buf = Vec::new();
18231831
let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
18241832
col_metadata.write_thrift(&mut writer).unwrap();
1825-
let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1833+
let col_chunk_res = read_column_chunk(&mut buf, column_descr.clone()).unwrap();
1834+
1835+
let expected_metadata = ColumnChunkMetaData::builder(column_descr)
1836+
.set_encodings_mask(EncodingMask::new_from_encodings(
1837+
[Encoding::PLAIN, Encoding::RLE].iter(),
1838+
))
1839+
.set_file_path("file_path".to_owned())
1840+
.set_num_values(1000)
1841+
.set_compression(Compression::SNAPPY)
1842+
.set_total_compressed_size(2000)
1843+
.set_total_uncompressed_size(3000)
1844+
.set_data_page_offset(4000)
1845+
.set_dictionary_page_offset(Some(5000))
1846+
.set_page_encoding_stats_mask(EncodingMask::new_from_encodings(
1847+
[Encoding::PLAIN, Encoding::RLE].iter(),
1848+
))
1849+
.set_bloom_filter_offset(Some(6000))
1850+
.set_bloom_filter_length(Some(25))
1851+
.set_offset_index_offset(Some(7000))
1852+
.set_offset_index_length(Some(25))
1853+
.set_column_index_offset(Some(8000))
1854+
.set_column_index_length(Some(25))
1855+
.set_unencoded_byte_array_data_bytes(Some(2000))
1856+
.set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1857+
.set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1858+
.build()
1859+
.unwrap();
1860+
1861+
assert_eq!(col_chunk_res, expected_metadata);
1862+
}
1863+
1864+
#[test]
1865+
fn test_column_chunk_metadata_thrift_conversion_full_stats() {
1866+
let column_descr = get_test_schema_descr().column(0);
1867+
let stats = vec![
1868+
PageEncodingStats {
1869+
page_type: PageType::DATA_PAGE,
1870+
encoding: Encoding::PLAIN,
1871+
count: 3,
1872+
},
1873+
PageEncodingStats {
1874+
page_type: PageType::DATA_PAGE,
1875+
encoding: Encoding::RLE,
1876+
count: 5,
1877+
},
1878+
];
1879+
let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1880+
.set_encodings_mask(EncodingMask::new_from_encodings(
1881+
[Encoding::PLAIN, Encoding::RLE].iter(),
1882+
))
1883+
.set_num_values(1000)
1884+
.set_compression(Compression::SNAPPY)
1885+
.set_total_compressed_size(2000)
1886+
.set_total_uncompressed_size(3000)
1887+
.set_data_page_offset(4000)
1888+
.set_page_encoding_stats(stats)
1889+
.build()
1890+
.unwrap();
1891+
1892+
let mut buf = Vec::new();
1893+
let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1894+
col_metadata.write_thrift(&mut writer).unwrap();
1895+
1896+
let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false);
1897+
let col_chunk_res =
1898+
read_column_chunk_with_options(&mut buf, column_descr, Some(&options)).unwrap();
18261899

18271900
assert_eq!(col_chunk_res, col_metadata);
18281901
}

parquet/src/file/metadata/options.rs

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,13 +87,23 @@ impl ParquetStatisticsPolicy {
8787
/// [`ParquetMetaData`]: crate::file::metadata::ParquetMetaData
8888
/// [`ParquetMetaDataReader`]: crate::file::metadata::ParquetMetaDataReader
8989
/// [`ParquetMetaDataPushDecoder`]: crate::file::metadata::ParquetMetaDataPushDecoder
90-
#[derive(Default, Debug, Clone)]
90+
#[derive(Debug, Clone)]
9191
pub struct ParquetMetaDataOptions {
9292
schema_descr: Option<SchemaDescPtr>,
9393
encoding_stats_as_mask: bool,
9494
encoding_stats_policy: ParquetStatisticsPolicy,
9595
}
9696

97+
impl Default for ParquetMetaDataOptions {
98+
fn default() -> Self {
99+
Self {
100+
schema_descr: None,
101+
encoding_stats_as_mask: true,
102+
encoding_stats_policy: ParquetStatisticsPolicy::KeepAll,
103+
}
104+
}
105+
}
106+
97107
impl ParquetMetaDataOptions {
98108
/// Return a new default [`ParquetMetaDataOptions`].
99109
pub fn new() -> Self {
@@ -118,7 +128,7 @@ impl ParquetMetaDataOptions {
118128
}
119129

120130
/// Returns whether to present the [`encoding_stats`] field of the Parquet `ColumnMetaData`
121-
/// as a bitmask (defaults to `false`).
131+
/// as a bitmask (defaults to `true`).
122132
///
123133
/// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this
124134
/// might be desirable.
@@ -193,6 +203,12 @@ mod tests {
193203
};
194204
use std::{io::Read, sync::Arc};
195205

206+
#[test]
207+
fn test_options_default() {
208+
let options = ParquetMetaDataOptions::default();
209+
assert!(options.encoding_stats_as_mask());
210+
}
211+
196212
#[test]
197213
fn test_provide_schema() {
198214
let mut buf: Vec<u8> = Vec::new();

parquet/src/file/metadata/thrift/mod.rs

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ fn read_column_metadata<'a>(
410410
let mut seen_mask = 0u16;
411411

412412
let mut skip_pes = false;
413-
let mut pes_mask = false;
413+
let mut pes_mask = true;
414414

415415
if let Some(opts) = options {
416416
skip_pes = opts.skip_encoding_stats(col_index);
@@ -1704,7 +1704,7 @@ write_thrift_field!(RustBoundingBox, FieldType::Struct);
17041704
pub(crate) mod tests {
17051705
use crate::errors::Result;
17061706
use crate::file::metadata::thrift::{BoundingBox, SchemaElement, write_schema};
1707-
use crate::file::metadata::{ColumnChunkMetaData, RowGroupMetaData};
1707+
use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaDataOptions, RowGroupMetaData};
17081708
use crate::parquet_thrift::tests::test_roundtrip;
17091709
use crate::parquet_thrift::{
17101710
ElementType, ThriftCompactOutputProtocol, ThriftSliceInputProtocol, read_thrift_vec,
@@ -1726,9 +1726,17 @@ pub(crate) mod tests {
17261726
pub(crate) fn read_column_chunk(
17271727
buf: &mut [u8],
17281728
column_descr: Arc<ColumnDescriptor>,
1729+
) -> Result<ColumnChunkMetaData> {
1730+
read_column_chunk_with_options(buf, column_descr, None)
1731+
}
1732+
1733+
pub(crate) fn read_column_chunk_with_options(
1734+
buf: &mut [u8],
1735+
column_descr: Arc<ColumnDescriptor>,
1736+
options: Option<&ParquetMetaDataOptions>,
17291737
) -> Result<ColumnChunkMetaData> {
17301738
let mut reader = ThriftSliceInputProtocol::new(buf);
1731-
crate::file::metadata::thrift::read_column_chunk(&mut reader, &column_descr, 0, None)
1739+
crate::file::metadata::thrift::read_column_chunk(&mut reader, &column_descr, 0, options)
17321740
}
17331741

17341742
pub(crate) fn roundtrip_schema(schema: TypePtr) -> Result<TypePtr> {

parquet/src/file/serialized_reader.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1855,7 +1855,10 @@ mod tests {
18551855
fn test_file_reader_optional_metadata() {
18561856
// file with optional metadata: bloom filters, encoding stats, column index and offset index.
18571857
let file = get_test_file("data_index_bloom_encoding_stats.parquet");
1858-
let file_reader = Arc::new(SerializedFileReader::new(file).unwrap());
1858+
let options = ReadOptionsBuilder::new()
1859+
.with_encoding_stats_as_mask(false)
1860+
.build();
1861+
let file_reader = Arc::new(SerializedFileReader::new_with_options(file, options).unwrap());
18591862

18601863
let row_group_metadata = file_reader.metadata.row_group(0);
18611864
let col0_metadata = row_group_metadata.column(0);

0 commit comments

Comments
 (0)