@@ -1062,6 +1062,10 @@ impl ColumnChunkMetaData {
10621062
10631063 /// Returns the page encoding statistics, or `None` if no page encoding statistics
10641064 /// are available (or they were converted to a mask).
1065+ ///
1066+ /// Note: By default, this crate converts page encoding statistics to a mask for performance
1067+ /// reasons. To get the full statistics, you must set [`ParquetMetaDataOptions::with_encoding_stats_as_mask`]
1068+ /// to `false`.
10651069 pub fn page_encoding_stats ( & self ) -> Option < & Vec < PageEncodingStats > > {
10661070 match self . encoding_stats . as_ref ( ) {
10671071 Some ( ParquetPageEncodingStats :: Full ( stats) ) => Some ( stats) ,
@@ -1072,6 +1076,8 @@ impl ColumnChunkMetaData {
10721076 /// Returns the page encoding statistics reduced to a bitmask, or `None` if statistics are
10731077 /// not available (or they were left in their original form).
10741078 ///
1079+ /// Note: This is the default behavior for this crate.
1080+ ///
10751081 /// The [`PageEncodingStats`] struct was added to the Parquet specification specifically to
10761082 /// enable fast determination of whether all pages in a column chunk are dictionary encoded
10771083 /// (see <https://github.com/apache/parquet-format/pull/16>).
@@ -1667,7 +1673,9 @@ impl OffsetIndexBuilder {
16671673mod tests {
16681674 use super :: * ;
16691675 use crate :: basic:: { PageType , SortOrder } ;
1670- use crate :: file:: metadata:: thrift:: tests:: { read_column_chunk, read_row_group} ;
1676+ use crate :: file:: metadata:: thrift:: tests:: {
1677+ read_column_chunk, read_column_chunk_with_options, read_row_group,
1678+ } ;
16711679
16721680 #[ test]
16731681 fn test_row_group_metadata_thrift_conversion ( ) {
@@ -1822,7 +1830,72 @@ mod tests {
18221830 let mut buf = Vec :: new ( ) ;
18231831 let mut writer = ThriftCompactOutputProtocol :: new ( & mut buf) ;
18241832 col_metadata. write_thrift ( & mut writer) . unwrap ( ) ;
1825- let col_chunk_res = read_column_chunk ( & mut buf, column_descr) . unwrap ( ) ;
1833+ let col_chunk_res = read_column_chunk ( & mut buf, column_descr. clone ( ) ) . unwrap ( ) ;
1834+
1835+ let expected_metadata = ColumnChunkMetaData :: builder ( column_descr)
1836+ . set_encodings_mask ( EncodingMask :: new_from_encodings (
1837+ [ Encoding :: PLAIN , Encoding :: RLE ] . iter ( ) ,
1838+ ) )
1839+ . set_file_path ( "file_path" . to_owned ( ) )
1840+ . set_num_values ( 1000 )
1841+ . set_compression ( Compression :: SNAPPY )
1842+ . set_total_compressed_size ( 2000 )
1843+ . set_total_uncompressed_size ( 3000 )
1844+ . set_data_page_offset ( 4000 )
1845+ . set_dictionary_page_offset ( Some ( 5000 ) )
1846+ . set_page_encoding_stats_mask ( EncodingMask :: new_from_encodings (
1847+ [ Encoding :: PLAIN , Encoding :: RLE ] . iter ( ) ,
1848+ ) )
1849+ . set_bloom_filter_offset ( Some ( 6000 ) )
1850+ . set_bloom_filter_length ( Some ( 25 ) )
1851+ . set_offset_index_offset ( Some ( 7000 ) )
1852+ . set_offset_index_length ( Some ( 25 ) )
1853+ . set_column_index_offset ( Some ( 8000 ) )
1854+ . set_column_index_length ( Some ( 25 ) )
1855+ . set_unencoded_byte_array_data_bytes ( Some ( 2000 ) )
1856+ . set_repetition_level_histogram ( Some ( LevelHistogram :: from ( vec ! [ 100 , 100 ] ) ) )
1857+ . set_definition_level_histogram ( Some ( LevelHistogram :: from ( vec ! [ 0 , 200 ] ) ) )
1858+ . build ( )
1859+ . unwrap ( ) ;
1860+
1861+ assert_eq ! ( col_chunk_res, expected_metadata) ;
1862+ }
1863+
1864+ #[ test]
1865+ fn test_column_chunk_metadata_thrift_conversion_full_stats ( ) {
1866+ let column_descr = get_test_schema_descr ( ) . column ( 0 ) ;
1867+ let stats = vec ! [
1868+ PageEncodingStats {
1869+ page_type: PageType :: DATA_PAGE ,
1870+ encoding: Encoding :: PLAIN ,
1871+ count: 3 ,
1872+ } ,
1873+ PageEncodingStats {
1874+ page_type: PageType :: DATA_PAGE ,
1875+ encoding: Encoding :: RLE ,
1876+ count: 5 ,
1877+ } ,
1878+ ] ;
1879+ let col_metadata = ColumnChunkMetaData :: builder ( column_descr. clone ( ) )
1880+ . set_encodings_mask ( EncodingMask :: new_from_encodings (
1881+ [ Encoding :: PLAIN , Encoding :: RLE ] . iter ( ) ,
1882+ ) )
1883+ . set_num_values ( 1000 )
1884+ . set_compression ( Compression :: SNAPPY )
1885+ . set_total_compressed_size ( 2000 )
1886+ . set_total_uncompressed_size ( 3000 )
1887+ . set_data_page_offset ( 4000 )
1888+ . set_page_encoding_stats ( stats)
1889+ . build ( )
1890+ . unwrap ( ) ;
1891+
1892+ let mut buf = Vec :: new ( ) ;
1893+ let mut writer = ThriftCompactOutputProtocol :: new ( & mut buf) ;
1894+ col_metadata. write_thrift ( & mut writer) . unwrap ( ) ;
1895+
1896+ let options = ParquetMetaDataOptions :: new ( ) . with_encoding_stats_as_mask ( false ) ;
1897+ let col_chunk_res =
1898+ read_column_chunk_with_options ( & mut buf, column_descr, Some ( & options) ) . unwrap ( ) ;
18261899
18271900 assert_eq ! ( col_chunk_res, col_metadata) ;
18281901 }
0 commit comments