Skip to content

Commit 9b58454

Browse files
authored
GH-43427: [C++][Parquet] Deprecate ColumnChunk::file_offset field and no longer write Metadata at end of Chunk (#43428)
### Rationale for this change See github issue ### What changes are included in this PR? Force `ColumnChunk::file_offset` tobe 0 ### Are these changes tested? No ### Are there any user-facing changes? Might affect user using legacy `ColumnChunk::file_offset` * GitHub Issue: #43427 Authored-by: mwish <maplewish117@gmail.com> Signed-off-by: mwish <maplewish117@gmail.com>
1 parent b852b72 commit 9b58454

5 files changed

Lines changed: 11 additions & 11 deletions

File tree

c_glib/test/parquet/test-column-chunk-metadata.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def setup
7777

7878
test("#file_offset") do
7979
assert do
80-
@metadata.file_offset > 0
80+
@metadata.file_offset == 0
8181
end
8282
end
8383

cpp/src/parquet/column_writer.cc

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -353,8 +353,6 @@ class SerializedPageWriter : public PageWriter {
353353
total_compressed_size_, total_uncompressed_size_, has_dictionary,
354354
fallback, dict_encoding_stats_, data_encoding_stats_,
355355
meta_encryptor_);
356-
// Write metadata at end of column chunk
357-
metadata_->WriteTo(sink_.get());
358356
}
359357

360358
/**
@@ -667,9 +665,6 @@ class BufferedPageWriter : public PageWriter {
667665
has_dictionary, fallback, pager_->dict_encoding_stats_,
668666
pager_->data_encoding_stats_, pager_->meta_encryptor_);
669667

670-
// Write metadata at end of column chunk
671-
metadata_->WriteTo(in_memory_sink_.get());
672-
673668
// Buffered page writer needs to adjust page offsets.
674669
pager_->FinishPageIndexes(final_position);
675670

cpp/src/parquet/metadata.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1536,10 +1536,10 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
15361536
const std::shared_ptr<Encryptor>& encryptor) {
15371537
if (dictionary_page_offset > 0) {
15381538
column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset);
1539-
column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size);
1540-
} else {
1541-
column_chunk_->__set_file_offset(data_page_offset + compressed_size);
15421539
}
1540+
// The `file_offset` field is deprecated and should be set to 0.
1541+
// See https://github.com/apache/parquet-format/pull/440 for detail.
1542+
column_chunk_->__set_file_offset(0);
15431543
column_chunk_->__isset.meta_data = true;
15441544
column_chunk_->meta_data.__set_num_values(num_values);
15451545
if (index_page_offset >= 0) {

cpp/src/parquet/metadata.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,12 @@ class PARQUET_EXPORT ColumnChunkMetaData {
146146

147147
bool Equals(const ColumnChunkMetaData& other) const;
148148

149-
// column chunk
149+
// Byte offset of `ColumnMetaData` in `file_path()`.
150+
//
151+
// Note that the meaning of this field has been inconsistent among implementations
152+
// so its use has since been deprecated in the Parquet specification. Modern
153+
// implementations will set this to `0` to indicate that the `ColumnMetaData` is solely
154+
// contained in the `ColumnChunk` struct.
150155
int64_t file_offset() const;
151156

152157
// parameter is only used when a dataset is spread across multiple files

python/pyarrow/tests/parquet/test_metadata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ def test_parquet_metadata_api():
122122
col_meta = rg_meta.column(ncols + 2)
123123

124124
col_meta = rg_meta.column(0)
125-
assert col_meta.file_offset > 0
125+
assert col_meta.file_offset == 0
126126
assert col_meta.file_path == '' # created from BytesIO
127127
assert col_meta.physical_type == 'BOOLEAN'
128128
assert col_meta.num_values == 10000

0 commit comments

Comments
 (0)