Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-43427: [C++][Parquet] Deprecate ColumnChunk::file_offset field and no longer write Metadata at end of Chunk #43428

Merged
merged 11 commits into from
Aug 7, 2024
2 changes: 1 addition & 1 deletion c_glib/test/parquet/test-column-chunk-metadata.rb
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def setup

test("#file_offset") do
assert do
@metadata.file_offset > 0
@metadata.file_offset == 0
end
end

Expand Down
5 changes: 0 additions & 5 deletions cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -353,8 +353,6 @@ class SerializedPageWriter : public PageWriter {
total_compressed_size_, total_uncompressed_size_, has_dictionary,
fallback, dict_encoding_stats_, data_encoding_stats_,
meta_encryptor_);
// Write metadata at end of column chunk
metadata_->WriteTo(sink_.get());
}

/**
Expand Down Expand Up @@ -667,9 +665,6 @@ class BufferedPageWriter : public PageWriter {
has_dictionary, fallback, pager_->dict_encoding_stats_,
pager_->data_encoding_stats_, pager_->meta_encryptor_);

// Write metadata at end of column chunk
metadata_->WriteTo(in_memory_sink_.get());

// Buffered page writer needs to adjust page offsets.
pager_->FinishPageIndexes(final_position);

Expand Down
6 changes: 3 additions & 3 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1536,10 +1536,10 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
const std::shared_ptr<Encryptor>& encryptor) {
if (dictionary_page_offset > 0) {
column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset);
column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size);
} else {
column_chunk_->__set_file_offset(data_page_offset + compressed_size);
}
// The `file_offset` field is deprecated and should be set to 0.
// See https://github.com/apache/parquet-format/pull/440 for detail.
column_chunk_->__set_file_offset(0);
column_chunk_->__isset.meta_data = true;
column_chunk_->meta_data.__set_num_values(num_values);
if (index_page_offset >= 0) {
Expand Down
7 changes: 6 additions & 1 deletion cpp/src/parquet/metadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,12 @@ class PARQUET_EXPORT ColumnChunkMetaData {

bool Equals(const ColumnChunkMetaData& other) const;

// column chunk
// Byte offset of `ColumnMetaData` in `file_path()`.
//
// Note that the meaning of this field has been inconsistent among implementations
// so its use has since been deprecated in the Parquet specification. Modern
// implementations will set this to `0` to indicate that the `ColumnMetaData` is solely
// contained in the `ColumnChunk` struct.
int64_t file_offset() const;

// parameter is only used when a dataset is spread across multiple files
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/tests/parquet/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def test_parquet_metadata_api():
col_meta = rg_meta.column(ncols + 2)

col_meta = rg_meta.column(0)
assert col_meta.file_offset > 0
assert col_meta.file_offset == 0
assert col_meta.file_path == '' # created from BytesIO
assert col_meta.physical_type == 'BOOLEAN'
assert col_meta.num_values == 10000
Expand Down
Loading