diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 631453e7a9d87..8515411b132b6 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -627,6 +627,9 @@ if(PYARROW_BUILD_PARQUET) endif() if(PYARROW_BUILD_DATASET) list(APPEND CYTHON_EXTENSIONS _dataset_parquet) + # include this always, even if PYARROW_BUILD_PARQUET_ENCRYPTION is not + # enabled (source file gets replaced with dummies below) + list(APPEND CYTHON_EXTENSIONS _dataset_parquet_encryption) endif() endif() @@ -713,14 +716,6 @@ endif() # Error on any warnings not already explicitly ignored. set(CYTHON_FLAGS "${CYTHON_FLAGS}" "--warning-errors") -if(PYARROW_BUILD_PARQUET_ENCRYPTION) - message(STATUS "Parquet Encryption Enabled") - list(APPEND CYTHON_FLAGS "-E" "PARQUET_ENCRYPTION_ENABLED=1") -else() - message(STATUS "Parquet Encryption is NOT Enabled") - list(APPEND CYTHON_FLAGS "-E" "PARQUET_ENCRYPTION_ENABLED=0") -endif() - foreach(module ${CYTHON_EXTENSIONS}) string(REPLACE "." ";" directories ${module}) list(GET directories -1 module_name) @@ -730,6 +725,16 @@ foreach(module ${CYTHON_EXTENSIONS}) set(module_SRC pyarrow/${module_root}.pyx) set_source_files_properties(${module_SRC} PROPERTIES CYTHON_IS_CXX TRUE) + if(${module_name} STREQUAL "_dataset_parquet_encryption") + if(PYARROW_BUILD_PARQUET_ENCRYPTION) + message(STATUS "Parquet Encryption Enabled") + set(module_SRC pyarrow/_dataset_parquet_encryption.pyx) + else() + message(STATUS "Parquet Encryption is NOT Enabled") + set(module_SRC pyarrow/_dataset_parquet_no_encryption.pyx) + endif() + endif() + cython_add_module(${module_name} ${module_name}_pyx ${module_name}_output ${module_SRC}) if(directories) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 6da4b7efe6bfc..fa1a388204d7a 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -33,13 +33,6 @@ from pyarrow.includes.libarrow_dataset cimport * from pyarrow.includes.libarrow_dataset_parquet cimport * from pyarrow._fs cimport FileSystem -IF PARQUET_ENCRYPTION_ENABLED: - from pyarrow.includes.libarrow_parquet_readwrite_encryption cimport * - from pyarrow._parquet_encryption cimport * -ELSE: - from pyarrow.includes.libarrow_parquet_readwrite cimport * - - from pyarrow._compute cimport Expression, _bind from pyarrow._dataset cimport ( _make_file_source, @@ -53,7 +46,7 @@ from pyarrow._dataset cimport ( PartitioningFactory, WrittenFile ) - +from pyarrow._dataset_parquet_encryption cimport * from pyarrow._parquet cimport ( _create_writer_properties, _create_arrow_writer_properties, @@ -65,134 +58,6 @@ cdef Expression _true = Expression._scalar(True) ctypedef CParquetFileWriter* _CParquetFileWriterPtr -IF PARQUET_ENCRYPTION_ENABLED: - cdef class ParquetEncryptionConfig(_Weakrefable): - """ - Core configuration class encapsulating parameters for high-level encryption - within the Parquet framework. - - The ParquetEncryptionConfig class serves as a bridge for passing encryption-related - parameters to the appropriate components within the Parquet library. It maintains references - to objects that define the encryption strategy, Key Management Service (KMS) configuration, - and specific encryption configurations for Parquet data. - - Parameters - ---------- - crypto_factory : pyarrow.parquet.encryption.CryptoFactory - Shared pointer to a `CryptoFactory` object. The `CryptoFactory` is responsible for - creating cryptographic components, such as encryptors and decryptors. - kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig - Shared pointer to a `KmsConnectionConfig` object. This object holds the configuration - parameters necessary for connecting to a Key Management Service (KMS). - encryption_config : pyarrow.parquet.encryption.EncryptionConfiguration - Shared pointer to an `EncryptionConfiguration` object. This object defines specific - encryption settings for Parquet data, including the keys assigned to different columns. - - Raises - ------ - ValueError - Raised if `encryption_config` is None. - """ - cdef: - shared_ptr[CParquetEncryptionConfig] c_config - - # Avoid mistakenly creating attributes - __slots__ = () - - def __cinit__(self, CryptoFactory crypto_factory, KmsConnectionConfig kms_connection_config, - EncryptionConfiguration encryption_config): - - cdef shared_ptr[CEncryptionConfiguration] c_encryption_config - - if crypto_factory is None: - raise ValueError("crypto_factory cannot be None") - - if kms_connection_config is None: - raise ValueError("kms_connection_config cannot be None") - - if encryption_config is None: - raise ValueError("encryption_config cannot be None") - - self.c_config.reset(new CParquetEncryptionConfig()) - - c_encryption_config = pyarrow_unwrap_encryptionconfig( - encryption_config) - - self.c_config.get().crypto_factory = pyarrow_unwrap_cryptofactory(crypto_factory) - self.c_config.get().kms_connection_config = pyarrow_unwrap_kmsconnectionconfig( - kms_connection_config) - self.c_config.get().encryption_config = c_encryption_config - - @staticmethod - cdef wrap(shared_ptr[CParquetEncryptionConfig] c_config): - cdef ParquetEncryptionConfig python_config = ParquetEncryptionConfig.__new__(ParquetEncryptionConfig) - python_config.c_config = c_config - return python_config - - cdef shared_ptr[CParquetEncryptionConfig] unwrap(self): - return self.c_config - - cdef class ParquetDecryptionConfig(_Weakrefable): - """ - Core configuration class encapsulating parameters for high-level decryption - within the Parquet framework. - - ParquetDecryptionConfig is designed to pass decryption-related parameters to - the appropriate decryption components within the Parquet library. It holds references to - objects that define the decryption strategy, Key Management Service (KMS) configuration, - and specific decryption configurations for reading encrypted Parquet data. - - Parameters - ---------- - crypto_factory : pyarrow.parquet.encryption.CryptoFactory - Shared pointer to a `CryptoFactory` object, pivotal in creating cryptographic - components for the decryption process. - kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig - Shared pointer to a `KmsConnectionConfig` object, containing parameters necessary - for connecting to a Key Management Service (KMS) during decryption. - decryption_config : pyarrow.parquet.encryption.DecryptionConfiguration - Shared pointer to a `DecryptionConfiguration` object, specifying decryption settings - for reading encrypted Parquet data. - - Raises - ------ - ValueError - Raised if `decryption_config` is None. - """ - - cdef: - shared_ptr[CParquetDecryptionConfig] c_config - - # Avoid mistakingly creating attributes - __slots__ = () - - def __cinit__(self, CryptoFactory crypto_factory, KmsConnectionConfig kms_connection_config, - DecryptionConfiguration decryption_config): - - cdef shared_ptr[CDecryptionConfiguration] c_decryption_config - - if decryption_config is None: - raise ValueError( - "decryption_config cannot be None") - - self.c_config.reset(new CParquetDecryptionConfig()) - - c_decryption_config = pyarrow_unwrap_decryptionconfig( - decryption_config) - - self.c_config.get().crypto_factory = pyarrow_unwrap_cryptofactory(crypto_factory) - self.c_config.get().kms_connection_config = pyarrow_unwrap_kmsconnectionconfig( - kms_connection_config) - self.c_config.get().decryption_config = c_decryption_config - - @staticmethod - cdef wrap(shared_ptr[CParquetDecryptionConfig] c_config): - cdef ParquetDecryptionConfig python_config = ParquetDecryptionConfig.__new__(ParquetDecryptionConfig) - python_config.c_config = c_config - return python_config - - cdef shared_ptr[CParquetDecryptionConfig] unwrap(self): - return self.c_config cdef class ParquetFileFormat(FileFormat): """ @@ -707,6 +572,8 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): for name, value in kwargs.items(): if name not in self._properties: raise TypeError("unexpected parquet write option: " + name) + if name == "encryption_properties" and not is_encryption_enabled(): + raise NotImplementedError("...") self._properties[name] = value if name in arrow_fields: setters.add(self._set_arrow_properties) @@ -825,8 +692,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): """ cdef CParquetFragmentScanOptions* parquet_options - IF PARQUET_ENCRYPTION_ENABLED: - cdef ParquetDecryptionConfig _parquet_decryption_config + cdef object _parquet_decryption_config # Avoid mistakingly creating attributes __slots__ = () @@ -846,14 +712,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): self.thrift_string_size_limit = thrift_string_size_limit if thrift_container_size_limit is not None: self.thrift_container_size_limit = thrift_container_size_limit - - IF PARQUET_ENCRYPTION_ENABLED: - if decryption_config: - self.parquet_decryption_config = decryption_config - ELSE: - if decryption_config is not None: - raise NotImplementedError( - "Encryption is not enabled, but a decryption_config was provided.") + if decryption_config is not None: + self.parquet_decryption_config = decryption_config cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp): FragmentScanOptions.init(self, sp) @@ -865,29 +725,22 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): cdef ArrowReaderProperties* arrow_reader_properties(self): return self.parquet_options.arrow_reader_properties.get() - IF PARQUET_ENCRYPTION_ENABLED: - @property - def parquet_decryption_config(self): - return self._parquet_decryption_config - - @parquet_decryption_config.setter - def parquet_decryption_config(self, ParquetDecryptionConfig config): - cdef shared_ptr[CParquetDecryptionConfig] c_config - if not isinstance(config, ParquetDecryptionConfig): - raise ValueError("config must be a ParquetDecryptionConfig") - self._parquet_decryption_config = config - c_config = config.unwrap() - self.parquet_options.parquet_decryption_config = c_config - ELSE: - @property - def parquet_decryption_config(self): + @property + def parquet_decryption_config(self): + if not is_encryption_enabled(): raise NotImplementedError( "Unable to access encryption features; the code was compiled without the necessary encryption support.") + return self._parquet_decryption_config - @parquet_decryption_config.setter - def parquet_decryption_config(self, ParquetDecryptionConfig config): + @parquet_decryption_config.setter + def parquet_decryption_config(self, config): + if not is_encryption_enabled(): raise NotImplementedError( "Unable to access encryption features; the code was compiled without the necessary encryption support.") + # raise NotImplementedError( + # "Encryption is not enabled, but a decryption_config was provided.") + set_decryption_config(self.parquet_options, config) + self._parquet_decryption_config = config @property def use_buffered_stream(self): diff --git a/python/pyarrow/includes/libarrow_parquet_readwrite.pxd b/python/pyarrow/_dataset_parquet_encryption.pxd similarity index 54% rename from python/pyarrow/includes/libarrow_parquet_readwrite.pxd rename to python/pyarrow/_dataset_parquet_encryption.pxd index 1a28a417b7db9..547917e7f4e2b 100644 --- a/python/pyarrow/includes/libarrow_parquet_readwrite.pxd +++ b/python/pyarrow/_dataset_parquet_encryption.pxd @@ -15,18 +15,12 @@ # specific language governing permissions and limitations # under the License. -# distutils: language = c++ +# cython: language_level = 3 -from pyarrow.includes.libarrow_dataset cimport * -from pyarrow._parquet cimport * +"""Dataset support for Parquet encryption.""" -cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: - cdef cppclass CParquetFileWriteOptions \ - "arrow::dataset::ParquetFileWriteOptions"(CFileWriteOptions): - shared_ptr[WriterProperties] writer_properties - shared_ptr[ArrowWriterProperties] arrow_writer_properties +from pyarrow.includes.libarrow_dataset_parquet cimport * - cdef cppclass CParquetFragmentScanOptions \ - "arrow::dataset::ParquetFragmentScanOptions"(CFragmentScanOptions): - shared_ptr[CReaderProperties] reader_properties - shared_ptr[ArrowReaderProperties] arrow_reader_properties + +cdef bint is_encryption_enabled() +cdef set_decryption_config(CParquetFragmentScanOptions * parquet_options, config) diff --git a/python/pyarrow/_dataset_parquet_encryption.pyx b/python/pyarrow/_dataset_parquet_encryption.pyx new file mode 100644 index 0000000000000..ef98029ccc780 --- /dev/null +++ b/python/pyarrow/_dataset_parquet_encryption.pyx @@ -0,0 +1,166 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +"""Dataset support for Parquet encryption.""" + +#from pyarrow.includes.libarrow_dataset_parquet cimport * +from pyarrow.includes.libarrow_dataset_parquet cimport * +from pyarrow._parquet_encryption cimport * + + +cdef bint is_encryption_enabled(): + return True + + +cdef class ParquetEncryptionConfig(_Weakrefable): + """ + Core configuration class encapsulating parameters for high-level encryption + within the Parquet framework. + + The ParquetEncryptionConfig class serves as a bridge for passing encryption-related + parameters to the appropriate components within the Parquet library. It maintains references + to objects that define the encryption strategy, Key Management Service (KMS) configuration, + and specific encryption configurations for Parquet data. + + Parameters + ---------- + crypto_factory : pyarrow.parquet.encryption.CryptoFactory + Shared pointer to a `CryptoFactory` object. The `CryptoFactory` is responsible for + creating cryptographic components, such as encryptors and decryptors. + kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig + Shared pointer to a `KmsConnectionConfig` object. This object holds the configuration + parameters necessary for connecting to a Key Management Service (KMS). + encryption_config : pyarrow.parquet.encryption.EncryptionConfiguration + Shared pointer to an `EncryptionConfiguration` object. This object defines specific + encryption settings for Parquet data, including the keys assigned to different columns. + + Raises + ------ + ValueError + Raised if `encryption_config` is None. + """ + cdef: + shared_ptr[CParquetEncryptionConfig] c_config + + # Avoid mistakenly creating attributes + __slots__ = () + + def __cinit__(self, CryptoFactory crypto_factory, KmsConnectionConfig kms_connection_config, + EncryptionConfiguration encryption_config): + + cdef shared_ptr[CEncryptionConfiguration] c_encryption_config + + if crypto_factory is None: + raise ValueError("crypto_factory cannot be None") + + if kms_connection_config is None: + raise ValueError("kms_connection_config cannot be None") + + if encryption_config is None: + raise ValueError("encryption_config cannot be None") + + self.c_config.reset(new CParquetEncryptionConfig()) + + c_encryption_config = pyarrow_unwrap_encryptionconfig( + encryption_config) + + self.c_config.get().crypto_factory = pyarrow_unwrap_cryptofactory(crypto_factory) + self.c_config.get().kms_connection_config = pyarrow_unwrap_kmsconnectionconfig( + kms_connection_config) + self.c_config.get().encryption_config = c_encryption_config + + @staticmethod + cdef wrap(shared_ptr[CParquetEncryptionConfig] c_config): + cdef ParquetEncryptionConfig python_config = ParquetEncryptionConfig.__new__(ParquetEncryptionConfig) + python_config.c_config = c_config + return python_config + + cdef shared_ptr[CParquetEncryptionConfig] unwrap(self): + return self.c_config + + +cdef class ParquetDecryptionConfig(_Weakrefable): + """ + Core configuration class encapsulating parameters for high-level decryption + within the Parquet framework. + + ParquetDecryptionConfig is designed to pass decryption-related parameters to + the appropriate decryption components within the Parquet library. It holds references to + objects that define the decryption strategy, Key Management Service (KMS) configuration, + and specific decryption configurations for reading encrypted Parquet data. + + Parameters + ---------- + crypto_factory : pyarrow.parquet.encryption.CryptoFactory + Shared pointer to a `CryptoFactory` object, pivotal in creating cryptographic + components for the decryption process. + kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig + Shared pointer to a `KmsConnectionConfig` object, containing parameters necessary + for connecting to a Key Management Service (KMS) during decryption. + decryption_config : pyarrow.parquet.encryption.DecryptionConfiguration + Shared pointer to a `DecryptionConfiguration` object, specifying decryption settings + for reading encrypted Parquet data. + + Raises + ------ + ValueError + Raised if `decryption_config` is None. + """ + + cdef: + shared_ptr[CParquetDecryptionConfig] c_config + + # Avoid mistakingly creating attributes + __slots__ = () + + def __cinit__(self, CryptoFactory crypto_factory, KmsConnectionConfig kms_connection_config, + DecryptionConfiguration decryption_config): + + cdef shared_ptr[CDecryptionConfiguration] c_decryption_config + + if decryption_config is None: + raise ValueError( + "decryption_config cannot be None") + + self.c_config.reset(new CParquetDecryptionConfig()) + + c_decryption_config = pyarrow_unwrap_decryptionconfig( + decryption_config) + + self.c_config.get().crypto_factory = pyarrow_unwrap_cryptofactory(crypto_factory) + self.c_config.get().kms_connection_config = pyarrow_unwrap_kmsconnectionconfig( + kms_connection_config) + self.c_config.get().decryption_config = c_decryption_config + + @staticmethod + cdef wrap(shared_ptr[CParquetDecryptionConfig] c_config): + cdef ParquetDecryptionConfig python_config = ParquetDecryptionConfig.__new__(ParquetDecryptionConfig) + python_config.c_config = c_config + return python_config + + cdef shared_ptr[CParquetDecryptionConfig] unwrap(self): + return self.c_config + + +cdef set_decryption_config(CParquetFragmentScanOptions * parquet_options, config): + cdef shared_ptr[CParquetDecryptionConfig] c_config + if not isinstance(config, ParquetDecryptionConfig): + raise ValueError("config must be a ParquetDecryptionConfig") + c_config = (config).unwrap() + parquet_options.parquet_decryption_config = c_config diff --git a/python/pyarrow/_dataset_parquet_no_encryption.pxd b/python/pyarrow/_dataset_parquet_no_encryption.pxd new file mode 100644 index 0000000000000..a960d9d099186 --- /dev/null +++ b/python/pyarrow/_dataset_parquet_no_encryption.pxd @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +"""Dataset support for Parquet encryption - shims in case encryption is disabled.""" + +from pyarrow.includes.libarrow_dataset_parquet cimport * + + +cdef bint is_encryption_enabled() +cdef set_decryption_config(CParquetFragmentScanOptions * parquet_options, config) diff --git a/python/pyarrow/_dataset_parquet_no_encryption.pyx b/python/pyarrow/_dataset_parquet_no_encryption.pyx new file mode 100644 index 0000000000000..550fb9948e68a --- /dev/null +++ b/python/pyarrow/_dataset_parquet_no_encryption.pyx @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +"""Dataset support for Parquet encryption - shims in case encryption is disabled.""" + +from pyarrow.includes.libarrow_dataset_parquet cimport * + + +cdef bint is_encryption_enabled(): + return False + + +cdef set_decryption_config(CParquetFragmentScanOptions * parquet_options, config): + pass diff --git a/python/pyarrow/_parquet_encryption.pxd b/python/pyarrow/_parquet_encryption.pxd index 741d414864f34..e90bcdaaeba94 100644 --- a/python/pyarrow/_parquet_encryption.pxd +++ b/python/pyarrow/_parquet_encryption.pxd @@ -19,6 +19,7 @@ # cython: language_level = 3 from pyarrow.includes.common cimport * +from pyarrow.includes.libparquet_encryption cimport * from pyarrow._parquet cimport (ParquetCipher, CFileEncryptionProperties, CFileDecryptionProperties, @@ -47,124 +48,3 @@ cdef class KmsConnectionConfig(_Weakrefable): @staticmethod cdef wrap(const CKmsConnectionConfig& config) - - -cdef extern from "parquet/encryption/kms_client.h" \ - namespace "parquet::encryption" nogil: - cdef cppclass CKmsClient" parquet::encryption::KmsClient": - c_string WrapKey(const c_string& key_bytes, - const c_string& master_key_identifier) except + - c_string UnwrapKey(const c_string& wrapped_key, - const c_string& master_key_identifier) except + - - cdef cppclass CKeyAccessToken" parquet::encryption::KeyAccessToken": - CKeyAccessToken(const c_string value) - void Refresh(const c_string& new_value) - const c_string& value() const - - cdef cppclass CKmsConnectionConfig \ - " parquet::encryption::KmsConnectionConfig": - CKmsConnectionConfig() - c_string kms_instance_id - c_string kms_instance_url - shared_ptr[CKeyAccessToken] refreshable_key_access_token - unordered_map[c_string, c_string] custom_kms_conf - -# Callbacks for implementing Python kms clients -# Use typedef to emulate syntax for std::function -ctypedef void CallbackWrapKey( - object, const c_string&, const c_string&, c_string*) -ctypedef void CallbackUnwrapKey( - object, const c_string&, const c_string&, c_string*) - -cdef extern from "parquet/encryption/kms_client_factory.h" \ - namespace "parquet::encryption" nogil: - cdef cppclass CKmsClientFactory" parquet::encryption::KmsClientFactory": - shared_ptr[CKmsClient] CreateKmsClient( - const CKmsConnectionConfig& kms_connection_config) except + - -# Callbacks for implementing Python kms client factories -# Use typedef to emulate syntax for std::function -ctypedef void CallbackCreateKmsClient( - object, - const CKmsConnectionConfig&, shared_ptr[CKmsClient]*) - -cdef extern from "parquet/encryption/crypto_factory.h" \ - namespace "parquet::encryption" nogil: - cdef cppclass CEncryptionConfiguration\ - " parquet::encryption::EncryptionConfiguration": - CEncryptionConfiguration(const c_string& footer_key) except + - c_string footer_key - c_string column_keys - ParquetCipher encryption_algorithm - c_bool plaintext_footer - c_bool double_wrapping - double cache_lifetime_seconds - c_bool internal_key_material - int32_t data_key_length_bits - - cdef cppclass CDecryptionConfiguration\ - " parquet::encryption::DecryptionConfiguration": - CDecryptionConfiguration() except + - double cache_lifetime_seconds - - cdef cppclass CCryptoFactory" parquet::encryption::CryptoFactory": - void RegisterKmsClientFactory( - shared_ptr[CKmsClientFactory] kms_client_factory) except + - shared_ptr[CFileEncryptionProperties] GetFileEncryptionProperties( - const CKmsConnectionConfig& kms_connection_config, - const CEncryptionConfiguration& encryption_config) except +* - shared_ptr[CFileDecryptionProperties] GetFileDecryptionProperties( - const CKmsConnectionConfig& kms_connection_config, - const CDecryptionConfiguration& decryption_config) except +* - void RemoveCacheEntriesForToken(const c_string& access_token) except + - void RemoveCacheEntriesForAllTokens() except + - -cdef extern from "arrow/python/parquet_encryption.h" \ - namespace "arrow::py::parquet::encryption" nogil: - cdef cppclass CPyKmsClientVtable \ - " arrow::py::parquet::encryption::PyKmsClientVtable": - CPyKmsClientVtable() - function[CallbackWrapKey] wrap_key - function[CallbackUnwrapKey] unwrap_key - - cdef cppclass CPyKmsClient\ - " arrow::py::parquet::encryption::PyKmsClient"(CKmsClient): - CPyKmsClient(object handler, CPyKmsClientVtable vtable) - - cdef cppclass CPyKmsClientFactoryVtable\ - " arrow::py::parquet::encryption::PyKmsClientFactoryVtable": - CPyKmsClientFactoryVtable() - function[CallbackCreateKmsClient] create_kms_client - - cdef cppclass CPyKmsClientFactory\ - " arrow::py::parquet::encryption::PyKmsClientFactory"( - CKmsClientFactory): - CPyKmsClientFactory(object handler, CPyKmsClientFactoryVtable vtable) - - cdef cppclass CPyCryptoFactory\ - " arrow::py::parquet::encryption::PyCryptoFactory"(CCryptoFactory): - CResult[shared_ptr[CFileEncryptionProperties]] \ - SafeGetFileEncryptionProperties( - const CKmsConnectionConfig& kms_connection_config, - const CEncryptionConfiguration& encryption_config) - CResult[shared_ptr[CFileDecryptionProperties]] \ - SafeGetFileDecryptionProperties( - const CKmsConnectionConfig& kms_connection_config, - const CDecryptionConfiguration& decryption_config) - -cdef extern from "arrow/dataset/parquet_encryption_config.h" namespace "arrow::dataset" nogil: - cdef cppclass CParquetEncryptionConfig "arrow::dataset::ParquetEncryptionConfig": - shared_ptr[CCryptoFactory] crypto_factory - shared_ptr[CKmsConnectionConfig] kms_connection_config - shared_ptr[CEncryptionConfiguration] encryption_config - - cdef cppclass CParquetDecryptionConfig "arrow::dataset::ParquetDecryptionConfig": - shared_ptr[CCryptoFactory] crypto_factory - shared_ptr[CKmsConnectionConfig] kms_connection_config - shared_ptr[CDecryptionConfiguration] decryption_config - -cdef shared_ptr[CCryptoFactory] pyarrow_unwrap_cryptofactory(object crypto_factory) except * -cdef shared_ptr[CKmsConnectionConfig] pyarrow_unwrap_kmsconnectionconfig(object kmsconnectionconfig) except * -cdef shared_ptr[CEncryptionConfiguration] pyarrow_unwrap_encryptionconfig(object encryptionconfig) except * -cdef shared_ptr[CDecryptionConfiguration] pyarrow_unwrap_decryptionconfig(object decryptionconfig) except * diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index 45f2388da1d2e..adf21814a2c99 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -81,14 +81,6 @@ "format." ) -try: - from pyarrow._dataset_parquet import ( # noqa - ParquetDecryptionConfig, - ParquetEncryptionConfig, - ) -except ImportError: - pass - try: from pyarrow._dataset_parquet import ( # noqa ParquetDatasetFactory, @@ -105,6 +97,15 @@ pass +try: + from pyarrow._dataset_parquet_encryption import ( # noqa + ParquetDecryptionConfig, + ParquetEncryptionConfig, + ) +except ImportError: + pass + + def __getattr__(name): if name == "OrcFileFormat" and not _orc_available: raise ImportError(_orc_msg) diff --git a/python/pyarrow/includes/libarrow_dataset_parquet.pxd b/python/pyarrow/includes/libarrow_dataset_parquet.pxd index 4461813e8e741..877d5b16b24c2 100644 --- a/python/pyarrow/includes/libarrow_dataset_parquet.pxd +++ b/python/pyarrow/includes/libarrow_dataset_parquet.pxd @@ -18,14 +18,39 @@ # distutils: language = c++ from pyarrow.includes.libarrow_dataset cimport * +from pyarrow.includes.libparquet_encryption cimport * + from pyarrow._parquet cimport * +cdef extern from "arrow/dataset/parquet_encryption_config.h" namespace "arrow::dataset" nogil: + cdef cppclass CParquetEncryptionConfig "arrow::dataset::ParquetEncryptionConfig": + shared_ptr[CCryptoFactory] crypto_factory + shared_ptr[CKmsConnectionConfig] kms_connection_config + shared_ptr[CEncryptionConfiguration] encryption_config + + cdef cppclass CParquetDecryptionConfig "arrow::dataset::ParquetDecryptionConfig": + shared_ptr[CCryptoFactory] crypto_factory + shared_ptr[CKmsConnectionConfig] kms_connection_config + shared_ptr[CDecryptionConfiguration] decryption_config + +cdef shared_ptr[CCryptoFactory] pyarrow_unwrap_cryptofactory(object crypto_factory) except * +cdef shared_ptr[CKmsConnectionConfig] pyarrow_unwrap_kmsconnectionconfig(object kmsconnectionconfig) except * +cdef shared_ptr[CEncryptionConfiguration] pyarrow_unwrap_encryptionconfig(object encryptionconfig) except * +cdef shared_ptr[CDecryptionConfiguration] pyarrow_unwrap_decryptionconfig(object decryptionconfig) except * + + cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: cdef cppclass CParquetFileWriter \ "arrow::dataset::ParquetFileWriter"(CFileWriter): const shared_ptr[FileWriter]& parquet_writer() const + cdef cppclass CParquetFileWriteOptions \ + "arrow::dataset::ParquetFileWriteOptions"(CFileWriteOptions): + shared_ptr[WriterProperties] writer_properties + shared_ptr[ArrowWriterProperties] arrow_writer_properties + shared_ptr[CParquetEncryptionConfig] parquet_encryption_config + cdef cppclass CParquetFileFragment "arrow::dataset::ParquetFileFragment"( CFileFragment): const vector[int]& row_groups() const @@ -52,6 +77,12 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: shared_ptr[CSchema] physical_schema, vector[int] row_groups) + cdef cppclass CParquetFragmentScanOptions \ + "arrow::dataset::ParquetFragmentScanOptions"(CFragmentScanOptions): + shared_ptr[CReaderProperties] reader_properties + shared_ptr[ArrowReaderProperties] arrow_reader_properties + shared_ptr[CParquetDecryptionConfig] parquet_decryption_config + cdef cppclass CParquetFactoryOptions \ "arrow::dataset::ParquetFactoryOptions": CPartitioningOrFactory partitioning diff --git a/python/pyarrow/includes/libarrow_parquet_readwrite_encryption.pxd b/python/pyarrow/includes/libarrow_parquet_readwrite_encryption.pxd deleted file mode 100644 index 7c9785ee5e9bc..0000000000000 --- a/python/pyarrow/includes/libarrow_parquet_readwrite_encryption.pxd +++ /dev/null @@ -1,35 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# distutils: language = c++ - -from pyarrow.includes.libarrow_dataset cimport * -from pyarrow._parquet cimport * -from pyarrow._parquet_encryption cimport * - -cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: - cdef cppclass CParquetFileWriteOptions \ - "arrow::dataset::ParquetFileWriteOptions"(CFileWriteOptions): - shared_ptr[WriterProperties] writer_properties - shared_ptr[ArrowWriterProperties] arrow_writer_properties - shared_ptr[CParquetEncryptionConfig] parquet_encryption_config - - cdef cppclass CParquetFragmentScanOptions \ - "arrow::dataset::ParquetFragmentScanOptions"(CFragmentScanOptions): - shared_ptr[CReaderProperties] reader_properties - shared_ptr[ArrowReaderProperties] arrow_reader_properties - shared_ptr[CParquetDecryptionConfig] parquet_decryption_config diff --git a/python/pyarrow/includes/libparquet_encryption.pxd b/python/pyarrow/includes/libparquet_encryption.pxd new file mode 100644 index 0000000000000..2b40414ce5383 --- /dev/null +++ b/python/pyarrow/includes/libparquet_encryption.pxd @@ -0,0 +1,130 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ + +from pyarrow.includes.common cimport * +from pyarrow._parquet cimport (ParquetCipher, + CFileEncryptionProperties, + CFileDecryptionProperties, + ParquetCipher_AES_GCM_V1, + ParquetCipher_AES_GCM_CTR_V1) + + +cdef extern from "parquet/encryption/kms_client.h" \ + namespace "parquet::encryption" nogil: + cdef cppclass CKmsClient" parquet::encryption::KmsClient": + c_string WrapKey(const c_string& key_bytes, + const c_string& master_key_identifier) except + + c_string UnwrapKey(const c_string& wrapped_key, + const c_string& master_key_identifier) except + + + cdef cppclass CKeyAccessToken" parquet::encryption::KeyAccessToken": + CKeyAccessToken(const c_string value) + void Refresh(const c_string& new_value) + const c_string& value() const + + cdef cppclass CKmsConnectionConfig \ + " parquet::encryption::KmsConnectionConfig": + CKmsConnectionConfig() + c_string kms_instance_id + c_string kms_instance_url + shared_ptr[CKeyAccessToken] refreshable_key_access_token + unordered_map[c_string, c_string] custom_kms_conf + +# Callbacks for implementing Python kms clients +# Use typedef to emulate syntax for std::function +ctypedef void CallbackWrapKey( + object, const c_string&, const c_string&, c_string*) +ctypedef void CallbackUnwrapKey( + object, const c_string&, const c_string&, c_string*) + +cdef extern from "parquet/encryption/kms_client_factory.h" \ + namespace "parquet::encryption" nogil: + cdef cppclass CKmsClientFactory" parquet::encryption::KmsClientFactory": + shared_ptr[CKmsClient] CreateKmsClient( + const CKmsConnectionConfig& kms_connection_config) except + + +# Callbacks for implementing Python kms client factories +# Use typedef to emulate syntax for std::function +ctypedef void CallbackCreateKmsClient( + object, + const CKmsConnectionConfig&, shared_ptr[CKmsClient]*) + +cdef extern from "parquet/encryption/crypto_factory.h" \ + namespace "parquet::encryption" nogil: + cdef cppclass CEncryptionConfiguration\ + " parquet::encryption::EncryptionConfiguration": + CEncryptionConfiguration(const c_string& footer_key) except + + c_string footer_key + c_string column_keys + ParquetCipher encryption_algorithm + c_bool plaintext_footer + c_bool double_wrapping + double cache_lifetime_seconds + c_bool internal_key_material + int32_t data_key_length_bits + + cdef cppclass CDecryptionConfiguration\ + " parquet::encryption::DecryptionConfiguration": + CDecryptionConfiguration() except + + double cache_lifetime_seconds + + cdef cppclass CCryptoFactory" parquet::encryption::CryptoFactory": + void RegisterKmsClientFactory( + shared_ptr[CKmsClientFactory] kms_client_factory) except + + shared_ptr[CFileEncryptionProperties] GetFileEncryptionProperties( + const CKmsConnectionConfig& kms_connection_config, + const CEncryptionConfiguration& encryption_config) except +* + shared_ptr[CFileDecryptionProperties] GetFileDecryptionProperties( + const CKmsConnectionConfig& kms_connection_config, + const CDecryptionConfiguration& decryption_config) except +* + void RemoveCacheEntriesForToken(const c_string& access_token) except + + void RemoveCacheEntriesForAllTokens() except + + +cdef extern from "arrow/python/parquet_encryption.h" \ + namespace "arrow::py::parquet::encryption" nogil: + cdef cppclass CPyKmsClientVtable \ + " arrow::py::parquet::encryption::PyKmsClientVtable": + CPyKmsClientVtable() + function[CallbackWrapKey] wrap_key + function[CallbackUnwrapKey] unwrap_key + + cdef cppclass CPyKmsClient\ + " arrow::py::parquet::encryption::PyKmsClient"(CKmsClient): + CPyKmsClient(object handler, CPyKmsClientVtable vtable) + + cdef cppclass CPyKmsClientFactoryVtable\ + " arrow::py::parquet::encryption::PyKmsClientFactoryVtable": + CPyKmsClientFactoryVtable() + function[CallbackCreateKmsClient] create_kms_client + + cdef cppclass CPyKmsClientFactory\ + " arrow::py::parquet::encryption::PyKmsClientFactory"( + CKmsClientFactory): + CPyKmsClientFactory(object handler, CPyKmsClientFactoryVtable vtable) + + cdef cppclass CPyCryptoFactory\ + " arrow::py::parquet::encryption::PyCryptoFactory"(CCryptoFactory): + CResult[shared_ptr[CFileEncryptionProperties]] \ + SafeGetFileEncryptionProperties( + const CKmsConnectionConfig& kms_connection_config, + const CEncryptionConfiguration& encryption_config) + CResult[shared_ptr[CFileDecryptionProperties]] \ + SafeGetFileDecryptionProperties( + const CKmsConnectionConfig& kms_connection_config, + const CDecryptionConfiguration& decryption_config) diff --git a/python/pyarrow/tests/test_dataset_encryption.py b/python/pyarrow/tests/test_dataset_encryption.py index 7a942820a9f42..00e8de6a8f6fa 100644 --- a/python/pyarrow/tests/test_dataset_encryption.py +++ b/python/pyarrow/tests/test_dataset_encryption.py @@ -19,9 +19,6 @@ import pyarrow.fs as fs import pyarrow as pa import pytest -import numpy as np -import tempfile -import os encryption_enabled = False @@ -136,32 +133,14 @@ def test_dataset_encryption_decryption(): def test_write_dataset_parquet_without_encryption(): """Test write_dataset with ParquetFileFormat and test if an exception is thrown if you try to set encryption_config using make_write_options""" - table = pa.table( - [ - pa.array(range(20), type="uint32"), - pa.array( - np.arange("2012-01-01", 20, dtype="datetime64[D]").astype( - "datetime64[ns]" - ) - ), - pa.array(np.repeat(["a", "b"], 10)), - ], - names=["f1", "f2", "part"], - ) - - with tempfile.TemporaryDirectory() as tempdir: - base_dir = os.path.join(tempdir, "parquet_dataset") - - # Use a placeholder for encryption configurations - encryption_config_placeholder = "test_encryption_config_value" - - # Set the encryption configuration using ParquetFileFormat - # and make_write_options - pformat = pa.dataset.ParquetFileFormat() - with pytest.raises(NotImplementedError): - write_options = pformat.make_write_options( - encryption_config=encryption_config_placeholder - ) + # Set the encryption configuration using ParquetFileFormat + # and make_write_options + pformat = pa.dataset.ParquetFileFormat() - ds.write_dataset(table, base_dir, format=pformat, file_options=write_options) + with pytest.raises(NotImplementedError): + _ = pformat.make_write_options( + # encryption_config=encryption_config_placeholder + # TODO + encryption_properties="some value" + )