From 3bc460d158808b8702ecd8905a43d57ca24ec2b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Tue, 8 Aug 2023 20:03:12 +0200 Subject: [PATCH] HDF5: Handle unknown datatypes in datasets (#1469) * HDF5: Throw ReadError at dataset open * Try parent types when a dataset datatype is unknown * Update Sample Download Scripts * Add test for old HDF5-plugin written openPMD dataset from PIConGPU * Remove debugging comments --- share/openPMD/download_samples.ps1 | 5 + share/openPMD/download_samples.sh | 5 +- src/IO/HDF5/HDF5IOHandler.cpp | 241 ++++++++++++++++++++--------- test/SerialIOTest.cpp | 79 ++++++++++ 4 files changed, 257 insertions(+), 73 deletions(-) diff --git a/share/openPMD/download_samples.ps1 b/share/openPMD/download_samples.ps1 index 5fc8987c4d..9880aa891f 100755 --- a/share/openPMD/download_samples.ps1 +++ b/share/openPMD/download_samples.ps1 @@ -15,18 +15,23 @@ New-item -ItemType directory -Name samples\git-sample\3d-bp4\ Invoke-WebRequest https://github.com/openPMD/openPMD-example-datasets/raw/f3b73e43511db96217a153dc3ab3cb2e8f81f7db/example-3d.tar.gz -OutFile example-3d.tar.gz Invoke-WebRequest https://github.com/openPMD/openPMD-example-datasets/raw/f3b73e43511db96217a153dc3ab3cb2e8f81f7db/example-thetaMode.tar.gz -OutFile example-thetaMode.tar.gz Invoke-WebRequest https://github.com/openPMD/openPMD-example-datasets/raw/f3b73e43511db96217a153dc3ab3cb2e8f81f7db/example-3d-bp4.tar.gz -OutFile example-3d-bp4.tar.gz +Invoke-WebRequest https://github.com/openPMD/openPMD-example-datasets/raw/566b356030df38f56049484941baacafef331163/legacy_datasets.tar.gz -OutFile legacy_datasets.tar.gz 7z.exe x -r example-3d.tar.gz 7z.exe x -r example-3d.tar 7z.exe x -r example-thetaMode.tar.gz 7z.exe x -r example-thetaMode.tar 7z.exe x -r example-3d-bp4.tar.gz 7z.exe x -r example-3d-bp4.tar +7z.exe x -r legacy_datasets.tar.gz +7z.exe x -r legacy_datasets.tar Move-Item -Path example-3d\hdf5\* samples\git-sample\ Move-Item -Path example-thetaMode\hdf5\* samples\git-sample\thetaMode\ Move-Item -Path example-3d-bp4\* samples\git-sample\3d-bp4\ +Move-Item -Path legacy_datasets\* samples\git-sample\legacy\ Remove-Item -Recurse -Force example-3d* Remove-Item -Recurse -Force example-thetaMode* Remove-Item -Recurse -Force example-3d-bp4* +Remove-Item -Recurse -Force legacy_datasets* # Ref.: https://github.com/yt-project/yt/pull/1645 New-item -ItemType directory -Name samples\issue-sample\ diff --git a/share/openPMD/download_samples.sh b/share/openPMD/download_samples.sh index aef491ca1f..8691ce47a5 100755 --- a/share/openPMD/download_samples.sh +++ b/share/openPMD/download_samples.sh @@ -15,14 +15,17 @@ mkdir -p samples/git-sample/3d-bp4 curl -sOL https://github.com/openPMD/openPMD-example-datasets/raw/f3b73e43511db96217a153dc3ab3cb2e8f81f7db/example-3d.tar.gz curl -sOL https://github.com/openPMD/openPMD-example-datasets/raw/f3b73e43511db96217a153dc3ab3cb2e8f81f7db/example-thetaMode.tar.gz curl -sOL https://github.com/openPMD/openPMD-example-datasets/raw/f3b73e43511db96217a153dc3ab3cb2e8f81f7db/example-3d-bp4.tar.gz +curl -sOL https://github.com/openPMD/openPMD-example-datasets/raw/566b356030df38f56049484941baacafef331163/legacy_datasets.tar.gz tar -xzf example-3d.tar.gz tar -xzf example-thetaMode.tar.gz tar -xzf example-3d-bp4.tar.gz +tar -xzf legacy_datasets.tar.gz mv example-3d/hdf5/* samples/git-sample/ mv example-thetaMode/hdf5/* samples/git-sample/thetaMode/ mv example-3d-bp4/* samples/git-sample/3d-bp4 +mv legacy_datasets/* samples/git-sample/legacy chmod 777 samples/ -rm -rf example-3d.* example-3d example-thetaMode.* example-thetaMode example-3d-bp4 example-3d-bp4.* +rm -rf example-3d.* example-3d example-thetaMode.* example-thetaMode example-3d-bp4 example-3d-bp4.* legacy_datasets legacy_datasets.* # Ref.: https://github.com/yt-project/yt/pull/1645 mkdir -p samples/issue-sample/ diff --git a/src/IO/HDF5/HDF5IOHandler.cpp b/src/IO/HDF5/HDF5IOHandler.cpp index 63b998ad1d..a0faac5fe8 100644 --- a/src/IO/HDF5/HDF5IOHandler.cpp +++ b/src/IO/HDF5/HDF5IOHandler.cpp @@ -991,10 +991,15 @@ void HDF5IOHandlerImpl::openDataset( node_id = H5Gopen( file.id, concrete_h5_file_position(writable->parent).c_str(), gapl); - VERIFY( - node_id >= 0, - "[HDF5] Internal error: Failed to open HDF5 group during dataset " - "opening"); + if (node_id < 0) + { + throw error::ReadError( + error::AffectedObject::Dataset, + error::Reason::NotFound, + "HDF5", + "Internal error: Failed to open HDF5 group during dataset " + "opening"); + } /* Sanitize name */ std::string name = parameters.name; @@ -1004,10 +1009,15 @@ void HDF5IOHandlerImpl::openDataset( name += '/'; dataset_id = H5Dopen(node_id, name.c_str(), H5P_DEFAULT); - VERIFY( - dataset_id >= 0, - "[HDF5] Internal error: Failed to open HDF5 dataset during dataset " - "opening"); + if (dataset_id < 0) + { + throw error::ReadError( + error::AffectedObject::Dataset, + error::Reason::NotFound, + "HDF5", + "Internal error: Failed to open HDF5 dataset during dataset " + "opening"); + } hid_t dataset_type, dataset_space; dataset_type = H5Dget_type(dataset_id); @@ -1020,52 +1030,114 @@ void HDF5IOHandlerImpl::openDataset( if (dataset_class == H5S_SIMPLE || dataset_class == H5S_SCALAR || dataset_class == H5S_NULL) { - if (H5Tequal(dataset_type, H5T_NATIVE_UCHAR)) - d = DT::UCHAR; - else if (H5Tequal(dataset_type, H5T_NATIVE_SCHAR)) - d = DT::SCHAR; - // NOTE: in HDF5, CHAR is actually either UCHAR or SCHAR. - else if (H5Tequal(dataset_type, H5T_NATIVE_CHAR)) - d = DT::CHAR; - else if (H5Tequal(dataset_type, H5T_NATIVE_SHORT)) - d = DT::SHORT; - else if (H5Tequal(dataset_type, H5T_NATIVE_INT)) - d = DT::INT; - else if (H5Tequal(dataset_type, H5T_NATIVE_LONG)) - d = DT::LONG; - else if (H5Tequal(dataset_type, H5T_NATIVE_LLONG)) - d = DT::LONGLONG; - else if (H5Tequal(dataset_type, H5T_NATIVE_FLOAT)) - d = DT::FLOAT; - else if (H5Tequal(dataset_type, H5T_NATIVE_DOUBLE)) - d = DT::DOUBLE; - else if ( - H5Tequal(dataset_type, H5T_NATIVE_LDOUBLE) || - H5Tequal(dataset_type, m_H5T_LONG_DOUBLE_80_LE)) - d = DT::LONG_DOUBLE; - else if (H5Tequal(dataset_type, m_H5T_CFLOAT)) - d = DT::CFLOAT; - else if (H5Tequal(dataset_type, m_H5T_CDOUBLE)) - d = DT::CDOUBLE; - else if ( - H5Tequal(dataset_type, m_H5T_CLONG_DOUBLE) || - H5Tequal(dataset_type, m_H5T_CLONG_DOUBLE_80_LE)) - d = DT::CLONG_DOUBLE; - else if (H5Tequal(dataset_type, H5T_NATIVE_USHORT)) - d = DT::USHORT; - else if (H5Tequal(dataset_type, H5T_NATIVE_UINT)) - d = DT::UINT; - else if (H5Tequal(dataset_type, H5T_NATIVE_ULONG)) - d = DT::ULONG; - else if (H5Tequal(dataset_type, H5T_NATIVE_ULLONG)) - d = DT::ULONGLONG; - else if (H5Tget_class(dataset_type) == H5T_STRING) - d = DT::STRING; - else - throw std::runtime_error("[HDF5] Unknown dataset type"); + constexpr size_t max_retries = 10; + /* + * It happens that an HDF5 file has a type that is not equal to any of + * the native types, but can still be read as its parent type. + * For example an enum (which some applications use to emulate bools) + * can still be read as its parent type, a char. + * Upon not matching any native type, don't give up yet, but check the + * parent type. + * Normally, this procedure should stop at the point where + * H5Tget_super() returns H5I_INVALID_HID, but this is putting a bit + * too much trust in an external library to be the loop's exit + * condition. So, we restrict the loop to a maximum of 10 iterations + * before manually canceling it. + */ + size_t remaining_tries = max_retries; + bool repeat = false; + do + { + repeat = false; + if (H5Tequal(dataset_type, H5T_NATIVE_UCHAR)) + d = DT::UCHAR; + else if (H5Tequal(dataset_type, H5T_NATIVE_SCHAR)) + d = DT::SCHAR; + // NOTE: in HDF5, CHAR is actually either UCHAR or SCHAR. + else if (H5Tequal(dataset_type, H5T_NATIVE_CHAR)) + d = DT::CHAR; + else if (H5Tequal(dataset_type, H5T_NATIVE_SHORT)) + d = DT::SHORT; + else if (H5Tequal(dataset_type, H5T_NATIVE_INT)) + d = DT::INT; + else if (H5Tequal(dataset_type, H5T_NATIVE_LONG)) + d = DT::LONG; + else if (H5Tequal(dataset_type, H5T_NATIVE_LLONG)) + d = DT::LONGLONG; + else if (H5Tequal(dataset_type, H5T_NATIVE_FLOAT)) + d = DT::FLOAT; + else if (H5Tequal(dataset_type, H5T_NATIVE_DOUBLE)) + d = DT::DOUBLE; + else if ( + H5Tequal(dataset_type, H5T_NATIVE_LDOUBLE) || + H5Tequal(dataset_type, m_H5T_LONG_DOUBLE_80_LE)) + d = DT::LONG_DOUBLE; + else if (H5Tequal(dataset_type, m_H5T_CFLOAT)) + d = DT::CFLOAT; + else if (H5Tequal(dataset_type, m_H5T_CDOUBLE)) + d = DT::CDOUBLE; + else if ( + H5Tequal(dataset_type, m_H5T_CLONG_DOUBLE) || + H5Tequal(dataset_type, m_H5T_CLONG_DOUBLE_80_LE)) + d = DT::CLONG_DOUBLE; + else if (H5Tequal(dataset_type, H5T_NATIVE_USHORT)) + d = DT::USHORT; + else if (H5Tequal(dataset_type, H5T_NATIVE_UINT)) + d = DT::UINT; + else if (H5Tequal(dataset_type, H5T_NATIVE_ULONG)) + d = DT::ULONG; + else if (H5Tequal(dataset_type, H5T_NATIVE_ULLONG)) + d = DT::ULONGLONG; + else if (H5Tget_class(dataset_type) == H5T_STRING) + d = DT::STRING; + else + { + auto throw_error = []() { + throw error::ReadError( + error::AffectedObject::Dataset, + error::Reason::UnexpectedContent, + "HDF5", + "Unknown dataset type"); + }; + if (remaining_tries == 0) + { + throw_error(); + } + hid_t next_type = H5Tget_super(dataset_type); + if (next_type == H5I_INVALID_HID) + { + throw_error(); + } + else if (H5Tequal(dataset_type, next_type)) + { + H5Tclose(next_type); + throw_error(); + } + else + { + if (H5Tclose(dataset_type) != 0) + { + throw error::ReadError( + error::AffectedObject::Group, + error::Reason::Other, + "HDF5", + "Internal error: Failed to close HDF5 dataset type " + "during " + "dataset opening"); + } + dataset_type = next_type; + --remaining_tries; + repeat = true; + } + } + } while (repeat); } else - throw std::runtime_error("[HDF5] Unsupported dataset class"); + throw error::ReadError( + error::AffectedObject::Dataset, + error::Reason::UnexpectedContent, + "HDF5", + "Unknown dataset class"); auto dtype = parameters.dtype; *dtype = d; @@ -1083,30 +1155,55 @@ void HDF5IOHandlerImpl::openDataset( herr_t status; status = H5Sclose(dataset_space); - VERIFY( - status == 0, - "[HDF5] Internal error: Failed to close HDF5 dataset space during " - "dataset opening"); + if (status != 0) + { + throw error::ReadError( + error::AffectedObject::Group, + error::Reason::Other, + "HDF5", + "Internal error: Failed to close HDF5 dataset space during " + "dataset opening"); + } status = H5Tclose(dataset_type); - VERIFY( - status == 0, - "[HDF5] Internal error: Failed to close HDF5 dataset type during " - "dataset opening"); + if (status != 0) + { + throw error::ReadError( + error::AffectedObject::Group, + error::Reason::Other, + "HDF5", + "Internal error: Failed to close HDF5 dataset type during " + "dataset opening"); + } status = H5Dclose(dataset_id); - VERIFY( - status == 0, - "[HDF5] Internal error: Failed to close HDF5 dataset during dataset " - "opening"); + if (status != 0) + { + throw error::ReadError( + error::AffectedObject::Group, + error::Reason::Other, + "HDF5", + "Internal error: Failed to close HDF5 dataset during dataset " + "opening"); + } status = H5Gclose(node_id); - VERIFY( - status == 0, - "[HDF5] Internal error: Failed to close HDF5 group during dataset " - "opening"); + if (status != 0) + { + throw error::ReadError( + error::AffectedObject::Group, + error::Reason::Other, + "HDF5", + "Internal error: Failed to close HDF5 group during dataset " + "opening"); + } status = H5Pclose(gapl); - VERIFY( - status == 0, - "[HDF5] Internal error: Failed to close HDF5 property during dataset " - "opening"); + if (status != 0) + { + throw error::ReadError( + error::AffectedObject::Group, + error::Reason::Other, + "HDF5", + "Internal error: Failed to close HDF5 property during dataset " + "opening"); + } writable->written = true; writable->abstractFilePosition = std::make_shared(name); diff --git a/test/SerialIOTest.cpp b/test/SerialIOTest.cpp index f6f1bc1b63..a92ae7a700 100644 --- a/test/SerialIOTest.cpp +++ b/test/SerialIOTest.cpp @@ -2706,6 +2706,85 @@ TEST_CASE("git_hdf5_sample_structure_test", "[serial][hdf5]") #endif } +namespace +{ +struct LoadDataset +{ + template + static void call(RecordComponent &rc) + { + auto chunk = rc.loadChunk(); + rc.seriesFlush(); + } + + static constexpr char const *errorMsg = "LoadDataset"; +}; +} // namespace + +TEST_CASE("git_hdf5_legacy_picongpu", "[serial][hdf5]") +{ + try + { + Series o = Series( + "../samples/git-sample/legacy/simData_%T.h5", Access::READ_ONLY); + + /* + * That dataset was written directly via HDF5 (not the openPMD-api) + * and had two issues: + * + * 1) No unitSI defined for numParticles and numParticlesOffset. + * unitSI does not really make sense there, but the openPMD-standard + * is not quite clear if it is required, so the API writes it and + * also required it. We will keep writing it, but we don't require + * it any longer. + * 2) A custom enum was used for writing a boolean dataset. + * At the least, the dataset should be skipped in parsing instead + * of failing the entire procedure. Ideally, the custom datatype + * should be upcasted to char type and treated as such. + */ + + auto radiationMask = + o.iterations[200] + .particles["e"]["radiationMask"][RecordComponent::SCALAR]; + switchNonVectorType( + radiationMask.getDatatype(), radiationMask); + + auto particlePatches = o.iterations[200].particles["e"].particlePatches; + REQUIRE(particlePatches.size() == 4); + for (auto key : {"extent", "offset"}) + { + REQUIRE(particlePatches.contains(key)); + REQUIRE(particlePatches.at(key).size() == 3); + for (auto subkey : {"x", "y", "z"}) + { + REQUIRE(particlePatches.at(key).contains(subkey)); + // unitSI is present in those records + particlePatches.at(key).at(subkey).unitSI(); + } + } + for (auto key : {"numParticles", "numParticlesOffset"}) + { + REQUIRE(particlePatches.contains(key)); + REQUIRE(particlePatches.at(key).contains(RecordComponent::SCALAR)); + // unitSI is not present in those records + REQUIRE_THROWS_AS( + particlePatches.at(key).at(RecordComponent::SCALAR).unitSI(), + no_such_attribute_error); + } + + helper::listSeries(o, true, std::cout); + } + catch (error::ReadError &e) + { + if (e.reason == error::Reason::Inaccessible) + { + std::cerr << "git sample not accessible. (" << e.what() << ")\n"; + return; + } + throw; + } +} + TEST_CASE("git_hdf5_sample_attribute_test", "[serial][hdf5]") { try