From db48d95b392503cf944d367702db67615d5911b0 Mon Sep 17 00:00:00 2001 From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com> Date: Sat, 25 Jan 2025 07:16:03 +1100 Subject: [PATCH] [Tensornet] Update observe calculation (#2522) * Add non path reuse option Signed-off-by: Thien Nguyen * Add docs and test Signed-off-by: Thien Nguyen * Code review: add a note about observe with tensornet and update code comments Signed-off-by: Thien Nguyen --------- Signed-off-by: Thien Nguyen --- docs/sphinx/using/backends/simulators.rst | 13 ++++ .../cutensornet/simulator_cutensornet.cpp | 15 ++++ .../nvqir/cutensornet/simulator_cutensornet.h | 8 +++ runtime/nvqir/cutensornet/tensornet_state.cpp | 68 ++++++++++++++++++- runtime/nvqir/cutensornet/tensornet_state.h | 5 ++ unittests/CMakeLists.txt | 22 ++++++ .../integration/observe_result_tester.cpp | 4 ++ 7 files changed, 132 insertions(+), 3 deletions(-) diff --git a/docs/sphinx/using/backends/simulators.rst b/docs/sphinx/using/backends/simulators.rst index d8880a2fac..30054e36ab 100644 --- a/docs/sphinx/using/backends/simulators.rst +++ b/docs/sphinx/using/backends/simulators.rst @@ -482,6 +482,7 @@ Specific aspects of the simulation can be configured by setting the following of * **`OMP_PLACES=cores`**: Set this environment variable to improve CPU parallelization. * **`OMP_NUM_THREADS=X`**: To enable CPU parallelization, set X to `NUMBER_OF_CORES_PER_NODE/NUMBER_OF_GPUS_PER_NODE`. * **`CUDAQ_TENSORNET_CONTROLLED_RANK=X`**: Specify the number of controlled qubits whereby the full tensor body of the controlled gate is expanded. If the number of controlled qubits is greater than this value, the gate is applied as a controlled tensor operator to the tensor network state. Default value is 1. +* **`CUDAQ_TENSORNET_OBSERVE_CONTRACT_PATH_REUSE=X`**: Set this environment variable to `TRUE` (`ON`) or `FALSE` (`OFF`) to enable or disable contraction path reuse when computing expectation values. Default is `OFF`. .. note:: @@ -489,6 +490,18 @@ Specific aspects of the simulation can be configured by setting the following of If you do not have these dependencies installed, you may encounter an error stating `Invalid simulator requested`. See the section :ref:`dependencies-and-compatibility` for more information about how to install dependencies. +.. note:: + + When using contraction path reuse (`CUDAQ_TENSORNET_OBSERVE_CONTRACT_PATH_REUSE=TRUE`), :code:`tensornet` backends perform a single contraction path optimization with an opaque spin operator term. This path is then used to contract all the actual terms in the spin operator, hence saving the path finding time. + + As we use an opaque spin operator term as a placeholder for contraction path optimization, the resulting contraction path is not as optimal as if the actual spin operator is used. + For instance, if the spin operator is sparse (only acting on a few qubits), the contraction can be significantly simplified. + +.. note:: + + :code:`tensornet` backends only return the overall expectation value for a :class:`cudaq.SpinOperator` when using the `cudaq::observe` method. + Term-by-term expectation values will not be available in the resulting `ObserveResult` object. + If needed, these values can be computed by calling `cudaq::observe` on individual terms instead. Matrix product state +++++++++++++++++++++++++++++++++++ diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp index 3382ae1586..f99e189fe4 100644 --- a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp +++ b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp @@ -24,6 +24,10 @@ SimulatorTensorNetBase::SimulatorTensorNetBase() HANDLE_CUTN_ERROR(cutensornetCreate(&m_cutnHandle)); // The scratch pad must be allocated after we have selected the device. scratchPad.allocate(); + + // Check whether observe path reuse is enabled. + m_reuseContractionPathObserve = + cudaq::getEnvBool("CUDAQ_TENSORNET_OBSERVE_CONTRACT_PATH_REUSE", false); } static std::vector> @@ -277,6 +281,17 @@ cudaq::observe_result SimulatorTensorNetBase::observe(const cudaq::spin_op &ham) { LOG_API_TIME(); prepareQubitTensorState(); + if (!m_reuseContractionPathObserve) { + // If contraction path reuse is disabled, convert spin_op to + // cutensornetNetworkOperator_t and compute the expectation value. + TensorNetworkSpinOp spinOp(ham, m_cutnHandle); + std::complex expVal = + m_state->computeExpVal(spinOp.getNetworkOperator()); + expVal += spinOp.getIdentityTermOffset(); + return cudaq::observe_result(expVal.real(), ham, + cudaq::sample_result(cudaq::ExecutionResult( + {}, ham.to_string(false), expVal.real()))); + } std::vector termStrs; std::vector terms; diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.h b/runtime/nvqir/cutensornet/simulator_cutensornet.h index 666411ac11..8d11c88dee 100644 --- a/runtime/nvqir/cutensornet/simulator_cutensornet.h +++ b/runtime/nvqir/cutensornet/simulator_cutensornet.h @@ -103,6 +103,14 @@ class SimulatorTensorNetBase : public nvqir::CircuitSimulatorBase { // cutensornetStateApplyControlledTensorOperator). Tensornet supports // arbitrary values. std::size_t m_maxControlledRankForFullTensorExpansion = 1; + + // Flag to enable contraction path reuse when computing the expectation value + // (observe). + // Default is off (no contraction path reuse). + // Reusing the path, while saving the path finding time, prevents lightcone + // simplification, e.g., when the spin op is sparse (only acting on a few + // qubits). + bool m_reuseContractionPathObserve = false; }; } // end namespace nvqir diff --git a/runtime/nvqir/cutensornet/tensornet_state.cpp b/runtime/nvqir/cutensornet/tensornet_state.cpp index 94e662345c..c6e2b2d712 100644 --- a/runtime/nvqir/cutensornet/tensornet_state.cpp +++ b/runtime/nvqir/cutensornet/tensornet_state.cpp @@ -681,15 +681,14 @@ std::vector> TensorNetState::computeExpVals( placeHolderArraySize, cudaMemcpyHostToDevice)); std::complex expVal; - std::complex stateNorm{0.0, 0.0}; { ScopedTraceWithContext("cutensornetExpectationCompute"); HANDLE_CUTN_ERROR(cutensornetExpectationCompute( m_cutnHandle, tensorNetworkExpectation, workDesc, &expVal, - static_cast(&stateNorm), + /*stateNorm*/ nullptr, /*cudaStream*/ 0)); } - allExpVals.emplace_back(expVal / std::abs(stateNorm)); + allExpVals.emplace_back(expVal); } } @@ -699,6 +698,69 @@ std::vector> TensorNetState::computeExpVals( return allExpVals; } +std::complex TensorNetState::computeExpVal( + cutensornetNetworkOperator_t tensorNetworkOperator) { + LOG_API_TIME(); + cutensornetStateExpectation_t tensorNetworkExpectation; + // Step 1: create + { + ScopedTraceWithContext("cutensornetCreateExpectation"); + HANDLE_CUTN_ERROR(cutensornetCreateExpectation(m_cutnHandle, m_quantumState, + tensorNetworkOperator, + &tensorNetworkExpectation)); + } + // Step 2: configure + const int32_t numHyperSamples = + 8; // desired number of hyper samples used in the tensor network + // contraction path finder + { + ScopedTraceWithContext("cutensornetExpectationConfigure"); + HANDLE_CUTN_ERROR(cutensornetExpectationConfigure( + m_cutnHandle, tensorNetworkExpectation, + CUTENSORNET_EXPECTATION_OPT_NUM_HYPER_SAMPLES, &numHyperSamples, + sizeof(numHyperSamples))); + } + + // Step 3: Prepare + cutensornetWorkspaceDescriptor_t workDesc; + HANDLE_CUTN_ERROR( + cutensornetCreateWorkspaceDescriptor(m_cutnHandle, &workDesc)); + { + ScopedTraceWithContext("cutensornetExpectationPrepare"); + HANDLE_CUTN_ERROR(cutensornetExpectationPrepare( + m_cutnHandle, tensorNetworkExpectation, scratchPad.scratchSize, + workDesc, /*cudaStream*/ 0)); + } + + // Attach the workspace buffer + int64_t worksize{0}; + HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize( + m_cutnHandle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, + CUTENSORNET_MEMSPACE_DEVICE, CUTENSORNET_WORKSPACE_SCRATCH, &worksize)); + if (worksize <= static_cast(scratchPad.scratchSize)) { + HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory( + m_cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, + CUTENSORNET_WORKSPACE_SCRATCH, scratchPad.d_scratch, worksize)); + } else { + throw std::runtime_error("ERROR: Insufficient workspace size on Device!"); + } + + // Step 4: Compute + std::complex expVal; + + { + ScopedTraceWithContext("cutensornetExpectationCompute"); + HANDLE_CUTN_ERROR(cutensornetExpectationCompute( + m_cutnHandle, tensorNetworkExpectation, workDesc, &expVal, + /*stateNorm*/ nullptr, + /*cudaStream*/ 0)); + } + // Step 5: clean up + HANDLE_CUTN_ERROR(cutensornetDestroyExpectation(tensorNetworkExpectation)); + HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc)); + return expVal; +} + std::unique_ptr TensorNetState::createFromMpsTensors( const std::vector &in_mpsTensors, ScratchDeviceMem &inScratchPad, cutensornetHandle_t handle, std::mt19937 &randomEngine) { diff --git a/runtime/nvqir/cutensornet/tensornet_state.h b/runtime/nvqir/cutensornet/tensornet_state.h index b543684b53..c680fd5c67 100644 --- a/runtime/nvqir/cutensornet/tensornet_state.h +++ b/runtime/nvqir/cutensornet/tensornet_state.h @@ -151,6 +151,11 @@ class TensorNetState { std::vector> computeExpVals(const std::vector> &symplecticRepr); + /// @brief Evaluate the expectation value of a given + /// `cutensornetNetworkOperator_t` + std::complex + computeExpVal(cutensornetNetworkOperator_t tensorNetworkOperator); + /// @brief Number of qubits that this state represents. std::size_t getNumQubits() const { return m_numQubits; } diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt index 9ff26f7510..81b7202ae6 100644 --- a/unittests/CMakeLists.txt +++ b/unittests/CMakeLists.txt @@ -221,6 +221,28 @@ if(TARGET nvqir-tensornet) endif() # NGPUS endif() # NVIDIA_SMI endif() # MPI_CXX_FOUND + + # Test CUDAQ_TENSORNET_OBSERVE_CONTRACT_PATH_REUSE=ON mode (on a few test cases that have cudaq::observe) + add_executable(test_tensornet_observe_path_reuse + integration/builder_tester.cpp + integration/deuteron_variational_tester.cpp + integration/observe_result_tester.cpp + ) + target_include_directories(test_tensornet_observe_path_reuse PRIVATE .) + target_compile_definitions(test_tensornet_observe_path_reuse + PRIVATE -DNVQIR_BACKEND_NAME=tensornet) + if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE) + target_link_options(test_tensornet_observe_path_reuse PRIVATE -Wl,--no-as-needed) + endif() + target_link_libraries(test_tensornet_observe_path_reuse + PRIVATE + cudaq + cudaq-builder + cudaq-platform-default + nvqir-tensornet + gtest_main) + # Run this test with "CUDAQ_TENSORNET_OBSERVE_CONTRACT_PATH_REUSE=TRUE" + gtest_discover_tests(test_tensornet_observe_path_reuse TEST_SUFFIX _PathReuse PROPERTIES ENVIRONMENT "CUDAQ_TENSORNET_OBSERVE_CONTRACT_PATH_REUSE=ON" PROPERTIES LABELS "gpu_required") endif() # Create an executable for SpinOp UnitTests diff --git a/unittests/integration/observe_result_tester.cpp b/unittests/integration/observe_result_tester.cpp index 9613f70d29..56e96a8651 100644 --- a/unittests/integration/observe_result_tester.cpp +++ b/unittests/integration/observe_result_tester.cpp @@ -83,6 +83,9 @@ CUDAQ_TEST(ObserveResult, checkSimple) { EXPECT_TRUE(x0x1Counts.size() == 4); } +// By default, tensornet backends only compute the overall expectation value in +// observe, i.e., no sub-term calculations. +#ifndef CUDAQ_BACKEND_TENSORNET CUDAQ_TEST(ObserveResult, checkExpValBug) { auto kernel = []() __qpu__ { @@ -112,3 +115,4 @@ CUDAQ_TEST(ObserveResult, checkExpValBug) { EXPECT_NEAR(exp, .79, 1e-1); } #endif +#endif