From db48d95b392503cf944d367702db67615d5911b0 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>
Date: Sat, 25 Jan 2025 07:16:03 +1100
Subject: [PATCH] [Tensornet] Update observe calculation (#2522)

* Add non path reuse option

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>

* Add docs and test

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>

* Code review: add a note about observe with tensornet and update code comments

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>

---------

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 docs/sphinx/using/backends/simulators.rst     | 13 ++++
 .../cutensornet/simulator_cutensornet.cpp     | 15 ++++
 .../nvqir/cutensornet/simulator_cutensornet.h |  8 +++
 runtime/nvqir/cutensornet/tensornet_state.cpp | 68 ++++++++++++++++++-
 runtime/nvqir/cutensornet/tensornet_state.h   |  5 ++
 unittests/CMakeLists.txt                      | 22 ++++++
 .../integration/observe_result_tester.cpp     |  4 ++
 7 files changed, 132 insertions(+), 3 deletions(-)

diff --git a/docs/sphinx/using/backends/simulators.rst b/docs/sphinx/using/backends/simulators.rst
index d8880a2fac..30054e36ab 100644
--- a/docs/sphinx/using/backends/simulators.rst
+++ b/docs/sphinx/using/backends/simulators.rst
@@ -482,6 +482,7 @@ Specific aspects of the simulation can be configured by setting the following of
 * **`OMP_PLACES=cores`**: Set this environment variable to improve CPU parallelization.
 * **`OMP_NUM_THREADS=X`**: To enable CPU parallelization, set X to `NUMBER_OF_CORES_PER_NODE/NUMBER_OF_GPUS_PER_NODE`.
 * **`CUDAQ_TENSORNET_CONTROLLED_RANK=X`**: Specify the number of controlled qubits whereby the full tensor body of the controlled gate is expanded. If the number of controlled qubits is greater than this value, the gate is applied as a controlled tensor operator to the tensor network state. Default value is 1.
+* **`CUDAQ_TENSORNET_OBSERVE_CONTRACT_PATH_REUSE=X`**: Set this environment variable to `TRUE` (`ON`) or `FALSE` (`OFF`) to enable or disable contraction path reuse when computing expectation values. Default is `OFF`.
 
 .. note:: 
 
@@ -489,6 +490,18 @@ Specific aspects of the simulation can be configured by setting the following of
   If you do not have these dependencies installed, you may encounter an error stating `Invalid simulator requested`. 
   See the section :ref:`dependencies-and-compatibility` for more information about how to install dependencies.
 
+.. note:: 
+
+  When using contraction path reuse (`CUDAQ_TENSORNET_OBSERVE_CONTRACT_PATH_REUSE=TRUE`), :code:`tensornet` backends perform a single contraction path optimization with an opaque spin operator term. This path is then used to contract all the actual terms in the spin operator, hence saving the path finding time.
+
+  As we use an opaque spin operator term as a placeholder for contraction path optimization, the resulting contraction path is not as optimal as if the actual spin operator is used.
+  For instance, if the spin operator is sparse (only acting on a few qubits), the contraction can be significantly simplified.  
+
+.. note:: 
+
+  :code:`tensornet` backends only return the overall expectation value for a :class:`cudaq.SpinOperator` when using the `cudaq::observe` method. 
+  Term-by-term expectation values will not be available in the resulting `ObserveResult` object.
+  If needed, these values can be computed by calling `cudaq::observe` on individual terms instead.  
 
 Matrix product state 
 +++++++++++++++++++++++++++++++++++
diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp
index 3382ae1586..f99e189fe4 100644
--- a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp
+++ b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp
@@ -24,6 +24,10 @@ SimulatorTensorNetBase::SimulatorTensorNetBase()
   HANDLE_CUTN_ERROR(cutensornetCreate(&m_cutnHandle));
   // The scratch pad must be allocated after we have selected the device.
   scratchPad.allocate();
+
+  // Check whether observe path reuse is enabled.
+  m_reuseContractionPathObserve =
+      cudaq::getEnvBool("CUDAQ_TENSORNET_OBSERVE_CONTRACT_PATH_REUSE", false);
 }
 
 static std::vector<std::complex<double>>
@@ -277,6 +281,17 @@ cudaq::observe_result
 SimulatorTensorNetBase::observe(const cudaq::spin_op &ham) {
   LOG_API_TIME();
   prepareQubitTensorState();
+  if (!m_reuseContractionPathObserve) {
+    // If contraction path reuse is disabled, convert spin_op to
+    // cutensornetNetworkOperator_t and compute the expectation value.
+    TensorNetworkSpinOp spinOp(ham, m_cutnHandle);
+    std::complex<double> expVal =
+        m_state->computeExpVal(spinOp.getNetworkOperator());
+    expVal += spinOp.getIdentityTermOffset();
+    return cudaq::observe_result(expVal.real(), ham,
+                                 cudaq::sample_result(cudaq::ExecutionResult(
+                                     {}, ham.to_string(false), expVal.real())));
+  }
 
   std::vector<std::string> termStrs;
   std::vector<cudaq::spin_op::spin_op_term> terms;
diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.h b/runtime/nvqir/cutensornet/simulator_cutensornet.h
index 666411ac11..8d11c88dee 100644
--- a/runtime/nvqir/cutensornet/simulator_cutensornet.h
+++ b/runtime/nvqir/cutensornet/simulator_cutensornet.h
@@ -103,6 +103,14 @@ class SimulatorTensorNetBase : public nvqir::CircuitSimulatorBase<double> {
   // cutensornetStateApplyControlledTensorOperator). Tensornet supports
   // arbitrary values.
   std::size_t m_maxControlledRankForFullTensorExpansion = 1;
+
+  // Flag to enable contraction path reuse when computing the expectation value
+  // (observe).
+  //   Default is off (no contraction path reuse).
+  //   Reusing the path, while saving the path finding time, prevents lightcone
+  //   simplification, e.g., when the spin op is sparse (only acting on a few
+  //   qubits).
+  bool m_reuseContractionPathObserve = false;
 };
 
 } // end namespace nvqir
diff --git a/runtime/nvqir/cutensornet/tensornet_state.cpp b/runtime/nvqir/cutensornet/tensornet_state.cpp
index 94e662345c..c6e2b2d712 100644
--- a/runtime/nvqir/cutensornet/tensornet_state.cpp
+++ b/runtime/nvqir/cutensornet/tensornet_state.cpp
@@ -681,15 +681,14 @@ std::vector<std::complex<double>> TensorNetState::computeExpVals(
                                    placeHolderArraySize,
                                    cudaMemcpyHostToDevice));
       std::complex<double> expVal;
-      std::complex<double> stateNorm{0.0, 0.0};
       {
         ScopedTraceWithContext("cutensornetExpectationCompute");
         HANDLE_CUTN_ERROR(cutensornetExpectationCompute(
             m_cutnHandle, tensorNetworkExpectation, workDesc, &expVal,
-            static_cast<void *>(&stateNorm),
+            /*stateNorm*/ nullptr,
             /*cudaStream*/ 0));
       }
-      allExpVals.emplace_back(expVal / std::abs(stateNorm));
+      allExpVals.emplace_back(expVal);
     }
   }
 
@@ -699,6 +698,69 @@ std::vector<std::complex<double>> TensorNetState::computeExpVals(
   return allExpVals;
 }
 
+std::complex<double> TensorNetState::computeExpVal(
+    cutensornetNetworkOperator_t tensorNetworkOperator) {
+  LOG_API_TIME();
+  cutensornetStateExpectation_t tensorNetworkExpectation;
+  // Step 1: create
+  {
+    ScopedTraceWithContext("cutensornetCreateExpectation");
+    HANDLE_CUTN_ERROR(cutensornetCreateExpectation(m_cutnHandle, m_quantumState,
+                                                   tensorNetworkOperator,
+                                                   &tensorNetworkExpectation));
+  }
+  // Step 2: configure
+  const int32_t numHyperSamples =
+      8; // desired number of hyper samples used in the tensor network
+         // contraction path finder
+  {
+    ScopedTraceWithContext("cutensornetExpectationConfigure");
+    HANDLE_CUTN_ERROR(cutensornetExpectationConfigure(
+        m_cutnHandle, tensorNetworkExpectation,
+        CUTENSORNET_EXPECTATION_OPT_NUM_HYPER_SAMPLES, &numHyperSamples,
+        sizeof(numHyperSamples)));
+  }
+
+  // Step 3: Prepare
+  cutensornetWorkspaceDescriptor_t workDesc;
+  HANDLE_CUTN_ERROR(
+      cutensornetCreateWorkspaceDescriptor(m_cutnHandle, &workDesc));
+  {
+    ScopedTraceWithContext("cutensornetExpectationPrepare");
+    HANDLE_CUTN_ERROR(cutensornetExpectationPrepare(
+        m_cutnHandle, tensorNetworkExpectation, scratchPad.scratchSize,
+        workDesc, /*cudaStream*/ 0));
+  }
+
+  // Attach the workspace buffer
+  int64_t worksize{0};
+  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(
+      m_cutnHandle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
+      CUTENSORNET_MEMSPACE_DEVICE, CUTENSORNET_WORKSPACE_SCRATCH, &worksize));
+  if (worksize <= static_cast<int64_t>(scratchPad.scratchSize)) {
+    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(
+        m_cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
+        CUTENSORNET_WORKSPACE_SCRATCH, scratchPad.d_scratch, worksize));
+  } else {
+    throw std::runtime_error("ERROR: Insufficient workspace size on Device!");
+  }
+
+  // Step 4: Compute
+  std::complex<double> expVal;
+
+  {
+    ScopedTraceWithContext("cutensornetExpectationCompute");
+    HANDLE_CUTN_ERROR(cutensornetExpectationCompute(
+        m_cutnHandle, tensorNetworkExpectation, workDesc, &expVal,
+        /*stateNorm*/ nullptr,
+        /*cudaStream*/ 0));
+  }
+  // Step 5: clean up
+  HANDLE_CUTN_ERROR(cutensornetDestroyExpectation(tensorNetworkExpectation));
+  HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
+  return expVal;
+}
+
 std::unique_ptr<TensorNetState> TensorNetState::createFromMpsTensors(
     const std::vector<MPSTensor> &in_mpsTensors, ScratchDeviceMem &inScratchPad,
     cutensornetHandle_t handle, std::mt19937 &randomEngine) {
diff --git a/runtime/nvqir/cutensornet/tensornet_state.h b/runtime/nvqir/cutensornet/tensornet_state.h
index b543684b53..c680fd5c67 100644
--- a/runtime/nvqir/cutensornet/tensornet_state.h
+++ b/runtime/nvqir/cutensornet/tensornet_state.h
@@ -151,6 +151,11 @@ class TensorNetState {
   std::vector<std::complex<double>>
   computeExpVals(const std::vector<std::vector<bool>> &symplecticRepr);
 
+  /// @brief Evaluate the expectation value of a given
+  /// `cutensornetNetworkOperator_t`
+  std::complex<double>
+  computeExpVal(cutensornetNetworkOperator_t tensorNetworkOperator);
+
   /// @brief Number of qubits that this state represents.
   std::size_t getNumQubits() const { return m_numQubits; }
 
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index 9ff26f7510..81b7202ae6 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -221,6 +221,28 @@ if(TARGET nvqir-tensornet)
       endif() # NGPUS
     endif() # NVIDIA_SMI
   endif() # MPI_CXX_FOUND
+  
+  # Test CUDAQ_TENSORNET_OBSERVE_CONTRACT_PATH_REUSE=ON mode (on a few test cases that have cudaq::observe)
+  add_executable(test_tensornet_observe_path_reuse 
+    integration/builder_tester.cpp  
+    integration/deuteron_variational_tester.cpp
+    integration/observe_result_tester.cpp
+  )
+  target_include_directories(test_tensornet_observe_path_reuse PRIVATE .)
+  target_compile_definitions(test_tensornet_observe_path_reuse 
+                             PRIVATE -DNVQIR_BACKEND_NAME=tensornet)
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
+    target_link_options(test_tensornet_observe_path_reuse PRIVATE -Wl,--no-as-needed)
+  endif()
+  target_link_libraries(test_tensornet_observe_path_reuse
+    PRIVATE 
+    cudaq
+    cudaq-builder
+    cudaq-platform-default
+    nvqir-tensornet
+    gtest_main)
+  # Run this test with "CUDAQ_TENSORNET_OBSERVE_CONTRACT_PATH_REUSE=TRUE"
+  gtest_discover_tests(test_tensornet_observe_path_reuse TEST_SUFFIX _PathReuse PROPERTIES ENVIRONMENT "CUDAQ_TENSORNET_OBSERVE_CONTRACT_PATH_REUSE=ON" PROPERTIES LABELS "gpu_required")
 endif() 
 
 # Create an executable for SpinOp UnitTests
diff --git a/unittests/integration/observe_result_tester.cpp b/unittests/integration/observe_result_tester.cpp
index 9613f70d29..56e96a8651 100644
--- a/unittests/integration/observe_result_tester.cpp
+++ b/unittests/integration/observe_result_tester.cpp
@@ -83,6 +83,9 @@ CUDAQ_TEST(ObserveResult, checkSimple) {
   EXPECT_TRUE(x0x1Counts.size() == 4);
 }
 
+// By default, tensornet backends only compute the overall expectation value in
+// observe, i.e., no sub-term calculations.
+#ifndef CUDAQ_BACKEND_TENSORNET
 CUDAQ_TEST(ObserveResult, checkExpValBug) {
 
   auto kernel = []() __qpu__ {
@@ -112,3 +115,4 @@ CUDAQ_TEST(ObserveResult, checkExpValBug) {
   EXPECT_NEAR(exp, .79, 1e-1);
 }
 #endif
+#endif