Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for AMD GPUs #619

Merged
merged 17 commits into from
Nov 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@ cmake_minimum_required(VERSION 3.11)

execute_process(COMMAND which nvcc OUTPUT_VARIABLE has_nvcc)
if(has_nvcc STREQUAL "")
project(qsim)
execute_process(COMMAND which hipcc OUTPUT_VARIABLE has_hipcc)
if(has_hipcc STREQUAL "")
project(qsim)
else()
project(qsim LANGUAGES CXX HIP)
ADD_SUBDIRECTORY(pybind_interface/hip)
endif()
else()
project(qsim LANGUAGES CXX CUDA)
ADD_SUBDIRECTORY(pybind_interface/cuda)
Expand Down
16 changes: 16 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ TESTS = run-cxx-tests

CXX=g++
NVCC=nvcc
HIPCC=hipcc

CXXFLAGS = -O3 -fopenmp
ARCHFLAGS = -march=native
NVCCFLAGS = -O3
HIPCCFLAGS = -O3

# CUQUANTUM_ROOT should be set.
CUSTATEVECFLAGS = -I$(CUQUANTUM_ROOT)/include -L${CUQUANTUM_ROOT}/lib -L$(CUQUANTUM_ROOT)/lib64 -lcustatevec -lcublas
Expand All @@ -22,6 +24,8 @@ export ARCHFLAGS
export NVCC
export NVCCFLAGS
export CUSTATEVECFLAGS
export HIPCC
export HIPCCFLAGS

ifeq ($(PYBIND11), true)
TARGETS += pybind
Expand All @@ -43,6 +47,10 @@ qsim-cuda:
qsim-custatevec:
$(MAKE) -C apps/ qsim-custatevec

.PHONY: qsim-hip
qsim-hip:
$(MAKE) -C apps/ qsim-hip

.PHONY: pybind
pybind:
$(MAKE) -C pybind_interface/ pybind
Expand All @@ -59,6 +67,10 @@ cuda-tests:
custatevec-tests:
$(MAKE) -C tests/ custatevec-tests

.PHONY: hip-tests
hip-tests:
$(MAKE) -C tests/ hip-tests

.PHONY: run-cxx-tests
run-cxx-tests: cxx-tests
$(MAKE) -C tests/ run-cxx-tests
Expand All @@ -71,6 +83,10 @@ run-cuda-tests: cuda-tests
run-custatevec-tests: custatevec-tests
$(MAKE) -C tests/ run-custatevec-tests

.PHONY: run-hip-tests
run-hip-tests: hip-tests
$(MAKE) -C tests/ run-hip-tests

PYTESTS = $(shell find qsimcirq_tests/ -name '*_test.py')

.PHONY: run-py-tests
Expand Down
9 changes: 9 additions & 0 deletions apps/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ CUDA_TARGETS := $(CUDA_TARGETS:%cuda.cu=%cuda.x)
CUSTATEVEC_TARGETS = $(shell find . -maxdepth 1 -name "*custatevec.cu")
CUSTATEVEC_TARGETS := $(CUSTATEVEC_TARGETS:%custatevec.cu=%custatevec.x)

HIP_TARGETS = $(shell find . -maxdepth 1 -name '*cuda.cu')
HIP_TARGETS := $(HIP_TARGETS:%cuda.cu=%hip.x)

.PHONY: qsim
qsim: $(CXX_TARGETS)

Expand All @@ -16,6 +19,9 @@ qsim-cuda: $(CUDA_TARGETS)
.PHONY: qsim-custatevec
qsim-custatevec: $(CUSTATEVEC_TARGETS)

.PHONY: qsim-hip
qsim-hip: $(HIP_TARGETS)

%.x: %.cc
$(CXX) -o ./$@ $< $(CXXFLAGS) $(ARCHFLAGS)

Expand All @@ -25,6 +31,9 @@ qsim-custatevec: $(CUSTATEVEC_TARGETS)
%custatevec.x: %custatevec.cu
$(NVCC) -o ./$@ $< $(NVCCFLAGS) $(CUSTATEVECFLAGS)

%hip.x: %cuda.cu
$(HIPCC) -o ./$@ $< $(HIPCCFLAGS)

.PHONY: clean
clean:
-rm -f ./*.x ./*.a ./*.so ./*.mod
16 changes: 11 additions & 5 deletions apps/make.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,15 @@ g++ -O3 -march=native -fopenmp -o qsim_amplitudes.x qsim_amplitudes.cc
g++ -O3 -march=native -fopenmp -o qsimh_base.x qsimh_base.cc
g++ -O3 -march=native -fopenmp -o qsimh_amplitudes.x qsimh_amplitudes.cc

nvcc -O3 -o qsim_base_cuda.x qsim_base_cuda.cu
nvcc -O3 -o qsim_qtrajectory_cuda.x qsim_qtrajectory_cuda.cu
if command -v nvcc &>/dev/null; then
nvcc -O3 -o qsim_base_cuda.x qsim_base_cuda.cu
nvcc -O3 -o qsim_qtrajectory_cuda.x qsim_qtrajectory_cuda.cu

# CUQUANTUM_ROOT should be set.
CUSTATEVECFLAGS="-I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUQUANTUM_ROOT}/lib64 -lcustatevec -lcublas"
nvcc -O3 $CUSTATEVECFLAGS -o qsim_base_custatevec.x qsim_base_custatevec.cu
if [ -n "$CUQUANTUM_ROOT" ]; then
CUSTATEVECFLAGS="-I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUQUANTUM_ROOT}/lib64 -lcustatevec -lcublas"
nvcc -O3 $CUSTATEVECFLAGS -o qsim_base_custatevec.x qsim_base_custatevec.cu
fi
elif command -v hipcc &>/dev/null; then
hipcc -O3 -o qsim_base_hip.x qsim_base_cuda.cu
hipcc -O3 -o qsim_qtrajectory_hip.x qsim_qtrajectory_cuda.cu
fi
2 changes: 2 additions & 0 deletions docs/_book.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ upper_tabs:
path: /qsim/tutorials/q32d14
- title: "Simulate noise"
path: /qsim/tutorials/noisy_qsimcirq
- title: "AMD GPU support"
path: /qsim/tutorials/amd_gpu

- name: "Guides"
contents:
Expand Down
86 changes: 86 additions & 0 deletions docs/tutorials/amd_gpu.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# Support for AMD Instinct™ MI Series Accelerators

qsim provides support for AMD Instinct accelerators.
The implementation covers the native GPU support in qsim
by utilizing [AMD HIP SDK](https://rocm.docs.amd.com/projects/HIP)
(Heterogeneous-Compute Interface for Portability).
The cuQuantum implementation is currently not covered.

## Building

Building qsim with support for AMD Instinct accelerators requires installation of
[AMD ROCm™ Open Software Platform](https://www.amd.com/en/developer/resources/rocm-hub.html).
Instructions for installing ROCm are available at https://rocm.docs.amd.com/.

To enable support for AMD GPUs, qsim needs to be built from sources.
This can be done as follows:

```
conda env list
conda create -y -n CirqDevEnv python=3
conda activate CirqDevEnv
pip install pybind11

git clone https://github.com/quantumlib/qsim.git
cd qsim

make -j qsim # to build CPU qsim
make -j qsim-hip # to build HIP qsim
make -j pybind # to build Python bindings
make -j cxx-tests # to build CPU tests
make -j hip-tests # to build HIP tests

pip install .
```

Note: To avoid problems when building qsim with support for AMD GPUs,
make sure to use the latest version of CMake.

## Testing

### Simulator

To test the qsim simulator:

```
make run-cxx-tests # to run CPU tests
make run-hip-tests # to run HIP tests
```

or

```
cd tests
for file in *.x; do ./"$file"; done # to run all tests
for file in *_hip_test.x; do ./"$file"; done # to run HIP tests only
```

### Python Bindings

To test the Python bindings:

```
make run-py-tests
```

or

```
cd qsimcirq_tests
python3 -m pytest -v qsimcirq_test.py
```

## Using

Using qsim on AMD Instinct GPUs is identical to using it on NVIDIA GPUs.
I.e., it is done by passing `use_gpu=True` and `gpu_mode=0` as `qsimcirq.QSimOptions`:

```
simulator = qsimcirq.QSimSimulator(qsim_options=qsimcirq.QSimOptions(
use_gpu=True,
gpu_mode=0,
...
))
```

Note: `gpu_mode` has to be set to zero for AMD GPUs, as cuStateVec is not supported.
61 changes: 61 additions & 0 deletions lib/cuda2hip.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright 2023 Advanced Micro Devices, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef SIMULATOR_CUDA2HIP_H_
#define SIMULATOR_CUDA2HIP_H_

#define cublasCaxpy hipblasCaxpy
#define cublasCdotc hipblasCdotc
#define cublasCreate hipblasCreate
#define cublasCscal hipblasCscal
#define cublasCsscal hipblasCsscal
#define cublasDestroy hipblasDestroy
#define cublasDznrm2 hipblasDznrm2
#define cublasHandle_t hipblasHandle_t
#define cublasScnrm2 hipblasScnrm2
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define cublasStatus_t hipblasStatus_t
#define cublasZaxpy hipblasZaxpy
#define cublasZdotc hipblasZdotc
#define cublasZdscal hipblasZdscal
#define cublasZscal hipblasZscal
#define cuCimagf hipCimagf
#define cuCimag hipCimag
#define cuComplex hipComplex
#define cuCrealf hipCrealf
#define cuCreal hipCreal
#define CUDA_C_32F HIPBLAS_C_32F
#define CUDA_C_64F HIPBLAS_C_64F
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaFree hipFree
#define cudaGetErrorString hipGetErrorString
#define cudaMalloc hipMalloc
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpy hipMemcpy
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemset hipMemset
#define cudaPeekAtLastError hipPeekAtLastError
#define cudaSuccess hipSuccess
#define cuDoubleComplex hipDoubleComplex

template <typename T>
__device__ __forceinline__ T __shfl_down_sync(
unsigned mask, T var, unsigned int delta, int width = warpSize) {
return __shfl_down(var, delta, width);
}

#endif // SIMULATOR_CUDA2HIP_H_
4 changes: 0 additions & 4 deletions lib/fuser_mqubit.h
Original file line number Diff line number Diff line change
Expand Up @@ -561,8 +561,6 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
static void FuseOrphanedGates(unsigned max_fused_size, Stat& stat,
std::vector<GateF*>& orphaned_gates,
std::vector<GateFused>& fused_gates) {
unsigned count = 0;

for (std::size_t i = 0; i < orphaned_gates.size(); ++i) {
auto ogate1 = orphaned_gates[i];

Expand All @@ -575,8 +573,6 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {

if (ogate2->visited == kFinal) continue;

++count;

unsigned cur_size = ogate1->qubits.size() + ogate2->qubits.size();

if (cur_size <= max_fused_size) {
Expand Down
13 changes: 9 additions & 4 deletions lib/simulator_cuda_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,15 @@
#ifndef SIMULATOR_CUDA_KERNELS_H_
#define SIMULATOR_CUDA_KERNELS_H_

#include <cuda.h>
#include <cuda_runtime.h>

#include "util_cuda.h"
#ifdef __NVCC__
#include <cuda.h>
#include <cuda_runtime.h>

#include "util_cuda.h"
#elif __HIP__
#include <hip/hip_runtime.h>
#include "cuda2hip.h"
#endif

namespace qsim {

Expand Down
10 changes: 8 additions & 2 deletions lib/statespace_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
#ifndef STATESPACE_CUDA_H_
#define STATESPACE_CUDA_H_

#include <cuda.h>
#ifdef __NVCC__
#include <cuda.h>
#elif __HIP__
#include <hip/hip_runtime.h>
#include "cuda2hip.h"
#endif

#include <algorithm>
#include <complex>
Expand Down Expand Up @@ -102,7 +107,8 @@ class StateSpaceCUDA :
}

void SetAllZeros(State& state) const {
cudaMemset(state.get(), 0, MinSize(state.num_qubits()) * sizeof(fp_type));
ErrorCheck(cudaMemset(state.get(), 0,
MinSize(state.num_qubits()) * sizeof(fp_type)));
}

// Uniform superposition.
Expand Down
7 changes: 6 additions & 1 deletion lib/statespace_cuda_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
#ifndef STATESPACE_CUDA_KERNELS_H_
#define STATESPACE_CUDA_KERNELS_H_

#include <cuda.h>
#ifdef __NVCC__
#include <cuda.h>
#elif __HIP__
#include <hip/hip_runtime.h>
#include "cuda2hip.h"
#endif

#include "util_cuda.h"

Expand Down
6 changes: 5 additions & 1 deletion lib/util_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@
#ifndef UTIL_CUDA_H_
#define UTIL_CUDA_H_

#include <cuda.h>
#ifdef __NVCC__
#include <cuda.h>
#elif __HIP__
#include <hip/hip_runtime.h>
#endif

#include <cstdlib>

Expand Down
Loading
Loading