diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000000..63440e5f3b --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,6 @@ +# These owners will be the default owners for everything in +# the repo. Unless a later match takes precedence, +# @global-owner1 and @global-owner2 will be requested for +# review when someone opens a pull request. +* @EmilyBourne @bauom + diff --git a/.github/actions/coverage_collection/action.yml b/.github/actions/coverage_collection/action.yml index 03d1b2cdfd..e32b82b317 100644 --- a/.github/actions/coverage_collection/action.yml +++ b/.github/actions/coverage_collection/action.yml @@ -6,7 +6,6 @@ runs: - name: Coverage collection run: | coverage combine - coverage xml rm ${SITE_DIR}/pyccel_cov.pth shell: bash diff --git a/.github/actions/coverage_install/action.yml b/.github/actions/coverage_install/action.yml index 44ef760dcb..fb55f29c45 100644 --- a/.github/actions/coverage_install/action.yml +++ b/.github/actions/coverage_install/action.yml @@ -10,7 +10,7 @@ runs: - name: Directory Creation run: | INSTALL_DIR=$(cd tests; python -c "import pyccel; print(pyccel.__path__[0])") - SITE_DIR=$(python -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])') + SITE_DIR=$(dirname ${INSTALL_DIR}) echo -e "import coverage; coverage.process_startup()" > ${SITE_DIR}/pyccel_cov.pth echo -e "[run]\nparallel = True\nsource = ${INSTALL_DIR}\ndata_file = $(pwd)/.coverage\n[report]\ninclude = ${INSTALL_DIR}/*\n[xml]\noutput = cobertura.xml" > .coveragerc echo "SITE_DIR=${SITE_DIR}" >> $GITHUB_ENV diff --git a/.github/actions/linux_install/action.yml b/.github/actions/linux_install/action.yml index 8fb5cd8505..0ef9a69b8e 100644 --- a/.github/actions/linux_install/action.yml +++ b/.github/actions/linux_install/action.yml @@ -9,22 +9,22 @@ runs: shell: bash - name: Install fortran run: - sudo apt-get install gfortran + sudo apt-get install -y gfortran shell: bash - name: Install LaPack run: - sudo apt-get install libblas-dev liblapack-dev + sudo apt-get install -y libblas-dev liblapack-dev shell: bash - name: Install MPI run: | - sudo apt-get install libopenmpi-dev openmpi-bin + sudo apt-get install -y libopenmpi-dev openmpi-bin echo "MPI_OPTS=--oversubscribe" >> $GITHUB_ENV shell: bash - name: Install OpenMP run: - sudo apt-get install libomp-dev libomp5 + sudo apt-get install -y libomp-dev libomp5 shell: bash - name: Install Valgrind run: - sudo apt-get install valgrind + sudo apt-get install -y valgrind shell: bash diff --git a/.github/actions/pytest_run/action.yml b/.github/actions/pytest_run/action.yml index 13f20d7c04..10113c373e 100644 --- a/.github/actions/pytest_run/action.yml +++ b/.github/actions/pytest_run/action.yml @@ -16,17 +16,18 @@ runs: - name: Test with pytest run: | which python - python -m pytest -n auto -rXx -v -m "not (parallel or xdist_incompatible) and c ${{ inputs.pytest_mark }}" --ignore=symbolic --ignore=ndarrays + python -m pytest -n auto -rXx -v -m "not (parallel or xdist_incompatible) and c ${{ inputs.pytest_mark }}" --ignore=symbolic --ignore=ndarrays --ignore=cuda_ndarrays if [ -n "${SITE_DIR}" ]; then echo "Touching" # Test ndarray folder update (requires parallel tests to avoid clean) touch ${SITE_DIR}/pyccel/stdlib/cwrapper/cwrapper.h python -m pytest -n auto -rXx -v -m c -k test_array_int32_1d_scalar epyccel/test_arrays.py fi - python -m pytest -rXx -m "xdist_incompatible and not parallel and c ${{ inputs.pytest_mark }}" --ignore=symbolic --ignore=ndarrays + + python -m pytest -rXx -m "xdist_incompatible and not parallel and c ${{ inputs.pytest_mark }}" --ignore=symbolic --ignore=ndarrays --ignore=cuda_ndarrays pyccel-clean - python -m pytest -n auto -rXx -m "not (parallel or xdist_incompatible) and not (c or python) ${{ inputs.pytest_mark }}" --ignore=symbolic --ignore=ndarrays - python -m pytest -rXx -m "xdist_incompatible and not parallel and not (c or python) ${{ inputs.pytest_mark }}" --ignore=symbolic --ignore=ndarrays + python -m pytest -n auto -rXx -m "not (parallel or xdist_incompatible) and not (c or python or ccuda) ${{ inputs.pytest_mark }}" --ignore=symbolic --ignore=ndarrays --ignore=cuda_ndarrays + python -m pytest -rXx -m "xdist_incompatible and not parallel and not (c or python or ccuda) ${{ inputs.pytest_mark }}" --ignore=symbolic --ignore=ndarrays --ignore=cuda_ndarrays pyccel-clean python -m pytest ndarrays/ -rXx pyccel-clean diff --git a/.github/actions/pytest_run_cuda/action.yml b/.github/actions/pytest_run_cuda/action.yml new file mode 100644 index 0000000000..0c55ccd81a --- /dev/null +++ b/.github/actions/pytest_run_cuda/action.yml @@ -0,0 +1,17 @@ +name: 'Pyccel pytest commands generating Ccuda' +inputs: + shell_cmd: + description: 'Specifies the shell command (different for anaconda)' + required: false + default: "bash" + +runs: + using: "composite" + steps: + - name: Ccuda tests with pytest + run: | + # Catch exit 5 (no tests found) + sh -c 'python -m pytest -n auto -rx -m "not (parallel or xdist_incompatible) and ccuda" --ignore=symbolic --ignore=ndarrays --ignore=cuda_ndarrays; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret' + pyccel-clean + shell: ${{ inputs.shell_cmd }} + working-directory: ./tests diff --git a/.github/actions/pytest_run_python/action.yml b/.github/actions/pytest_run_python/action.yml index 842fd2eaf6..c9595dde1f 100644 --- a/.github/actions/pytest_run_python/action.yml +++ b/.github/actions/pytest_run_python/action.yml @@ -10,8 +10,8 @@ runs: steps: - name: Python tests with pytest run: | - python -m pytest -n auto -rXx -m "not (parallel or xdist_incompatible) and python" --ignore=symbolic --ignore=ndarrays - python -m pytest -rXx -m "xdist_incompatible and not parallel and python" --ignore=symbolic --ignore=ndarrays + python -m pytest -n auto -rXx -m "not (parallel or xdist_incompatible) and python" --ignore=symbolic --ignore=ndarrays --ignore=cuda_ndarrays + python -m pytest -rXx -m "xdist_incompatible and not parallel and python" --ignore=symbolic --ignore=ndarrays --ignore=cuda_ndarrays pyccel-clean shell: ${{ inputs.shell_cmd }} working-directory: ./tests diff --git a/.github/actions/python_install/action.yml b/.github/actions/python_install/action.yml new file mode 100644 index 0000000000..f9b720e3e1 --- /dev/null +++ b/.github/actions/python_install/action.yml @@ -0,0 +1,17 @@ +name: 'Python installation commands' + +runs: + using: "composite" + steps: + - name: Install python + run: + sudo apt-get -y install python3-dev + shell: bash + - name: python as python3 + run: + sudo apt-get -y install python-is-python3 + shell: bash + - name: Install Pip + run: + sudo apt-get -y install python3-pip + shell: bash diff --git a/.github/workflows/Github_pytest.yml b/.github/workflows/Github_pytest.yml index b8f225a32e..a8ae6da2bc 100644 --- a/.github/workflows/Github_pytest.yml +++ b/.github/workflows/Github_pytest.yml @@ -2,7 +2,7 @@ name: Pull Request on: pull_request: - branches: [ devel ] + branches: [ main, devel ] types: [ opened, reopened, ready_for_review, synchronize, converted_to_draft ] issue_comment: types: [ created ] @@ -16,6 +16,7 @@ jobs: run_windows: ${{ steps.run_bot.outputs.run_windows }} run_macosx: ${{ steps.run_bot.outputs.run_macosx }} run_coverage: ${{ steps.run_bot.outputs.run_coverage }} + run_cuda: ${{ steps.run_bot.outputs.run_cuda }} run_docs: ${{ steps.run_bot.outputs.run_docs }} run_pylint: ${{ steps.run_bot.outputs.run_pylint }} run_lint: ${{ steps.run_bot.outputs.run_lint }} @@ -81,9 +82,28 @@ jobs: python_version: ${{ needs.Bot.outputs.python_version }} ref: ${{ needs.Bot.outputs.REF }} + cuda: + needs: Bot + if: ${{ needs.Bot.outputs.run_cuda == 'True' }} + uses: + ./.github/workflows/cuda.yml + with: + python_version: ${{ (needs.Bot.outputs.python_version == '') && '3.10' || needs.Bot.outputs.python_version }} + ref: ${{ needs.Bot.outputs.REF }} + + coverage_collection: + needs: [Bot, linux, cuda] + if: ${{ always() && needs.Bot.outputs.run_coverage == 'True' && needs.Linux.result == 'success' && needs.Cuda.result != 'failure' }} + uses: + ./.github/workflows/coverage_collect.yml + with: + python_version: ${{ (needs.Bot.outputs.python_version == '') && '3.7' || needs.Bot.outputs.python_version }} + ref: ${{ needs.Bot.outputs.REF }} + cuda_result: ${{ needs.Cuda.result }} + coverage: - needs: [Bot, linux] - if: ${{ needs.Bot.outputs.run_coverage == 'True' }} + needs: [Bot, linux, cuda, coverage_collection] + if: ${{ always() && needs.Bot.outputs.run_coverage == 'True' && needs.CoverageCollection.result == 'success' && needs.Cuda.result != 'failure' }} uses: ./.github/workflows/coverage.yml with: diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index e27379f0cd..b8c551a5f6 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -26,7 +26,7 @@ jobs: - uses: actions/checkout@v3 with: ref: ${{ inputs.ref }} - - name: Set up Python 3.7 + - name: Set up Python ${{ inputs.python_version }} uses: actions/setup-python@v4 with: python-version: 3.7 @@ -44,7 +44,7 @@ jobs: - name: Collect coverage information uses: actions/download-artifact@v3 with: - name: coverage-artifact + name: coverage-artifact-xml - name: Collect diff information run: | git fetch diff --git a/.github/workflows/coverage_collect.yml b/.github/workflows/coverage_collect.yml new file mode 100644 index 0000000000..e0c4a0c3bc --- /dev/null +++ b/.github/workflows/coverage_collect.yml @@ -0,0 +1,67 @@ +name: Unit test coverage collection + +on: + workflow_call: + inputs: + python_version: + required: true + type: string + ref: + required: false + type: string + cuda_result: + required: true + type: string + +jobs: + CoverageChecker: + + runs-on: ubuntu-latest + name: Unit tests + + steps: + - uses: actions/checkout@v3 + with: + ref: ${{ inputs.ref }} + - name: Set up Python ${{ inputs.python_version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ inputs.python_version }} + - name: Install dependencies + uses: ./.github/actions/linux_install + - name: Install coverage + run: | + python -m pip install --upgrade pip + python -m pip install coverage + shell: bash + - name: Collect coverage information + uses: actions/download-artifact@v3 + with: + name: coverage-artifact + - name: Rename coverage file + run: mv .coverage .coverage.linux + - name: Collect coverage information + uses: actions/download-artifact@v3 + if: ${{ inputs.cuda_result }} == 'success' + with: + name: cuda-coverage-artifact + - name: Rename coverage file + if: ${{ inputs.cuda_result }} == 'success' + run: mv .coverage .coverage.cuda + - name: Generate coverage report + run: | + echo -e "[paths]\nsource =\n $(pwd)/pyccel\n */site-packages/pyccel\n[xml]\noutput = cobertura.xml" > .coveragerc + coverage combine + coverage xml + - name: Run codacy-coverage-reporter + uses: codacy/codacy-coverage-reporter-action@master + continue-on-error: True + with: + project-token: ${{ secrets.CODACY_PROJECT_TOKEN }} + coverage-reports: cobertura.xml + - name: Save code coverage xml report + uses: actions/upload-artifact@v3 + with: + name: coverage-artifact-xml + path: cobertura.xml + retention-days: 1 diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml new file mode 100644 index 0000000000..9c9d181190 --- /dev/null +++ b/.github/workflows/cuda.yml @@ -0,0 +1,55 @@ +name: Cuda unit tests + +on: + workflow_call: + inputs: + python_version: + required: true + type: string + ref: + required: false + type: string + +jobs: + Cuda: + + runs-on: ubuntu-20.04 + name: Unit tests + + container: nvidia/cuda:11.7.1-devel-ubuntu20.04 + steps: + - uses: actions/checkout@v3 + - name: Prepare docker + run: | + apt update && apt install sudo + TZ=Europe/France + ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata + shell: bash + - name: CUDA Version + run: nvcc --version # cuda install check + - name: Install dependencies + uses: ./.github/actions/linux_install + - name: Install python (setup-python action doesn't work with containers) + uses: ./.github/actions/python_install + - name: Install Pyccel with tests + run: | + PATH=${PATH}:$HOME/.local/bin + echo "PATH=${PATH}" >> $GITHUB_ENV + python -m pip install --upgrade pip + python -m pip install --user .[test] + shell: bash + - name: Coverage install + uses: ./.github/actions/coverage_install + - name: Ccuda tests with pytest + uses: ./.github/actions/pytest_run_cuda + - name: Collect coverage information + continue-on-error: True + uses: ./.github/actions/coverage_collection + - name: Save code coverage report + uses: actions/upload-artifact@v3 + with: + name: cuda-coverage-artifact + path: .coverage + retention-days: 1 + diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index d2367d95b2..faced10877 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -53,15 +53,9 @@ jobs: - name: Collect coverage information continue-on-error: True uses: ./.github/actions/coverage_collection - - name: Run codacy-coverage-reporter - uses: codacy/codacy-coverage-reporter-action@master - continue-on-error: True - with: - project-token: ${{ secrets.CODACY_PROJECT_TOKEN }} - coverage-reports: cobertura.xml - name: Save code coverage report uses: actions/upload-artifact@v3 with: name: coverage-artifact - path: cobertura.xml + path: .coverage retention-days: 1 diff --git a/MANIFEST.in b/MANIFEST.in index 656cdd153a..bf873bea31 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -6,6 +6,7 @@ include README.md recursive-include pyccel *.tx recursive-include pyccel *.pyh recursive-include pyccel *.c +recursive-include pyccel *.cu recursive-include pyccel *.f90 recursive-include pyccel *.h recursive-include pyccel *.pyccel diff --git a/ci_tools/bot_interaction.py b/ci_tools/bot_interaction.py index e68848cf7d..a5e5c21dac 100644 --- a/ci_tools/bot_interaction.py +++ b/ci_tools/bot_interaction.py @@ -13,7 +13,7 @@ senior_reviewer = ['yguclu', 'EmilyBourne'] trusted_reviewers = ['yguclu', 'EmilyBourne', 'ratnania', 'saidctb', 'bauom'] -pr_test_keys = ['linux', 'windows', 'macosx', 'coverage', 'docs', 'pylint', +pr_test_keys = ['linux', 'windows', 'macosx', 'cuda', 'coverage', 'docs', 'pylint', 'lint', 'spelling'] review_labels = ('needs_initial_review', 'Ready_for_review', 'Ready_to_merge') @@ -94,7 +94,9 @@ def run_tests(pr_id, tests, outputs, event): if outputs['run_coverage']: outputs['run_linux'] = True + outputs['run_cuda'] = True running.add('linux') + running.add('cuda') running_tests = bool(running) @@ -434,6 +436,7 @@ def flagged_as_trusted(pr_id, user): outputs = {'run_linux': False, 'run_windows': False, 'run_macosx': False, + 'run_cuda': False, 'run_coverage': False, 'run_docs': False, 'run_pylint': False, diff --git a/ci_tools/bot_messages/show_tests.txt b/ci_tools/bot_messages/show_tests.txt index 79fe6e3478..028d29224e 100644 --- a/ci_tools/bot_messages/show_tests.txt +++ b/ci_tools/bot_messages/show_tests.txt @@ -2,6 +2,7 @@ The following is a list of keywords which can be used to run tests. Tests in bol - **linux** : Runs the unit tests on a Linux system. - **windows** : Runs the unit tests on a Windows system. - **macosx** : Runs the unit tests on a MacOS X system. +- **cuda** : Runs the cuda unit tests on a Linux system. - **coverage** : Runs the unit tests on a Linux system and checks the coverage of the tests. - **docs** : Checks if the documentation follows the numpydoc format. - **pylint** : Runs pylint on files which are too big to be handled by codacy. diff --git a/pyccel/ast/class_defs.py b/pyccel/ast/class_defs.py index 1a08f2ab0a..bf25800c2d 100644 --- a/pyccel/ast/class_defs.py +++ b/pyccel/ast/class_defs.py @@ -160,6 +160,13 @@ #======================================================================================= +CudaArrayClass = ClassDef('cuda.ndarray', + methods=[ + FunctionDef('shape',[],[],body=[], + decorators={'property':'property', 'numpy_wrapper':Shape})]) + +#======================================================================================= + literal_classes = { NativeBool() : BooleanClass, NativeInteger() : IntegerClass, diff --git a/pyccel/ast/core.py b/pyccel/ast/core.py index d256fd1a50..512c5b56bd 100644 --- a/pyccel/ast/core.py +++ b/pyccel/ast/core.py @@ -57,9 +57,9 @@ 'Declare', 'Decorator', 'Del', + 'DoConcurrent', 'DottedFunctionCall', 'Duplicate', - 'DoConcurrent', 'EmptyNode', 'ErrorExit', 'Exit', @@ -75,10 +75,11 @@ 'If', 'IfSection', 'Import', - 'InlineFunctionDef', 'InProgram', + 'InlineFunctionDef', 'Interface', 'Iterable', + 'KernelCall', 'Module', 'ModuleHeader', 'Pass', @@ -93,8 +94,8 @@ 'SympyFunction', 'While', 'With', - 'create_variable', 'create_incremented_string', + 'create_variable', 'get_iterable_ranges', 'inline', 'subs' @@ -2270,6 +2271,46 @@ def prefix(self): """ return self._prefix +class KernelCall(FunctionCall): + """ + Represents a kernel call + (e.g. a[c, b]()) + + + + Parameters + ========== + func : FunctionDef + The definition of the function being called + args : tuple + The arguments being passed to the function + + numBlocks : NativeInteger + The number of blocks + tpblock : NativeInteger + The number of threads per block + """ + __slots__ = ('_numBlocks','_tpblock', '_func', '_args') + _attribute_nodes = (*FunctionCall._attribute_nodes, '_numBlocks', '_tpblock') + + def __init__(self, func, args, numBlocks, tpblock, current_function=None): + + self._numBlocks = numBlocks + self._tpblock = tpblock + super().__init__(func, args, current_function) + + @property + def numBlocks(self): + """ The number of blocks in which the kernel will run + """ + return self._numBlocks + + @property + def tpblock(self): + """ The number of threads in each block + """ + return self._tpblock + class Return(Basic): """Represents a function return in the code. diff --git a/pyccel/ast/cudaext.py b/pyccel/ast/cudaext.py new file mode 100644 index 0000000000..3f04c272ee --- /dev/null +++ b/pyccel/ast/cudaext.py @@ -0,0 +1,276 @@ +from .basic import PyccelAstNode +from .builtins import (PythonTuple,PythonList) + +from .core import Module, PyccelFunctionDef + +from .datatypes import NativeInteger + +from .internals import PyccelInternalFunction, get_final_precision + +from .literals import LiteralInteger +from .literals import LiteralTrue, LiteralFalse +from .operators import PyccelAdd, PyccelMul +from .variable import (Variable, HomogeneousTupleVariable) + +from .numpyext import process_dtype, process_shape, NumpyNewArray + +#============================================================================== +__all__ = ( + 'CudaArray', + 'CudaBlockDim', + 'CudaBlockIdx', + 'CudaCopy', + 'CudaGrid', + 'CudaGridDim', + 'CudaInternalVar', + 'CudaMemCopy', + 'CudaNewArray', + 'CudaSynchronize', + 'CudaThreadIdx' +) + +#============================================================================== +class CudaNewArray(NumpyNewArray): + """ Class from which all Cuda functions which imply a call to Allocate + inherit + """ + __slots__ = () + +#============================================================================== + +#============================================================================== +class CudaArray(CudaNewArray): + """ + Represents a call to cuda.array for code generation. + + arg : list, tuple, PythonList + + """ + __slots__ = ('_arg','_dtype','_precision','_shape','_rank','_order','_memory_location') + _attribute_nodes = ('_arg',) + name = 'array' + + def __init__(self, arg, dtype=None, order='C', memory_location='managed'): + + if not isinstance(arg, (PythonTuple, PythonList, Variable)): + raise TypeError(f"Unknown type of {type(arg)}.") + + is_homogeneous_tuple = isinstance(arg, (PythonTuple, PythonList, HomogeneousTupleVariable)) and arg.is_homogeneous + is_array = isinstance(arg, Variable) and arg.is_ndarray + + # TODO: treat inhomogenous lists and tuples when they have mixed ordering + if not (is_homogeneous_tuple or is_array): + raise TypeError('we only accept homogeneous arguments') + + # Verify dtype and get precision + if dtype is None: + dtype = arg.dtype + prec = get_final_precision(arg) + else: + dtype, prec = process_dtype(dtype) + # ... Determine ordering + order = str(order).strip("\'") + + shape = process_shape(False, arg.shape) + rank = len(shape) + + if rank < 2: + order = None + else: + # ... Determine ordering + order = str(order).strip("\'") + + if order not in ('K', 'A', 'C', 'F'): + raise ValueError(f"Cannot recognize '{order}' order") + + # TODO [YG, 18.02.2020]: set correct order based on input array + if order in ('K', 'A'): + order = 'C' + # ... + #Verify memory location + if memory_location not in ('host', 'device', 'managed'): + raise ValueError("memory_location must be 'host', 'device' or 'managed'") + self._arg = arg + self._shape = shape + self._rank = rank + self._dtype = dtype + self._order = order + self._precision = prec + self._memory_location = memory_location + super().__init__() + + def __str__(self): + return str(self.arg) + + @property + def arg(self): + return self._arg + @property + def memory_location(self): + return self._memory_location + +class CudaSynchronize(PyccelInternalFunction): + "Represents a call to Cuda.deviceSynchronize for code generation." + + __slots__ = () + _attribute_nodes = () + _shape = None + _rank = 0 + _dtype = NativeInteger() + _precision = None + _order = None + def __init__(self): + super().__init__() + +class CudaInternalVar(PyccelAstNode): + """ + Represents a General Class For Cuda internal Variables Used To locate Thread In the GPU architecture" + + Parameters + ---------- + dim : NativeInteger + Represent the dimension where we want to locate our thread. + + """ + __slots__ = ('_dim','_dtype', '_precision') + _attribute_nodes = ('_dim',) + _shape = None + _rank = 0 + _order = None + + def __init__(self, dim=None): + + if isinstance(dim, int): + dim = LiteralInteger(dim) + if not isinstance(dim, LiteralInteger): + raise TypeError("dimension need to be an integer") + if dim not in (0, 1, 2): + raise ValueError("dimension need to be 0, 1 or 2") + #... + self._dim = dim + self._dtype = dim.dtype + self._precision = dim.precision + super().__init__() + + @property + def dim(self): + return self._dim + + +class CudaCopy(CudaNewArray): + """ + Represents a call to cuda.copy for code generation. + + Parameters + ---------- + arg : Variable + + memory_location : str + 'host' the newly created array is allocated on host. + 'device' the newly created array is allocated on device. + + is_async: bool + Indicates whether the copy is asynchronous or not [Default value: False] + + """ + __slots__ = ('_arg','_dtype','_precision','_shape','_rank','_order','_memory_location', '_is_async') + + def __init__(self, arg, memory_location, is_async=False): + + if not isinstance(arg, Variable): + raise TypeError(f"unknown type of {type(arg)}.") + + # Verify the memory_location of src + if arg.memory_location not in ('device', 'host', 'managed'): + raise ValueError("The direction of the copy should be from 'host' or 'device'") + + # Verify the memory_location of dst + if memory_location not in ('device', 'host', 'managed'): + raise ValueError("The direction of the copy should be to 'host' or 'device'") + + # verify the type of is_async + if not isinstance(is_async, (LiteralTrue, LiteralFalse, bool)): + raise TypeError('is_async must be boolean') + + self._arg = arg + self._shape = arg.shape + self._rank = arg.rank + self._dtype = arg.dtype + self._order = arg.order + self._precision = arg.precision + self._memory_location = memory_location + self._is_async = is_async + super().__init__() + + @property + def arg(self): + return self._arg + + @property + def memory_location(self): + return self._memory_location + + @property + def is_async(self): + return self._is_async + +class CudaThreadIdx(CudaInternalVar): + __slots__ = () + pass +class CudaBlockDim(CudaInternalVar): + __slots__ = () + pass +class CudaBlockIdx(CudaInternalVar): + __slots__ = () + pass +class CudaGridDim(CudaInternalVar): + __slots__ = () + pass + +class CudaGrid(PyccelAstNode) : + """ + CudaGrid locates a thread in the GPU architecture using `CudaThreadIdx`, `CudaBlockDim`, `CudaBlockIdx` + to calculate the exact index of the thread automatically. + + Parameters + ---------- + dim : NativeInteger + Represent the dimension where we want to locate our thread. + + """ + __slots__ = () + _attribute_nodes = () + def __new__(cls, dim=0): + if not isinstance(dim, LiteralInteger): + raise TypeError("dimension need to be an integer") + if dim not in (0, 1, 2): + raise ValueError("dimension need to be 0, 1 or 2") + expr = [PyccelAdd(PyccelMul(CudaBlockIdx(d), CudaBlockDim(d)), CudaThreadIdx(d))\ + for d in range(dim.python_value + 1)] + if dim == 0: + return expr[0] + return PythonTuple(*expr) + + + +cuda_funcs = { + 'array' : PyccelFunctionDef('array' , CudaArray), + 'copy' : PyccelFunctionDef('copy' , CudaCopy), + 'synchronize' : PyccelFunctionDef('synchronize' , CudaSynchronize), + 'threadIdx' : PyccelFunctionDef('threadIdx' , CudaThreadIdx), + 'blockDim' : PyccelFunctionDef('blockDim' , CudaBlockDim), + 'blockIdx' : PyccelFunctionDef('blockIdx' , CudaBlockIdx), + 'gridDim' : PyccelFunctionDef('gridDim' , CudaGridDim), + 'grid' : PyccelFunctionDef('grid' , CudaGrid) +} + +cuda_Internal_Var = { + 'CudaThreadIdx' : 'threadIdx', + 'CudaBlockDim' : 'blockDim', + 'CudaBlockIdx' : 'blockIdx', + 'CudaGridDim' : 'gridDim' +} + +cuda_mod = Module('cuda', + variables = [], + funcs = cuda_funcs.values()) \ No newline at end of file diff --git a/pyccel/ast/cupyext.py b/pyccel/ast/cupyext.py new file mode 100644 index 0000000000..e0082cc6f3 --- /dev/null +++ b/pyccel/ast/cupyext.py @@ -0,0 +1,481 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +#------------------------------------------------------------------------------------------# +# This file is part of Pyccel which is released under MIT License. See the LICENSE file or # +# go to https://github.com/pyccel/pyccel/blob/master/LICENSE for full license details. # +#------------------------------------------------------------------------------------------# +""" Module containing objects from the cupy module understood by pyccel +""" + +from functools import reduce +import operator + +from pyccel.errors.errors import Errors + +from pyccel.utilities.stage import PyccelStage + +from .basic import PyccelAstNode +from .builtins import (PythonTuple, PythonList) + +from .core import Module, PyccelFunctionDef + +from .datatypes import (default_precision, NativeInteger, + NativeFloat, NativeComplex, NativeBool, str_dtype, + NativeNumeric) + +from .internals import PyccelInternalFunction, max_precision, get_final_precision +from .internals import PyccelArraySize + +from .literals import LiteralInteger, LiteralFloat, LiteralComplex +from .literals import LiteralTrue, LiteralFalse +from .literals import Nil +from .mathext import MathCeil +from .operators import PyccelMinus, PyccelDiv +from .variable import (Variable, HomogeneousTupleVariable) +from .cudaext import CudaNewArray +from .numpyext import process_dtype, process_shape, DtypePrecisionToCastFunction + +errors = Errors() +pyccel_stage = PyccelStage() + +__all__ = ( + 'CupyArange', + 'CupyArray', + 'CupyArraySize', + 'CupyAutoFill', + 'CupyEmpty', + 'CupyEmptyLike', + 'CupyFull', + 'CupyFullLike', + 'CupyNewArray', + 'CupyOnes', + 'CupyOnesLike', + 'CupyZeros', + 'CupyZerosLike', + 'Shape' +) + +#============================================================================== +class CupyNewArray(CudaNewArray): + """ Class from which all Cupy functions which imply a call to Allocate + inherit + """ + __slots__ = () + _memory_location = 'device' + def __init__(self): + super().__init__() + + @property + def memory_location(self): + """ Indicate if the array is allocated on the host, device or has a managed memory + """ + return self._memory_location + +#============================================================================== +class CupyArray(CupyNewArray): + """ + Represents a call to cupy.array for code generation. + + arg : list, tuple, PythonList + + """ + __slots__ = ('_arg','_dtype','_precision','_shape','_rank','_order') + _attribute_nodes = ('_arg',) + name = 'array' + + def __init__(self, arg, dtype=None, order='C'): + + if not isinstance(arg, (PythonTuple, PythonList, Variable)): + raise TypeError('Unknown type of %s.' % type(arg)) + + is_homogeneous_tuple = isinstance(arg, (PythonTuple, PythonList, HomogeneousTupleVariable)) and arg.is_homogeneous + is_array = isinstance(arg, Variable) and arg.is_ndarray + + # TODO: treat inhomogenous lists and tuples when they have mixed ordering + if not (is_homogeneous_tuple or is_array): + raise TypeError('we only accept homogeneous arguments') + + # Verify dtype and get precision + if dtype is None: + dtype = arg.dtype + prec = get_final_precision(arg) + else: + dtype, prec = process_dtype(dtype) + # ... Determine ordering + order = str(order).strip("\'") + + shape = process_shape(False, arg.shape) + rank = len(shape) + + if rank < 2: + order = None + else: + # ... Determine ordering + order = str(order).strip("\'") + + if order not in ('K', 'A', 'C', 'F'): + raise ValueError(f"Cannot recognize '{order}' order") + + # TODO [YG, 18.02.2020]: set correct order based on input array + if order in ('K', 'A'): + order = 'C' + # ... + + self._arg = arg + self._shape = shape + self._rank = rank + self._dtype = dtype + self._order = order + self._precision = prec + super().__init__() + + def __str__(self): + return str(self.arg) + + @property + def arg(self): + return self._arg + +#============================================================================== +class CupyArange(CupyNewArray): + """ + Represents a call to cupy.arange for code generation. + + Parameters + ---------- + start : Numeric + Start of interval, default value 0 + + stop : Numeric + End of interval + + step : Numeric + Spacing between values, default value 1 + + dtype : Datatype + The type of the output array, if dtype is not given, + infer the data type from the other input arguments. + """ + __slots__ = ('_start','_step','_stop','_dtype','_precision','_shape') + _attribute_nodes = ('_start','_step','_stop') + _rank = 1 + _order = None + name = 'arange' + + def __init__(self, start, stop = None, step = None, dtype = None): + + if stop is None: + self._start = LiteralInteger(0) + self._stop = start + else: + self._start = start + self._stop = stop + self._step = step if step is not None else LiteralInteger(1) + + if dtype is None: + self._dtype = max([i.dtype for i in self.arg], key = NativeNumeric.index) + self._precision = max_precision(self.arg, allow_native=False) + else: + self._dtype, self._precision = process_dtype(dtype) + + self._shape = (MathCeil(PyccelDiv(PyccelMinus(self._stop, self._start), self._step))) + self._shape = process_shape(False, self._shape) + super().__init__() + + @property + def arg(self): + return (self._start, self._stop, self._step) + + @property + def start(self): + return self._start + + @property + def stop(self): + return self._stop + + @property + def step(self): + return self._step + +#============================================================================== + +class Shape(PyccelInternalFunction): + """ Represents a call to cupy.shape for code generation + """ + __slots__ = () + name = 'shape' + def __new__(cls, arg): + if isinstance(arg.shape, PythonTuple): + return arg.shape + else: + return PythonTuple(*arg.shape) + +#============================================================================== +class CupyFull(CupyNewArray): + """ + Represents a call to cupy.full for code generation. + + Parameters + ---------- + shape : int or sequence of ints + Shape of the new array, e.g., ``(2, 3)`` or ``2``. + + fill_value : scalar + Fill value. + + dtype: str, DataType + datatype for the constructed array + The default, `None`, means `np.array(fill_value).dtype`. + + order : {'C', 'F'}, optional + Whether to store multidimensional data in C- or Fortran-contiguous + (row- or column-wise) order in memory. + + """ + __slots__ = ('_fill_value','_dtype','_precision','_shape','_rank','_order') + name = 'full' + + def __init__(self, shape, fill_value, dtype=None, order='C'): + + # Convert shape to PythonTuple + shape = process_shape(False, shape) + # If there is no dtype, extract it from fill_value + # TODO: must get dtype from an annotated node + if dtype is None: + dtype = fill_value.dtype + precision = get_final_precision(fill_value) + else: + dtype, precision = process_dtype(dtype) + + # Cast fill_value to correct type + if fill_value: + if fill_value.dtype != dtype or get_final_precision(fill_value) != precision: + cast_func = DtypePrecisionToCastFunction[dtype.name][precision] + fill_value = cast_func(fill_value) + self._shape = shape + self._rank = len(self._shape) + self._dtype = dtype + self._order = CupyNewArray._process_order(self._rank, order) + self._precision = precision + + super().__init__(fill_value) + + #-------------------------------------------------------------------------- + @property + def fill_value(self): + return self._args[0] + +#============================================================================== +class CupyAutoFill(CupyFull): + """ Abstract class for all classes which inherit from CupyFull but + the fill_value is implicitly specified + """ + __slots__ = () + def __init__(self, shape, dtype='float', order='C'): + if not dtype: + raise TypeError("Data type must be provided") + super().__init__(shape, Nil(), dtype, order) + +#============================================================================== +class CupyEmpty(CupyAutoFill): + """ Represents a call to cupy.empty for code generation. + """ + __slots__ = () + name = 'empty' + + def __init__(self, shape, dtype='float', order='C'): + if dtype in NativeNumeric: + precision = default_precision[str_dtype(dtype)] + dtype = DtypePrecisionToCastFunction[dtype.name][precision] + super().__init__(shape, dtype, order) + @property + def fill_value(self): + return None + + +#============================================================================== +class CupyZeros(CupyAutoFill): + """ Represents a call to cupy.zeros for code generation. + """ + __slots__ = () + name = 'zeros' + @property + def fill_value(self): + dtype = self.dtype + if isinstance(dtype, NativeInteger): + value = LiteralInteger(0, precision = self.precision) + elif isinstance(dtype, NativeFloat): + value = LiteralFloat(0, precision = self.precision) + elif isinstance(dtype, NativeComplex): + value = LiteralComplex(0., 0., precision = self.precision) + elif isinstance(dtype, NativeBool): + value = LiteralFalse(precision = self.precision) + else: + raise TypeError('Unknown type') + return value + +#============================================================================== +class CupyOnes(CupyAutoFill): + """ Represents a call to cupy.ones for code generation. + """ + __slots__ = () + name = 'ones' + @property + def fill_value(self): + dtype = self.dtype + if isinstance(dtype, NativeInteger): + value = LiteralInteger(1, precision = self.precision) + elif isinstance(dtype, NativeFloat): + value = LiteralFloat(1., precision = self.precision) + elif isinstance(dtype, NativeComplex): + value = LiteralComplex(1., 0., precision = self.precision) + elif isinstance(dtype, NativeBool): + value = LiteralTrue(precision = self.precision) + else: + raise TypeError('Unknown type') + return value + +#======================================================================================= +class CupyFullLike(PyccelInternalFunction): + """ Represents a call to cupy.full_like for code generation. + """ + __slots__ = () + name = 'full_like' + def __new__(cls, a, fill_value, dtype=None, order='K', subok=True, shape=None): + + # NOTE: we ignore 'subok' argument + if dtype is None: + dtype = DtypePrecisionToCastFunction[a.dtype.name][a.precision] + order = a.order if str(order).strip('\'"') in ('K', 'A') else order + shape = Shape(a) if shape is None else shape + return CupyFull(shape, fill_value, dtype, order) + +#======================================================================================= +class CupyEmptyLike(PyccelInternalFunction): + """ Represents a call to cupy.empty_like for code generation. + """ + __slots__ = () + name = 'empty_like' + def __new__(cls, a, dtype=None, order='K', subok=True, shape=None): + + # NOTE: we ignore 'subok' argument + if dtype is None: + dtype = DtypePrecisionToCastFunction[a.dtype.name][a.precision] + order = a.order if str(order).strip('\'"') in ('K', 'A') else order + shape = Shape(a) if shape is None else shape + + return CupyEmpty(shape, dtype, order) + +#======================================================================================= +class CupyOnesLike(PyccelInternalFunction): + """ Represents a call to cupy.ones_like for code generation. + """ + __slots__ = () + name = 'ones_like' + def __new__(cls, a, dtype=None, order='K', subok=True, shape=None): + + # NOTE: we ignore 'subok' argument + if dtype is None: + dtype = DtypePrecisionToCastFunction[a.dtype.name][a.precision] + order = a.order if str(order).strip('\'"') in ('K', 'A') else order + shape = Shape(a) if shape is None else shape + + return CupyOnes(shape, dtype, order) + +#======================================================================================= +class CupyZerosLike(PyccelInternalFunction): + """ Represents a call to cupy.zeros_like for code generation. + """ + __slots__ = () + name = 'zeros_like' + def __new__(cls, a, dtype=None, order='K', subok=True, shape=None): + + # NOTE: we ignore 'subok' argument + if dtype is None: + dtype = DtypePrecisionToCastFunction[a.dtype.name][a.precision] + order = a.order if str(order).strip('\'"') in ('K', 'A') else order + shape = Shape(a) if shape is None else shape + + return CupyZeros(shape, dtype, order) + +#======================================================================================= + +class CupyArraySize(PyccelInternalFunction): + """ + Class representing a call to the cupy size function which + returns the shape of an object in a given dimension + + Parameters + ========== + arg : PyccelAstNode + A PyccelAstNode of unknown shape + axis : int + The dimension along which the size is + requested + """ + __slots__ = ('_arg',) + _attribute_nodes = ('_arg',) + name = 'size' + _dtype = NativeInteger() + _precision = -1 + _rank = 0 + _shape = None + _order = None + + def __new__(cls, a, axis = None): + if axis is not None: + return PyccelArraySize(a, axis) + elif not isinstance(a, (list, + tuple, + PyccelAstNode)): + raise TypeError('Unknown type of %s.' % type(a)) + elif all(isinstance(s, LiteralInteger) for s in a.shape): + return LiteralInteger(reduce(operator.mul, [s.python_value for s in a.shape])) + else: + return super().__new__(cls) + + def __init__(self, a, axis = None): + self._arg = a + super().__init__(a) + + @property + def arg(self): + """ Object whose size is investigated + """ + return self._arg + + def __str__(self): + return 'Size({})'.format(str(self.arg)) + +#============================================================================== + +cupy_funcs = { + # ... array creation routines + 'full' : PyccelFunctionDef('full' , CupyFull), + 'empty' : PyccelFunctionDef('empty' , CupyEmpty), + 'zeros' : PyccelFunctionDef('zeros' , CupyZeros), + 'ones' : PyccelFunctionDef('ones' , CupyOnes), + 'full_like' : PyccelFunctionDef('full_like' , CupyFullLike), + 'empty_like': PyccelFunctionDef('empty_like', CupyEmptyLike), + 'zeros_like': PyccelFunctionDef('zeros_like', CupyZerosLike), + 'ones_like' : PyccelFunctionDef('ones_like' , CupyOnesLike), + 'array' : PyccelFunctionDef('array' , CupyArray), + 'arange' : PyccelFunctionDef('arange' , CupyArange), + # ... + 'shape' : PyccelFunctionDef('shape' , Shape), + 'size' : PyccelFunctionDef('size' , CupyArraySize), +} + +cupy_mod = Module('cupy', + variables = [], + funcs = cupy_funcs.values()) + +#============================================================================== + +cupy_target_swap = { + cupy_funcs['full_like'] : cupy_funcs['full'], + cupy_funcs['empty_like'] : cupy_funcs['empty'], + cupy_funcs['zeros_like'] : cupy_funcs['zeros'], + cupy_funcs['ones_like'] : cupy_funcs['ones'] + } diff --git a/pyccel/ast/utilities.py b/pyccel/ast/utilities.py index 7b79388ded..4addd2557d 100644 --- a/pyccel/ast/utilities.py +++ b/pyccel/ast/utilities.py @@ -26,6 +26,8 @@ from .mathext import math_mod from .sysext import sys_mod +from .cudaext import cuda_mod +from .cupyext import cupy_mod from .numpyext import (NumpyEmpty, NumpyArray, numpy_mod, NumpyTranspose, NumpyLinspace) from .operators import PyccelAdd, PyccelMul, PyccelIs, PyccelArithmeticOperator @@ -78,12 +80,15 @@ def builtin_function(expr, args=None): decorators_mod = Module('decorators',(), funcs = [PyccelFunctionDef(d, PyccelInternalFunction) for d in pyccel_decorators.__all__]) pyccel_mod = Module('pyccel',(),(), - imports = [Import('decorators', decorators_mod)]) + imports = [Import('decorators', decorators_mod), + Import('cuda', cuda_mod), + ]) # TODO add documentation builtin_import_registry = Module('__main__', (),(), imports = [ + Import('cupy', cupy_mod), Import('numpy', numpy_mod), Import('scipy', scipy_mod), Import('itertools', itertools_mod), diff --git a/pyccel/ast/variable.py b/pyccel/ast/variable.py index 3c84a7cf2d..550a51e3c1 100644 --- a/pyccel/ast/variable.py +++ b/pyccel/ast/variable.py @@ -55,6 +55,16 @@ class Variable(PyccelAstNode): rank : int, default: 0 The number of dimensions for an array. + memory_handling: str, default: 'stack' + 'heap' is used for arrays, if we need to allocate memory on the heap. + 'stack' if memory should be allocated on the stack, represents stack arrays and scalars. + 'alias' if object allows access to memory stored in another variable. + + memory_location: str, default: 'host' + 'host' the variable can only be accessed by the CPU. + 'device' the variable can only be accessed by the GPU. + 'managed' the variable can be accessed by CPU and GPU and is being managed by the Cuda API (memory transfer is being done implicitly). + memory_handling : str, default: 'stack' 'heap' is used for arrays, if we need to allocate memory on the heap. 'stack' if memory should be allocated on the stack, represents stack arrays and scalars. @@ -108,7 +118,7 @@ class Variable(PyccelAstNode): >>> Variable('int', DottedName('matrix', 'n_rows')) matrix.n_rows """ - __slots__ = ('_name', '_alloc_shape', '_memory_handling', '_is_const', + __slots__ = ('_name', '_alloc_shape', '_memory_handling', '_memory_location', '_is_const', '_is_target', '_is_optional', '_allows_negative_indexes', '_cls_base', '_is_argument', '_is_temp','_dtype','_precision', '_rank','_shape','_order','_is_private') @@ -121,6 +131,7 @@ def __init__( *, rank=0, memory_handling='stack', + memory_location='host', is_const=False, is_target=False, is_optional=False, @@ -155,6 +166,10 @@ def __init__( raise ValueError("memory_handling must be 'heap', 'stack' or 'alias'") self._memory_handling = memory_handling + if memory_location not in ('host', 'device', 'managed'): + raise ValueError("memory_location must be 'host', 'device' or 'managed'") + self._memory_location = memory_location + if not isinstance(is_const, bool): raise TypeError('is_const must be a boolean.') self._is_const = is_const @@ -309,6 +324,36 @@ def on_stack(self): """ return self.memory_handling == 'stack' + @property + def memory_location(self): + """ Indicates whether a Variable has a dynamic size + """ + return self._memory_location + + @memory_location.setter + def memory_location(self, memory_location): + if memory_location not in ('host', 'device', 'managed'): + raise ValueError("memory_location must be 'host', 'device' or 'managed'") + self._memory_location = memory_location + + @property + def on_host(self): + """ Indicates if memory is only accessible by the CPU + """ + return self.memory_location == 'host' + + @property + def on_device(self): + """ Indicates if memory is only accessible by the GPU + """ + return self.memory_location == 'device' + + @property + def is_managed(self): + """ Indicates if memory is being managed by CUDA API + """ + return self.memory_location == 'managed' + @property def is_stack_array(self): """ Indicates if the variable is located on stack and is an array diff --git a/pyccel/codegen/codegen.py b/pyccel/codegen/codegen.py index 1a5568bf96..149f831ee7 100644 --- a/pyccel/codegen/codegen.py +++ b/pyccel/codegen/codegen.py @@ -8,17 +8,19 @@ from pyccel.codegen.printing.fcode import FCodePrinter from pyccel.codegen.printing.ccode import CCodePrinter +from pyccel.codegen.printing.ccudacode import CCudaCodePrinter from pyccel.codegen.printing.pycode import PythonCodePrinter from pyccel.ast.core import FunctionDef, Interface, ModuleHeader from pyccel.errors.errors import Errors from pyccel.utilities.stage import PyccelStage -_extension_registry = {'fortran': 'f90', 'c':'c', 'python':'py'} -_header_extension_registry = {'fortran': None, 'c':'h', 'python':None} +_extension_registry = {'fortran': 'f90', 'c':'c', 'python':'py', 'ccuda': 'cu'} +_header_extension_registry = {'fortran': None, 'c':'h', 'python':None, 'ccuda': 'h'} printer_registry = { 'fortran':FCodePrinter, 'c':CCodePrinter, + 'ccuda':CCudaCodePrinter, 'python':PythonCodePrinter } @@ -140,7 +142,7 @@ def set_printer(self, **settings): language = settings.pop('language', 'fortran') # Set language - if not language in ['fortran', 'c', 'python']: + if not language in ['fortran', 'c', 'python', 'ccuda']: raise ValueError('{} language is not available'.format(language)) self._language = language diff --git a/pyccel/codegen/compiling/compilers.py b/pyccel/codegen/compiling/compilers.py index 32ccdaf518..62983538f3 100644 --- a/pyccel/codegen/compiling/compilers.py +++ b/pyccel/codegen/compiling/compilers.py @@ -333,7 +333,10 @@ def compile_program(self, compile_obj, output_folder, verbose = False): # Get compile options exec_cmd, includes, libs_flags, libdirs_flags, m_code = \ self._get_compile_components(compile_obj, accelerators) - linker_libdirs_flags = ['-Wl,-rpath' if l == '-L' else l for l in libdirs_flags] + if self._info['exec'] in ('nvcc', 'nvc', 'nvfortran'): + linker_libdirs_flags = ['-Xcompiler' if l == '-L' else f'"-Wl,-rpath,{l}"' for l in libdirs_flags] + else: + linker_libdirs_flags = ['-Wl,-rpath' if l == '-L' else l for l in libdirs_flags] if self._info['language'] == 'fortran': j_code = (self._info['module_output_flag'], output_folder) @@ -385,7 +388,10 @@ def compile_shared_library(self, compile_obj, output_folder, verbose = False, sh # Collect compile information exec_cmd, includes, libs_flags, libdirs_flags, m_code = \ self._get_compile_components(compile_obj, accelerators) - linker_libdirs_flags = ['-Wl,-rpath' if l == '-L' else l for l in libdirs_flags] + if self._info['exec'] in ('nvcc', 'nvc', 'nvfortran'): + linker_libdirs_flags = ['-Xcompiler' if l == '-L' else f'"-Wl,-rpath,{l}"' for l in libdirs_flags] + else: + linker_libdirs_flags = ['-Wl,-rpath' if l == '-L' else l for l in libdirs_flags] flags.insert(0,"-shared") diff --git a/pyccel/codegen/pipeline.py b/pyccel/codegen/pipeline.py index 99208311f1..79b2e32ef4 100644 --- a/pyccel/codegen/pipeline.py +++ b/pyccel/codegen/pipeline.py @@ -194,9 +194,9 @@ def handle_error(stage): if language is None: language = 'fortran' - # Choose Fortran compiler + # Choose default compiler family if compiler is None: - compiler = 'GNU' + compiler = 'nvidia' if language == 'ccuda' else 'GNU' fflags = [] if fflags is None else fflags.split() wrapper_flags = [] if wrapper_flags is None else wrapper_flags.split() diff --git a/pyccel/codegen/printing/ccode.py b/pyccel/codegen/printing/ccode.py index 01d1b71cc1..3b2159c07c 100644 --- a/pyccel/codegen/printing/ccode.py +++ b/pyccel/codegen/printing/ccode.py @@ -411,12 +411,14 @@ def arrayFill(self, expr): lhs = expr.lhs code_init = '' declare_dtype = self.find_in_dtype_registry(self._print(rhs.dtype), rhs.precision) - + dtype = self.find_in_ndarray_type_registry(self._print(rhs.dtype), rhs.precision) + dtype = dtype[3:] + if rhs.fill_value is not None: if isinstance(rhs.fill_value, Literal): - code_init += 'array_fill(({0}){1}, {2});\n'.format(declare_dtype, self._print(rhs.fill_value), self._print(lhs)) + code_init += 'array_fill_{0}(({1}){2}, {3});\n'.format(dtype, declare_dtype, self._print(rhs.fill_value), self._print(lhs)) else: - code_init += 'array_fill({0}, {1});\n'.format(self._print(rhs.fill_value), self._print(lhs)) + code_init += 'array_fill_{0}({1}, {2});\n'.format(dtype, self._print(rhs.fill_value), self._print(lhs)) return code_init def _init_stack_array(self, expr): diff --git a/pyccel/codegen/printing/ccudacode.py b/pyccel/codegen/printing/ccudacode.py new file mode 100644 index 0000000000..bfefbdc3dd --- /dev/null +++ b/pyccel/codegen/printing/ccudacode.py @@ -0,0 +1,564 @@ +# coding: utf-8 +#------------------------------------------------------------------------------------------# +# This file is part of Pyccel which is released under MIT License. See the LICENSE file or # +# go to https://github.com/pyccel/pyccel/blob/master/LICENSE for full license details. # +#------------------------------------------------------------------------------------------# +# pylint: disable=missing-function-docstring + + +from pyccel.ast.builtins import PythonTuple + +from pyccel.ast.core import (FunctionCall, Deallocate, FunctionAddress, + FunctionDefArgument, Assign, Import, + AliasAssign, Module) + +from pyccel.ast.datatypes import NativeTuple, datatype +from pyccel.ast.literals import LiteralTrue, Literal, Nil + +from pyccel.ast.numpyext import NumpyFull, NumpyArray, NumpyArange + +from pyccel.ast.cupyext import CupyFull, CupyArray, CupyArange + +from pyccel.ast.cudaext import CudaCopy, cuda_Internal_Var, CudaArray + +from pyccel.ast.variable import Variable + +from pyccel.ast.c_concepts import ObjectAddress + +from pyccel.codegen.printing.ccode import CCodePrinter + +from pyccel.errors.errors import Errors + + +errors = Errors() + +# TODO: add examples + +__all__ = ["CCudaCodePrinter", "ccudacode"] + +# dictionary mapping numpy function to (argument_conditions, C_function). +# Used in CCodePrinter._print_NumpyUfuncBase(self, expr) +numpy_ufunc_to_c_float = { + 'NumpyAbs' : 'fabs', + 'NumpyFabs' : 'fabs', + 'NumpyMin' : 'minval', + 'NumpyMax' : 'maxval', + 'NumpyFloor': 'floor', # TODO: might require special treatment with casting + # --- + 'NumpyExp' : 'exp', + 'NumpyLog' : 'log', + 'NumpySqrt': 'sqrt', + # --- + 'NumpySin' : 'sin', + 'NumpyCos' : 'cos', + 'NumpyTan' : 'tan', + 'NumpyArcsin' : 'asin', + 'NumpyArccos' : 'acos', + 'NumpyArctan' : 'atan', + 'NumpyArctan2': 'atan2', + 'NumpySinh' : 'sinh', + 'NumpyCosh' : 'cosh', + 'NumpyTanh' : 'tanh', + 'NumpyArcsinh': 'asinh', + 'NumpyArccosh': 'acosh', + 'NumpyArctanh': 'atanh', +} + +numpy_ufunc_to_c_complex = { + 'NumpyAbs' : 'cabs', + 'NumpyMin' : 'minval', + 'NumpyMax' : 'maxval', + # --- + 'NumpyExp' : 'cexp', + 'NumpyLog' : 'clog', + 'NumpySqrt': 'csqrt', + # --- + 'NumpySin' : 'csin', + 'NumpyCos' : 'ccos', + 'NumpyTan' : 'ctan', + 'NumpyArcsin' : 'casin', + 'NumpyArccos' : 'cacos', + 'NumpyArctan' : 'catan', + 'NumpySinh' : 'csinh', + 'NumpyCosh' : 'ccosh', + 'NumpyTanh' : 'ctanh', + 'NumpyArcsinh': 'casinh', + 'NumpyArccosh': 'cacosh', + 'NumpyArctanh': 'catanh', +} + +# dictionary mapping Math function to (argument_conditions, C_function). +# Used in CCodePrinter._print_MathFunctionBase(self, expr) +# Math function ref https://docs.python.org/3/library/math.html +math_function_to_c = { + # ---------- Number-theoretic and representation functions ------------ + 'MathCeil' : 'ceil', + # 'MathComb' : 'com' # TODO + 'MathCopysign': 'copysign', + 'MathFabs' : 'fabs', + 'MathFloor' : 'floor', + # 'MathFmod' : '???', # TODO + # 'MathRexp' : '???' TODO requires two output + # 'MathFsum' : '???', # TODO + # 'MathIsclose' : '???', # TODO + 'MathIsfinite': 'isfinite', # int isfinite(real-floating x); + 'MathIsinf' : 'isinf', # int isinf(real-floating x); + 'MathIsnan' : 'isnan', # int isnan(real-floating x); + # 'MathIsqrt' : '???' TODO + 'MathLdexp' : 'ldexp', + # 'MathModf' : '???' TODO return two value + # 'MathPerm' : '???' TODO + # 'MathProd' : '???' TODO + 'MathRemainder' : 'remainder', + 'MathTrunc' : 'trunc', + + # ----------------- Power and logarithmic functions ----------------------- + + 'MathExp' : 'exp', + 'MathExpm1' : 'expm1', + 'MathLog' : 'log', # take also an option arg [base] + 'MathLog1p' : 'log1p', + 'MathLog2' : 'log2', + 'MathLog10' : 'log10', + 'MathPow' : 'pow', + 'MathSqrt' : 'sqrt', + + # --------------------- Trigonometric functions --------------------------- + + 'MathAcos' : 'acos', + 'MathAsin' : 'asin', + 'MathAtan' : 'atan', + 'MathAtan2' : 'atan2', + 'MathCos' : 'cos', + # 'MathDist' : '???', TODO + 'MathHypot' : 'hypot', + 'MathSin' : 'sin', + 'MathTan' : 'tan', + + + # -------------------------- Hyperbolic functions ------------------------- + + 'MathAcosh' : 'acosh', + 'MathAsinh' : 'asinh', + 'MathAtanh' : 'atanh', + 'MathCosh' : 'cosh', + 'MathSinh' : 'sinh', + 'MathTanh' : 'tanh', + + # --------------------------- Special functions --------------------------- + + 'MathErf' : 'erf', + 'MathErfc' : 'erfc', + 'MathGamma' : 'tgamma', + 'MathLgamma' : 'lgamma', + + # --------------------------- internal functions -------------------------- + 'MathFactorial' : 'pyc_factorial', + 'MathGcd' : 'pyc_gcd', + 'MathDegrees' : 'pyc_degrees', + 'MathRadians' : 'pyc_radians', + 'MathLcm' : 'pyc_lcm', +} + +c_library_headers = ( + "complex", + "ctype", + "float", + "math", + "stdarg", + "stdbool", + "stddef", + "stdint", + "stdio", + "stdlib", + "string", + "tgmath", +) + +dtype_registry = {('float',8) : 'double', + ('float',4) : 'float', + ('complex',8) : 'double complex', + ('complex',4) : 'float complex', + ('int',4) : 'int32_t', + ('int',8) : 'int64_t', + ('int',2) : 'int16_t', + ('int',1) : 'int8_t', + ('bool',4) : 'bool'} + +ndarray_type_registry = { + ('float',8) : 'nd_double', + ('float',4) : 'nd_float', + ('complex',8) : 'nd_cdouble', + ('complex',4) : 'nd_cfloat', + ('int',8) : 'nd_int64', + ('int',4) : 'nd_int32', + ('int',2) : 'nd_int16', + ('int',1) : 'nd_int8', + ('bool',4) : 'nd_bool'} + +import_dict = {'omp_lib' : 'omp' } + +c_imports = {n : Import(n, Module(n, (), ())) for n in + ['stdlib', + 'math', + 'string', + 'ndarrays', + 'cuda_ndarrays', + 'math', + 'complex', + 'stdint', + 'pyc_math_c', + 'stdio', + 'stdbool', + 'assert']} + +class CCudaCodePrinter(CCodePrinter): + """A printer to convert python expressions to strings of ccuda code""" + printmethod = "_ccudacode" + language = "ccuda" + + _default_settings = { + 'tabwidth': 4, + } + + def __init__(self, filename, prefix_module = None): + + errors.set_target(filename, 'file') + + super().__init__(filename) + self.prefix_module = prefix_module + self._additional_imports = {'stdlib':c_imports['stdlib']} + self._additional_code = '' + self._additional_args = [] + self._temporary_args = [] + self._current_module = None + self._in_header = False + # Dictionary linking optional variables to their + # temporary counterparts which provide allocated + # memory + # Key is optional variable + self._optional_partners = {} + + def function_signature(self, expr, print_arg_names = True): + """ + Get the Ccuda representation of the function signature. + Extract from the function definition `expr` all the + information (name, input, output) needed to create the + function signature and return a string describing the + function. + This is not a declaration as the signature does not end + with a semi-colon. + Parameters + ---------- + expr : FunctionDef + The function definition for which a signature is needed. + print_arg_names : bool, default : True + Indicates whether argument names should be printed. + Returns + ------- + str + Signature of the function. + """ + if len(expr.results) > 1: + self._additional_args.append(expr.results) + args = list(expr.arguments) + if len(expr.results) == 1: + ret_type = self.get_declare_type(expr.results[0]) + elif len(expr.results) > 1: + ret_type = self._print(datatype('int')) + args += [FunctionDefArgument(a) for a in expr.results] + else: + ret_type = self._print(datatype('void')) + name = expr.name + if not args: + arg_code = 'void' + else: + def get_var_arg(arg, var): + code = "const " * var.is_const + code += self.get_declare_type(var) + ' ' + code += arg.name * print_arg_names + return code + + var_list = [a.var for a in args] + arg_code_list = [self.function_signature(var, False) if isinstance(var, FunctionAddress) + else get_var_arg(arg, var) for arg, var in zip(args, var_list)] + arg_code = ', '.join(arg_code_list) + + if self._additional_args : + self._additional_args.pop() + + extern_word = 'extern "C"' + cuda_deco = "__global__" if 'kernel' in expr.decorators else '' + + if isinstance(expr, FunctionAddress): + return f'{extern_word} {ret_type} (*{name})({arg_code})' + else: + return f'{extern_word} {cuda_deco} {ret_type} {name}({arg_code})' + + def _print_Allocate(self, expr): + free_code = '' + #free the array if its already allocated and checking if its not null if the status is unknown + if (expr.status == 'unknown'): + free_code = 'if (%s.shape != NULL)\n' % self._print(expr.variable.name) + free_code += "{{\n{}}}\n".format(self._print(Deallocate(expr.variable))) + elif (expr.status == 'allocated'): + free_code += self._print(Deallocate(expr.variable)) + shape = ", ".join(self._print(i) for i in expr.shape) + shape_dtype = self.find_in_dtype_registry('int', 8) + tmp_shape = self.scope.get_new_name('tmp_shape') + dtype = self._print(expr.variable.dtype) + dtype = self.find_in_ndarray_type_registry(dtype, expr.variable.precision) + shape_Assign = "{} {}[] = {{{}}};".format(shape_dtype, tmp_shape, shape) + is_view = 'false' if expr.variable.on_heap else 'true' + self.add_import(c_imports['cuda_ndarrays']) + # define the memory location for the created cuda array + memory_location = expr.variable.memory_location + if memory_location in ('device', 'host'): + memory_location = 'allocateMemoryOn' + str(memory_location).capitalize() + else: + memory_location = 'managedMemory' + alloc_code = f"{expr.variable} = \ + cuda_array_create({len(expr.shape)}, {tmp_shape}, {dtype}, {is_view}, {memory_location});" + return f"{free_code}\n{shape_Assign}\n{alloc_code}\n" + + def _print_Deallocate(self, expr): + var_code = self._print(expr.variable) + if expr.variable.is_alias: + return f"cuda_free_pointer({var_code});\n" + else: + if expr.variable.memory_location == 'host': + return f"cuda_free_host({var_code});\n" + else: + return f"cuda_free({var_code});\n" + + def _print_KernelCall(self, expr): + func = expr.funcdef + if func.is_inline: + return self._handle_inline_func_call(expr) + # Ensure the correct syntax is used for pointers + args = [] + for a, f in zip(expr.args, func.arguments): + a = a.value if a else Nil() + f = f.var + if self.is_c_pointer(f): + if isinstance(a, Variable): + args.append(ObjectAddress(a)) + elif not self.is_c_pointer(a): + tmp_var = self.scope.get_temporary_variable(f.dtype) + assign = Assign(tmp_var, a) + self._additional_code += self._print(assign) + args.append(ObjectAddress(tmp_var)) + else: + args.append(a) + else : + args.append(a) + + args += self._temporary_args + self._temporary_args = [] + args = ', '.join(['{}'.format(self._print(a)) for a in args]) + # TODO: need to raise error in semantic if we have result , kernel can't return + if not func.results: + return '{}<<<{},{}>>>({});\n'.format(func.name, expr.numBlocks, expr.tpblock,args) + + def _print_Assign(self, expr): + prefix_code = '' + lhs = expr.lhs + rhs = expr.rhs + if isinstance(lhs, Variable) and lhs.is_optional: + if lhs in self._optional_partners: + # Collect temporary variable which provides + # allocated memory space for this optional variable + tmp_var = self._optional_partners[lhs] + else: + # Create temporary variable to provide allocated + # memory space before assigning to the pointer value + # (may be NULL) + tmp_var = self.scope.get_temporary_variable(lhs, + is_optional = False) + self._optional_partners[lhs] = tmp_var + # Point optional variable at an allocated memory space + prefix_code = self._print(AliasAssign(lhs, tmp_var)) + if isinstance(rhs, FunctionCall) and isinstance(rhs.dtype, NativeTuple): + self._temporary_args = [ObjectAddress(a) for a in lhs] + return prefix_code+'{};\n'.format(self._print(rhs)) + # Inhomogenous tuples are unravelled and therefore do not exist in the c printer + + if isinstance(rhs, (CupyFull)): + return prefix_code+self.cuda_arrayFill(expr) + if isinstance(rhs, CupyArange): + return prefix_code+self.cuda_Arange(expr) + if isinstance(rhs, (CudaArray, CupyArray)): + return prefix_code+self.copy_CudaArray_Data(expr) + if isinstance(rhs, (NumpyArray, PythonTuple)): + return prefix_code+self.copy_NumpyArray_Data(expr) + if isinstance(rhs, (NumpyFull)): + return prefix_code+self.arrayFill(expr) + if isinstance(rhs, NumpyArange): + return prefix_code+self.fill_NumpyArange(rhs, lhs) + if isinstance(rhs, CudaCopy): + return prefix_code+self.cudaCopy(lhs, rhs) + lhs = self._print(expr.lhs) + rhs = self._print(expr.rhs) + return prefix_code+'{} = {};\n'.format(lhs, rhs) + + def arrayFill(self, expr): + """ print the assignment of a NdArray + + parameters + ---------- + expr : PyccelAstNode + The Assign Node used to get the lhs and rhs + Return + ------ + String + Return a str that contains a call to the C function array_fill using Cuda api, + """ + rhs = expr.rhs + lhs = expr.lhs + code_init = '' + declare_dtype = self.find_in_dtype_registry(self._print(rhs.dtype), rhs.precision) + dtype = self.find_in_ndarray_type_registry(self._print(rhs.dtype), rhs.precision) + dtype = dtype[3:] + + if rhs.fill_value is not None: + if isinstance(rhs.fill_value, Literal): + code_init += 'cuda_array_fill_{0}(({1}){2}, {3});\n'.format(dtype, declare_dtype, self._print(rhs.fill_value), self._print(lhs)) + else: + code_init += 'cuda_array_fill_{0}({1}, {2});\n'.format(dtype, self._print(rhs.fill_value), self._print(lhs)) + return code_init + + def cuda_Arange(self, expr): + """ print the assignment of a NdArray + + parameters + ---------- + expr : PyccelAstNode + The Assign Node used to get the lhs and rhs + Return + ------ + String + Return a str that contains a call to the C function array_arange using Cuda api, + """ + rhs = expr.rhs + lhs = expr.lhs + code_init = '' + declare_dtype = self.find_in_dtype_registry(self._print(rhs.dtype), rhs.precision) + dtype = self.find_in_ndarray_type_registry(self._print(rhs.dtype), rhs.precision) + dtype = dtype[3:] + + #TODO: calculate best thread number to run the kernel + code_init += 'cuda_array_arange_{0}<<<1,32>>>({1}, {2});\n'.format(dtype, self._print(lhs), self._print(rhs.start)) + return code_init + + def cuda_arrayFill(self, expr): + """ print the assignment of a NdArray + + parameters + ---------- + expr : PyccelAstNode + The Assign Node used to get the lhs and rhs + Return + ------ + String + Return a str that contains a call to the C function array_fill using Cuda api, + """ + rhs = expr.rhs + lhs = expr.lhs + code_init = '' + declare_dtype = self.find_in_dtype_registry(self._print(rhs.dtype), rhs.precision) + dtype = self.find_in_ndarray_type_registry(self._print(rhs.dtype), rhs.precision) + dtype = dtype[3:] + + if rhs.fill_value is not None: + if isinstance(rhs.fill_value, Literal): + code_init += 'cuda_array_fill_{0}<<<1,1>>>(({1}){2}, {3});\n'.format(dtype, declare_dtype, self._print(rhs.fill_value), self._print(lhs)) + else: + code_init += 'cuda_array_fill_{0}<<<1,1>>>({1}, {2});\n'.format(dtype, self._print(rhs.fill_value), self._print(lhs)) + return code_init + + def copy_CudaArray_Data(self, expr): + """ print the assignment of a Cuda NdArray + + parameters + ---------- + expr : PyccelAstNode + The Assign Node used to get the lhs and rhs + Return + ------ + String + Return a str that contains the declaration of a dummy data_buffer + and a call to an operator which copies it to a Cuda NdArray struct + if the ndarray is a stack_array the str will contain the initialization + """ + rhs = expr.rhs + lhs = expr.lhs + if rhs.rank == 0: + raise NotImplementedError(str(expr)) + dummy_array_name = self.scope.get_new_name('cuda_array_dummy') + declare_dtype = self.find_in_dtype_registry(self._print(rhs.dtype), rhs.precision) + dtype = self.find_in_ndarray_type_registry(self._print(rhs.dtype), rhs.precision) + arg = rhs.arg if isinstance(rhs, (CudaArray, CupyArray)) else rhs + if rhs.rank > 1: + # flattening the args to use them in C initialization. + arg = self._flatten_list(arg) + + self.add_import(c_imports['string']) + if isinstance(arg, Variable): + arg = self._print(arg) + cpy_data = "cudaMemcpy({0}.raw_data, {1}.{2}, {0}.buffer_size, cudaMemcpyHostToDevice);".format(lhs, arg, dtype) + return '%s\n' % (cpy_data) + else : + arg = ', '.join(self._print(i) for i in arg) + dummy_array = "%s %s[] = {%s};\n" % (declare_dtype, dummy_array_name, arg) + cpy_data = "cudaMemcpy({0}.raw_data, {1}, {0}.buffer_size, cudaMemcpyHostToDevice);".format(self._print(lhs), dummy_array_name, dtype) + return '%s%s\n' % (dummy_array, cpy_data) + + def _print_CudaSynchronize(self, expr): + return 'cudaDeviceSynchronize()' + + def _print_CudaInternalVar(self, expr): + var_name = type(expr).__name__ + var_name = cuda_Internal_Var[var_name] + dim_c = ('x', 'y', 'z')[expr.dim] + return '{}.{}'.format(var_name, dim_c) + + def cudaCopy(self, lhs, rhs): + from_location = 'Host' + to_location = 'Host' + if rhs.arg.memory_location in ('device', 'managed'): + from_location = 'Device' + if rhs.memory_location in ('device', 'managed'): + to_location = 'Device' + transfer_type = 'cudaMemcpy{0}To{1}'.format(from_location, to_location) + if isinstance(rhs.is_async, LiteralTrue): + cpy_data = "cudaMemcpyAsync({0}.raw_data, {1}.raw_data, {0}.buffer_size, {2}, 0);".format(lhs, rhs.arg, transfer_type) + else: + cpy_data = "cudaMemcpy({0}.raw_data, {1}.raw_data, {0}.buffer_size, {2});".format(lhs, rhs.arg, transfer_type) + return '%s\n' % (cpy_data) + +def ccudacode(expr, filename, assign_to=None, **settings): + """Converts an expr to a string of ccuda code + + expr : Expr + A pyccel expression to be converted. + filename : str + The name of the file being translated. Used in error printing + assign_to : optional + When given, the argument is used as the name of the variable to which + the expression is assigned. Can be a string, ``Symbol``, + ``MatrixSymbol``, or ``Indexed`` type. This is helpful in case of + line-wrapping, or for expressions that generate multi-line statements. + precision : integer, optional + The precision for numbers such as pi [default=15]. + user_functions : dict, optional + A dictionary where keys are ``FunctionClass`` instances and values are + their string representations. Alternatively, the dictionary value can + be a list of tuples i.e. [(argument_test, cfunction_string)]. See below + for examples. + dereference : iterable, optional + An iterable of symbols that should be dereferenced in the printed code + expression. These would be values passed by address to the function. + For example, if ``dereference=[a]``, the resulting code would print + ``(*a)`` instead of ``a``. + """ + return CCudaCodePrinter(filename, **settings).doprint(expr, assign_to) diff --git a/pyccel/codegen/printing/cwrappercode.py b/pyccel/codegen/printing/cwrappercode.py index e5552d4dcb..ac0335eff4 100644 --- a/pyccel/codegen/printing/cwrappercode.py +++ b/pyccel/codegen/printing/cwrappercode.py @@ -292,7 +292,7 @@ def static_function_signature(self, expr): Signature of the function. """ #if target_language is C no need for the binding - if self._target_language == 'c': + if self._target_language in ('c', 'ccuda'): return self.function_signature(expr) args = [a.var for a in expr.arguments] diff --git a/pyccel/codegen/utilities.py b/pyccel/codegen/utilities.py index 62ac36bd19..9c3a6df5cf 100644 --- a/pyccel/codegen/utilities.py +++ b/pyccel/codegen/utilities.py @@ -21,18 +21,19 @@ __all__ = ['copy_internal_library','recompile_object'] #============================================================================== -language_extension = {'fortran':'f90', 'c':'c', 'python':'py'} +language_extension = {'fortran':'f90', 'c':'c', 'python':'py', 'ccuda':'cu'} #============================================================================== # map internal libraries to their folders inside pyccel/stdlib and their compile objects # The compile object folder will be in the pyccel dirpath internal_libs = { - "ndarrays" : ("ndarrays", CompileObj("ndarrays.c",folder="ndarrays")), - "pyc_math_f90" : ("math", CompileObj("pyc_math_f90.f90",folder="math")), - "pyc_math_c" : ("math", CompileObj("pyc_math_c.c",folder="math")), - "cwrapper" : ("cwrapper", CompileObj("cwrapper.c",folder="cwrapper", accelerators=('python',))), - "numpy_f90" : ("numpy", CompileObj("numpy_f90.f90",folder="numpy")), - "numpy_c" : ("numpy", CompileObj("numpy_c.c",folder="numpy")), + "ndarrays" : ("ndarrays", CompileObj("ndarrays.c",folder="ndarrays")), + "cuda_ndarrays" : ("cuda_ndarrays", CompileObj("cuda_ndarrays.cu",folder="cuda_ndarrays")), + "pyc_math_f90" : ("math", CompileObj("pyc_math_f90.f90",folder="math")), + "pyc_math_c" : ("math", CompileObj("pyc_math_c.c",folder="math")), + "cwrapper" : ("cwrapper", CompileObj("cwrapper.c",folder="cwrapper", accelerators=('python',))), + "numpy_f90" : ("numpy", CompileObj("numpy_f90.f90",folder="numpy")), + "numpy_c" : ("numpy", CompileObj("numpy_c.c",folder="numpy")), } internal_libs["cwrapper_ndarrays"] = ("cwrapper_ndarrays", CompileObj("cwrapper_ndarrays.c",folder="cwrapper_ndarrays", accelerators = ('python',), diff --git a/pyccel/commands/console.py b/pyccel/commands/console.py index 22e9f8afd9..e6a5841478 100644 --- a/pyccel/commands/console.py +++ b/pyccel/commands/console.py @@ -60,7 +60,7 @@ def pyccel(files=None, mpi=None, openmp=None, openacc=None, output_dir=None, com # ... backend compiler options group = parser.add_argument_group('Backend compiler options') - group.add_argument('--language', choices=('fortran', 'c', 'python'), help='Generated language') + group.add_argument('--language', choices=('fortran', 'c', 'python', 'ccuda'), help='Generated language') group.add_argument('--compiler', help='Compiler family or json file containing a compiler description {GNU,intel,PGI}') diff --git a/pyccel/compilers/default_compilers.py b/pyccel/compilers/default_compilers.py index c04566a22b..5d559977c8 100644 --- a/pyccel/compilers/default_compilers.py +++ b/pyccel/compilers/default_compilers.py @@ -173,6 +173,16 @@ 'family': 'nvidia', } +#------------------------------------------------------------ +nvcc_info = {'exec' : 'nvcc', + 'language': 'ccuda', + 'debug_flags': ("-g",), + 'release_flags': ("-O3",), + 'general_flags' : ('--compiler-options', '-fPIC',), + 'standard_flags' : ('-std=c99',), + 'family': 'nvidia', + } + #------------------------------------------------------------ def change_to_lib_flag(lib): """ @@ -275,6 +285,7 @@ def change_to_lib_flag(lib): pgfortran_info.update(python_info) nvc_info.update(python_info) nvfort_info.update(python_info) +nvcc_info.update(python_info) available_compilers = {('GNU', 'c') : gcc_info, ('GNU', 'fortran') : gfort_info, @@ -283,6 +294,7 @@ def change_to_lib_flag(lib): ('PGI', 'c') : pgcc_info, ('PGI', 'fortran') : pgfortran_info, ('nvidia', 'c') : nvc_info, - ('nvidia', 'fortran') : nvfort_info} + ('nvidia', 'fortran') : nvfort_info, + ('nvidia', 'ccuda') : nvcc_info} vendors = ('GNU','intel','PGI','nvidia') diff --git a/pyccel/decorators.py b/pyccel/decorators.py index 0bb87aca15..2d263c4d9e 100644 --- a/pyccel/decorators.py +++ b/pyccel/decorators.py @@ -21,6 +21,7 @@ 'sympy', 'template', 'types', + 'kernel', ) def lambdify(f): @@ -98,3 +99,25 @@ def allow_negative_index(f,*args): def identity(f): return f return identity + +def kernel(f): + """ + This decorator is used to mark a Python function as a GPU kernel function, + allowing it to be executed on a GPU. + The decorator returns a NumPy array containing the decorated function object + to ensure that the function is treated as an array function. + This also allows the function to run in pure Python without errors related to indexing. + + Parameters + ---------- + f : Function + The function to be marked as a kernel. + + Returns + ------- + numpy.ndarray: A numpy array containing the function object. + + """ + from numpy import array + return array([[f]]) + diff --git a/pyccel/epyccel.py b/pyccel/epyccel.py index fecc7cca52..da6d932848 100644 --- a/pyccel/epyccel.py +++ b/pyccel/epyccel.py @@ -223,7 +223,7 @@ def epyccel( python_function_or_module, **kwargs ): verbose : bool Print additional information (default: False). - language : {'fortran', 'c', 'python'} + language : {'fortran', 'c', 'python', 'ccuda'} Language of generated code (default: 'fortran'). accelerators : iterable of str, optional diff --git a/pyccel/errors/messages.py b/pyccel/errors/messages.py index 84282094ec..9d2b56e333 100644 --- a/pyccel/errors/messages.py +++ b/pyccel/errors/messages.py @@ -161,3 +161,10 @@ WRONG_LINSPACE_ENDPOINT = 'endpoint argument must be boolean' NON_LITERAL_KEEP_DIMS = 'keep_dims argument must be a literal, otherwise rank is unknown' NON_LITERAL_AXIS = 'axis argument must be a literal, otherwise pyccel cannot determine which dimension to operate on' +KERNEL_STACK_ARRAY_ARG = "A variable allocated on the stack can't be passed to a Kernel function" +NON_KERNEL_FUNCTION_CUDA_VAR = 'Cuda internal variables should only be used in Kernel or Device functions' +INVALID_KERNEL_CALL_BP_GRID = 'Invalid Block per grid parameter for Kernel call' +INVALID_KERNEL_CALL_TP_BLOCK = 'Invalid Thread per Block parameter for Kernel call' +MISSING_KERNEL_CONFIGURATION = 'Kernel launch configuration not specified' +INVALID_FUNCTION_CALL = 'Invalid call for a non-kernel function' + diff --git a/pyccel/naming/__init__.py b/pyccel/naming/__init__.py index a71d841c8e..b740f1e754 100644 --- a/pyccel/naming/__init__.py +++ b/pyccel/naming/__init__.py @@ -10,7 +10,9 @@ from .fortrannameclashchecker import FortranNameClashChecker from .cnameclashchecker import CNameClashChecker from .pythonnameclashchecker import PythonNameClashChecker +from .ccudanameclashchecker import CCudaNameClashChecker name_clash_checkers = {'fortran':FortranNameClashChecker(), 'c':CNameClashChecker(), + 'ccuda':CCudaNameClashChecker(), 'python':PythonNameClashChecker()} diff --git a/pyccel/naming/ccudanameclashchecker.py b/pyccel/naming/ccudanameclashchecker.py new file mode 100644 index 0000000000..e5221139e6 --- /dev/null +++ b/pyccel/naming/ccudanameclashchecker.py @@ -0,0 +1,69 @@ +# coding: utf-8 +#------------------------------------------------------------------------------------------# +# This file is part of Pyccel which is released under MIT License. See the LICENSE file or # +# go to https://github.com/pyccel/pyccel/blob/master/LICENSE for full license details. # +#------------------------------------------------------------------------------------------# +""" +Handles name clash problems in Ccuda +""" +from pyccel.utilities.metaclasses import Singleton +from pyccel.utilities.strings import create_incremented_string + +class CCudaNameClashChecker(metaclass = Singleton): + """ Class containing functions to help avoid problematic names in Ccuda + """ + # Keywords as mentioned on https://en.cppreference.com/w/c/keyword + keywords = set(['auto', 'break', 'case', 'char', 'const', 'bool', + 'continue', 'default', 'do', 'double', 'else', 'enum', + 'extern', 'float', 'for', 'goto', 'if', 'inline', + 'int', 'int8_t', 'int16_t', 'int32_t', 'int64_t', + 'long', 'register', 'restrict', 'return', 'short', 'signed', + 'sizeof', 'static', 'struct', 'switch', 'typedef', 'union', + 'unsigned', 'void', 'volatile', 'while', '_Alignas', + '_Alignof', '_Atomic', '_Bool', '_Complex', 'Decimal128', + '_Decimal32', '_Decimal64', '_Generic', '_Imaginary', '__global__', + '_Noreturn', '_Static_assert', '_Thread_local', 't_ndarray', + 'array_create', 'new_slice', 'array_slicing', 'alias_assign', + 'transpose_alias_assign', 'array_fill', 't_slice', + 'GET_INDEX_EXP1', 'GET_INDEX_EXP2', 'GET_INDEX_EXP2', + 'GET_INDEX_EXP3', 'GET_INDEX_EXP4', 'GET_INDEX_EXP5', + 'GET_INDEX_EXP6', 'GET_INDEX_EXP7', 'GET_INDEX_EXP8', + 'GET_INDEX_EXP9', 'GET_INDEX_EXP10', 'GET_INDEX_EXP11', + 'GET_INDEX_EXP12', 'GET_INDEX_EXP13', 'GET_INDEX_EXP14', + 'GET_INDEX_EXP15', 'NUM_ARGS_H1', 'NUM_ARGS', + 'GET_INDEX_FUNC_H2', 'GET_INDEX_FUNC', 'GET_INDEX', + 'INDEX', 'GET_ELEMENT', 'free_array', 'free_pointer', + 'get_index', 'numpy_to_ndarray_strides', + 'numpy_to_ndarray_shape', 'get_size', + 'cuda_free_array', 'cuda_free_pointer', 'cuda_array_create', + 'threadIdx', 'blockIdx', 'blockDim', 'gridDim', + 'cuda_array_fill_double', 'cuda_array_fill_int64', + 'cuda_array_fill_int32', 'cuda_array_fill_int8', + 'cuda_array_arange_double', 'cuda_array_arange_int64', + 'cuda_array_arange_int32', 'cuda_array_arange_int8', + 'cudaMallocManaged', 'cudaSynchronize']) + + def has_clash(self, name, symbols): + """ Indicate whether the proposed name causes any clashes + """ + return any(name == k for k in self.keywords) or \ + any(name == s for s in symbols) + + def get_collisionless_name(self, name, symbols): + """ Get the name that will be used in the fortran code + """ + if len(name)>4 and all(name[i] == '_' for i in (0,1,-1,-2)): + # Ignore magic methods + return name + if name[0] == '_': + name = 'private'+name + prefix = name + coll_symbols = self.keywords.copy() + coll_symbols.update(s.lower() for s in symbols) + if prefix in coll_symbols: + counter = 1 + new_name, counter = create_incremented_string(coll_symbols, + prefix = prefix, counter = counter) + name = name+new_name[-5:] + return name + diff --git a/pyccel/parser/semantic.py b/pyccel/parser/semantic.py index 0534166047..18818c8ade 100644 --- a/pyccel/parser/semantic.py +++ b/pyccel/parser/semantic.py @@ -37,6 +37,7 @@ from pyccel.ast.core import ConstructorCall, InlineFunctionDef from pyccel.ast.core import FunctionDef, Interface, FunctionAddress, FunctionCall, FunctionCallArgument from pyccel.ast.core import DottedFunctionCall +from pyccel.ast.core import KernelCall from pyccel.ast.core import ClassDef from pyccel.ast.core import For from pyccel.ast.core import Module @@ -57,7 +58,7 @@ from pyccel.ast.core import PyccelFunctionDef from pyccel.ast.core import Assert -from pyccel.ast.class_defs import NumpyArrayClass, TupleClass, get_cls_base +from pyccel.ast.class_defs import NumpyArrayClass, TupleClass, get_cls_base, CudaArrayClass from pyccel.ast.datatypes import NativeRange, str_dtype from pyccel.ast.datatypes import NativeSymbol @@ -92,6 +93,9 @@ from pyccel.ast.numpyext import NumpyNewArray, NumpyNonZero from pyccel.ast.numpyext import DtypePrecisionToCastFunction +from pyccel.ast.cupyext import CupyNewArray +from pyccel.ast.cudaext import CudaNewArray, CudaThreadIdx, CudaBlockDim, CudaBlockIdx, CudaGridDim + from pyccel.ast.omp import (OMP_For_Loop, OMP_Simd_Construct, OMP_Distribute_Construct, OMP_TaskLoop_Construct, OMP_Sections_Construct, Omp_End_Clause, OMP_Single_Construct) @@ -117,7 +121,7 @@ from pyccel.errors.errors import Errors from pyccel.errors.errors import PyccelSemanticError -from pyccel.errors.messages import (PYCCEL_RESTRICTION_TODO, UNDERSCORE_NOT_A_THROWAWAY, +from pyccel.errors.messages import (INVALID_FUNCTION_CALL, INVALID_KERNEL_CALL_BP_GRID, INVALID_KERNEL_CALL_TP_BLOCK, MISSING_KERNEL_CONFIGURATION,PYCCEL_RESTRICTION_TODO, UNDERSCORE_NOT_A_THROWAWAY, UNDEFINED_VARIABLE, IMPORTING_EXISTING_IDENTIFIED, INDEXED_TUPLE, LIST_OF_TUPLES, INVALID_INDICES, INCOMPATIBLE_ARGUMENT, INCOMPATIBLE_ORDERING, UNRECOGNISED_FUNCTION_CALL, STACK_ARRAY_SHAPE_UNPURE_FUNC, STACK_ARRAY_UNKNOWN_SHAPE, @@ -231,7 +235,6 @@ def __init__(self, inputs, *, parents = (), d_parsers = (), **kwargs): # self._code = parser._code # ... - self.annotate() # ... @@ -480,7 +483,6 @@ def _infer_type(self, expr, **settings): d_var = {} # TODO improve => put settings as attribut of Parser - if expr in (PythonInt, PythonFloat, PythonComplex, PythonBool, NumpyBool, NumpyInt, NumpyInt8, NumpyInt16, NumpyInt32, NumpyInt64, NumpyComplex, NumpyComplex64, NumpyComplex128, NumpyFloat, NumpyFloat64, NumpyFloat32): @@ -493,6 +495,7 @@ def _infer_type(self, expr, **settings): elif isinstance(expr, Variable): d_var['datatype' ] = expr.dtype d_var['memory_handling'] = expr.memory_handling + d_var['memory_location'] = expr.memory_location d_var['shape' ] = expr.shape d_var['rank' ] = expr.rank d_var['cls_base' ] = expr.cls_base @@ -550,6 +553,28 @@ def _infer_type(self, expr, **settings): d_var['cls_base' ] = NumpyArrayClass return d_var + elif isinstance(expr, CupyNewArray): + d_var['datatype' ] = expr.dtype + d_var['memory_handling'] = 'heap' if expr.rank > 0 else 'stack' + d_var['memory_location'] = expr.memory_location + d_var['shape' ] = expr.shape + d_var['rank' ] = expr.rank + d_var['order' ] = expr.order + d_var['precision' ] = expr.precision + d_var['cls_base' ] = CudaArrayClass + return d_var + + elif isinstance(expr, CudaNewArray): + d_var['datatype' ] = expr.dtype + d_var['memory_handling'] = 'heap' if expr.rank > 0 else 'stack' + d_var['memory_location'] = expr.memory_location + d_var['shape' ] = expr.shape + d_var['rank' ] = expr.rank + d_var['order' ] = expr.order + d_var['precision' ] = expr.precision + d_var['cls_base' ] = CudaArrayClass + return d_var + elif isinstance(expr, NumpyTranspose): var = expr.internal_var @@ -879,8 +904,17 @@ def _handle_function(self, expr, func, args, **settings): FunctionCall/PyccelInternalFunction The semantic representation of the call. """ + if isinstance(func, FunctionDef) and 'kernel' in func.decorators: + errors.report(MISSING_KERNEL_CONFIGURATION, symbol = expr, severity = 'fatal') if isinstance(func, PyccelFunctionDef): func = func.cls_name + if func in (CudaThreadIdx, CudaBlockDim, CudaBlockIdx, CudaGridDim): + if 'kernel' not in self.scope.decorators\ + and 'device' not in self.scope.decorators: + errors.report("Cuda internal variables should only be used in Kernel or Device functions", + symbol = expr, + severity = 'fatal') + args, kwargs = split_positional_keyword_arguments(*args) try: @@ -935,6 +969,92 @@ def _handle_function(self, expr, func, args, **settings): expr, func.is_elemental) return new_expr + def _handle_kernel(self, expr, func, args, **settings): + """ + Create a FunctionCall or an instance of a PyccelInternalFunction + from the function information and arguments + + Parameters + ========== + expr : PyccelAstNode + The expression where this call is found (used for error output) + func : FunctionDef instance, Interface instance or PyccelInternalFunction type + The function being called + args : tuple + The arguments passed to the function + + Returns + ======= + new_expr : FunctionCall or PyccelInternalFunction + """ + if 'kernel' not in func.decorators: + errors.report(INVALID_FUNCTION_CALL, + symbol = expr, + severity = 'fatal') + if isinstance(func, PyccelFunctionDef): + func = func.cls_name + args, kwargs = split_positional_keyword_arguments(*args) + for a in args: + if getattr(a,'dtype',None) == 'tuple': + self._infere_type(a, **settings) + for a in kwargs.values(): + if getattr(a,'dtype',None) == 'tuple': + self._infere_type(a, **settings) + + try: + new_expr = func(*args, **kwargs) + except TypeError: + errors.report(UNRECOGNISED_FUNCTION_CALL, + symbol = expr, + severity = 'fatal') + + return new_expr + else: + if isinstance(func, FunctionDef) and len(args) > len(func.arguments): + errors.report("Too many arguments passed in function call", + symbol = expr, + severity='fatal') + # TODO : type check the NUMBER OF BLOCKS 'numBlocks' and threads per block 'tpblock' + if not isinstance(expr.numBlocks, LiteralInteger): + # expr.numBlocks could be invalid type, or PyccelSymbol + if isinstance(expr.numBlocks, PyccelSymbol): + numBlocks = self.get_variable(expr.numBlocks) + if not isinstance(numBlocks.dtype, NativeInteger): + errors.report(INVALID_KERNEL_CALL_BP_GRID, + symbol = expr, + severity='error') + else: + errors.report(INVALID_KERNEL_CALL_BP_GRID, + symbol = expr, + severity='error') + if not isinstance(expr.tpblock, LiteralInteger): + # expr.tpblock could be invalid type, or PyccelSymbol + if isinstance(expr.tpblock, PyccelSymbol): + tpblock = self.get_variable(expr.tpblock) + if not isinstance(tpblock.dtype, NativeInteger): + errors.report(INVALID_KERNEL_CALL_TP_BLOCK, + symbol = expr, + severity='error') + else: + errors.report(INVALID_KERNEL_CALL_TP_BLOCK, + symbol = expr, + severity='error') + new_expr = KernelCall(func, args, expr.numBlocks, expr.tpblock, self._current_function) + + for a in new_expr.args: + if a is None: + errors.report("Too few arguments passed in function call", + symbol = expr, + severity='error') + elif isinstance(a.value, Variable) and a.value.on_stack: + errors.report("A variable allocated on the stack can't be passed to a Kernel function", + symbol = expr, + severity='error') + if isinstance(func, FunctionDef): + self._check_argument_compatibility(new_expr.args, func.arguments, + expr, func.is_elemental) + return new_expr + def _create_variable(self, name, dtype, rhs, d_lhs, arr_in_multirets=False): """ Create a new variable. @@ -1934,7 +2054,6 @@ def _visit_PyccelSymbol(self, expr, **settings): def _visit_DottedName(self, expr, **settings): - var = self.check_for_variable(_get_name(expr)) if var: return var @@ -2089,6 +2208,18 @@ def _visit_DottedName(self, expr, **settings): bounding_box=(self._current_fst_node.lineno, self._current_fst_node.col_offset), severity='fatal') + def _visit_KernelCall(self, expr, **settings): + name = expr.funcdef + try: + name = self.scope.get_expected_name(name) + except RuntimeError: + pass + func = self.scope.find(name, 'functions') + + args = self._handle_function_args(expr.args, **settings) + + return self._handle_kernel(expr, func, args, **settings) + def _visit_PyccelOperator(self, expr, **settings): args = [self._visit(a, **settings) for a in expr.args] return self._create_PyccelOperator(expr, args) diff --git a/pyccel/parser/syntactic.py b/pyccel/parser/syntactic.py index 35822a060f..605efa5ce6 100644 --- a/pyccel/parser/syntactic.py +++ b/pyccel/parser/syntactic.py @@ -17,7 +17,7 @@ from pyccel.ast.basic import Basic -from pyccel.ast.core import FunctionCall, FunctionCallArgument +from pyccel.ast.core import FunctionCall, FunctionCallArgument, KernelCall from pyccel.ast.core import Module from pyccel.ast.core import Assign from pyccel.ast.core import AugAssign @@ -911,6 +911,10 @@ def _visit_Call(self, stmt): elif isinstance(func, DottedName): func_attr = FunctionCall(func.name[-1], args) func = DottedName(*func.name[:-1], func_attr) + elif isinstance(func, IndexedElement): + if len(func.indices) != 2: + raise NotImplementedError + func = KernelCall(func.base, args, func.indices[0], func.indices[1]) else: raise NotImplementedError(' Unknown function type {}'.format(str(type(func)))) diff --git a/pyccel/stdlib/cuda_ndarrays/cuda_ndarrays.cu b/pyccel/stdlib/cuda_ndarrays/cuda_ndarrays.cu new file mode 100644 index 0000000000..2563362433 --- /dev/null +++ b/pyccel/stdlib/cuda_ndarrays/cuda_ndarrays.cu @@ -0,0 +1,185 @@ +#include "cuda_ndarrays.h" + +__global__ +void cuda_array_arange_int8(t_ndarray arr, int start) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = gridDim.x * blockDim.x; + + for(int i = index ; i < arr.length; i+=stride) + arr.nd_int8[i] = (i + start); +} +__global__ +void cuda_array_arange_int32(t_ndarray arr, int start) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = gridDim.x * blockDim.x; + + for(int i = index ; i < arr.length; i+=stride) + arr.nd_int32[i] = (i + start); +} +__global__ +void cuda_array_arange_int64(t_ndarray arr, int start) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = gridDim.x * blockDim.x; + + for(int i = index ; i < arr.length; i+=stride) + arr.nd_int64[i] = (i + start); +} +__global__ +void cuda_array_arange_double(t_ndarray arr, int start) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = gridDim.x * blockDim.x; + + for(int i = index ; i < arr.length; i+=stride) + arr.nd_double[i] = (i + start); +} + +__global__ +void cuda_array_fill_int8(int8_t c, t_ndarray arr) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = gridDim.x * blockDim.x; + + for(int i = index ; i < arr.length; i+=stride) + arr.nd_int8[i] = c; +} + +__global__ +void cuda_array_fill_int32(int32_t c, t_ndarray arr) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = gridDim.x * blockDim.x; + + for(int i = index ; i < arr.length; i+=stride) + arr.nd_int32[i] = c; +} + +__global__ +void cuda_array_fill_int64(int64_t c, t_ndarray arr) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = gridDim.x * blockDim.x; + + for(int i = index ; i < arr.length; i+=stride) + arr.nd_int64[i] = c; +} +__global__ +void cuda_array_fill_double(double c, t_ndarray arr) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = gridDim.x * blockDim.x; + + for(int i = index ; i < arr.length; i+=stride) + arr.nd_double[i] = c; +} + +void device_memory(void** devPtr, size_t size) +{ + cudaMalloc(devPtr, size); +} + +void managed_memory(void** devPtr, size_t size) +{ + cudaMallocManaged(devPtr, size); +} + +void host_memory(void** devPtr, size_t size) +{ + cudaMallocHost(devPtr, size); +} + +t_ndarray cuda_array_create(int32_t nd, int64_t *shape, + enum e_types type, bool is_view, enum e_memory_locations location) +{ + t_ndarray arr; + void (*fun_ptr_arr[])(void**, size_t) = {managed_memory, host_memory, device_memory}; + + arr.nd = nd; + arr.type = type; + switch (type) + { + case nd_int8: + arr.type_size = sizeof(int8_t); + break; + case nd_int16: + arr.type_size = sizeof(int16_t); + break; + case nd_int32: + arr.type_size = sizeof(int32_t); + break; + case nd_int64: + arr.type_size = sizeof(int64_t); + break; + case nd_float: + arr.type_size = sizeof(float); + break; + case nd_double: + arr.type_size = sizeof(double); + break; + case nd_bool: + arr.type_size = sizeof(bool); + break; + } + arr.is_view = is_view; + arr.length = 1; + cudaMallocManaged(&(arr.shape), arr.nd * sizeof(int64_t)); + for (int32_t i = 0; i < arr.nd; i++) + { + arr.length *= shape[i]; + arr.shape[i] = shape[i]; + } + arr.buffer_size = arr.length * arr.type_size; + cudaMallocManaged(&(arr.strides), nd * sizeof(int64_t)); + for (int32_t i = 0; i < arr.nd; i++) + { + arr.strides[i] = 1; + for (int32_t j = i + 1; j < arr.nd; j++) + arr.strides[i] *= arr.shape[j]; + } + if (!is_view) + (*fun_ptr_arr[location])(&(arr.raw_data), arr.buffer_size); + return (arr); +} + +int32_t cuda_free_host(t_ndarray arr) +{ + if (arr.shape == NULL) + return (0); + cudaFreeHost(arr.raw_data); + arr.raw_data = NULL; + cudaFree(arr.shape); + arr.shape = NULL; + cudaFree(arr.strides); + arr.strides = NULL; + return (1); +} + +__host__ __device__ +int32_t cuda_free(t_ndarray arr) +{ + if (arr.shape == NULL) + return (0); + cudaFree(arr.raw_data); + arr.raw_data = NULL; + cudaFree(arr.shape); + arr.shape = NULL; + cudaFree(arr.strides); + arr.strides = NULL; + return (1); +} + +__host__ __device__ +int32_t cuda_free_pointer(t_ndarray arr) +{ + if (arr.is_view == false || arr.shape == NULL) + return (0); + cudaFree(arr.shape); + arr.shape = NULL; + cudaFree(arr.strides); + arr.strides = NULL; + return (1); +} + diff --git a/pyccel/stdlib/cuda_ndarrays/cuda_ndarrays.h b/pyccel/stdlib/cuda_ndarrays/cuda_ndarrays.h new file mode 100644 index 0000000000..8e88ecd998 --- /dev/null +++ b/pyccel/stdlib/cuda_ndarrays/cuda_ndarrays.h @@ -0,0 +1,34 @@ +#ifndef CUDA_NDARRAYS_H +# define CUDA_NDARRAYS_H + +#include "../ndarrays/ndarrays.h" + +__global__ +void cuda_array_arange_int8(t_ndarray arr, int start); +__global__ +void cuda_array_arange_int32(t_ndarray arr, int start); +__global__ +void cuda_array_arange_int64(t_ndarray arr, int start); +__global__ +void cuda_array_arange_double(t_ndarray arr, int start); + +__global__ +void _cuda_array_fill_int8(int8_t c, t_ndarray arr); +__global__ +void _cuda_array_fill_int32(int32_t c, t_ndarray arr); +__global__ +void _cuda_array_fill_int64(int64_t c, t_ndarray arr); +__global__ +void _cuda_array_fill_double(double c, t_ndarray arr); + +t_ndarray cuda_array_create(int32_t nd, int64_t *shape, enum e_types type, bool is_view, enum e_memory_locations location); +int32_t cuda_free_array(t_ndarray dump); + +int32_t cuda_free_host(t_ndarray arr); + +__host__ __device__ +int32_t cuda_free(t_ndarray arr); + +__host__ __device__ +int32_t cuda_free_pointer(t_ndarray arr); +#endif diff --git a/pyccel/stdlib/ndarrays/ndarrays.c b/pyccel/stdlib/ndarrays/ndarrays.c index 70ebd25a9e..6f952730f0 100644 --- a/pyccel/stdlib/ndarrays/ndarrays.c +++ b/pyccel/stdlib/ndarrays/ndarrays.c @@ -42,12 +42,14 @@ t_ndarray array_create(int32_t nd, int64_t *shape, case nd_bool: arr.type_size = sizeof(bool); break; + #ifndef __NVCC__ case nd_cfloat: arr.type_size = sizeof(float complex); break; case nd_cdouble: arr.type_size = sizeof(double complex); break; + #endif } arr.is_view = is_view; arr.length = 1; @@ -95,12 +97,14 @@ void stack_array_init(t_ndarray *arr) case nd_bool: arr->type_size = sizeof(bool); break; + #ifndef __NVCC__ case nd_cfloat: arr->type_size = sizeof(float complex); break; case nd_cdouble: arr->type_size = sizeof(double complex); break; + #endif } arr->length = 1; for (int32_t i = 0; i < arr->nd; i++) @@ -114,7 +118,7 @@ void stack_array_init(t_ndarray *arr) } } -void _array_fill_int8(int8_t c, t_ndarray arr) +void array_fill_int8(int8_t c, t_ndarray arr) { if (c == 0) memset(arr.raw_data, 0, arr.buffer_size); @@ -123,7 +127,7 @@ void _array_fill_int8(int8_t c, t_ndarray arr) arr.nd_int8[i] = c; } -void _array_fill_int16(int16_t c, t_ndarray arr) +void array_fill_int16(int16_t c, t_ndarray arr) { if (c == 0) memset(arr.raw_data, 0, arr.buffer_size); @@ -132,7 +136,7 @@ void _array_fill_int16(int16_t c, t_ndarray arr) arr.nd_int16[i] = c; } -void _array_fill_int32(int32_t c, t_ndarray arr) +void array_fill_int32(int32_t c, t_ndarray arr) { if (c == 0) memset(arr.raw_data, 0, arr.buffer_size); @@ -141,7 +145,7 @@ void _array_fill_int32(int32_t c, t_ndarray arr) arr.nd_int32[i] = c; } -void _array_fill_int64(int64_t c, t_ndarray arr) +void array_fill_int64(int64_t c, t_ndarray arr) { if (c == 0) memset(arr.raw_data, 0, arr.buffer_size); @@ -150,7 +154,7 @@ void _array_fill_int64(int64_t c, t_ndarray arr) arr.nd_int64[i] = c; } -void _array_fill_bool(bool c, t_ndarray arr) +void array_fill_bool(bool c, t_ndarray arr) { if (c == 0) memset(arr.raw_data, 0, arr.buffer_size); @@ -159,7 +163,7 @@ void _array_fill_bool(bool c, t_ndarray arr) arr.nd_bool[i] = c; } -void _array_fill_float(float c, t_ndarray arr) +void array_fill_float(float c, t_ndarray arr) { if (c == 0) memset(arr.raw_data, 0, arr.buffer_size); @@ -168,7 +172,7 @@ void _array_fill_float(float c, t_ndarray arr) arr.nd_float[i] = c; } -void _array_fill_double(double c, t_ndarray arr) +void array_fill_double(double c, t_ndarray arr) { if (c == 0) memset(arr.raw_data, 0, arr.buffer_size); @@ -177,7 +181,8 @@ void _array_fill_double(double c, t_ndarray arr) arr.nd_double[i] = c; } -void _array_fill_cfloat(float complex c, t_ndarray arr) +#ifndef __NVCC__ +void array_fill_cfloat(float complex c, t_ndarray arr) { if (c == 0) memset(arr.raw_data, 0, arr.buffer_size); @@ -187,7 +192,7 @@ void _array_fill_cfloat(float complex c, t_ndarray arr) } -void _array_fill_cdouble(double complex c, t_ndarray arr) +void array_fill_cdouble(double complex c, t_ndarray arr) { if (c == 0) memset(arr.raw_data, 0, arr.buffer_size); @@ -195,6 +200,7 @@ void _array_fill_cdouble(double complex c, t_ndarray arr) for (int32_t i = 0; i < arr.length; i++) arr.nd_cdouble[i] = c; } +#endif /* ** deallocation @@ -437,5 +443,7 @@ NUMPY_SUM_(int32, int64_t, int32) NUMPY_SUM_(int64, int64_t, int64) NUMPY_SUM_(float32, float, float) NUMPY_SUM_(float64, double, double) +#ifndef __NVCC__ NUMPY_SUM_(complex64, float complex, cfloat) NUMPY_SUM_(complex128, double complex, cdouble) +#endif diff --git a/pyccel/stdlib/ndarrays/ndarrays.h b/pyccel/stdlib/ndarrays/ndarrays.h index ec3c07d6fc..bbfdb19b3f 100644 --- a/pyccel/stdlib/ndarrays/ndarrays.h +++ b/pyccel/stdlib/ndarrays/ndarrays.h @@ -10,16 +10,9 @@ # include # include -/* mapping the function array_fill to the correct type */ -# define array_fill(c, arr) _Generic((c), int64_t : _array_fill_int64,\ - int32_t : _array_fill_int32,\ - int16_t : _array_fill_int16,\ - int8_t : _array_fill_int8,\ - float : _array_fill_float,\ - double : _array_fill_double,\ - bool : _array_fill_bool,\ - float complex : _array_fill_cfloat,\ - double complex : _array_fill_cdouble)(c, arr) +#ifdef __cplusplus +extern "C" { +#endif enum e_slice_type { ELEMENT, RANGE }; @@ -69,8 +62,17 @@ enum e_types nd_int64 = 7, nd_float = 11, nd_double = 12, + #ifndef __NVCC__ nd_cfloat = 14, nd_cdouble = 15 + #endif +}; + +enum e_memory_locations +{ + managedMemory, + allocateMemoryOnHost, + allocateMemoryOnDevice }; typedef struct s_ndarray @@ -85,8 +87,10 @@ typedef struct s_ndarray float *nd_float; double *nd_double; bool *nd_bool; + #ifndef __NVCC__ double complex *nd_cdouble; float complex *nd_cfloat; + #endif }; /* number of dimensions */ int32_t nd; @@ -112,15 +116,17 @@ typedef struct s_ndarray void stack_array_init(t_ndarray *arr); t_ndarray array_create(int32_t nd, int64_t *shape, enum e_types type, bool is_view); -void _array_fill_int8(int8_t c, t_ndarray arr); -void _array_fill_int16(int16_t c, t_ndarray arr); -void _array_fill_int32(int32_t c, t_ndarray arr); -void _array_fill_int64(int64_t c, t_ndarray arr); -void _array_fill_float(float c, t_ndarray arr); -void _array_fill_double(double c, t_ndarray arr); -void _array_fill_bool(bool c, t_ndarray arr); -void _array_fill_cfloat(float complex c, t_ndarray arr); -void _array_fill_cdouble(double complex c, t_ndarray arr); +void array_fill_int8(int8_t c, t_ndarray arr); +void array_fill_int16(int16_t c, t_ndarray arr); +void array_fill_int32(int32_t c, t_ndarray arr); +void array_fill_int64(int64_t c, t_ndarray arr); +void array_fill_float(float c, t_ndarray arr); +void array_fill_double(double c, t_ndarray arr); +void array_fill_bool(bool c, t_ndarray arr); +#ifndef __NVCC__ +void array_fill_cfloat(float complex c, t_ndarray arr); +void array_fill_cdouble(double complex c, t_ndarray arr); +#endif /* slicing */ /* creating a Slice object */ @@ -152,7 +158,13 @@ int64_t numpy_sum_int32(t_ndarray arr); int64_t numpy_sum_int64(t_ndarray arr); float numpy_sum_float32(t_ndarray arr); double numpy_sum_float64(t_ndarray arr); +#ifndef __NVCC__ float complex numpy_sum_complex64(t_ndarray arr); double complex numpy_sum_complex128(t_ndarray arr); +#endif + +#ifdef __cplusplus +} +#endif #endif diff --git a/pytest.ini b/pytest.ini index 42eb0d72ba..41cd7cdace 100644 --- a/pytest.ini +++ b/pytest.ini @@ -6,6 +6,7 @@ markers = parallel: test to be run using 'mpiexec' fortran: test to generate fortran code c: test to generate c code + ccuda: test to generate Ccuda code python: test to generate python code xdist_incompatible: test which compiles a file also compiled by another test external: test using an external dll (problematic with conda on Windows) diff --git a/samples/test/test.py b/samples/test/test.py new file mode 100644 index 0000000000..027e3945e9 --- /dev/null +++ b/samples/test/test.py @@ -0,0 +1,28 @@ +# from numpy import array +from pyccel.decorators import kernel +from pyccel import cuda +from numpy import array as narray +# from pyccel.stdlib.internal import PyccelthreadIdx + + +@types('int[:]') +def abbah(z): + return z[0] + + +@kernel +@types('int[:]') +def func(a): + i = cuda.threadIdx(0) + """ + test test + """ + print("Hello World! ", a[i]) + a[i] += 1 + +if __name__ == "__main__": + # b = narray([1, 2, 3], dtype='int', order='C') + a = cuda.array([1, 2, 3], dtype='int', order='C') + func[1, 3](a) + cuda.synchronize() + print(a) \ No newline at end of file diff --git a/samples/test/tt.cu b/samples/test/tt.cu new file mode 100644 index 0000000000..d300c3ac6c --- /dev/null +++ b/samples/test/tt.cu @@ -0,0 +1,20 @@ +#include +#include + +__global__ void func(int *p) +{ + p[0] = 5; + printf("%s\n", "Hello World!"); +} + +int main() +{ + int *p1; + + cudaMallocManaged(&p1, 3*sizeof(int)); + p1[0] = 0; + func<<<1,2>>>(p1); + cudaDeviceSynchronize(); + printf("%d\n", p1[0]); + return 0; +} \ No newline at end of file diff --git a/tests/cuda_ndarrays/conftest.py b/tests/cuda_ndarrays/conftest.py new file mode 100644 index 0000000000..112323a6cf --- /dev/null +++ b/tests/cuda_ndarrays/conftest.py @@ -0,0 +1,120 @@ +# pylint: disable=missing-function-docstring, missing-module-docstring/ +import subprocess +import os +import pathlib +import sys +import shutil +import pytest + +NEEDS_FROM_PARENT = hasattr(pytest.Item, "from_parent") + +def pytest_collect_file(parent, path): + """ + A hook to collect test_*.cu test files. + """ + if path.ext == ".cu" and path.basename.startswith("test_"): + if NEEDS_FROM_PARENT: + return CTestFile.from_parent(path=pathlib.Path(path), parent=parent) + return CTestFile(parent=parent, path=pathlib.Path(path)) + return None + + +class CTestFile(pytest.File): + """ + A custom file handler class for CCuda unit test files. + """ + + @classmethod + def from_parent(cls, parent, **kwargs): + return super().from_parent(parent=parent, **kwargs) + + def collect(self): + """ + Overridden collect method to collect the results from each + CCuda unit test executable. + """ + # Run the exe that corresponds to the .cu file and capture the output. + test_exe = os.path.splitext(self.fspath)[0] + rootdir = self.config.rootdir.strpath + test_exe = os.path.relpath(test_exe) + ndarray_path = os.path.join(rootdir , "pyccel", "stdlib", "ndarrays") + cuda_ndarray_path = os.path.join(rootdir, "pyccel", "stdlib", "cuda_ndarrays") + comp_cmd = [shutil.which("nvcc"), test_exe + ".cu", + os.path.join(ndarray_path, "ndarrays.c"), + os.path.join(cuda_ndarray_path, "cuda_ndarrays.cu"), + "-I", ndarray_path, + "-I", cuda_ndarray_path, + "-o", test_exe,] + + subprocess.run(comp_cmd, check=True) + if sys.platform.startswith("win"): + test_exe += ".exe" + test_output = subprocess.check_output("./" + test_exe) + + # Clean up the unit test output and remove non test data lines. + lines = test_output.decode().split("\n") + lines = [line.strip() for line in lines] + lines = [line for line in lines if line.startswith("[")] + + # Extract the test metadata from the unit test output. + test_results = [] + for line in lines: + token, data = line.split(" ", 1) + token = token[1:-1] + if token in ("PASS", "FAIL"): + file_name, line_number, function_name = data.split(":") + test_results.append({"condition": token, + "file_name": file_name, + "function_name": function_name, + "line_number": int(line_number), + "INFO" : "no data found", + "DSCR" : "" + }) + elif token in ("INFO", "DSCR"): + test_results[-1][token] = data + for test_result in test_results: + if NEEDS_FROM_PARENT: + yield CTestItem.from_parent(name = test_result["function_name"] + " < " + test_result["DSCR"] + " >", + parent = self, test_result = test_result) + else: + yield CTestItem(name = test_result["function_name"] + " < " + test_result["DSCR"] + " >", parent = self, + test_result = test_result) + + +class CTestItem(pytest.Item): + """ + Pytest.Item subclass to handle each test result item. There may be + more than one test result from a test function. + """ + + def __init__(self, *, test_result, **kwargs): + """Overridden constructor to pass test results dict.""" + super().__init__(**kwargs) + self.test_result = test_result + + @classmethod + def from_parent(cls, parent, **kwargs): + return super().from_parent(parent=parent, **kwargs) + + def runtest(self): + """The test has already been run. We just evaluate the result.""" + if self.test_result["condition"] == "FAIL": + raise CTestException(self, self.name) + + def reportinfo(self): + """"Called to display header information about the test case.""" + return self.fspath, self.test_result["line_number"], self.name + + def repr_failure(self, excinfo, style=None): + """ + Called when runtest() raises an exception. The method is used + to format the output of the failed test result. + """ + if isinstance(excinfo.value, CTestException): + return f"Test failed : {self.test_result['file_name']}:{self.test_result['line_number']} {self.test_result['function_name']} < {self.test_result['DSCR']} >\n INFO : {self.test_result['INFO']}" + return super().repr_failure(excinfo) + + + +class CTestException(Exception): + """Custom exception to distinguish C unit test failures from others.""" diff --git a/tests/cuda_ndarrays/test_cuda_ndarrays.cu b/tests/cuda_ndarrays/test_cuda_ndarrays.cu new file mode 100644 index 0000000000..e3c5e57917 --- /dev/null +++ b/tests/cuda_ndarrays/test_cuda_ndarrays.cu @@ -0,0 +1,503 @@ +#include +#include +#include +#include +#include "ndarrays.h" +#include "cuda_ndarrays.h" + +void assert_double(double v1 , double v2, const char *dscr, + const char * func, const char *file, int32_t line) +{ + if (v1 != v2) + { + printf("[FAIL] %s:%d:%s\n", file, line, func); + printf("[INFO] v1:%f != v2:%f\n", v1, v2); + printf("[DSCR] %s\n", dscr); + return ; + } + printf("[PASS] %s:%d:%s\n", file, line, func); + printf("[DSCR] %s\n", dscr); +} + +void assert_float(float v1 , float v2, const char *dscr, + const char * func, const char *file, int32_t line) +{ + if (v1 != v2) + { + printf("[FAIL] %s:%d:%s\n", file, line, func); + printf("[INFO] v1:%f != v2:%f\n", v1, v2); + printf("[DSCR] %s\n", dscr); + return ; + } + printf("[PASS] %s:%d:%s\n", file, line, func); + printf("[DSCR] %s\n", dscr); +} + +void assert_int64(int64_t v1, int64_t v2, const char *dscr, + const char * func, const char *file, int32_t line) +{ + if (v1 != v2) + { + printf("[FAIL] %s:%d:%s\n", file, line, func); + printf("[INFO] v1:%ld != v2:%ld\n", v1, v2); + printf("[DSCR] %s\n", dscr); + return ; + } + printf("[PASS] %s:%d:%s\n", file, line, func); + printf("[DSCR] %s\n", dscr); +} + +void assert_int32(int32_t v1 , int32_t v2, const char *dscr, + const char * func, const char *file, int32_t line) +{ + if (v1 != v2) + { + printf("[FAIL] %s:%d:%s\n", file, line, func); + printf("[INFO] v1:%d != v2:%d\n", v1, v2); + printf("[DSCR] %s\n", dscr); + return ; + } + printf("[PASS] %s:%d:%s\n",file, line, func); + printf("[DSCR] %s\n", dscr); +} + +void assert_int16(int16_t v1 , int16_t v2, const char *dscr, + const char * func, const char *file, int32_t line) +{ + if (v1 != v2) + { + printf("[FAIL] %s:%d:%s\n", file, line, func); + printf("[INFO] v1:%d != v2:%d\n", v1, v2); + printf("[DSCR] %s\n", dscr); + return ; + } + printf("[PASS] %s:%d:%s\n", file, line, func); + printf("[DSCR] %s\n", dscr); +} + +void assert_int8(int8_t v1 , int8_t v2, const char *dscr, + const char * func, const char *file, int32_t line) +{ + if (v1 != v2) + { + printf("[FAIL] %s:%d:%s\n", file, line, func); + printf("[INFO] v1:%d != v2:%d\n", v1, v2); + printf("[DSCR] %s\n", dscr); + return ; + } + printf("[PASS] %s:%d:%s\n", file, line, func); + printf("[DSCR] %s\n", dscr); +} + +void test_cuda_array_create_host_double() +{ + t_ndarray arr = {.shape = NULL}; + + int64_t tmp_shape[] = {INT64_C(14)}; + + arr = cuda_array_create(1, tmp_shape, nd_double, false, allocateMemoryOnHost); + double cuda_array_dummy[] = {1.02, 0.25, 5e-05, 1.0, 200.0, 33.0, 5.0, 57.0, 62.0, 70.0, 103.009, 141.0, 122.0, 26.5}; + cudaMemcpy(arr.nd_double, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToHost); + + assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__); + assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__); + for(int i = 0; i < tmp_shape[0]; i++) + { + assert_double(arr.nd_double[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__); + } + cuda_free_host(arr); +} + +void test_cuda_array_create_managed_double() +{ + t_ndarray arr = {.shape = NULL}; + + int64_t tmp_shape[] = {INT64_C(14)}; + arr = cuda_array_create(1, tmp_shape, nd_double, false, managedMemory); + double cuda_array_dummy[] = {1.02, 0.25, 5e-05, 1.0, 200.0, 33.0, 5.0, 57.0, 62.0, 70.0, 103.009, 141.0, 122.0, 26.5}; + cudaMemcpy(arr.nd_double, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice); + + assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__); + assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__); + for(int i = 0; i < tmp_shape[0]; i++) + { + assert_double(arr.nd_double[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__); + } + cuda_free(arr); +} + +void test_cuda_array_create_device_double() +{ + t_ndarray arr = {.shape = NULL}; + t_ndarray b = {.shape = NULL}; + + int64_t tmp_shape[] = {INT64_C(14)}; + arr = cuda_array_create(1, tmp_shape, nd_double, false, allocateMemoryOnDevice); + double cuda_array_dummy[] = {1.02, 0.25, 5e-05, 1.0, 200.0, 33.0, 5.0, 57.0, 62.0, 70.0, 103.009, 141.0, 122.0, 26.5}; + cudaMemcpy(arr.nd_double, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice); + + int64_t tmp_shape_0001[] = {INT64_C(14)}; + b = cuda_array_create(1, tmp_shape_0001, nd_double, false, allocateMemoryOnHost); + cudaMemcpy(b.nd_double, arr.nd_double, b.buffer_size, cudaMemcpyDeviceToHost); + + assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__); + assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__); + for(int i = 0; i < tmp_shape[0]; i++) + { + assert_double(b.nd_double[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__); + } + cuda_free(arr); + cuda_free_host(b); +} + +void test_cuda_array_create_host_float() +{ + t_ndarray arr = {.shape = NULL}; + + int64_t tmp_shape[] = {INT64_C(14)}; + + arr = cuda_array_create(1, tmp_shape, nd_float, false, allocateMemoryOnHost); + float cuda_array_dummy[] = {1.02, 0.25, 5e-05, 1.0, 200.0, 33.0, 5.0, 57.0, 62.0, 70.0, 103.009, 141.0, 122.0, 26.5}; + cudaMemcpy(arr.nd_float, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToHost); + + assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__); + assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__); + for(int i = 0; i < tmp_shape[0]; i++) + { + assert_float(arr.nd_float[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__); + } + cuda_free_host(arr); +} + +void test_cuda_array_create_managed_float() +{ + t_ndarray arr = {.shape = NULL}; + + int64_t tmp_shape[] = {INT64_C(14)}; + arr = cuda_array_create(1, tmp_shape, nd_float, false, managedMemory); + float cuda_array_dummy[] = {1.02, 0.25, 5e-05, 1.0, 200.0, 33.0, 5.0, 57.0, 62.0, 70.0, 103.009, 141.0, 122.0, 26.5}; + cudaMemcpy(arr.nd_float, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice); + + assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__); + assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__); + for(int i = 0; i < tmp_shape[0]; i++) + { + assert_float(arr.nd_float[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__); + } + cuda_free(arr); +} + +void test_cuda_array_create_device_float() +{ + t_ndarray arr = {.shape = NULL}; + t_ndarray b = {.shape = NULL}; + + int64_t tmp_shape[] = {INT64_C(14)}; + arr = cuda_array_create(1, tmp_shape, nd_float, false, allocateMemoryOnDevice); + float cuda_array_dummy[] = {1.02, 0.25, 5e-05, 1.0, 200.0, 33.0, 5.0, 57.0, 62.0, 70.0, 103.009, 141.0, 122.0, 26.5}; + cudaMemcpy(arr.nd_float, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice); + + int64_t tmp_shape_0001[] = {INT64_C(14)}; + b = cuda_array_create(1, tmp_shape_0001, nd_float, false, allocateMemoryOnHost); + cudaMemcpy(b.nd_float, arr.nd_float, b.buffer_size, cudaMemcpyDeviceToHost); + + assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__); + assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__); + for(int i = 0; i < tmp_shape[0]; i++) + { + assert_float(b.nd_float[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__); + } + cuda_free(arr); + cuda_free_host(b); +} + +void test_cuda_array_create_host_int64() +{ + t_ndarray arr = {.shape = NULL}; + + int64_t tmp_shape[] = {INT64_C(14)}; + + arr = cuda_array_create(1, tmp_shape, nd_int64, false, allocateMemoryOnHost); + int64_t cuda_array_dummy[] = {1, 0, 0, 1, 200, 33, 5, 57, + 62, 70, 103, 141, 122, 26}; + cudaMemcpy(arr.nd_int64, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToHost); + + assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__); + assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__); + for(int i = 0; i < tmp_shape[0]; i++) + { + assert_int64(arr.nd_int64[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__); + } + cuda_free_host(arr); +} + +void test_cuda_array_create_managed_int64() +{ + t_ndarray arr = {.shape = NULL}; + + int64_t tmp_shape[] = {INT64_C(14)}; + arr = cuda_array_create(1, tmp_shape, nd_int64, false, managedMemory); + int64_t cuda_array_dummy[] = {1, 0, 0, 1, 200, 33, 5, 57, + 62, 70, 103, 141, 122, 26}; + cudaMemcpy(arr.nd_int64, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice); + + + assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__); + assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__); + for(int i = 0; i < tmp_shape[0]; i++) + { + assert_int64(arr.nd_int64[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__); + } + cuda_free(arr); +} + +void test_cuda_array_create_device_int64() +{ + t_ndarray arr = {.shape = NULL}; + t_ndarray b = {.shape = NULL}; + + int64_t tmp_shape[] = {INT64_C(14)}; + arr = cuda_array_create(1, tmp_shape, nd_int64, false, allocateMemoryOnDevice); + int64_t cuda_array_dummy[] = {1, 0, 0, 1, 200, 33, 5, 57, + 62, 70, 103, 141, 122, 26}; + cudaMemcpy(arr.nd_double, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice); + + int64_t tmp_shape_0001[] = {INT64_C(14)}; + b = cuda_array_create(1, tmp_shape_0001, nd_int64, false, allocateMemoryOnHost); + cudaMemcpy(b.nd_int64, arr.nd_int64, b.buffer_size, cudaMemcpyDeviceToHost); + + assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__); + assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__); + for(int i = 0; i < tmp_shape[0]; i++) + { + assert_int64(b.nd_int64[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__); + } + cuda_free(arr); + cuda_free_host(b); +} + + +void test_cuda_array_create_host_int32() +{ + t_ndarray arr = {.shape = NULL}; + + int64_t tmp_shape[] = {INT64_C(14)}; + + arr = cuda_array_create(1, tmp_shape, nd_int32, false, allocateMemoryOnHost); + int32_t cuda_array_dummy[] = {1, 0, 0, 1, 200, 33, 5, 57, + 62, 70, 103, 141, 122, 26}; + cudaMemcpy(arr.nd_int32, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToHost); + + assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__); + assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__); + for(int i = 0; i < tmp_shape[0]; i++) + { + assert_int32(arr.nd_int32[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__); + } + cuda_free_host(arr); +} + +void test_cuda_array_create_managed_int32() +{ + t_ndarray arr = {.shape = NULL}; + + int64_t tmp_shape[] = {INT64_C(14)}; + arr = cuda_array_create(1, tmp_shape, nd_int32, false, managedMemory); + int32_t cuda_array_dummy[] = {1, 0, 0, 1, 200, 33, 5, 57, + 62, 70, 103, 141, 122, 26}; + cudaMemcpy(arr.nd_int32, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice); + + + assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__); + assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__); + for(int i = 0; i < tmp_shape[0]; i++) + { + assert_int32(arr.nd_int32[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__); + } + cuda_free(arr); +} + +void test_cuda_array_create_device_int32() +{ + t_ndarray arr = {.shape = NULL}; + t_ndarray b = {.shape = NULL}; + + int64_t tmp_shape[] = {INT64_C(14)}; + arr = cuda_array_create(1, tmp_shape, nd_int32, false, allocateMemoryOnDevice); + int32_t cuda_array_dummy[] = {1, 0, 0, 1, 200, 33, 5, 57, + 62, 70, 103, 141, 122, 26}; + cudaMemcpy(arr.nd_int32, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice); + + int64_t tmp_shape_0001[] = {INT64_C(14)}; + b = cuda_array_create(1, tmp_shape_0001, nd_int32, false, allocateMemoryOnHost); + cudaMemcpy(b.nd_int32, arr.nd_int32, b.buffer_size, cudaMemcpyDeviceToHost); + + assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__); + assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__); + for(int i = 0; i < tmp_shape[0]; i++) + { + assert_int32(b.nd_int32[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__); + } + cuda_free(arr); + cuda_free_host(b); +} + +void test_cuda_array_create_host_int16() +{ + t_ndarray arr = {.shape = NULL}; + + int64_t tmp_shape[] = {INT64_C(14)}; + + arr = cuda_array_create(1, tmp_shape, nd_int16, false, allocateMemoryOnHost); + int16_t cuda_array_dummy[] = {1, 0, 0, 1, 200, 33, 5, 57, + 62, 70, 103, 141, 122, 26}; + cudaMemcpy(arr.nd_int16, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToHost); + + assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__); + assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__); + for(int i = 0; i < tmp_shape[0]; i++) + { + assert_int16(arr.nd_int16[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__); + } + cuda_free_host(arr); +} + +void test_cuda_array_create_managed_int16() +{ + t_ndarray arr = {.shape = NULL}; + + int64_t tmp_shape[] = {INT64_C(14)}; + arr = cuda_array_create(1, tmp_shape, nd_int16, false, managedMemory); + int16_t cuda_array_dummy[] = {1, 0, 0, 1, 200, 33, 5, 57, + 62, 70, 103, 141, 122, 26}; + cudaMemcpy(arr.nd_int16, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice); + + + assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__); + assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__); + for(int i = 0; i < tmp_shape[0]; i++) + { + assert_int16(arr.nd_int16[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__); + } + cuda_free(arr); +} + +void test_cuda_array_create_device_int16() +{ + t_ndarray arr = {.shape = NULL}; + t_ndarray b = {.shape = NULL}; + + int64_t tmp_shape[] = {INT64_C(14)}; + arr = cuda_array_create(1, tmp_shape, nd_int16, false, allocateMemoryOnDevice); + int16_t cuda_array_dummy[] = {1, 0, 0, 1, 200, 33, 5, 57, + 62, 70, 103, 141, 122, 26}; + cudaMemcpy(arr.nd_int16, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice); + + int64_t tmp_shape_0001[] = {INT64_C(14)}; + b = cuda_array_create(1, tmp_shape_0001, nd_int16, false, allocateMemoryOnHost); + cudaMemcpy(b.nd_int16, arr.nd_int16, b.buffer_size, cudaMemcpyDeviceToHost); + + assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__); + assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__); + for(int i = 0; i < tmp_shape[0]; i++) + { + assert_int16(b.nd_int16[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__); + } + cuda_free(arr); + cuda_free_host(b); +} + +void test_cuda_array_create_host_int8() +{ + t_ndarray arr = {.shape = NULL}; + + int64_t tmp_shape[] = {INT64_C(14)}; + + arr = cuda_array_create(1, tmp_shape, nd_int8, false, allocateMemoryOnHost); + int8_t cuda_array_dummy[] = {1, 0, 0, 1, 116, 33, 5, 57, + 62, 70, 103, 120, 122, 26}; + cudaMemcpy(arr.nd_int8, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToHost); + + assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__); + assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__); + for(int i = 0; i < tmp_shape[0]; i++) + { + assert_int8(arr.nd_int8[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__); + } + cuda_free_host(arr); +} + +void test_cuda_array_create_managed_int8() +{ + t_ndarray arr = {.shape = NULL}; + + int64_t tmp_shape[] = {INT64_C(14)}; + arr = cuda_array_create(1, tmp_shape, nd_int8, false, managedMemory); + int8_t cuda_array_dummy[] = {1, 0, 0, 1, 116, 33, 5, 57, + 62, 70, 103, 120, 122, 26}; + cudaMemcpy(arr.nd_int8, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice); + + + assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__); + assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__); + for(int i = 0; i < tmp_shape[0]; i++) + { + assert_int8(arr.nd_int8[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__); + } + cuda_free(arr); +} + +void test_cuda_array_create_device_int8() +{ + t_ndarray arr = {.shape = NULL}; + t_ndarray b = {.shape = NULL}; + + int64_t tmp_shape[] = {INT64_C(14)}; + arr = cuda_array_create(1, tmp_shape, nd_int8, false, allocateMemoryOnDevice); + int8_t cuda_array_dummy[] = {1, 0, 0, 1, 116, 33, 5, 57, + 62, 70, 103, 120, 122, 26}; + cudaMemcpy(arr.nd_int8, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice); + + int64_t tmp_shape_0001[] = {INT64_C(14)}; + b = cuda_array_create(1, tmp_shape_0001, nd_int8, false, allocateMemoryOnHost); + cudaMemcpy(b.nd_int8, arr.nd_int8, b.buffer_size, cudaMemcpyDeviceToHost); + + assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__); + assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__); + for(int i = 0; i < tmp_shape[0]; i++) + { + assert_int8(b.nd_int8[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__); + } + cuda_free(arr); + cuda_free_host(b); +} + +int32_t main(void) +{ + /* Cuda array creation tests */ + test_cuda_array_create_host_double(); + test_cuda_array_create_managed_double(); + test_cuda_array_create_device_double(); + + test_cuda_array_create_host_float(); + test_cuda_array_create_managed_float(); + test_cuda_array_create_device_float(); + + test_cuda_array_create_host_int64(); + test_cuda_array_create_managed_int64(); + test_cuda_array_create_device_int64(); + + test_cuda_array_create_host_int32(); + test_cuda_array_create_managed_int32(); + test_cuda_array_create_device_int32(); + + test_cuda_array_create_host_int16(); + test_cuda_array_create_managed_int16(); + test_cuda_array_create_device_int16(); + + test_cuda_array_create_host_int8(); + test_cuda_array_create_managed_int8(); + test_cuda_array_create_device_int8(); + + return (0); +} diff --git a/tests/cuda_test/test_kernel_semantic.py b/tests/cuda_test/test_kernel_semantic.py new file mode 100644 index 0000000000..e08e4e3711 --- /dev/null +++ b/tests/cuda_test/test_kernel_semantic.py @@ -0,0 +1,218 @@ + +# pylint: disable=missing-function-docstring, missing-module-docstring/ +import pytest + +import numpy as np +from pyccel.epyccel import epyccel +from pyccel.decorators import stack_array, types, kernel +from pyccel.errors.errors import Errors, PyccelSemanticError +from pyccel.errors.messages import (INVALID_FUNCTION_CALL, KERNEL_STACK_ARRAY_ARG, MISSING_KERNEL_CONFIGURATION, + NON_KERNEL_FUNCTION_CUDA_VAR, + INVALID_KERNEL_CALL_BP_GRID, + INVALID_KERNEL_CALL_TP_BLOCK + ) + +@pytest.mark.parametrize( 'language', [ + pytest.param("ccuda", marks = pytest.mark.ccuda) + ] +) +def test_stack_array_kernel(language): + @stack_array('arr') + def kernel_caller(): + from numpy import ones + @kernel + @types('int[:]') + def stack_array_kernel(arr): + return arr[0] + arr = ones(1, dtype=int) + return stack_array_kernel[1,1](arr) + + # Initialize singleton that stores Pyccel errors + errors = Errors() + + # epyccel should raise an Exception + with pytest.raises(PyccelSemanticError): + epyccel(kernel_caller, language=language) + + # Check that we got exactly 1 Pyccel error + assert errors.has_errors() + assert errors.num_messages() == 1 + + # Check that the error is correct + error_info = [*errors.error_info_map.values()][0][0] + assert error_info.symbol.funcdef == 'stack_array_kernel' + assert KERNEL_STACK_ARRAY_ARG == error_info.message + +@pytest.mark.parametrize( 'language', [ + pytest.param("ccuda", marks = pytest.mark.ccuda) + ] +) +def test_cuda_intern_var_non_kernel(language): + def non_kernel_function(): + from pyccel import cuda + i = cuda.threadIdx(0) + cuda.blockIdx(0) * cuda.blockDim(0) + + # Initialize singleton that stores Pyccel errors + errors = Errors() + + # epyccel should raise an Exception + with pytest.raises(PyccelSemanticError): + epyccel(non_kernel_function, language=language) + + # Check that we got exactly 1 Pyccel error + assert errors.has_errors() + assert errors.num_messages() == 1 + + # Check that the error is correct + error_info = [*errors.error_info_map.values()][0][0] + assert error_info.symbol.name[0] == 'cuda' + assert error_info.symbol.name[1].func_name == 'threadIdx' + assert NON_KERNEL_FUNCTION_CUDA_VAR == error_info.message + +@pytest.mark.parametrize( 'language', [ + pytest.param("ccuda", marks = pytest.mark.ccuda) + ] +) +def test_unvalid_block_number(language): + def unvalid_block_number(): + @kernel + def kernel_call(): + pass + kernel_call[1.2,1]() + + # Initialize singleton that stores Pyccel errors + errors = Errors() + + # epyccel should raise an Exception + with pytest.raises(PyccelSemanticError): + epyccel(unvalid_block_number, language=language) + + # Check that we got exactly 1 Pyccel error + assert errors.has_errors() + assert errors.num_messages() == 1 + + # Check that the error is correct + error_info = [*errors.error_info_map.values()][0][0] + assert error_info.symbol.funcdef == 'kernel_call' + assert INVALID_KERNEL_CALL_BP_GRID == error_info.message + +@pytest.mark.parametrize( 'language', [ + pytest.param("ccuda", marks = pytest.mark.ccuda) + ] +) +def test_unvalid_thread_per_block(language): + def unvalid_thread_per_block(): + @kernel + def kernel_call(): + pass + kernel_call[1,1.2]() + + # Initialize singleton that stores Pyccel errors + errors = Errors() + + # epyccel should raise an Exception + with pytest.raises(PyccelSemanticError): + epyccel(unvalid_thread_per_block, language=language) + + # Check that we got exactly 1 Pyccel error + assert errors.has_errors() + assert errors.num_messages() == 1 + + # Check that the error is correct + error_info = [*errors.error_info_map.values()][0][0] + assert error_info.symbol.funcdef == 'kernel_call' + assert INVALID_KERNEL_CALL_TP_BLOCK == error_info.message + +@pytest.mark.parametrize( 'language', [ + pytest.param("ccuda", marks = pytest.mark.ccuda) + ] +) +def test_missing_kernel_config(language): + def missing_kernel_config(): + @kernel + def kernel_call(): + pass + kernel_call() + + errors = Errors() + + with pytest.raises(PyccelSemanticError): + epyccel(missing_kernel_config, language=language) + + assert errors.has_errors() + assert errors.num_messages() == 1 + + error_info = [*errors.error_info_map.values()][0][0] + assert error_info.symbol.func_name == 'kernel_call' + assert MISSING_KERNEL_CONFIGURATION == error_info.message + +@pytest.mark.parametrize( 'language', [ + pytest.param("ccuda", marks = pytest.mark.ccuda) + ] +) +def test_invalid_block_number(language): + def invalid_block_number(): + @kernel + def kernel_call(): + pass + blocks_per_grid = 5.0 + kernel_call[blocks_per_grid, 1]() + + errors = Errors() + + with pytest.raises(PyccelSemanticError): + epyccel(invalid_block_number, language=language) + + assert errors.has_errors() + assert errors.num_messages() == 1 + + error_info = [*errors.error_info_map.values()][0][0] + assert error_info.symbol.funcdef == 'kernel_call' + assert INVALID_KERNEL_CALL_BP_GRID == error_info.message + +@pytest.mark.parametrize( 'language', [ + pytest.param("ccuda", marks = pytest.mark.ccuda) + ] +) +def test_invalid_thread_per_block(language): + def invalid_thread_per_block(): + @kernel + def kernel_call(): + pass + threads_per_block = 5.0 + kernel_call[1, threads_per_block]() + + errors = Errors() + + with pytest.raises(PyccelSemanticError): + epyccel(invalid_thread_per_block, language=language) + + assert errors.has_errors() + assert errors.num_messages() == 1 + + error_info = [*errors.error_info_map.values()][0][0] + assert error_info.symbol.funcdef == 'kernel_call' + assert INVALID_KERNEL_CALL_TP_BLOCK == error_info.message + + +@pytest.mark.parametrize( 'language', [ + pytest.param("ccuda", marks = pytest.mark.ccuda) + ] +) +def test_invalid_function_call(language): + def invalid_function_call(): + def non_kernel_func(): + pass + non_kernel_func[1, 2]() # pylint: disable=E1136 + + errors = Errors() + + with pytest.raises(PyccelSemanticError): + epyccel(invalid_function_call, language=language) + + assert errors.has_errors() + assert errors.num_messages() == 1 + + error_info = [*errors.error_info_map.values()][0][0] + assert error_info.symbol.funcdef == 'non_kernel_func' + assert INVALID_FUNCTION_CALL == error_info.message diff --git a/tests/internal/scripts/ccuda/cuda_array_device.py b/tests/internal/scripts/ccuda/cuda_array_device.py new file mode 100644 index 0000000000..17c50a2824 --- /dev/null +++ b/tests/internal/scripts/ccuda/cuda_array_device.py @@ -0,0 +1,4 @@ +from pyccel import cuda + +if __name__ == '__main__': + arr = cuda.array([0,1,2,3,4], memory_location = 'device') diff --git a/tests/internal/scripts/ccuda/cuda_array_host.py b/tests/internal/scripts/ccuda/cuda_array_host.py new file mode 100644 index 0000000000..8702031bba --- /dev/null +++ b/tests/internal/scripts/ccuda/cuda_array_host.py @@ -0,0 +1,4 @@ +from pyccel import cuda + +if __name__ == '__main__': + arr = cuda.array([0,1,2,3,4], memory_location='host') diff --git a/tests/internal/scripts/ccuda/cuda_array_managed.py b/tests/internal/scripts/ccuda/cuda_array_managed.py new file mode 100644 index 0000000000..58070d84ae --- /dev/null +++ b/tests/internal/scripts/ccuda/cuda_array_managed.py @@ -0,0 +1,4 @@ +from pyccel import cuda + +if __name__ == '__main__': + a = cuda.array([0,1,2,3,4], memory_location = 'managed') diff --git a/tests/internal/scripts/ccuda/cuda_copy.py b/tests/internal/scripts/ccuda/cuda_copy.py new file mode 100644 index 0000000000..11a59c7fa0 --- /dev/null +++ b/tests/internal/scripts/ccuda/cuda_copy.py @@ -0,0 +1,6 @@ +from pyccel.decorators import kernel, types +from pyccel import cuda + +if __name__ == '__main__': + arr = cuda.array([1,2,3,4], memory_location='device') + out = cuda.copy(arr, 'host', is_async=True) diff --git a/tests/internal/scripts/ccuda/cuda_grid.py b/tests/internal/scripts/ccuda/cuda_grid.py new file mode 100644 index 0000000000..56c2d14325 --- /dev/null +++ b/tests/internal/scripts/ccuda/cuda_grid.py @@ -0,0 +1,20 @@ +from pyccel.decorators import kernel, types +from pyccel import cuda + +@kernel +@types('int[:]') +def func_1d(a): + i = cuda.grid(0) + print("1 dim :", a[i]) + +@kernel +@types('int[:]') +def func_2d(a): + i, j = cuda.grid(1) + print("2 dim :", a[i], a[j]) + +@kernel +@types('int[:]') +def func_3d(a): + i, j, k = cuda.grid(2) + print("3 dim :", a[i], a[j], a[k]) diff --git a/tests/internal/scripts/ccuda/cupy_arange.py b/tests/internal/scripts/ccuda/cupy_arange.py new file mode 100644 index 0000000000..91664bf956 --- /dev/null +++ b/tests/internal/scripts/ccuda/cupy_arange.py @@ -0,0 +1,4 @@ +import cupy as cp + +if __name__ == '__main__': + arr = cp.arange(32) diff --git a/tests/internal/scripts/ccuda/cupy_array.py b/tests/internal/scripts/ccuda/cupy_array.py new file mode 100644 index 0000000000..c168c156e5 --- /dev/null +++ b/tests/internal/scripts/ccuda/cupy_array.py @@ -0,0 +1,4 @@ +import cupy as cp + +if __name__ == '__main__': + arr = cp.array([0, 1, 2, 3, 4]) diff --git a/tests/internal/scripts/ccuda/free_pointer.py b/tests/internal/scripts/ccuda/free_pointer.py new file mode 100644 index 0000000000..4cc6e8f722 --- /dev/null +++ b/tests/internal/scripts/ccuda/free_pointer.py @@ -0,0 +1,16 @@ +# pylint: disable=missing-function-docstring, disable=unused-variable, missing-module-docstring + +import math +from pyccel.decorators import kernel +from pyccel import cuda + +@kernel +def func(arr:'int[:]'): + i = cuda.grid(0) + arr[i] = math.pow(arr[i], 2) + +if __name__ == '__main__': + a = cuda.array([1,2,3,4], memory_location='device') + func[1,4](a) + c = a + cuda.synchronize() diff --git a/tests/internal/scripts/ccuda/kernel.py b/tests/internal/scripts/ccuda/kernel.py new file mode 100644 index 0000000000..86d77418c6 --- /dev/null +++ b/tests/internal/scripts/ccuda/kernel.py @@ -0,0 +1,8 @@ +from pyccel.decorators import kernel, types +from pyccel import cuda + +@kernel +@types('int[:]') +def func(a): + i = cuda.threadIdx(0) + cuda.blockIdx(0) * cuda.blockDim(0) + print("Hello World! ", a[i]) diff --git a/tests/internal/scripts/ccuda/kernel_launch_config_literal.py b/tests/internal/scripts/ccuda/kernel_launch_config_literal.py new file mode 100644 index 0000000000..25a6d3f317 --- /dev/null +++ b/tests/internal/scripts/ccuda/kernel_launch_config_literal.py @@ -0,0 +1,11 @@ +from pyccel.decorators import kernel +from pyccel import cuda + + +@kernel +def func(): + i = cuda.threadIdx(0) + cuda.blockIdx(0) * cuda.blockDim(0) + print("Hello World! ") + +if __name__ == '__main__': + func[1, 5]() diff --git a/tests/internal/scripts/ccuda/kernel_launch_config_variable.py b/tests/internal/scripts/ccuda/kernel_launch_config_variable.py new file mode 100644 index 0000000000..199e9ee6e5 --- /dev/null +++ b/tests/internal/scripts/ccuda/kernel_launch_config_variable.py @@ -0,0 +1,13 @@ +from pyccel.decorators import kernel +from pyccel import cuda + + +@kernel +def func(): + i = cuda.threadIdx(0) + cuda.blockIdx(0) * cuda.blockDim(0) + print("Hello World! ") + +if __name__ == '__main__': + nb = 1 + tpb = 5 + func[nb, tpb]() diff --git a/tests/internal/test_internal.py b/tests/internal/test_internal.py index 10630e7e8b..cf9fd01a8d 100644 --- a/tests/internal/test_internal.py +++ b/tests/internal/test_internal.py @@ -41,6 +41,14 @@ def test_mpi(f): def test_openmp(f, language): execute_pyccel(f, accelerators=['openmp'], language=language) +@pytest.mark.parametrize("f", get_files_from_folder('ccuda')) +@pytest.mark.parametrize( 'language', + (pytest.param("ccuda", marks = pytest.mark.ccuda),) +) +@pytest.mark.external +def test_ccuda(f, language): + execute_pyccel(f, language=language) + #@pytest.mark.parametrize("f", get_files_from_folder('openacc')) #@pytest.mark.external #def test_openacc(): @@ -98,3 +106,13 @@ def test_openmp(f, language): # print('> testing {0}'.format(str(os.path.basename(f)))) # test_openacc(f) # print('\n') + + print('*********************************') + print('*** ***') + print('*** TESTING INTERNAL/Cuda ***') + print('*** ***') + print('*********************************') + for f in get_files_from_folder('ccuda'): + print('> testing {0}'.format(str(os.path.basename(f)))) + test_ccuda(f) + print('\n') diff --git a/tests/ndarrays/test_ndarrays.c b/tests/ndarrays/test_ndarrays.c index eee6caa020..2c5116b11c 100644 --- a/tests/ndarrays/test_ndarrays.c +++ b/tests/ndarrays/test_ndarrays.c @@ -609,7 +609,7 @@ int32_t test_array_fill_int64(void) int64_t c_value; x = array_create(2, m_1_shape, nd_int64, false); - array_fill((int64_t)32, x); + array_fill_int64((int64_t)32, x); // testing the index [3, 1] index = 3 * x.strides[0] + 1 * x.strides[1]; c_index = 7; @@ -633,7 +633,7 @@ int32_t test_array_fill_int32(void) int32_t c_value; x = array_create(2, m_1_shape, nd_int32, false); - array_fill((int32_t)32, x); + array_fill_int32((int32_t)32, x); // testing the index [3, 1] index = 3 * x.strides[0] + 1 * x.strides[1]; c_index = 7; @@ -657,7 +657,7 @@ int32_t test_array_fill_int16(void) int16_t c_value; x = array_create(2, m_1_shape, nd_int16, false); - array_fill((int16_t)32, x); + array_fill_int16((int16_t)32, x); // testing the index [3, 1] index = 3 * x.strides[0] + 1 * x.strides[1]; c_index = 7; @@ -681,7 +681,7 @@ int32_t test_array_fill_int8(void) int8_t c_value; x = array_create(2, m_1_shape, nd_int8, false); - array_fill((int8_t)32, x); + array_fill_int8((int8_t)32, x); // testing the index [3, 1] index = 3 * x.strides[0] + 1 * x.strides[1]; c_index = 7; @@ -705,7 +705,7 @@ int32_t test_array_fill_double(void) double c_value; x = array_create(2, m_1_shape, nd_double, false); - array_fill(2., x); + array_fill_double(2., x); // testing the index [3, 1] index = 3 * x.strides[0] + 1 * x.strides[1]; c_index = 7; @@ -729,7 +729,7 @@ int32_t test_array_fill_cdouble(void) double complex c_value; x = array_create(2, m_1_shape, nd_cdouble, false); - array_fill(0.3+0.54*I, x); + array_fill_cdouble(0.3+0.54*I, x); // testing the index [3, 1] index = 3 * x.strides[0] + 1 * x.strides[1]; c_index = 7; @@ -755,7 +755,7 @@ int32_t test_array_zeros_double(void) double c_value; x = array_create(2, m_1_shape, nd_double, false); - array_fill(0, x); + array_fill_double(0, x); // testing the index [3, 1] index = 3 * x.strides[0] + 1 * x.strides[1]; c_index = 7; @@ -779,7 +779,7 @@ int32_t test_array_zeros_int32(void) int32_t c_value; x = array_create(2, m_1_shape, nd_int32, false); - array_fill(0, x); + array_fill_int32(0, x); // testing the index [3, 1] index = 3 * x.strides[0] + 1 * x.strides[1]; c_index = 7; @@ -803,7 +803,7 @@ int32_t test_array_zeros_cdouble(void) double complex c_value; x = array_create(2, m_1_shape, nd_cdouble, false); - array_fill(0, x); + array_fill_cdouble(0, x); // testing the index [3, 1] index = 3 * x.strides[0] + 1 * x.strides[1]; c_index = 7; diff --git a/tests/run_tests_py3.sh b/tests/run_tests_py3.sh index 6145dca0d2..16fb78a4f0 100755 --- a/tests/run_tests_py3.sh +++ b/tests/run_tests_py3.sh @@ -15,7 +15,7 @@ SCRIPT_DIR=$(dirname -- "$(realpath -- "$0")") #python3 "$SCRIPT_DIR"/internal/test_internal.py #python3 "$SCRIPT_DIR"/external/test_external.py #python3 "$SCRIPT_DIR"/macro/test_macro.py - +python3 -m pytest "$SCRIPT_DIR"/cuda_test -v python3 -m pytest "$SCRIPT_DIR"/pyccel -v python3 -m pytest "$SCRIPT_DIR"/epyccel -v -m "not parallel" mpirun -n 4 python3 -m pytest "$SCRIPT_DIR"/epyccel/test_epyccel_mpi_modules.py -v diff --git a/tutorial/decorators.md b/tutorial/decorators.md index cf7edb3589..5f64b96a0a 100644 --- a/tutorial/decorators.md +++ b/tutorial/decorators.md @@ -600,6 +600,45 @@ The generated C code: ```c ``` +## GPU decorators + +### kernel + +This decorator is used to mark a Python function as a GPU kernel function using the Pyccel library. The decorated function can then be executed on a GPU by specifying the number of blocks and threads per block as arguments to the function using square brackets. The `kernel` decorator is provided by the Pyccel library and is used to create a GPU kernel from a Python function. + +#### Basic Example + +python Code: + +```Python +from pyccel.decorators import kernel + +@kernel +def func(): + #Code + +if __name__ == '__main__': + # the decorated function. + func[1, 5]() +``` + +the Generated Code (Ccuda): + +```C +extern "C" __global__ void func(void) +{ + /*Code*/ + return; +} + +int main() +{ + /*the decorated function.*/ + func<<<1,5>>>(); + return 0; +} +``` + ## Getting Help If you face problems with Pyccel, please take the following steps: