diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 0000000000..63440e5f3b
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1,6 @@
+# These owners will be the default owners for everything in
+# the repo. Unless a later match takes precedence,
+# @global-owner1 and @global-owner2 will be requested for
+# review when someone opens a pull request.
+*       @EmilyBourne @bauom
+
diff --git a/.github/actions/coverage_collection/action.yml b/.github/actions/coverage_collection/action.yml
index 03d1b2cdfd..e32b82b317 100644
--- a/.github/actions/coverage_collection/action.yml
+++ b/.github/actions/coverage_collection/action.yml
@@ -6,7 +6,6 @@ runs:
     - name: Coverage collection
       run: |
         coverage combine
-        coverage xml
         rm ${SITE_DIR}/pyccel_cov.pth
       shell: bash
 
diff --git a/.github/actions/coverage_install/action.yml b/.github/actions/coverage_install/action.yml
index 44ef760dcb..fb55f29c45 100644
--- a/.github/actions/coverage_install/action.yml
+++ b/.github/actions/coverage_install/action.yml
@@ -10,7 +10,7 @@ runs:
     - name: Directory Creation
       run: |
         INSTALL_DIR=$(cd tests; python -c "import pyccel; print(pyccel.__path__[0])")
-        SITE_DIR=$(python -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])')
+        SITE_DIR=$(dirname ${INSTALL_DIR})
         echo -e "import coverage; coverage.process_startup()" > ${SITE_DIR}/pyccel_cov.pth
         echo -e "[run]\nparallel = True\nsource = ${INSTALL_DIR}\ndata_file = $(pwd)/.coverage\n[report]\ninclude = ${INSTALL_DIR}/*\n[xml]\noutput = cobertura.xml" > .coveragerc
         echo "SITE_DIR=${SITE_DIR}" >> $GITHUB_ENV
diff --git a/.github/actions/linux_install/action.yml b/.github/actions/linux_install/action.yml
index 8fb5cd8505..0ef9a69b8e 100644
--- a/.github/actions/linux_install/action.yml
+++ b/.github/actions/linux_install/action.yml
@@ -9,22 +9,22 @@ runs:
       shell: bash
     - name: Install fortran
       run:
-        sudo apt-get install gfortran
+        sudo apt-get install -y gfortran
       shell: bash
     - name: Install LaPack
       run:
-        sudo apt-get install libblas-dev liblapack-dev
+        sudo apt-get install -y libblas-dev liblapack-dev
       shell: bash
     - name: Install MPI
       run: |
-        sudo apt-get install libopenmpi-dev openmpi-bin
+        sudo apt-get install -y libopenmpi-dev openmpi-bin
         echo "MPI_OPTS=--oversubscribe" >> $GITHUB_ENV
       shell: bash
     - name: Install OpenMP
       run:
-        sudo apt-get install libomp-dev libomp5
+        sudo apt-get install -y libomp-dev libomp5
       shell: bash
     - name: Install Valgrind
       run:
-        sudo apt-get install valgrind
+        sudo apt-get install -y valgrind
       shell: bash
diff --git a/.github/actions/pytest_run/action.yml b/.github/actions/pytest_run/action.yml
index 13f20d7c04..10113c373e 100644
--- a/.github/actions/pytest_run/action.yml
+++ b/.github/actions/pytest_run/action.yml
@@ -16,17 +16,18 @@ runs:
     - name: Test with pytest
       run: |
         which python
-        python -m pytest -n auto -rXx -v -m "not (parallel or xdist_incompatible) and c ${{ inputs.pytest_mark }}" --ignore=symbolic --ignore=ndarrays
+        python -m pytest -n auto -rXx -v -m "not (parallel or xdist_incompatible) and c ${{ inputs.pytest_mark }}" --ignore=symbolic --ignore=ndarrays --ignore=cuda_ndarrays
         if [ -n "${SITE_DIR}" ]; then
             echo "Touching"
             # Test ndarray folder update (requires parallel tests to avoid clean)
             touch ${SITE_DIR}/pyccel/stdlib/cwrapper/cwrapper.h
             python -m pytest -n auto -rXx -v -m c -k test_array_int32_1d_scalar epyccel/test_arrays.py
         fi
-        python -m pytest -rXx -m "xdist_incompatible and not parallel and c ${{ inputs.pytest_mark }}" --ignore=symbolic --ignore=ndarrays
+
+        python -m pytest -rXx -m "xdist_incompatible and not parallel and c ${{ inputs.pytest_mark }}" --ignore=symbolic --ignore=ndarrays --ignore=cuda_ndarrays
         pyccel-clean
-        python -m pytest -n auto -rXx -m "not (parallel or xdist_incompatible) and not (c or python) ${{ inputs.pytest_mark }}" --ignore=symbolic --ignore=ndarrays
-        python -m pytest -rXx -m "xdist_incompatible and not parallel and not (c or python) ${{ inputs.pytest_mark }}" --ignore=symbolic --ignore=ndarrays
+        python -m pytest -n auto -rXx -m "not (parallel or xdist_incompatible) and not (c or python or ccuda) ${{ inputs.pytest_mark }}" --ignore=symbolic --ignore=ndarrays --ignore=cuda_ndarrays
+        python -m pytest -rXx -m "xdist_incompatible and not parallel and not (c or python or ccuda) ${{ inputs.pytest_mark }}" --ignore=symbolic --ignore=ndarrays --ignore=cuda_ndarrays
         pyccel-clean
         python -m pytest ndarrays/ -rXx
         pyccel-clean
diff --git a/.github/actions/pytest_run_cuda/action.yml b/.github/actions/pytest_run_cuda/action.yml
new file mode 100644
index 0000000000..0c55ccd81a
--- /dev/null
+++ b/.github/actions/pytest_run_cuda/action.yml
@@ -0,0 +1,17 @@
+name: 'Pyccel pytest commands generating Ccuda'
+inputs:
+  shell_cmd:
+    description: 'Specifies the shell command (different for anaconda)'
+    required: false
+    default: "bash"
+
+runs:
+  using: "composite"
+  steps:
+    - name: Ccuda tests with pytest
+      run: |
+        # Catch exit 5 (no tests found)
+        sh -c 'python -m pytest -n auto -rx -m "not (parallel or xdist_incompatible) and ccuda" --ignore=symbolic --ignore=ndarrays --ignore=cuda_ndarrays; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret'
+        pyccel-clean
+      shell: ${{ inputs.shell_cmd }}
+      working-directory: ./tests
diff --git a/.github/actions/pytest_run_python/action.yml b/.github/actions/pytest_run_python/action.yml
index 842fd2eaf6..c9595dde1f 100644
--- a/.github/actions/pytest_run_python/action.yml
+++ b/.github/actions/pytest_run_python/action.yml
@@ -10,8 +10,8 @@ runs:
   steps:
     - name: Python tests with pytest
       run: |
-        python -m pytest -n auto -rXx -m "not (parallel or xdist_incompatible) and python" --ignore=symbolic --ignore=ndarrays
-        python -m pytest -rXx -m "xdist_incompatible and not parallel and python" --ignore=symbolic --ignore=ndarrays
+        python -m pytest -n auto -rXx -m "not (parallel or xdist_incompatible) and python" --ignore=symbolic --ignore=ndarrays --ignore=cuda_ndarrays
+        python -m pytest -rXx -m "xdist_incompatible and not parallel and python" --ignore=symbolic --ignore=ndarrays --ignore=cuda_ndarrays
         pyccel-clean
       shell: ${{ inputs.shell_cmd }}
       working-directory: ./tests
diff --git a/.github/actions/python_install/action.yml b/.github/actions/python_install/action.yml
new file mode 100644
index 0000000000..f9b720e3e1
--- /dev/null
+++ b/.github/actions/python_install/action.yml
@@ -0,0 +1,17 @@
+name: 'Python installation commands'
+
+runs:
+  using: "composite"
+  steps:
+    - name: Install python
+      run:
+        sudo apt-get -y install python3-dev
+      shell: bash
+    - name: python as python3
+      run:
+        sudo apt-get -y install python-is-python3
+      shell: bash
+    - name: Install Pip
+      run:
+        sudo apt-get -y install python3-pip
+      shell: bash
diff --git a/.github/workflows/Github_pytest.yml b/.github/workflows/Github_pytest.yml
index b8f225a32e..a8ae6da2bc 100644
--- a/.github/workflows/Github_pytest.yml
+++ b/.github/workflows/Github_pytest.yml
@@ -2,7 +2,7 @@ name: Pull Request
 
 on:
   pull_request:
-    branches: [ devel ]
+    branches: [ main, devel ]
     types: [ opened, reopened, ready_for_review, synchronize, converted_to_draft ]
   issue_comment:
     types: [ created ]
@@ -16,6 +16,7 @@ jobs:
       run_windows: ${{ steps.run_bot.outputs.run_windows }}
       run_macosx: ${{ steps.run_bot.outputs.run_macosx }}
       run_coverage: ${{ steps.run_bot.outputs.run_coverage }}
+      run_cuda: ${{ steps.run_bot.outputs.run_cuda }}
       run_docs: ${{ steps.run_bot.outputs.run_docs }}
       run_pylint: ${{ steps.run_bot.outputs.run_pylint }}
       run_lint: ${{ steps.run_bot.outputs.run_lint }}
@@ -81,9 +82,28 @@ jobs:
       python_version: ${{ needs.Bot.outputs.python_version }}
       ref: ${{ needs.Bot.outputs.REF }}
 
+  cuda:
+    needs: Bot
+    if: ${{ needs.Bot.outputs.run_cuda == 'True' }}
+    uses:
+      ./.github/workflows/cuda.yml
+    with:
+      python_version: ${{ (needs.Bot.outputs.python_version == '') && '3.10' || needs.Bot.outputs.python_version  }}
+      ref: ${{ needs.Bot.outputs.REF }}
+
+  coverage_collection:
+    needs: [Bot, linux, cuda]
+    if: ${{ always() && needs.Bot.outputs.run_coverage == 'True' && needs.Linux.result == 'success' && needs.Cuda.result != 'failure' }}
+    uses:
+      ./.github/workflows/coverage_collect.yml
+    with:
+      python_version: ${{ (needs.Bot.outputs.python_version == '') && '3.7' || needs.Bot.outputs.python_version  }}
+      ref: ${{ needs.Bot.outputs.REF }}
+      cuda_result: ${{ needs.Cuda.result }}
+
   coverage:
-    needs: [Bot, linux]
-    if: ${{ needs.Bot.outputs.run_coverage == 'True' }}
+    needs: [Bot, linux, cuda, coverage_collection]
+    if: ${{ always() && needs.Bot.outputs.run_coverage == 'True' && needs.CoverageCollection.result == 'success' && needs.Cuda.result != 'failure' }}
     uses:
       ./.github/workflows/coverage.yml
     with:
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index e27379f0cd..b8c551a5f6 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -26,7 +26,7 @@ jobs:
       - uses: actions/checkout@v3
         with:
           ref: ${{ inputs.ref }}
-      - name: Set up Python 3.7
+      - name: Set up Python ${{ inputs.python_version }}
         uses: actions/setup-python@v4
         with:
           python-version: 3.7
@@ -44,7 +44,7 @@ jobs:
       - name: Collect coverage information
         uses: actions/download-artifact@v3
         with:
-          name: coverage-artifact
+          name: coverage-artifact-xml
       - name: Collect diff information
         run: |
           git fetch
diff --git a/.github/workflows/coverage_collect.yml b/.github/workflows/coverage_collect.yml
new file mode 100644
index 0000000000..e0c4a0c3bc
--- /dev/null
+++ b/.github/workflows/coverage_collect.yml
@@ -0,0 +1,67 @@
+name: Unit test coverage collection
+
+on:
+  workflow_call:
+    inputs:
+      python_version:
+        required: true
+        type: string
+      ref:
+        required: false
+        type: string
+      cuda_result:
+        required: true
+        type: string
+
+jobs:
+  CoverageChecker:
+
+    runs-on: ubuntu-latest
+    name: Unit tests
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          ref: ${{ inputs.ref }}
+      - name: Set up Python ${{ inputs.python_version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ inputs.python_version }}
+      - name: Install dependencies
+        uses: ./.github/actions/linux_install
+      - name: Install coverage
+        run: |
+            python -m pip install --upgrade pip
+            python -m pip install coverage
+        shell: bash
+      - name: Collect coverage information
+        uses: actions/download-artifact@v3
+        with:
+          name: coverage-artifact
+      - name: Rename coverage file
+        run: mv .coverage .coverage.linux
+      - name: Collect coverage information
+        uses: actions/download-artifact@v3
+        if: ${{ inputs.cuda_result }} == 'success'
+        with:
+          name: cuda-coverage-artifact
+      - name: Rename coverage file
+        if: ${{ inputs.cuda_result }} == 'success'
+        run: mv .coverage .coverage.cuda
+      - name: Generate coverage report
+        run: |
+            echo -e "[paths]\nsource =\n   $(pwd)/pyccel\n   */site-packages/pyccel\n[xml]\noutput = cobertura.xml" > .coveragerc
+            coverage combine
+            coverage xml
+      - name: Run codacy-coverage-reporter
+        uses: codacy/codacy-coverage-reporter-action@master
+        continue-on-error: True
+        with:
+          project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
+          coverage-reports: cobertura.xml
+      - name: Save code coverage xml report
+        uses: actions/upload-artifact@v3
+        with:
+          name: coverage-artifact-xml
+          path: cobertura.xml
+          retention-days: 1
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
new file mode 100644
index 0000000000..9c9d181190
--- /dev/null
+++ b/.github/workflows/cuda.yml
@@ -0,0 +1,55 @@
+name: Cuda unit tests
+
+on:
+  workflow_call:
+    inputs:
+      python_version:
+        required: true
+        type: string
+      ref:
+        required: false
+        type: string
+
+jobs:
+  Cuda:
+
+    runs-on: ubuntu-20.04
+    name: Unit tests
+
+    container: nvidia/cuda:11.7.1-devel-ubuntu20.04
+    steps:
+      - uses: actions/checkout@v3
+      - name: Prepare docker
+        run: |
+          apt update && apt install sudo
+          TZ=Europe/France
+          ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+          DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata
+        shell: bash
+      - name: CUDA Version
+        run: nvcc --version # cuda install check
+      - name: Install dependencies
+        uses: ./.github/actions/linux_install
+      - name: Install python (setup-python action doesn't work with containers)
+        uses: ./.github/actions/python_install
+      - name: Install Pyccel with tests
+        run: |
+            PATH=${PATH}:$HOME/.local/bin
+            echo "PATH=${PATH}" >> $GITHUB_ENV
+            python -m pip install --upgrade pip
+            python -m pip install --user .[test]
+        shell: bash
+      - name: Coverage install
+        uses: ./.github/actions/coverage_install
+      - name: Ccuda tests with pytest
+        uses: ./.github/actions/pytest_run_cuda
+      - name: Collect coverage information
+        continue-on-error: True
+        uses: ./.github/actions/coverage_collection
+      - name: Save code coverage report
+        uses: actions/upload-artifact@v3
+        with:
+          name: cuda-coverage-artifact
+          path: .coverage
+          retention-days: 1
+
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index d2367d95b2..faced10877 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -53,15 +53,9 @@ jobs:
       - name: Collect coverage information
         continue-on-error: True
         uses: ./.github/actions/coverage_collection
-      - name: Run codacy-coverage-reporter
-        uses: codacy/codacy-coverage-reporter-action@master
-        continue-on-error: True
-        with:
-          project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
-          coverage-reports: cobertura.xml
       - name: Save code coverage report
         uses: actions/upload-artifact@v3
         with:
           name: coverage-artifact
-          path: cobertura.xml
+          path: .coverage
           retention-days: 1
diff --git a/MANIFEST.in b/MANIFEST.in
index 656cdd153a..bf873bea31 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -6,6 +6,7 @@ include README.md
 recursive-include pyccel *.tx
 recursive-include pyccel *.pyh
 recursive-include pyccel *.c
+recursive-include pyccel *.cu
 recursive-include pyccel *.f90
 recursive-include pyccel *.h
 recursive-include pyccel *.pyccel
diff --git a/ci_tools/bot_interaction.py b/ci_tools/bot_interaction.py
index e68848cf7d..a5e5c21dac 100644
--- a/ci_tools/bot_interaction.py
+++ b/ci_tools/bot_interaction.py
@@ -13,7 +13,7 @@
 senior_reviewer = ['yguclu', 'EmilyBourne']
 trusted_reviewers = ['yguclu', 'EmilyBourne', 'ratnania', 'saidctb', 'bauom']
 
-pr_test_keys = ['linux', 'windows', 'macosx', 'coverage', 'docs', 'pylint',
+pr_test_keys = ['linux', 'windows', 'macosx', 'cuda', 'coverage', 'docs', 'pylint',
                 'lint', 'spelling']
 
 review_labels = ('needs_initial_review', 'Ready_for_review', 'Ready_to_merge')
@@ -94,7 +94,9 @@ def run_tests(pr_id, tests, outputs, event):
 
     if outputs['run_coverage']:
         outputs['run_linux'] = True
+        outputs['run_cuda'] = True
         running.add('linux')
+        running.add('cuda')
 
     running_tests = bool(running)
 
@@ -434,6 +436,7 @@ def flagged_as_trusted(pr_id, user):
     outputs = {'run_linux': False,
                'run_windows': False,
                'run_macosx': False,
+               'run_cuda': False,
                'run_coverage': False,
                'run_docs': False,
                'run_pylint': False,
diff --git a/ci_tools/bot_messages/show_tests.txt b/ci_tools/bot_messages/show_tests.txt
index 79fe6e3478..028d29224e 100644
--- a/ci_tools/bot_messages/show_tests.txt
+++ b/ci_tools/bot_messages/show_tests.txt
@@ -2,6 +2,7 @@ The following is a list of keywords which can be used to run tests. Tests in bol
 - **linux** : Runs the unit tests on a Linux system.
 - **windows** : Runs the unit tests on a Windows system.
 - **macosx** : Runs the unit tests on a MacOS X system.
+- **cuda** : Runs the cuda unit tests on a Linux system.
 - **coverage** : Runs the unit tests on a Linux system and checks the coverage of the tests.
 - **docs** : Checks if the documentation follows the numpydoc format.
 - **pylint** : Runs pylint on files which are too big to be handled by codacy.
diff --git a/pyccel/ast/class_defs.py b/pyccel/ast/class_defs.py
index 1a08f2ab0a..bf25800c2d 100644
--- a/pyccel/ast/class_defs.py
+++ b/pyccel/ast/class_defs.py
@@ -160,6 +160,13 @@
 
 #=======================================================================================
 
+CudaArrayClass = ClassDef('cuda.ndarray',
+        methods=[
+            FunctionDef('shape',[],[],body=[],
+                decorators={'property':'property', 'numpy_wrapper':Shape})])
+
+#=======================================================================================
+
 literal_classes = {
         NativeBool()    : BooleanClass,
         NativeInteger() : IntegerClass,
diff --git a/pyccel/ast/core.py b/pyccel/ast/core.py
index d256fd1a50..512c5b56bd 100644
--- a/pyccel/ast/core.py
+++ b/pyccel/ast/core.py
@@ -57,9 +57,9 @@
     'Declare',
     'Decorator',
     'Del',
+    'DoConcurrent',
     'DottedFunctionCall',
     'Duplicate',
-    'DoConcurrent',
     'EmptyNode',
     'ErrorExit',
     'Exit',
@@ -75,10 +75,11 @@
     'If',
     'IfSection',
     'Import',
-    'InlineFunctionDef',
     'InProgram',
+    'InlineFunctionDef',
     'Interface',
     'Iterable',
+    'KernelCall',
     'Module',
     'ModuleHeader',
     'Pass',
@@ -93,8 +94,8 @@
     'SympyFunction',
     'While',
     'With',
-    'create_variable',
     'create_incremented_string',
+    'create_variable',
     'get_iterable_ranges',
     'inline',
     'subs'
@@ -2270,6 +2271,46 @@ def prefix(self):
         """
         return self._prefix
 
+class KernelCall(FunctionCall):
+    """
+    Represents a kernel call
+    (e.g. a[c, b]())
+
+
+
+    Parameters
+    ==========
+    func             : FunctionDef
+                       The definition of the function being called
+    args             : tuple
+                       The arguments being passed to the function
+
+    numBlocks        : NativeInteger
+                       The number of blocks
+    tpblock          : NativeInteger
+                       The number of threads per block
+    """
+    __slots__ = ('_numBlocks','_tpblock', '_func', '_args')
+    _attribute_nodes = (*FunctionCall._attribute_nodes, '_numBlocks', '_tpblock')
+
+    def __init__(self, func, args, numBlocks, tpblock, current_function=None):
+
+        self._numBlocks = numBlocks
+        self._tpblock = tpblock
+        super().__init__(func, args, current_function)
+
+    @property
+    def numBlocks(self):
+        """ The number of blocks in which the kernel will run
+        """
+        return self._numBlocks
+
+    @property
+    def tpblock(self):
+        """ The number of threads in each block
+        """
+        return self._tpblock
+
 class Return(Basic):
 
     """Represents a function return in the code.
diff --git a/pyccel/ast/cudaext.py b/pyccel/ast/cudaext.py
new file mode 100644
index 0000000000..3f04c272ee
--- /dev/null
+++ b/pyccel/ast/cudaext.py
@@ -0,0 +1,276 @@
+from .basic          import PyccelAstNode
+from .builtins       import (PythonTuple,PythonList)
+
+from .core           import Module, PyccelFunctionDef
+
+from .datatypes      import NativeInteger
+
+from .internals      import PyccelInternalFunction, get_final_precision
+
+from .literals       import LiteralInteger
+from .literals       import LiteralTrue, LiteralFalse
+from .operators      import PyccelAdd, PyccelMul
+from .variable       import (Variable, HomogeneousTupleVariable)
+
+from .numpyext       import process_dtype, process_shape, NumpyNewArray
+
+#==============================================================================
+__all__ = (
+    'CudaArray',
+    'CudaBlockDim',
+    'CudaBlockIdx',
+    'CudaCopy',
+    'CudaGrid',
+    'CudaGridDim',
+    'CudaInternalVar',
+    'CudaMemCopy',
+    'CudaNewArray',
+    'CudaSynchronize',
+    'CudaThreadIdx'
+)
+
+#==============================================================================
+class CudaNewArray(NumpyNewArray):
+    """ Class from which all Cuda functions which imply a call to Allocate
+    inherit
+    """
+    __slots__ = ()
+
+#==============================================================================
+
+#==============================================================================
+class CudaArray(CudaNewArray):
+    """
+    Represents a call to  cuda.array for code generation.
+
+    arg : list, tuple, PythonList
+
+    """
+    __slots__ = ('_arg','_dtype','_precision','_shape','_rank','_order','_memory_location')
+    _attribute_nodes = ('_arg',)
+    name = 'array'
+
+    def __init__(self, arg, dtype=None, order='C', memory_location='managed'):
+
+        if not isinstance(arg, (PythonTuple, PythonList, Variable)):
+            raise TypeError(f"Unknown type of  {type(arg)}.")
+
+        is_homogeneous_tuple = isinstance(arg, (PythonTuple, PythonList, HomogeneousTupleVariable)) and arg.is_homogeneous
+        is_array = isinstance(arg, Variable) and arg.is_ndarray
+
+        # TODO: treat inhomogenous lists and tuples when they have mixed ordering
+        if not (is_homogeneous_tuple or is_array):
+            raise TypeError('we only accept homogeneous arguments')
+
+        # Verify dtype and get precision
+        if dtype is None:
+            dtype = arg.dtype
+            prec = get_final_precision(arg)
+        else:
+            dtype, prec = process_dtype(dtype)
+        # ... Determine ordering
+        order = str(order).strip("\'")
+
+        shape = process_shape(False, arg.shape)
+        rank  = len(shape)
+
+        if rank < 2:
+            order = None
+        else:
+            # ... Determine ordering
+            order = str(order).strip("\'")
+
+            if order not in ('K', 'A', 'C', 'F'):
+                raise ValueError(f"Cannot recognize '{order}' order")
+
+            # TODO [YG, 18.02.2020]: set correct order based on input array
+            if order in ('K', 'A'):
+                order = 'C'
+            # ...
+        #Verify memory location
+        if memory_location not in ('host', 'device', 'managed'):
+            raise ValueError("memory_location must be 'host', 'device' or 'managed'")
+        self._arg   = arg
+        self._shape = shape
+        self._rank  = rank
+        self._dtype = dtype
+        self._order = order
+        self._precision = prec
+        self._memory_location = memory_location
+        super().__init__()
+
+    def __str__(self):
+        return str(self.arg)
+
+    @property
+    def arg(self):
+        return self._arg
+    @property
+    def memory_location(self):
+        return self._memory_location
+
+class CudaSynchronize(PyccelInternalFunction):
+    "Represents a call to  Cuda.deviceSynchronize for code generation."
+
+    __slots__ = ()
+    _attribute_nodes = ()
+    _shape     = None
+    _rank      = 0
+    _dtype     = NativeInteger()
+    _precision = None
+    _order     = None
+    def __init__(self):
+        super().__init__()
+
+class CudaInternalVar(PyccelAstNode):
+    """
+    Represents a General Class For Cuda internal Variables Used To locate Thread In the GPU architecture"
+
+    Parameters
+    ----------
+    dim : NativeInteger
+        Represent the dimension where we want to locate our thread.
+
+    """
+    __slots__ = ('_dim','_dtype', '_precision')
+    _attribute_nodes = ('_dim',)
+    _shape     = None
+    _rank      = 0
+    _order     = None
+
+    def __init__(self, dim=None):
+        
+        if isinstance(dim, int):
+            dim = LiteralInteger(dim)
+        if not isinstance(dim, LiteralInteger):
+            raise TypeError("dimension need to be an integer")
+        if dim not in (0, 1, 2):
+            raise ValueError("dimension need to be 0, 1 or 2")
+        #...
+        self._dim       = dim
+        self._dtype     = dim.dtype
+        self._precision = dim.precision
+        super().__init__()
+
+    @property
+    def dim(self):
+        return self._dim
+
+
+class CudaCopy(CudaNewArray):
+    """
+    Represents a call to  cuda.copy for code generation.
+
+    Parameters
+    ----------
+    arg : Variable
+
+    memory_location : str
+        'host'   the newly created array is allocated on host.
+        'device' the newly created array is allocated on device.
+    
+    is_async: bool
+        Indicates whether the copy is asynchronous or not [Default value: False]
+
+    """
+    __slots__ = ('_arg','_dtype','_precision','_shape','_rank','_order','_memory_location', '_is_async')
+
+    def __init__(self, arg, memory_location, is_async=False):
+        
+        if not isinstance(arg, Variable):
+            raise TypeError(f"unknown type of  {type(arg)}.")
+        
+        # Verify the memory_location of src
+        if arg.memory_location not in ('device', 'host', 'managed'):
+            raise ValueError("The direction of the copy should be from 'host' or 'device'")
+
+        # Verify the memory_location of dst
+        if memory_location not in ('device', 'host', 'managed'):
+            raise ValueError("The direction of the copy should be to 'host' or 'device'")
+        
+        # verify the type of is_async
+        if not isinstance(is_async, (LiteralTrue, LiteralFalse, bool)):
+            raise TypeError('is_async must be boolean')
+        
+        self._arg             = arg
+        self._shape           = arg.shape
+        self._rank            = arg.rank
+        self._dtype           = arg.dtype
+        self._order           = arg.order
+        self._precision       = arg.precision
+        self._memory_location = memory_location
+        self._is_async        = is_async
+        super().__init__()
+    
+    @property
+    def arg(self):
+        return self._arg
+
+    @property
+    def memory_location(self):
+        return self._memory_location
+
+    @property
+    def is_async(self):
+        return self._is_async
+
+class CudaThreadIdx(CudaInternalVar):
+    __slots__ = ()
+    pass
+class CudaBlockDim(CudaInternalVar):
+    __slots__ = ()
+    pass
+class CudaBlockIdx(CudaInternalVar):
+    __slots__ = ()
+    pass
+class CudaGridDim(CudaInternalVar):
+    __slots__ = ()
+    pass
+
+class CudaGrid(PyccelAstNode)               :
+    """
+    CudaGrid locates a thread in the GPU architecture using `CudaThreadIdx`, `CudaBlockDim`, `CudaBlockIdx`
+    to calculate the exact index of the thread automatically.
+
+    Parameters
+    ----------
+    dim : NativeInteger
+        Represent the dimension where we want to locate our thread.
+
+    """
+    __slots__ = ()
+    _attribute_nodes = ()
+    def __new__(cls, dim=0):
+        if not isinstance(dim, LiteralInteger):
+            raise TypeError("dimension need to be an integer")
+        if dim not in (0, 1, 2):
+            raise ValueError("dimension need to be 0, 1 or 2")
+        expr = [PyccelAdd(PyccelMul(CudaBlockIdx(d), CudaBlockDim(d)), CudaThreadIdx(d))\
+                for d in range(dim.python_value + 1)]
+        if dim == 0:
+            return expr[0]
+        return PythonTuple(*expr)
+
+
+
+cuda_funcs = {
+    'array'             : PyccelFunctionDef('array'             , CudaArray),
+    'copy'              : PyccelFunctionDef('copy'              , CudaCopy),
+    'synchronize'       : PyccelFunctionDef('synchronize'       , CudaSynchronize),
+    'threadIdx'         : PyccelFunctionDef('threadIdx'         , CudaThreadIdx),
+    'blockDim'          : PyccelFunctionDef('blockDim'          , CudaBlockDim),
+    'blockIdx'          : PyccelFunctionDef('blockIdx'          , CudaBlockIdx),
+    'gridDim'           : PyccelFunctionDef('gridDim'           , CudaGridDim),
+    'grid'              : PyccelFunctionDef('grid'              , CudaGrid)
+}
+
+cuda_Internal_Var = {
+    'CudaThreadIdx' : 'threadIdx',
+    'CudaBlockDim'  : 'blockDim',
+    'CudaBlockIdx'  : 'blockIdx',
+    'CudaGridDim'   : 'gridDim'
+}
+
+cuda_mod = Module('cuda',
+    variables = [],
+    funcs = cuda_funcs.values())
\ No newline at end of file
diff --git a/pyccel/ast/cupyext.py b/pyccel/ast/cupyext.py
new file mode 100644
index 0000000000..e0082cc6f3
--- /dev/null
+++ b/pyccel/ast/cupyext.py
@@ -0,0 +1,481 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+#------------------------------------------------------------------------------------------#
+# This file is part of Pyccel which is released under MIT License. See the LICENSE file or #
+# go to https://github.com/pyccel/pyccel/blob/master/LICENSE for full license details.     #
+#------------------------------------------------------------------------------------------#
+""" Module containing objects from the cupy module understood by pyccel
+"""
+
+from functools import reduce
+import operator
+
+from pyccel.errors.errors import Errors
+
+from pyccel.utilities.stage import PyccelStage
+
+from .basic          import PyccelAstNode
+from .builtins       import (PythonTuple, PythonList)
+
+from .core           import Module, PyccelFunctionDef
+
+from .datatypes      import (default_precision, NativeInteger,
+                             NativeFloat, NativeComplex, NativeBool, str_dtype,
+                             NativeNumeric)
+
+from .internals      import PyccelInternalFunction, max_precision, get_final_precision
+from .internals      import PyccelArraySize
+
+from .literals       import LiteralInteger, LiteralFloat, LiteralComplex
+from .literals       import LiteralTrue, LiteralFalse
+from .literals       import Nil
+from .mathext        import MathCeil
+from .operators      import PyccelMinus, PyccelDiv
+from .variable       import (Variable, HomogeneousTupleVariable)
+from .cudaext        import CudaNewArray
+from .numpyext       import process_dtype, process_shape, DtypePrecisionToCastFunction
+
+errors = Errors()
+pyccel_stage = PyccelStage()
+
+__all__ = (
+    'CupyArange',
+    'CupyArray',
+    'CupyArraySize',
+    'CupyAutoFill',
+    'CupyEmpty',
+    'CupyEmptyLike',
+    'CupyFull',
+    'CupyFullLike',
+    'CupyNewArray',
+    'CupyOnes',
+    'CupyOnesLike',
+    'CupyZeros',
+    'CupyZerosLike',
+    'Shape'
+)
+
+#==============================================================================
+class CupyNewArray(CudaNewArray):
+    """ Class from which all Cupy functions which imply a call to Allocate
+    inherit
+    """
+    __slots__ = ()
+    _memory_location = 'device'
+    def __init__(self):
+        super().__init__()
+
+    @property
+    def memory_location(self):
+        """ Indicate if the array is allocated on the host, device or has a managed memory
+        """
+        return self._memory_location
+
+#==============================================================================
+class CupyArray(CupyNewArray):
+    """
+    Represents a call to  cupy.array for code generation.
+
+    arg : list, tuple, PythonList
+
+    """
+    __slots__ = ('_arg','_dtype','_precision','_shape','_rank','_order')
+    _attribute_nodes = ('_arg',)
+    name = 'array'
+
+    def __init__(self, arg, dtype=None, order='C'):
+
+        if not isinstance(arg, (PythonTuple, PythonList, Variable)):
+            raise TypeError('Unknown type of  %s.' % type(arg))
+
+        is_homogeneous_tuple = isinstance(arg, (PythonTuple, PythonList, HomogeneousTupleVariable)) and arg.is_homogeneous
+        is_array = isinstance(arg, Variable) and arg.is_ndarray
+
+        # TODO: treat inhomogenous lists and tuples when they have mixed ordering
+        if not (is_homogeneous_tuple or is_array):
+            raise TypeError('we only accept homogeneous arguments')
+
+        # Verify dtype and get precision
+        if dtype is None:
+            dtype = arg.dtype
+            prec = get_final_precision(arg)
+        else:
+            dtype, prec = process_dtype(dtype)
+        # ... Determine ordering
+        order = str(order).strip("\'")
+
+        shape = process_shape(False, arg.shape)
+        rank  = len(shape)
+
+        if rank < 2:
+            order = None
+        else:
+            # ... Determine ordering
+            order = str(order).strip("\'")
+
+            if order not in ('K', 'A', 'C', 'F'):
+                raise ValueError(f"Cannot recognize '{order}' order")
+
+            # TODO [YG, 18.02.2020]: set correct order based on input array
+            if order in ('K', 'A'):
+                order = 'C'
+            # ...
+
+        self._arg   = arg
+        self._shape = shape
+        self._rank  = rank
+        self._dtype = dtype
+        self._order = order
+        self._precision = prec
+        super().__init__()
+
+    def __str__(self):
+        return str(self.arg)
+
+    @property
+    def arg(self):
+        return self._arg
+
+#==============================================================================
+class CupyArange(CupyNewArray):
+    """
+    Represents a call to  cupy.arange for code generation.
+
+    Parameters
+    ----------
+    start : Numeric
+        Start of interval, default value 0
+
+    stop : Numeric
+        End of interval
+
+    step : Numeric
+        Spacing between values, default value 1
+
+    dtype : Datatype
+        The type of the output array, if dtype is not given,
+        infer the data type from the other input arguments.
+    """
+    __slots__ = ('_start','_step','_stop','_dtype','_precision','_shape')
+    _attribute_nodes = ('_start','_step','_stop')
+    _rank = 1
+    _order = None
+    name = 'arange'
+
+    def __init__(self, start, stop = None, step = None, dtype = None):
+
+        if stop is None:
+            self._start = LiteralInteger(0)
+            self._stop = start
+        else:
+            self._start = start
+            self._stop = stop
+        self._step = step if step is not None else LiteralInteger(1)
+
+        if dtype is None:
+            self._dtype = max([i.dtype for i in self.arg], key = NativeNumeric.index)
+            self._precision = max_precision(self.arg, allow_native=False)
+        else:
+            self._dtype, self._precision = process_dtype(dtype)
+
+        self._shape = (MathCeil(PyccelDiv(PyccelMinus(self._stop, self._start), self._step)))
+        self._shape = process_shape(False, self._shape)
+        super().__init__()
+
+    @property
+    def arg(self):
+        return (self._start, self._stop, self._step)
+
+    @property
+    def start(self):
+        return self._start
+
+    @property
+    def stop(self):
+        return self._stop
+
+    @property
+    def step(self):
+        return self._step
+
+#==============================================================================
+
+class Shape(PyccelInternalFunction):
+    """ Represents a call to cupy.shape for code generation
+    """
+    __slots__ = ()
+    name = 'shape'
+    def __new__(cls, arg):
+        if isinstance(arg.shape, PythonTuple):
+            return arg.shape
+        else:
+            return PythonTuple(*arg.shape)
+
+#==============================================================================
+class CupyFull(CupyNewArray):
+    """
+    Represents a call to cupy.full for code generation.
+
+    Parameters
+    ----------
+    shape : int or sequence of ints
+        Shape of the new array, e.g., ``(2, 3)`` or ``2``.
+
+    fill_value : scalar
+        Fill value.
+
+    dtype: str, DataType
+        datatype for the constructed array
+        The default, `None`, means `np.array(fill_value).dtype`.
+
+    order : {'C', 'F'}, optional
+        Whether to store multidimensional data in C- or Fortran-contiguous
+        (row- or column-wise) order in memory.
+
+    """
+    __slots__ = ('_fill_value','_dtype','_precision','_shape','_rank','_order')
+    name = 'full'
+
+    def __init__(self, shape, fill_value, dtype=None, order='C'):
+
+        # Convert shape to PythonTuple
+        shape = process_shape(False, shape)
+        # If there is no dtype, extract it from fill_value
+        # TODO: must get dtype from an annotated node
+        if dtype is None:
+            dtype = fill_value.dtype
+            precision = get_final_precision(fill_value)
+        else:
+            dtype, precision = process_dtype(dtype)
+
+        # Cast fill_value to correct type
+        if fill_value:
+            if fill_value.dtype != dtype or get_final_precision(fill_value) != precision:
+                cast_func = DtypePrecisionToCastFunction[dtype.name][precision]
+                fill_value = cast_func(fill_value)
+        self._shape = shape
+        self._rank  = len(self._shape)
+        self._dtype = dtype
+        self._order = CupyNewArray._process_order(self._rank, order)
+        self._precision = precision
+
+        super().__init__(fill_value)
+
+    #--------------------------------------------------------------------------
+    @property
+    def fill_value(self):
+        return self._args[0]
+
+#==============================================================================
+class CupyAutoFill(CupyFull):
+    """ Abstract class for all classes which inherit from CupyFull but
+        the fill_value is implicitly specified
+    """
+    __slots__ = ()
+    def __init__(self, shape, dtype='float', order='C'):
+        if not dtype:
+            raise TypeError("Data type must be provided")
+        super().__init__(shape, Nil(), dtype, order)
+
+#==============================================================================
+class CupyEmpty(CupyAutoFill):
+    """ Represents a call to cupy.empty for code generation.
+    """
+    __slots__ = ()
+    name = 'empty'
+
+    def __init__(self, shape, dtype='float', order='C'):
+        if dtype in NativeNumeric:
+            precision = default_precision[str_dtype(dtype)]
+            dtype = DtypePrecisionToCastFunction[dtype.name][precision]
+        super().__init__(shape, dtype, order)
+    @property
+    def fill_value(self):
+        return None
+
+
+#==============================================================================
+class CupyZeros(CupyAutoFill):
+    """ Represents a call to cupy.zeros for code generation.
+    """
+    __slots__ = ()
+    name = 'zeros'
+    @property
+    def fill_value(self):
+        dtype = self.dtype
+        if isinstance(dtype, NativeInteger):
+            value = LiteralInteger(0, precision = self.precision)
+        elif isinstance(dtype, NativeFloat):
+            value = LiteralFloat(0, precision = self.precision)
+        elif isinstance(dtype, NativeComplex):
+            value = LiteralComplex(0., 0., precision = self.precision)
+        elif isinstance(dtype, NativeBool):
+            value = LiteralFalse(precision = self.precision)
+        else:
+            raise TypeError('Unknown type')
+        return value
+
+#==============================================================================
+class CupyOnes(CupyAutoFill):
+    """ Represents a call to cupy.ones for code generation.
+    """
+    __slots__ = ()
+    name = 'ones'
+    @property
+    def fill_value(self):
+        dtype = self.dtype
+        if isinstance(dtype, NativeInteger):
+            value = LiteralInteger(1, precision = self.precision)
+        elif isinstance(dtype, NativeFloat):
+            value = LiteralFloat(1., precision = self.precision)
+        elif isinstance(dtype, NativeComplex):
+            value = LiteralComplex(1., 0., precision = self.precision)
+        elif isinstance(dtype, NativeBool):
+            value = LiteralTrue(precision = self.precision)
+        else:
+            raise TypeError('Unknown type')
+        return value
+
+#=======================================================================================
+class CupyFullLike(PyccelInternalFunction):
+    """ Represents a call to cupy.full_like for code generation.
+    """
+    __slots__ = ()
+    name = 'full_like'
+    def __new__(cls, a, fill_value, dtype=None, order='K', subok=True, shape=None):
+
+        # NOTE: we ignore 'subok' argument
+        if dtype is None:
+            dtype = DtypePrecisionToCastFunction[a.dtype.name][a.precision]
+        order = a.order if str(order).strip('\'"') in ('K', 'A') else order
+        shape = Shape(a) if shape is None else shape
+        return CupyFull(shape, fill_value, dtype, order)
+
+#=======================================================================================
+class CupyEmptyLike(PyccelInternalFunction):
+    """ Represents a call to cupy.empty_like for code generation.
+    """
+    __slots__ = ()
+    name = 'empty_like'
+    def __new__(cls, a, dtype=None, order='K', subok=True, shape=None):
+
+        # NOTE: we ignore 'subok' argument
+        if dtype is None:
+            dtype = DtypePrecisionToCastFunction[a.dtype.name][a.precision]
+        order = a.order if str(order).strip('\'"') in ('K', 'A') else order
+        shape = Shape(a) if shape is None else shape
+
+        return CupyEmpty(shape, dtype, order)
+
+#=======================================================================================
+class CupyOnesLike(PyccelInternalFunction):
+    """ Represents a call to cupy.ones_like for code generation.
+    """
+    __slots__ = ()
+    name = 'ones_like'
+    def __new__(cls, a, dtype=None, order='K', subok=True, shape=None):
+
+        # NOTE: we ignore 'subok' argument
+        if dtype is None:
+            dtype = DtypePrecisionToCastFunction[a.dtype.name][a.precision]
+        order = a.order if str(order).strip('\'"') in ('K', 'A') else order
+        shape = Shape(a) if shape is None else shape
+
+        return CupyOnes(shape, dtype, order)
+
+#=======================================================================================
+class CupyZerosLike(PyccelInternalFunction):
+    """ Represents a call to cupy.zeros_like for code generation.
+    """
+    __slots__ = ()
+    name = 'zeros_like'
+    def __new__(cls, a, dtype=None, order='K', subok=True, shape=None):
+
+        # NOTE: we ignore 'subok' argument
+        if dtype is None:
+            dtype = DtypePrecisionToCastFunction[a.dtype.name][a.precision]
+        order = a.order if str(order).strip('\'"') in ('K', 'A') else order
+        shape = Shape(a) if shape is None else shape
+
+        return CupyZeros(shape, dtype, order)
+
+#=======================================================================================
+
+class CupyArraySize(PyccelInternalFunction):
+    """
+    Class representing a call to the cupy size function which
+    returns the shape of an object in a given dimension
+
+    Parameters
+    ==========
+    arg   : PyccelAstNode
+            A PyccelAstNode of unknown shape
+    axis  : int
+            The dimension along which the size is
+            requested
+    """
+    __slots__ = ('_arg',)
+    _attribute_nodes = ('_arg',)
+    name   = 'size'
+    _dtype = NativeInteger()
+    _precision = -1
+    _rank  = 0
+    _shape = None
+    _order = None
+
+    def __new__(cls, a, axis = None):
+        if axis is not None:
+            return PyccelArraySize(a, axis)
+        elif not isinstance(a, (list,
+                                    tuple,
+                                    PyccelAstNode)):
+            raise TypeError('Unknown type of  %s.' % type(a))
+        elif all(isinstance(s, LiteralInteger) for s in a.shape):
+            return LiteralInteger(reduce(operator.mul, [s.python_value for s in a.shape]))
+        else:
+            return super().__new__(cls)
+
+    def __init__(self, a, axis = None):
+        self._arg   = a
+        super().__init__(a)
+
+    @property
+    def arg(self):
+        """ Object whose size is investigated
+        """
+        return self._arg
+
+    def __str__(self):
+        return 'Size({})'.format(str(self.arg))
+
+#==============================================================================
+
+cupy_funcs = {
+    # ... array creation routines
+    'full'      : PyccelFunctionDef('full'      , CupyFull),
+    'empty'     : PyccelFunctionDef('empty'     , CupyEmpty),
+    'zeros'     : PyccelFunctionDef('zeros'     , CupyZeros),
+    'ones'      : PyccelFunctionDef('ones'      , CupyOnes),
+    'full_like' : PyccelFunctionDef('full_like' , CupyFullLike),
+    'empty_like': PyccelFunctionDef('empty_like', CupyEmptyLike),
+    'zeros_like': PyccelFunctionDef('zeros_like', CupyZerosLike),
+    'ones_like' : PyccelFunctionDef('ones_like' , CupyOnesLike),
+    'array'     : PyccelFunctionDef('array'     , CupyArray),
+    'arange'    : PyccelFunctionDef('arange'    , CupyArange),
+    # ...
+    'shape'     : PyccelFunctionDef('shape'     , Shape),
+    'size'      : PyccelFunctionDef('size'      , CupyArraySize),
+}
+
+cupy_mod = Module('cupy',
+    variables = [],
+    funcs = cupy_funcs.values())
+
+#==============================================================================
+
+cupy_target_swap = {
+        cupy_funcs['full_like']  : cupy_funcs['full'],
+        cupy_funcs['empty_like'] : cupy_funcs['empty'],
+        cupy_funcs['zeros_like'] : cupy_funcs['zeros'],
+        cupy_funcs['ones_like']  : cupy_funcs['ones']
+    }
diff --git a/pyccel/ast/utilities.py b/pyccel/ast/utilities.py
index 7b79388ded..4addd2557d 100644
--- a/pyccel/ast/utilities.py
+++ b/pyccel/ast/utilities.py
@@ -26,6 +26,8 @@
 from .mathext       import math_mod
 from .sysext        import sys_mod
 
+from .cudaext       import cuda_mod
+from .cupyext       import cupy_mod
 from .numpyext      import (NumpyEmpty, NumpyArray, numpy_mod,
                             NumpyTranspose, NumpyLinspace)
 from .operators     import PyccelAdd, PyccelMul, PyccelIs, PyccelArithmeticOperator
@@ -78,12 +80,15 @@ def builtin_function(expr, args=None):
 decorators_mod = Module('decorators',(),
         funcs = [PyccelFunctionDef(d, PyccelInternalFunction) for d in pyccel_decorators.__all__])
 pyccel_mod = Module('pyccel',(),(),
-        imports = [Import('decorators', decorators_mod)])
+        imports = [Import('decorators', decorators_mod),
+                    Import('cuda', cuda_mod),
+                    ])
 
 # TODO add documentation
 builtin_import_registry = Module('__main__',
         (),(),
         imports = [
+            Import('cupy', cupy_mod),
             Import('numpy', numpy_mod),
             Import('scipy', scipy_mod),
             Import('itertools', itertools_mod),
diff --git a/pyccel/ast/variable.py b/pyccel/ast/variable.py
index 3c84a7cf2d..550a51e3c1 100644
--- a/pyccel/ast/variable.py
+++ b/pyccel/ast/variable.py
@@ -55,6 +55,16 @@ class Variable(PyccelAstNode):
     rank : int, default: 0
         The number of dimensions for an array.
 
+    memory_handling: str, default: 'stack'
+        'heap' is used for arrays, if we need to allocate memory on the heap.
+        'stack' if memory should be allocated on the stack, represents stack arrays and scalars.
+        'alias' if object allows access to memory stored in another variable.
+    
+    memory_location: str, default: 'host'
+        'host' the variable can only be accessed by the CPU.
+        'device' the variable can only be accessed by the GPU.
+        'managed' the variable can be accessed by CPU and GPU and is being managed by the Cuda API (memory transfer is being done implicitly).
+
     memory_handling : str, default: 'stack'
         'heap' is used for arrays, if we need to allocate memory on the heap.
         'stack' if memory should be allocated on the stack, represents stack arrays and scalars.
@@ -108,7 +118,7 @@ class Variable(PyccelAstNode):
     >>> Variable('int', DottedName('matrix', 'n_rows'))
     matrix.n_rows
     """
-    __slots__ = ('_name', '_alloc_shape', '_memory_handling', '_is_const',
+    __slots__ = ('_name', '_alloc_shape', '_memory_handling', '_memory_location', '_is_const',
             '_is_target', '_is_optional', '_allows_negative_indexes',
             '_cls_base', '_is_argument', '_is_temp','_dtype','_precision',
             '_rank','_shape','_order','_is_private')
@@ -121,6 +131,7 @@ def __init__(
         *,
         rank=0,
         memory_handling='stack',
+        memory_location='host',
         is_const=False,
         is_target=False,
         is_optional=False,
@@ -155,6 +166,10 @@ def __init__(
             raise ValueError("memory_handling must be 'heap', 'stack' or 'alias'")
         self._memory_handling = memory_handling
 
+        if memory_location not in ('host', 'device', 'managed'):
+            raise ValueError("memory_location must be 'host', 'device' or 'managed'")
+        self._memory_location = memory_location
+
         if not isinstance(is_const, bool):
             raise TypeError('is_const must be a boolean.')
         self._is_const = is_const
@@ -309,6 +324,36 @@ def on_stack(self):
         """
         return self.memory_handling == 'stack'
 
+    @property
+    def memory_location(self):
+        """ Indicates whether a Variable has a dynamic size
+        """
+        return self._memory_location
+
+    @memory_location.setter
+    def memory_location(self, memory_location):
+        if memory_location not in ('host', 'device', 'managed'):
+            raise ValueError("memory_location must be 'host', 'device' or 'managed'")
+        self._memory_location = memory_location
+
+    @property
+    def on_host(self):
+        """  Indicates if memory is only accessible by the CPU
+        """
+        return self.memory_location == 'host'
+
+    @property
+    def on_device(self):
+        """ Indicates if memory is only accessible by the GPU
+        """
+        return self.memory_location == 'device'
+
+    @property
+    def is_managed(self):
+        """ Indicates if memory is being managed by CUDA API
+        """
+        return self.memory_location == 'managed'
+
     @property
     def is_stack_array(self):
         """ Indicates if the variable is located on stack and is an array
diff --git a/pyccel/codegen/codegen.py b/pyccel/codegen/codegen.py
index 1a5568bf96..149f831ee7 100644
--- a/pyccel/codegen/codegen.py
+++ b/pyccel/codegen/codegen.py
@@ -8,17 +8,19 @@
 
 from pyccel.codegen.printing.fcode  import FCodePrinter
 from pyccel.codegen.printing.ccode  import CCodePrinter
+from pyccel.codegen.printing.ccudacode  import CCudaCodePrinter
 from pyccel.codegen.printing.pycode import PythonCodePrinter
 
 from pyccel.ast.core      import FunctionDef, Interface, ModuleHeader
 from pyccel.errors.errors import Errors
 from pyccel.utilities.stage import PyccelStage
 
-_extension_registry = {'fortran': 'f90', 'c':'c',  'python':'py'}
-_header_extension_registry = {'fortran': None, 'c':'h',  'python':None}
+_extension_registry = {'fortran': 'f90', 'c':'c',  'python':'py', 'ccuda': 'cu'}
+_header_extension_registry = {'fortran': None, 'c':'h',  'python':None, 'ccuda': 'h'}
 printer_registry    = {
                         'fortran':FCodePrinter,
                         'c':CCodePrinter,
+                        'ccuda':CCudaCodePrinter,
                         'python':PythonCodePrinter
                       }
 
@@ -140,7 +142,7 @@ def set_printer(self, **settings):
         language = settings.pop('language', 'fortran')
 
         # Set language
-        if not language in ['fortran', 'c', 'python']:
+        if not language in ['fortran', 'c', 'python', 'ccuda']:
             raise ValueError('{} language is not available'.format(language))
         self._language = language
 
diff --git a/pyccel/codegen/compiling/compilers.py b/pyccel/codegen/compiling/compilers.py
index 32ccdaf518..62983538f3 100644
--- a/pyccel/codegen/compiling/compilers.py
+++ b/pyccel/codegen/compiling/compilers.py
@@ -333,7 +333,10 @@ def compile_program(self, compile_obj, output_folder, verbose = False):
         # Get compile options
         exec_cmd, includes, libs_flags, libdirs_flags, m_code = \
                 self._get_compile_components(compile_obj, accelerators)
-        linker_libdirs_flags = ['-Wl,-rpath' if l == '-L' else l for l in libdirs_flags]
+        if self._info['exec'] in ('nvcc', 'nvc', 'nvfortran'):
+            linker_libdirs_flags = ['-Xcompiler' if l == '-L' else f'"-Wl,-rpath,{l}"' for l in libdirs_flags]
+        else:
+            linker_libdirs_flags = ['-Wl,-rpath' if l == '-L' else l for l in libdirs_flags]
 
         if self._info['language'] == 'fortran':
             j_code = (self._info['module_output_flag'], output_folder)
@@ -385,7 +388,10 @@ def compile_shared_library(self, compile_obj, output_folder, verbose = False, sh
         # Collect compile information
         exec_cmd, includes, libs_flags, libdirs_flags, m_code = \
                 self._get_compile_components(compile_obj, accelerators)
-        linker_libdirs_flags = ['-Wl,-rpath' if l == '-L' else l for l in libdirs_flags]
+        if self._info['exec'] in ('nvcc', 'nvc', 'nvfortran'):
+            linker_libdirs_flags = ['-Xcompiler' if l == '-L' else f'"-Wl,-rpath,{l}"' for l in libdirs_flags]
+        else:
+            linker_libdirs_flags = ['-Wl,-rpath' if l == '-L' else l for l in libdirs_flags]
 
         flags.insert(0,"-shared")
 
diff --git a/pyccel/codegen/pipeline.py b/pyccel/codegen/pipeline.py
index 99208311f1..79b2e32ef4 100644
--- a/pyccel/codegen/pipeline.py
+++ b/pyccel/codegen/pipeline.py
@@ -194,9 +194,9 @@ def handle_error(stage):
     if language is None:
         language = 'fortran'
 
-    # Choose Fortran compiler
+    # Choose default compiler family
     if compiler is None:
-        compiler = 'GNU'
+        compiler = 'nvidia' if language == 'ccuda' else 'GNU'
 
     fflags = [] if fflags is None else fflags.split()
     wrapper_flags = [] if wrapper_flags is None else wrapper_flags.split()
diff --git a/pyccel/codegen/printing/ccode.py b/pyccel/codegen/printing/ccode.py
index 01d1b71cc1..3b2159c07c 100644
--- a/pyccel/codegen/printing/ccode.py
+++ b/pyccel/codegen/printing/ccode.py
@@ -411,12 +411,14 @@ def arrayFill(self, expr):
         lhs = expr.lhs
         code_init = ''
         declare_dtype = self.find_in_dtype_registry(self._print(rhs.dtype), rhs.precision)
-
+        dtype = self.find_in_ndarray_type_registry(self._print(rhs.dtype), rhs.precision)
+        dtype = dtype[3:]
+ 
         if rhs.fill_value is not None:
             if isinstance(rhs.fill_value, Literal):
-                code_init += 'array_fill(({0}){1}, {2});\n'.format(declare_dtype, self._print(rhs.fill_value), self._print(lhs))
+                code_init += 'array_fill_{0}(({1}){2}, {3});\n'.format(dtype, declare_dtype, self._print(rhs.fill_value), self._print(lhs))
             else:
-                code_init += 'array_fill({0}, {1});\n'.format(self._print(rhs.fill_value), self._print(lhs))
+                code_init += 'array_fill_{0}({1}, {2});\n'.format(dtype, self._print(rhs.fill_value), self._print(lhs))
         return code_init
 
     def _init_stack_array(self, expr):
diff --git a/pyccel/codegen/printing/ccudacode.py b/pyccel/codegen/printing/ccudacode.py
new file mode 100644
index 0000000000..bfefbdc3dd
--- /dev/null
+++ b/pyccel/codegen/printing/ccudacode.py
@@ -0,0 +1,564 @@
+# coding: utf-8
+#------------------------------------------------------------------------------------------#
+# This file is part of Pyccel which is released under MIT License. See the LICENSE file or #
+# go to https://github.com/pyccel/pyccel/blob/master/LICENSE for full license details.     #
+#------------------------------------------------------------------------------------------#
+# pylint: disable=missing-function-docstring
+
+
+from pyccel.ast.builtins  import PythonTuple
+
+from pyccel.ast.core      import (FunctionCall, Deallocate, FunctionAddress,
+                                  FunctionDefArgument, Assign, Import,
+                                  AliasAssign, Module)
+
+from pyccel.ast.datatypes import NativeTuple, datatype
+from pyccel.ast.literals  import LiteralTrue, Literal, Nil
+
+from pyccel.ast.numpyext import NumpyFull, NumpyArray, NumpyArange
+
+from pyccel.ast.cupyext import CupyFull, CupyArray, CupyArange
+
+from pyccel.ast.cudaext import CudaCopy, cuda_Internal_Var, CudaArray
+
+from pyccel.ast.variable import Variable
+
+from pyccel.ast.c_concepts import ObjectAddress
+
+from pyccel.codegen.printing.ccode import CCodePrinter
+
+from pyccel.errors.errors   import Errors
+
+
+errors = Errors()
+
+# TODO: add examples
+
+__all__ = ["CCudaCodePrinter", "ccudacode"]
+
+# dictionary mapping numpy function to (argument_conditions, C_function).
+# Used in CCodePrinter._print_NumpyUfuncBase(self, expr)
+numpy_ufunc_to_c_float = {
+    'NumpyAbs'  : 'fabs',
+    'NumpyFabs'  : 'fabs',
+    'NumpyMin'  : 'minval',
+    'NumpyMax'  : 'maxval',
+    'NumpyFloor': 'floor',  # TODO: might require special treatment with casting
+    # ---
+    'NumpyExp' : 'exp',
+    'NumpyLog' : 'log',
+    'NumpySqrt': 'sqrt',
+    # ---
+    'NumpySin'    : 'sin',
+    'NumpyCos'    : 'cos',
+    'NumpyTan'    : 'tan',
+    'NumpyArcsin' : 'asin',
+    'NumpyArccos' : 'acos',
+    'NumpyArctan' : 'atan',
+    'NumpyArctan2': 'atan2',
+    'NumpySinh'   : 'sinh',
+    'NumpyCosh'   : 'cosh',
+    'NumpyTanh'   : 'tanh',
+    'NumpyArcsinh': 'asinh',
+    'NumpyArccosh': 'acosh',
+    'NumpyArctanh': 'atanh',
+}
+
+numpy_ufunc_to_c_complex = {
+    'NumpyAbs'  : 'cabs',
+    'NumpyMin'  : 'minval',
+    'NumpyMax'  : 'maxval',
+    # ---
+    'NumpyExp' : 'cexp',
+    'NumpyLog' : 'clog',
+    'NumpySqrt': 'csqrt',
+    # ---
+    'NumpySin'    : 'csin',
+    'NumpyCos'    : 'ccos',
+    'NumpyTan'    : 'ctan',
+    'NumpyArcsin' : 'casin',
+    'NumpyArccos' : 'cacos',
+    'NumpyArctan' : 'catan',
+    'NumpySinh'   : 'csinh',
+    'NumpyCosh'   : 'ccosh',
+    'NumpyTanh'   : 'ctanh',
+    'NumpyArcsinh': 'casinh',
+    'NumpyArccosh': 'cacosh',
+    'NumpyArctanh': 'catanh',
+}
+
+# dictionary mapping Math function to (argument_conditions, C_function).
+# Used in CCodePrinter._print_MathFunctionBase(self, expr)
+# Math function ref https://docs.python.org/3/library/math.html
+math_function_to_c = {
+    # ---------- Number-theoretic and representation functions ------------
+    'MathCeil'     : 'ceil',
+    # 'MathComb'   : 'com' # TODO
+    'MathCopysign': 'copysign',
+    'MathFabs'   : 'fabs',
+    'MathFloor'    : 'floor',
+    # 'MathFmod'   : '???',  # TODO
+    # 'MathRexp'   : '???'   TODO requires two output
+    # 'MathFsum'   : '???',  # TODO
+    # 'MathIsclose' : '???',  # TODO
+    'MathIsfinite': 'isfinite', # int isfinite(real-floating x);
+    'MathIsinf'   : 'isinf', # int isinf(real-floating x);
+    'MathIsnan'   : 'isnan', # int isnan(real-floating x);
+    # 'MathIsqrt'  : '???' TODO
+    'MathLdexp'  : 'ldexp',
+    # 'MathModf'  : '???' TODO return two value
+    # 'MathPerm'  : '???' TODO
+    # 'MathProd'  : '???' TODO
+    'MathRemainder'  : 'remainder',
+    'MathTrunc'  : 'trunc',
+
+    # ----------------- Power and logarithmic functions -----------------------
+
+    'MathExp'    : 'exp',
+    'MathExpm1'  : 'expm1',
+    'MathLog'    : 'log',      # take also an option arg [base]
+    'MathLog1p'  : 'log1p',
+    'MathLog2'  : 'log2',
+    'MathLog10'  : 'log10',
+    'MathPow'    : 'pow',
+    'MathSqrt'   : 'sqrt',
+
+    # --------------------- Trigonometric functions ---------------------------
+
+    'MathAcos'   : 'acos',
+    'MathAsin'   : 'asin',
+    'MathAtan'   : 'atan',
+    'MathAtan2'  : 'atan2',
+    'MathCos'    : 'cos',
+    # 'MathDist'  : '???', TODO
+    'MathHypot'  : 'hypot',
+    'MathSin'    : 'sin',
+    'MathTan'    : 'tan',
+
+
+    # -------------------------- Hyperbolic functions -------------------------
+
+    'MathAcosh'  : 'acosh',
+    'MathAsinh'  : 'asinh',
+    'MathAtanh'  : 'atanh',
+    'MathCosh'   : 'cosh',
+    'MathSinh'   : 'sinh',
+    'MathTanh'   : 'tanh',
+
+    # --------------------------- Special functions ---------------------------
+
+    'MathErf'    : 'erf',
+    'MathErfc'   : 'erfc',
+    'MathGamma'  : 'tgamma',
+    'MathLgamma' : 'lgamma',
+
+    # --------------------------- internal functions --------------------------
+    'MathFactorial' : 'pyc_factorial',
+    'MathGcd'       : 'pyc_gcd',
+    'MathDegrees'   : 'pyc_degrees',
+    'MathRadians'   : 'pyc_radians',
+    'MathLcm'       : 'pyc_lcm',
+}
+
+c_library_headers = (
+    "complex",
+    "ctype",
+    "float",
+    "math",
+    "stdarg",
+    "stdbool",
+    "stddef",
+    "stdint",
+    "stdio",
+    "stdlib",
+    "string",
+    "tgmath",
+)
+
+dtype_registry = {('float',8)   : 'double',
+                  ('float',4)   : 'float',
+                  ('complex',8) : 'double complex',
+                  ('complex',4) : 'float complex',
+                  ('int',4)     : 'int32_t',
+                  ('int',8)     : 'int64_t',
+                  ('int',2)     : 'int16_t',
+                  ('int',1)     : 'int8_t',
+                  ('bool',4)    : 'bool'}
+
+ndarray_type_registry = {
+                  ('float',8)   : 'nd_double',
+                  ('float',4)   : 'nd_float',
+                  ('complex',8) : 'nd_cdouble',
+                  ('complex',4) : 'nd_cfloat',
+                  ('int',8)     : 'nd_int64',
+                  ('int',4)     : 'nd_int32',
+                  ('int',2)     : 'nd_int16',
+                  ('int',1)     : 'nd_int8',
+                  ('bool',4)    : 'nd_bool'}
+
+import_dict = {'omp_lib' : 'omp' }
+
+c_imports = {n : Import(n, Module(n, (), ())) for n in
+                ['stdlib',
+                 'math',
+                 'string',
+                 'ndarrays',
+                 'cuda_ndarrays',
+                 'math',
+                 'complex',
+                 'stdint',
+                 'pyc_math_c',
+                 'stdio',
+                 'stdbool',
+                 'assert']}
+
+class CCudaCodePrinter(CCodePrinter):
+    """A printer to convert python expressions to strings of ccuda code"""
+    printmethod = "_ccudacode"
+    language = "ccuda"
+
+    _default_settings = {
+        'tabwidth': 4,
+    }
+
+    def __init__(self, filename, prefix_module = None):
+
+        errors.set_target(filename, 'file')
+
+        super().__init__(filename)
+        self.prefix_module = prefix_module
+        self._additional_imports = {'stdlib':c_imports['stdlib']}
+        self._additional_code = ''
+        self._additional_args = []
+        self._temporary_args = []
+        self._current_module = None
+        self._in_header = False
+        # Dictionary linking optional variables to their
+        # temporary counterparts which provide allocated
+        # memory
+        # Key is optional variable
+        self._optional_partners = {}
+
+    def function_signature(self, expr, print_arg_names = True):
+        """
+        Get the Ccuda representation of the function signature.
+        Extract from the function definition `expr` all the
+        information (name, input, output) needed to create the
+        function signature and return a string describing the
+        function.
+        This is not a declaration as the signature does not end
+        with a semi-colon.
+        Parameters
+        ----------
+        expr : FunctionDef
+            The function definition for which a signature is needed.
+        print_arg_names : bool, default : True
+            Indicates whether argument names should be printed.
+        Returns
+        -------
+        str
+            Signature of the function.
+        """
+        if len(expr.results) > 1:
+            self._additional_args.append(expr.results)
+        args = list(expr.arguments)
+        if len(expr.results) == 1:
+            ret_type = self.get_declare_type(expr.results[0])
+        elif len(expr.results) > 1:
+            ret_type = self._print(datatype('int'))
+            args += [FunctionDefArgument(a) for a in expr.results]
+        else:
+            ret_type = self._print(datatype('void'))
+        name = expr.name
+        if not args:
+            arg_code = 'void'
+        else:
+            def get_var_arg(arg, var):
+                code = "const " * var.is_const
+                code += self.get_declare_type(var) + ' '
+                code += arg.name * print_arg_names
+                return code
+
+            var_list = [a.var for a in args]
+            arg_code_list = [self.function_signature(var, False) if isinstance(var, FunctionAddress)
+                                else get_var_arg(arg, var) for arg, var in zip(args, var_list)]
+            arg_code = ', '.join(arg_code_list)
+
+        if self._additional_args :
+            self._additional_args.pop()
+
+        extern_word = 'extern "C"'
+        cuda_deco = "__global__" if 'kernel' in expr.decorators else ''
+
+        if isinstance(expr, FunctionAddress):
+            return f'{extern_word} {ret_type} (*{name})({arg_code})'
+        else:
+            return f'{extern_word} {cuda_deco} {ret_type} {name}({arg_code})'
+
+    def _print_Allocate(self, expr):
+        free_code = ''
+        #free the array if its already allocated and checking if its not null if the status is unknown
+        if  (expr.status == 'unknown'):
+            free_code = 'if (%s.shape != NULL)\n' % self._print(expr.variable.name)
+            free_code += "{{\n{}}}\n".format(self._print(Deallocate(expr.variable)))
+        elif  (expr.status == 'allocated'):
+            free_code += self._print(Deallocate(expr.variable))
+        shape = ", ".join(self._print(i) for i in expr.shape)
+        shape_dtype = self.find_in_dtype_registry('int', 8)
+        tmp_shape = self.scope.get_new_name('tmp_shape')
+        dtype = self._print(expr.variable.dtype)
+        dtype = self.find_in_ndarray_type_registry(dtype, expr.variable.precision)
+        shape_Assign = "{} {}[] = {{{}}};".format(shape_dtype, tmp_shape, shape)
+        is_view = 'false' if expr.variable.on_heap else 'true'
+        self.add_import(c_imports['cuda_ndarrays'])
+        # define the memory location for the created cuda array
+        memory_location = expr.variable.memory_location
+        if memory_location in ('device', 'host'):
+            memory_location = 'allocateMemoryOn' + str(memory_location).capitalize()
+        else:
+            memory_location = 'managedMemory'
+        alloc_code = f"{expr.variable} = \
+            cuda_array_create({len(expr.shape)}, {tmp_shape}, {dtype}, {is_view}, {memory_location});"
+        return f"{free_code}\n{shape_Assign}\n{alloc_code}\n"
+
+    def _print_Deallocate(self, expr):
+        var_code = self._print(expr.variable)
+        if expr.variable.is_alias:
+            return f"cuda_free_pointer({var_code});\n"
+        else:
+            if expr.variable.memory_location == 'host':
+                return f"cuda_free_host({var_code});\n"
+            else:
+                return f"cuda_free({var_code});\n"
+
+    def _print_KernelCall(self, expr):
+        func = expr.funcdef
+        if func.is_inline:
+            return self._handle_inline_func_call(expr)
+         # Ensure the correct syntax is used for pointers
+        args = []
+        for a, f in zip(expr.args, func.arguments):
+            a = a.value if a else Nil()
+            f = f.var
+            if self.is_c_pointer(f):
+                if isinstance(a, Variable):
+                    args.append(ObjectAddress(a))
+                elif not self.is_c_pointer(a):
+                    tmp_var = self.scope.get_temporary_variable(f.dtype)
+                    assign = Assign(tmp_var, a)
+                    self._additional_code += self._print(assign)
+                    args.append(ObjectAddress(tmp_var))
+                else:
+                    args.append(a)
+            else :
+                args.append(a)
+
+        args += self._temporary_args
+        self._temporary_args = []
+        args = ', '.join(['{}'.format(self._print(a)) for a in args])
+        # TODO: need to raise error in semantic if we have result , kernel can't return
+        if not func.results:
+            return '{}<<<{},{}>>>({});\n'.format(func.name, expr.numBlocks, expr.tpblock,args)
+
+    def _print_Assign(self, expr):
+        prefix_code = ''
+        lhs = expr.lhs
+        rhs = expr.rhs
+        if isinstance(lhs, Variable) and lhs.is_optional:
+            if lhs in self._optional_partners:
+                # Collect temporary variable which provides
+                # allocated memory space for this optional variable
+                tmp_var = self._optional_partners[lhs]
+            else:
+                # Create temporary variable to provide allocated
+                # memory space before assigning to the pointer value
+                # (may be NULL)
+                tmp_var = self.scope.get_temporary_variable(lhs,
+                        is_optional = False)
+                self._optional_partners[lhs] = tmp_var
+            # Point optional variable at an allocated memory space
+            prefix_code = self._print(AliasAssign(lhs, tmp_var))
+        if isinstance(rhs, FunctionCall) and isinstance(rhs.dtype, NativeTuple):
+            self._temporary_args = [ObjectAddress(a) for a in lhs]
+            return prefix_code+'{};\n'.format(self._print(rhs))
+        # Inhomogenous tuples are unravelled and therefore do not exist in the c printer
+
+        if isinstance(rhs, (CupyFull)):
+            return prefix_code+self.cuda_arrayFill(expr)
+        if isinstance(rhs, CupyArange):
+            return prefix_code+self.cuda_Arange(expr)
+        if isinstance(rhs, (CudaArray, CupyArray)):
+            return prefix_code+self.copy_CudaArray_Data(expr)
+        if isinstance(rhs, (NumpyArray, PythonTuple)):
+            return prefix_code+self.copy_NumpyArray_Data(expr)
+        if isinstance(rhs, (NumpyFull)):
+            return prefix_code+self.arrayFill(expr)
+        if isinstance(rhs, NumpyArange):
+            return prefix_code+self.fill_NumpyArange(rhs, lhs)
+        if isinstance(rhs, CudaCopy):
+            return prefix_code+self.cudaCopy(lhs, rhs)
+        lhs = self._print(expr.lhs)
+        rhs = self._print(expr.rhs)
+        return prefix_code+'{} = {};\n'.format(lhs, rhs)
+
+    def arrayFill(self, expr):
+        """ print the assignment of a NdArray
+
+        parameters
+        ----------
+            expr : PyccelAstNode
+                The Assign Node used to get the lhs and rhs
+        Return
+        ------
+            String
+                Return a str that contains a call to the C function array_fill using Cuda api,
+        """
+        rhs = expr.rhs
+        lhs = expr.lhs
+        code_init = ''
+        declare_dtype = self.find_in_dtype_registry(self._print(rhs.dtype), rhs.precision)
+        dtype = self.find_in_ndarray_type_registry(self._print(rhs.dtype), rhs.precision)
+        dtype = dtype[3:]
+
+        if rhs.fill_value is not None:
+            if isinstance(rhs.fill_value, Literal):
+                code_init += 'cuda_array_fill_{0}(({1}){2}, {3});\n'.format(dtype, declare_dtype, self._print(rhs.fill_value), self._print(lhs))
+            else:
+                code_init += 'cuda_array_fill_{0}({1}, {2});\n'.format(dtype, self._print(rhs.fill_value), self._print(lhs))
+        return code_init
+
+    def cuda_Arange(self, expr):
+        """ print the assignment of a NdArray
+
+        parameters
+        ----------
+            expr : PyccelAstNode
+                The Assign Node used to get the lhs and rhs
+        Return
+        ------
+            String
+                Return a str that contains a call to the C function array_arange using Cuda api,
+        """
+        rhs = expr.rhs
+        lhs = expr.lhs
+        code_init = ''
+        declare_dtype = self.find_in_dtype_registry(self._print(rhs.dtype), rhs.precision)
+        dtype = self.find_in_ndarray_type_registry(self._print(rhs.dtype), rhs.precision)
+        dtype = dtype[3:]
+
+        #TODO: calculate best thread number to run the kernel
+        code_init += 'cuda_array_arange_{0}<<<1,32>>>({1}, {2});\n'.format(dtype, self._print(lhs), self._print(rhs.start))
+        return code_init
+
+    def cuda_arrayFill(self, expr):
+        """ print the assignment of a NdArray
+
+        parameters
+        ----------
+            expr : PyccelAstNode
+                The Assign Node used to get the lhs and rhs
+        Return
+        ------
+            String
+                Return a str that contains a call to the C function array_fill using Cuda api,
+        """
+        rhs = expr.rhs
+        lhs = expr.lhs
+        code_init = ''
+        declare_dtype = self.find_in_dtype_registry(self._print(rhs.dtype), rhs.precision)
+        dtype = self.find_in_ndarray_type_registry(self._print(rhs.dtype), rhs.precision)
+        dtype = dtype[3:]
+
+        if rhs.fill_value is not None:
+            if isinstance(rhs.fill_value, Literal):
+                code_init += 'cuda_array_fill_{0}<<<1,1>>>(({1}){2}, {3});\n'.format(dtype, declare_dtype, self._print(rhs.fill_value), self._print(lhs))
+            else:
+                code_init += 'cuda_array_fill_{0}<<<1,1>>>({1}, {2});\n'.format(dtype, self._print(rhs.fill_value), self._print(lhs))
+        return code_init
+
+    def copy_CudaArray_Data(self, expr):
+        """ print the assignment of a Cuda NdArray
+
+        parameters
+        ----------
+            expr : PyccelAstNode
+                The Assign Node used to get the lhs and rhs
+        Return
+        ------
+            String
+                Return a str that contains the declaration of a dummy data_buffer
+                       and a call to an operator which copies it to a Cuda NdArray struct
+                if the ndarray is a stack_array the str will contain the initialization
+        """
+        rhs = expr.rhs
+        lhs = expr.lhs
+        if rhs.rank == 0:
+            raise NotImplementedError(str(expr))
+        dummy_array_name = self.scope.get_new_name('cuda_array_dummy')
+        declare_dtype = self.find_in_dtype_registry(self._print(rhs.dtype), rhs.precision)
+        dtype = self.find_in_ndarray_type_registry(self._print(rhs.dtype), rhs.precision)
+        arg = rhs.arg if isinstance(rhs, (CudaArray, CupyArray)) else rhs
+        if rhs.rank > 1:
+            # flattening the args to use them in C initialization.
+            arg = self._flatten_list(arg)
+
+        self.add_import(c_imports['string'])
+        if isinstance(arg, Variable):
+            arg = self._print(arg)
+            cpy_data = "cudaMemcpy({0}.raw_data, {1}.{2}, {0}.buffer_size, cudaMemcpyHostToDevice);".format(lhs, arg, dtype)
+            return '%s\n' % (cpy_data)
+        else :
+            arg = ', '.join(self._print(i) for i in arg)
+            dummy_array = "%s %s[] = {%s};\n" % (declare_dtype, dummy_array_name, arg)
+            cpy_data = "cudaMemcpy({0}.raw_data, {1}, {0}.buffer_size, cudaMemcpyHostToDevice);".format(self._print(lhs), dummy_array_name, dtype)
+            return  '%s%s\n' % (dummy_array, cpy_data)
+
+    def _print_CudaSynchronize(self, expr):
+        return 'cudaDeviceSynchronize()'
+
+    def _print_CudaInternalVar(self, expr):
+        var_name = type(expr).__name__
+        var_name = cuda_Internal_Var[var_name]
+        dim_c = ('x', 'y', 'z')[expr.dim]
+        return '{}.{}'.format(var_name, dim_c)
+    
+    def cudaCopy(self, lhs, rhs):
+        from_location = 'Host'
+        to_location = 'Host'
+        if rhs.arg.memory_location in ('device', 'managed'):
+            from_location = 'Device'
+        if rhs.memory_location in ('device', 'managed'):
+            to_location = 'Device'
+        transfer_type = 'cudaMemcpy{0}To{1}'.format(from_location, to_location)
+        if isinstance(rhs.is_async, LiteralTrue):
+            cpy_data = "cudaMemcpyAsync({0}.raw_data, {1}.raw_data, {0}.buffer_size, {2}, 0);".format(lhs, rhs.arg, transfer_type)
+        else:
+            cpy_data = "cudaMemcpy({0}.raw_data, {1}.raw_data, {0}.buffer_size, {2});".format(lhs, rhs.arg, transfer_type)
+        return '%s\n' % (cpy_data)
+
+def ccudacode(expr, filename, assign_to=None, **settings):
+    """Converts an expr to a string of ccuda code
+
+    expr : Expr
+        A pyccel expression to be converted.
+    filename : str
+        The name of the file being translated. Used in error printing
+    assign_to : optional
+        When given, the argument is used as the name of the variable to which
+        the expression is assigned. Can be a string, ``Symbol``,
+        ``MatrixSymbol``, or ``Indexed`` type. This is helpful in case of
+        line-wrapping, or for expressions that generate multi-line statements.
+    precision : integer, optional
+        The precision for numbers such as pi [default=15].
+    user_functions : dict, optional
+        A dictionary where keys are ``FunctionClass`` instances and values are
+        their string representations. Alternatively, the dictionary value can
+        be a list of tuples i.e. [(argument_test, cfunction_string)]. See below
+        for examples.
+    dereference : iterable, optional
+        An iterable of symbols that should be dereferenced in the printed code
+        expression. These would be values passed by address to the function.
+        For example, if ``dereference=[a]``, the resulting code would print
+        ``(*a)`` instead of ``a``.
+    """
+    return CCudaCodePrinter(filename, **settings).doprint(expr, assign_to)
diff --git a/pyccel/codegen/printing/cwrappercode.py b/pyccel/codegen/printing/cwrappercode.py
index e5552d4dcb..ac0335eff4 100644
--- a/pyccel/codegen/printing/cwrappercode.py
+++ b/pyccel/codegen/printing/cwrappercode.py
@@ -292,7 +292,7 @@ def static_function_signature(self, expr):
             Signature of the function.
         """
         #if target_language is C no need for the binding
-        if self._target_language == 'c':
+        if self._target_language in ('c', 'ccuda'):
             return self.function_signature(expr)
 
         args = [a.var for a in expr.arguments]
diff --git a/pyccel/codegen/utilities.py b/pyccel/codegen/utilities.py
index 62ac36bd19..9c3a6df5cf 100644
--- a/pyccel/codegen/utilities.py
+++ b/pyccel/codegen/utilities.py
@@ -21,18 +21,19 @@
 __all__ = ['copy_internal_library','recompile_object']
 
 #==============================================================================
-language_extension = {'fortran':'f90', 'c':'c', 'python':'py'}
+language_extension = {'fortran':'f90', 'c':'c', 'python':'py', 'ccuda':'cu'}
 
 #==============================================================================
 # map internal libraries to their folders inside pyccel/stdlib and their compile objects
 # The compile object folder will be in the pyccel dirpath
 internal_libs = {
-    "ndarrays"     : ("ndarrays", CompileObj("ndarrays.c",folder="ndarrays")),
-    "pyc_math_f90" : ("math", CompileObj("pyc_math_f90.f90",folder="math")),
-    "pyc_math_c"   : ("math", CompileObj("pyc_math_c.c",folder="math")),
-    "cwrapper"     : ("cwrapper", CompileObj("cwrapper.c",folder="cwrapper", accelerators=('python',))),
-    "numpy_f90"    : ("numpy", CompileObj("numpy_f90.f90",folder="numpy")),
-    "numpy_c"      : ("numpy", CompileObj("numpy_c.c",folder="numpy")),
+    "ndarrays"      : ("ndarrays", CompileObj("ndarrays.c",folder="ndarrays")),
+    "cuda_ndarrays" : ("cuda_ndarrays", CompileObj("cuda_ndarrays.cu",folder="cuda_ndarrays")),
+    "pyc_math_f90"  : ("math", CompileObj("pyc_math_f90.f90",folder="math")),
+    "pyc_math_c"    : ("math", CompileObj("pyc_math_c.c",folder="math")),
+    "cwrapper"      : ("cwrapper", CompileObj("cwrapper.c",folder="cwrapper", accelerators=('python',))),
+    "numpy_f90"     : ("numpy", CompileObj("numpy_f90.f90",folder="numpy")),
+    "numpy_c"       : ("numpy", CompileObj("numpy_c.c",folder="numpy")),
 }
 internal_libs["cwrapper_ndarrays"] = ("cwrapper_ndarrays", CompileObj("cwrapper_ndarrays.c",folder="cwrapper_ndarrays",
                                                              accelerators = ('python',),
diff --git a/pyccel/commands/console.py b/pyccel/commands/console.py
index 22e9f8afd9..e6a5841478 100644
--- a/pyccel/commands/console.py
+++ b/pyccel/commands/console.py
@@ -60,7 +60,7 @@ def pyccel(files=None, mpi=None, openmp=None, openacc=None, output_dir=None, com
     # ... backend compiler options
     group = parser.add_argument_group('Backend compiler options')
 
-    group.add_argument('--language', choices=('fortran', 'c', 'python'), help='Generated language')
+    group.add_argument('--language', choices=('fortran', 'c', 'python', 'ccuda'), help='Generated language')
 
     group.add_argument('--compiler', help='Compiler family or json file containing a compiler description {GNU,intel,PGI}')
 
diff --git a/pyccel/compilers/default_compilers.py b/pyccel/compilers/default_compilers.py
index c04566a22b..5d559977c8 100644
--- a/pyccel/compilers/default_compilers.py
+++ b/pyccel/compilers/default_compilers.py
@@ -173,6 +173,16 @@
             'family': 'nvidia',
             }
 
+#------------------------------------------------------------
+nvcc_info = {'exec' : 'nvcc',
+            'language': 'ccuda',
+            'debug_flags': ("-g",),
+            'release_flags': ("-O3",),
+            'general_flags' : ('--compiler-options', '-fPIC',),
+            'standard_flags' : ('-std=c99',),
+            'family': 'nvidia',
+            }
+
 #------------------------------------------------------------
 def change_to_lib_flag(lib):
     """
@@ -275,6 +285,7 @@ def change_to_lib_flag(lib):
 pgfortran_info.update(python_info)
 nvc_info.update(python_info)
 nvfort_info.update(python_info)
+nvcc_info.update(python_info)
 
 available_compilers = {('GNU', 'c') : gcc_info,
                        ('GNU', 'fortran') : gfort_info,
@@ -283,6 +294,7 @@ def change_to_lib_flag(lib):
                        ('PGI', 'c') : pgcc_info,
                        ('PGI', 'fortran') : pgfortran_info,
                        ('nvidia', 'c') : nvc_info,
-                       ('nvidia', 'fortran') : nvfort_info}
+                       ('nvidia', 'fortran') : nvfort_info,
+                       ('nvidia', 'ccuda') : nvcc_info}
 
 vendors = ('GNU','intel','PGI','nvidia')
diff --git a/pyccel/decorators.py b/pyccel/decorators.py
index 0bb87aca15..2d263c4d9e 100644
--- a/pyccel/decorators.py
+++ b/pyccel/decorators.py
@@ -21,6 +21,7 @@
     'sympy',
     'template',
     'types',
+    'kernel',
 )
 
 def lambdify(f):
@@ -98,3 +99,25 @@ def allow_negative_index(f,*args):
     def identity(f):
         return f
     return identity
+
+def kernel(f):
+    """
+    This decorator is used to mark a Python function as a GPU kernel function,
+    allowing it to be executed on a GPU.
+    The decorator returns a NumPy array containing the decorated function object
+    to ensure that the function is treated as an array function.
+    This also allows the function to run in pure Python without errors related to indexing.
+
+    Parameters
+    ----------
+    f : Function
+        The function to be marked as a kernel.
+
+    Returns
+    -------
+    numpy.ndarray: A numpy array containing the function object.
+
+    """
+    from numpy import array
+    return array([[f]])
+
diff --git a/pyccel/epyccel.py b/pyccel/epyccel.py
index fecc7cca52..da6d932848 100644
--- a/pyccel/epyccel.py
+++ b/pyccel/epyccel.py
@@ -223,7 +223,7 @@ def epyccel( python_function_or_module, **kwargs ):
     verbose : bool
         Print additional information (default: False).
 
-    language : {'fortran', 'c', 'python'}
+    language : {'fortran', 'c', 'python', 'ccuda'}
         Language of generated code (default: 'fortran').
 
     accelerators : iterable of str, optional
diff --git a/pyccel/errors/messages.py b/pyccel/errors/messages.py
index 84282094ec..9d2b56e333 100644
--- a/pyccel/errors/messages.py
+++ b/pyccel/errors/messages.py
@@ -161,3 +161,10 @@
 WRONG_LINSPACE_ENDPOINT = 'endpoint argument must be boolean'
 NON_LITERAL_KEEP_DIMS = 'keep_dims argument must be a literal, otherwise rank is unknown'
 NON_LITERAL_AXIS = 'axis argument must be a literal, otherwise pyccel cannot determine which dimension to operate on'
+KERNEL_STACK_ARRAY_ARG = "A variable allocated on the stack can't be passed to a Kernel function"
+NON_KERNEL_FUNCTION_CUDA_VAR = 'Cuda internal variables should only be used in Kernel or Device functions'
+INVALID_KERNEL_CALL_BP_GRID = 'Invalid Block per grid parameter for Kernel call'
+INVALID_KERNEL_CALL_TP_BLOCK = 'Invalid Thread per Block parameter for Kernel call'
+MISSING_KERNEL_CONFIGURATION = 'Kernel launch configuration not specified'
+INVALID_FUNCTION_CALL = 'Invalid call for a non-kernel function'
+
diff --git a/pyccel/naming/__init__.py b/pyccel/naming/__init__.py
index a71d841c8e..b740f1e754 100644
--- a/pyccel/naming/__init__.py
+++ b/pyccel/naming/__init__.py
@@ -10,7 +10,9 @@
 from .fortrannameclashchecker import FortranNameClashChecker
 from .cnameclashchecker import CNameClashChecker
 from .pythonnameclashchecker import PythonNameClashChecker
+from .ccudanameclashchecker import CCudaNameClashChecker
 
 name_clash_checkers = {'fortran':FortranNameClashChecker(),
         'c':CNameClashChecker(),
+        'ccuda':CCudaNameClashChecker(),
         'python':PythonNameClashChecker()}
diff --git a/pyccel/naming/ccudanameclashchecker.py b/pyccel/naming/ccudanameclashchecker.py
new file mode 100644
index 0000000000..e5221139e6
--- /dev/null
+++ b/pyccel/naming/ccudanameclashchecker.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+#------------------------------------------------------------------------------------------#
+# This file is part of Pyccel which is released under MIT License. See the LICENSE file or #
+# go to https://github.com/pyccel/pyccel/blob/master/LICENSE for full license details.     #
+#------------------------------------------------------------------------------------------#
+"""
+Handles name clash problems in Ccuda
+"""
+from pyccel.utilities.metaclasses import Singleton
+from pyccel.utilities.strings import create_incremented_string
+
+class CCudaNameClashChecker(metaclass = Singleton):
+    """ Class containing functions to help avoid problematic names in Ccuda
+    """
+    # Keywords as mentioned on https://en.cppreference.com/w/c/keyword
+    keywords = set(['auto', 'break', 'case', 'char', 'const', 'bool',
+        'continue', 'default', 'do', 'double', 'else', 'enum',
+        'extern', 'float', 'for', 'goto', 'if', 'inline', 
+        'int', 'int8_t', 'int16_t', 'int32_t', 'int64_t',
+        'long', 'register', 'restrict', 'return', 'short', 'signed',
+        'sizeof', 'static', 'struct', 'switch', 'typedef', 'union',
+        'unsigned', 'void', 'volatile', 'while', '_Alignas',
+        '_Alignof', '_Atomic', '_Bool', '_Complex', 'Decimal128',
+        '_Decimal32', '_Decimal64', '_Generic', '_Imaginary', '__global__',
+        '_Noreturn', '_Static_assert', '_Thread_local', 't_ndarray',
+        'array_create', 'new_slice', 'array_slicing', 'alias_assign',
+        'transpose_alias_assign', 'array_fill', 't_slice',
+        'GET_INDEX_EXP1', 'GET_INDEX_EXP2', 'GET_INDEX_EXP2',
+        'GET_INDEX_EXP3', 'GET_INDEX_EXP4', 'GET_INDEX_EXP5',
+        'GET_INDEX_EXP6', 'GET_INDEX_EXP7', 'GET_INDEX_EXP8',
+        'GET_INDEX_EXP9', 'GET_INDEX_EXP10', 'GET_INDEX_EXP11',
+        'GET_INDEX_EXP12', 'GET_INDEX_EXP13', 'GET_INDEX_EXP14',
+        'GET_INDEX_EXP15', 'NUM_ARGS_H1', 'NUM_ARGS',
+        'GET_INDEX_FUNC_H2', 'GET_INDEX_FUNC', 'GET_INDEX',
+        'INDEX', 'GET_ELEMENT', 'free_array', 'free_pointer',
+        'get_index', 'numpy_to_ndarray_strides',
+        'numpy_to_ndarray_shape', 'get_size',
+        'cuda_free_array', 'cuda_free_pointer', 'cuda_array_create', 
+        'threadIdx', 'blockIdx', 'blockDim', 'gridDim', 
+        'cuda_array_fill_double', 'cuda_array_fill_int64', 
+        'cuda_array_fill_int32', 'cuda_array_fill_int8',
+        'cuda_array_arange_double', 'cuda_array_arange_int64', 
+        'cuda_array_arange_int32', 'cuda_array_arange_int8',
+        'cudaMallocManaged', 'cudaSynchronize'])
+
+    def has_clash(self, name, symbols):
+        """ Indicate whether the proposed name causes any clashes
+        """
+        return any(name == k for k in self.keywords) or \
+               any(name == s for s in symbols)
+
+    def get_collisionless_name(self, name, symbols):
+        """ Get the name that will be used in the fortran code
+        """
+        if len(name)>4 and all(name[i] == '_' for i in (0,1,-1,-2)):
+            # Ignore magic methods
+            return name
+        if name[0] == '_':
+            name = 'private'+name
+        prefix = name
+        coll_symbols = self.keywords.copy()
+        coll_symbols.update(s.lower() for s in symbols)
+        if prefix in coll_symbols:
+            counter = 1
+            new_name, counter = create_incremented_string(coll_symbols,
+                    prefix = prefix, counter = counter)
+            name = name+new_name[-5:]
+        return name
+
diff --git a/pyccel/parser/semantic.py b/pyccel/parser/semantic.py
index 0534166047..18818c8ade 100644
--- a/pyccel/parser/semantic.py
+++ b/pyccel/parser/semantic.py
@@ -37,6 +37,7 @@
 from pyccel.ast.core import ConstructorCall, InlineFunctionDef
 from pyccel.ast.core import FunctionDef, Interface, FunctionAddress, FunctionCall, FunctionCallArgument
 from pyccel.ast.core import DottedFunctionCall
+from pyccel.ast.core import KernelCall
 from pyccel.ast.core import ClassDef
 from pyccel.ast.core import For
 from pyccel.ast.core import Module
@@ -57,7 +58,7 @@
 from pyccel.ast.core import PyccelFunctionDef
 from pyccel.ast.core import Assert
 
-from pyccel.ast.class_defs import NumpyArrayClass, TupleClass, get_cls_base
+from pyccel.ast.class_defs import NumpyArrayClass, TupleClass, get_cls_base, CudaArrayClass
 
 from pyccel.ast.datatypes import NativeRange, str_dtype
 from pyccel.ast.datatypes import NativeSymbol
@@ -92,6 +93,9 @@
 from pyccel.ast.numpyext import NumpyNewArray, NumpyNonZero
 from pyccel.ast.numpyext import DtypePrecisionToCastFunction
 
+from pyccel.ast.cupyext import CupyNewArray
+from pyccel.ast.cudaext import CudaNewArray, CudaThreadIdx, CudaBlockDim, CudaBlockIdx, CudaGridDim
+
 from pyccel.ast.omp import (OMP_For_Loop, OMP_Simd_Construct, OMP_Distribute_Construct,
                             OMP_TaskLoop_Construct, OMP_Sections_Construct, Omp_End_Clause,
                             OMP_Single_Construct)
@@ -117,7 +121,7 @@
 from pyccel.errors.errors import Errors
 from pyccel.errors.errors import PyccelSemanticError
 
-from pyccel.errors.messages import (PYCCEL_RESTRICTION_TODO, UNDERSCORE_NOT_A_THROWAWAY,
+from pyccel.errors.messages import (INVALID_FUNCTION_CALL, INVALID_KERNEL_CALL_BP_GRID, INVALID_KERNEL_CALL_TP_BLOCK, MISSING_KERNEL_CONFIGURATION,PYCCEL_RESTRICTION_TODO, UNDERSCORE_NOT_A_THROWAWAY,
         UNDEFINED_VARIABLE, IMPORTING_EXISTING_IDENTIFIED, INDEXED_TUPLE, LIST_OF_TUPLES,
         INVALID_INDICES, INCOMPATIBLE_ARGUMENT, INCOMPATIBLE_ORDERING,
         UNRECOGNISED_FUNCTION_CALL, STACK_ARRAY_SHAPE_UNPURE_FUNC, STACK_ARRAY_UNKNOWN_SHAPE,
@@ -231,7 +235,6 @@ def __init__(self, inputs, *, parents = (), d_parsers = (), **kwargs):
         #
         self._code = parser._code
         # ...
-
         self.annotate()
         # ...
 
@@ -480,7 +483,6 @@ def _infer_type(self, expr, **settings):
 
         d_var = {}
         # TODO improve => put settings as attribut of Parser
-
         if expr in (PythonInt, PythonFloat, PythonComplex, PythonBool, NumpyBool, NumpyInt, NumpyInt8, NumpyInt16,
                       NumpyInt32, NumpyInt64, NumpyComplex, NumpyComplex64,
 					  NumpyComplex128, NumpyFloat, NumpyFloat64, NumpyFloat32):
@@ -493,6 +495,7 @@ def _infer_type(self, expr, **settings):
         elif isinstance(expr, Variable):
             d_var['datatype'       ] = expr.dtype
             d_var['memory_handling'] = expr.memory_handling
+            d_var['memory_location'] = expr.memory_location
             d_var['shape'          ] = expr.shape
             d_var['rank'           ] = expr.rank
             d_var['cls_base'       ] = expr.cls_base
@@ -550,6 +553,28 @@ def _infer_type(self, expr, **settings):
             d_var['cls_base'   ] = NumpyArrayClass
             return d_var
 
+        elif isinstance(expr, CupyNewArray):
+            d_var['datatype'   ] = expr.dtype
+            d_var['memory_handling'] = 'heap' if expr.rank > 0 else 'stack'
+            d_var['memory_location'] = expr.memory_location
+            d_var['shape'      ] = expr.shape
+            d_var['rank'       ] = expr.rank
+            d_var['order'      ] = expr.order
+            d_var['precision'  ] = expr.precision
+            d_var['cls_base'   ] = CudaArrayClass
+            return d_var
+
+        elif isinstance(expr, CudaNewArray):
+            d_var['datatype'   ] = expr.dtype
+            d_var['memory_handling'] = 'heap' if expr.rank > 0 else 'stack'
+            d_var['memory_location'] = expr.memory_location
+            d_var['shape'      ] = expr.shape
+            d_var['rank'       ] = expr.rank
+            d_var['order'      ] = expr.order
+            d_var['precision'  ] = expr.precision
+            d_var['cls_base'   ] = CudaArrayClass
+            return d_var
+
         elif isinstance(expr, NumpyTranspose):
 
             var = expr.internal_var
@@ -879,8 +904,17 @@ def _handle_function(self, expr, func, args, **settings):
         FunctionCall/PyccelInternalFunction
             The semantic representation of the call.
         """
+        if isinstance(func, FunctionDef) and 'kernel' in func.decorators:
+            errors.report(MISSING_KERNEL_CONFIGURATION, symbol = expr, severity = 'fatal')
         if isinstance(func, PyccelFunctionDef):
             func = func.cls_name
+            if func in (CudaThreadIdx, CudaBlockDim, CudaBlockIdx, CudaGridDim):
+                if 'kernel' not in self.scope.decorators\
+                    and 'device' not in self.scope.decorators:
+                    errors.report("Cuda internal variables should only be used in Kernel or Device functions",
+                        symbol = expr,
+                        severity = 'fatal')
+
             args, kwargs = split_positional_keyword_arguments(*args)
 
             try:
@@ -935,6 +969,92 @@ def _handle_function(self, expr, func, args, **settings):
                             expr, func.is_elemental)
             return new_expr
 
+    def _handle_kernel(self, expr, func, args, **settings):
+        """
+        Create a FunctionCall or an instance of a PyccelInternalFunction
+        from the function information and arguments
+
+        Parameters
+        ==========
+        expr : PyccelAstNode
+               The expression where this call is found (used for error output)
+        func : FunctionDef instance, Interface instance or PyccelInternalFunction type
+               The function being called
+        args : tuple
+               The arguments passed to the function
+
+        Returns
+        =======
+        new_expr : FunctionCall or PyccelInternalFunction
+        """
+        if 'kernel' not in func.decorators:
+            errors.report(INVALID_FUNCTION_CALL,
+                        symbol = expr,
+                        severity = 'fatal')
+        if isinstance(func, PyccelFunctionDef):
+            func = func.cls_name
+            args, kwargs = split_positional_keyword_arguments(*args)
+            for a in args:
+                if getattr(a,'dtype',None) == 'tuple':
+                    self._infere_type(a, **settings)
+            for a in kwargs.values():
+                if getattr(a,'dtype',None) == 'tuple':
+                    self._infere_type(a, **settings)
+
+            try:
+                new_expr = func(*args, **kwargs)
+            except TypeError:
+                errors.report(UNRECOGNISED_FUNCTION_CALL,
+                        symbol = expr,
+                        severity = 'fatal')
+
+            return new_expr
+        else:
+            if isinstance(func, FunctionDef) and len(args) > len(func.arguments):
+                errors.report("Too many arguments passed in function call",
+                        symbol = expr,
+                        severity='fatal')
+            # TODO : type check the NUMBER OF BLOCKS 'numBlocks' and threads per block 'tpblock'
+            if not isinstance(expr.numBlocks, LiteralInteger):
+                # expr.numBlocks could be invalid type, or PyccelSymbol
+                if isinstance(expr.numBlocks, PyccelSymbol):
+                    numBlocks = self.get_variable(expr.numBlocks)
+                    if not isinstance(numBlocks.dtype, NativeInteger):
+                        errors.report(INVALID_KERNEL_CALL_BP_GRID,
+                        symbol = expr,
+                        severity='error')
+                else:
+                    errors.report(INVALID_KERNEL_CALL_BP_GRID,
+                        symbol = expr,
+                        severity='error')
+            if not isinstance(expr.tpblock, LiteralInteger):
+                # expr.tpblock could be invalid type, or PyccelSymbol
+                if isinstance(expr.tpblock, PyccelSymbol):
+                    tpblock = self.get_variable(expr.tpblock)
+                    if not isinstance(tpblock.dtype, NativeInteger):
+                        errors.report(INVALID_KERNEL_CALL_TP_BLOCK,
+                        symbol = expr,
+                        severity='error')
+                else:
+                    errors.report(INVALID_KERNEL_CALL_TP_BLOCK,
+                        symbol = expr,
+                        severity='error')
+            new_expr = KernelCall(func, args, expr.numBlocks, expr.tpblock, self._current_function)
+
+            for a in new_expr.args:
+                if a is None:
+                    errors.report("Too few arguments passed in function call",
+                        symbol = expr,
+                        severity='error')
+                elif isinstance(a.value, Variable) and a.value.on_stack:
+                    errors.report("A variable allocated on the stack can't be passed to a Kernel function",
+                        symbol = expr,
+                        severity='error')
+            if isinstance(func, FunctionDef):
+                self._check_argument_compatibility(new_expr.args, func.arguments,
+                        expr, func.is_elemental)
+            return new_expr
+
     def _create_variable(self, name, dtype, rhs, d_lhs, arr_in_multirets=False):
         """
         Create a new variable.
@@ -1934,7 +2054,6 @@ def _visit_PyccelSymbol(self, expr, **settings):
 
 
     def _visit_DottedName(self, expr, **settings):
-
         var = self.check_for_variable(_get_name(expr))
         if var:
             return var
@@ -2089,6 +2208,18 @@ def _visit_DottedName(self, expr, **settings):
             bounding_box=(self._current_fst_node.lineno, self._current_fst_node.col_offset),
             severity='fatal')
 
+    def _visit_KernelCall(self, expr, **settings):
+        name     = expr.funcdef
+        try:
+            name = self.scope.get_expected_name(name)
+        except RuntimeError:
+            pass
+        func     = self.scope.find(name, 'functions')
+
+        args = self._handle_function_args(expr.args, **settings)
+
+        return self._handle_kernel(expr, func, args, **settings)
+
     def _visit_PyccelOperator(self, expr, **settings):
         args     = [self._visit(a, **settings) for a in expr.args]
         return self._create_PyccelOperator(expr, args)
diff --git a/pyccel/parser/syntactic.py b/pyccel/parser/syntactic.py
index 35822a060f..605efa5ce6 100644
--- a/pyccel/parser/syntactic.py
+++ b/pyccel/parser/syntactic.py
@@ -17,7 +17,7 @@
 
 from pyccel.ast.basic import Basic
 
-from pyccel.ast.core import FunctionCall, FunctionCallArgument
+from pyccel.ast.core import FunctionCall, FunctionCallArgument, KernelCall
 from pyccel.ast.core import Module
 from pyccel.ast.core import Assign
 from pyccel.ast.core import AugAssign
@@ -911,6 +911,10 @@ def _visit_Call(self, stmt):
         elif isinstance(func, DottedName):
             func_attr = FunctionCall(func.name[-1], args)
             func = DottedName(*func.name[:-1], func_attr)
+        elif isinstance(func, IndexedElement):
+            if len(func.indices) != 2:
+                raise NotImplementedError
+            func = KernelCall(func.base, args, func.indices[0], func.indices[1])
         else:
             raise NotImplementedError(' Unknown function type {}'.format(str(type(func))))
 
diff --git a/pyccel/stdlib/cuda_ndarrays/cuda_ndarrays.cu b/pyccel/stdlib/cuda_ndarrays/cuda_ndarrays.cu
new file mode 100644
index 0000000000..2563362433
--- /dev/null
+++ b/pyccel/stdlib/cuda_ndarrays/cuda_ndarrays.cu
@@ -0,0 +1,185 @@
+#include "cuda_ndarrays.h"
+
+__global__
+void cuda_array_arange_int8(t_ndarray arr, int start)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = gridDim.x * blockDim.x;
+
+    for(int i = index ; i < arr.length; i+=stride)
+        arr.nd_int8[i] = (i + start);
+}
+__global__
+void cuda_array_arange_int32(t_ndarray arr, int start)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = gridDim.x * blockDim.x;
+
+    for(int i = index ; i < arr.length; i+=stride)
+        arr.nd_int32[i] = (i + start);
+}
+__global__
+void cuda_array_arange_int64(t_ndarray arr, int start)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = gridDim.x * blockDim.x;
+
+    for(int i = index ; i < arr.length; i+=stride)
+        arr.nd_int64[i] = (i + start);
+}
+__global__
+void cuda_array_arange_double(t_ndarray arr, int start)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = gridDim.x * blockDim.x;
+
+    for(int i = index ; i < arr.length; i+=stride)
+        arr.nd_double[i] = (i + start);
+}
+
+__global__
+void cuda_array_fill_int8(int8_t c, t_ndarray arr)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = gridDim.x * blockDim.x;
+
+    for(int i = index ; i < arr.length; i+=stride)
+        arr.nd_int8[i] = c;
+}
+
+__global__
+void cuda_array_fill_int32(int32_t c, t_ndarray arr)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = gridDim.x * blockDim.x;
+
+    for(int i = index ; i < arr.length; i+=stride)
+        arr.nd_int32[i] = c;
+}
+
+__global__
+void cuda_array_fill_int64(int64_t c, t_ndarray arr)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = gridDim.x * blockDim.x;
+
+    for(int i = index ; i < arr.length; i+=stride)
+        arr.nd_int64[i] = c;
+}
+__global__
+void cuda_array_fill_double(double c, t_ndarray arr)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = gridDim.x * blockDim.x;
+
+    for(int i = index ; i < arr.length; i+=stride)
+        arr.nd_double[i] = c;
+}
+
+void    device_memory(void** devPtr, size_t size)
+{
+    cudaMalloc(devPtr, size);
+}
+
+void    managed_memory(void** devPtr, size_t size)
+{
+    cudaMallocManaged(devPtr, size);
+}
+
+void    host_memory(void** devPtr, size_t size)
+{
+    cudaMallocHost(devPtr, size);
+}
+
+t_ndarray   cuda_array_create(int32_t nd, int64_t *shape,
+        enum e_types type, bool is_view, enum e_memory_locations location)
+{
+    t_ndarray arr;
+    void (*fun_ptr_arr[])(void**, size_t) = {managed_memory, host_memory, device_memory};
+
+    arr.nd = nd;
+    arr.type = type;
+    switch (type)
+    {
+        case nd_int8:
+            arr.type_size = sizeof(int8_t);
+            break;
+        case nd_int16:
+            arr.type_size = sizeof(int16_t);
+            break;
+        case nd_int32:
+            arr.type_size = sizeof(int32_t);
+            break;
+        case nd_int64:
+            arr.type_size = sizeof(int64_t);
+            break;
+        case nd_float:
+            arr.type_size = sizeof(float);
+            break;
+        case nd_double:
+            arr.type_size = sizeof(double);
+            break;
+        case nd_bool:
+            arr.type_size = sizeof(bool);
+            break;
+    }
+    arr.is_view = is_view;
+    arr.length = 1;
+    cudaMallocManaged(&(arr.shape), arr.nd * sizeof(int64_t));
+    for (int32_t i = 0; i < arr.nd; i++)
+    {
+        arr.length *= shape[i];
+        arr.shape[i] = shape[i];
+    }
+    arr.buffer_size = arr.length * arr.type_size;
+    cudaMallocManaged(&(arr.strides), nd * sizeof(int64_t));
+    for (int32_t i = 0; i < arr.nd; i++)
+    {
+        arr.strides[i] = 1;
+        for (int32_t j = i + 1; j < arr.nd; j++)
+            arr.strides[i] *= arr.shape[j];
+    }
+    if (!is_view)
+        (*fun_ptr_arr[location])(&(arr.raw_data), arr.buffer_size);
+    return (arr);
+}
+
+int32_t cuda_free_host(t_ndarray arr)
+{
+    if (arr.shape == NULL)
+        return (0);
+    cudaFreeHost(arr.raw_data);
+    arr.raw_data = NULL;
+    cudaFree(arr.shape);
+    arr.shape = NULL;
+    cudaFree(arr.strides);
+    arr.strides = NULL;
+    return (1);
+}
+
+__host__ __device__
+int32_t cuda_free(t_ndarray arr)
+{
+    if (arr.shape == NULL)
+        return (0);
+    cudaFree(arr.raw_data);
+    arr.raw_data = NULL;
+    cudaFree(arr.shape);
+    arr.shape = NULL;
+    cudaFree(arr.strides);
+    arr.strides = NULL;
+    return (1);
+}
+
+__host__ __device__
+int32_t cuda_free_pointer(t_ndarray arr)
+{
+    if (arr.is_view == false || arr.shape == NULL)
+        return (0);
+    cudaFree(arr.shape);
+    arr.shape = NULL;
+    cudaFree(arr.strides);
+    arr.strides = NULL;
+    return (1);
+}
+
diff --git a/pyccel/stdlib/cuda_ndarrays/cuda_ndarrays.h b/pyccel/stdlib/cuda_ndarrays/cuda_ndarrays.h
new file mode 100644
index 0000000000..8e88ecd998
--- /dev/null
+++ b/pyccel/stdlib/cuda_ndarrays/cuda_ndarrays.h
@@ -0,0 +1,34 @@
+#ifndef CUDA_NDARRAYS_H
+# define CUDA_NDARRAYS_H
+
+#include "../ndarrays/ndarrays.h"
+
+__global__
+void cuda_array_arange_int8(t_ndarray arr, int start);
+__global__
+void cuda_array_arange_int32(t_ndarray arr, int start);
+__global__
+void cuda_array_arange_int64(t_ndarray arr, int start);
+__global__
+void cuda_array_arange_double(t_ndarray arr, int start);
+
+__global__
+void _cuda_array_fill_int8(int8_t c, t_ndarray arr);
+__global__
+void _cuda_array_fill_int32(int32_t c, t_ndarray arr);
+__global__
+void _cuda_array_fill_int64(int64_t c, t_ndarray arr);
+__global__
+void _cuda_array_fill_double(double c, t_ndarray arr);
+
+t_ndarray   cuda_array_create(int32_t nd, int64_t *shape, enum e_types type, bool is_view, enum e_memory_locations location);
+int32_t         cuda_free_array(t_ndarray dump);
+
+int32_t cuda_free_host(t_ndarray arr);
+
+__host__ __device__
+int32_t cuda_free(t_ndarray arr);
+
+__host__ __device__
+int32_t cuda_free_pointer(t_ndarray arr);
+#endif
diff --git a/pyccel/stdlib/ndarrays/ndarrays.c b/pyccel/stdlib/ndarrays/ndarrays.c
index 70ebd25a9e..6f952730f0 100644
--- a/pyccel/stdlib/ndarrays/ndarrays.c
+++ b/pyccel/stdlib/ndarrays/ndarrays.c
@@ -42,12 +42,14 @@ t_ndarray   array_create(int32_t nd, int64_t *shape,
         case nd_bool:
             arr.type_size = sizeof(bool);
             break;
+        #ifndef __NVCC__
         case nd_cfloat:
             arr.type_size = sizeof(float complex);
             break;
         case nd_cdouble:
             arr.type_size = sizeof(double complex);
             break;
+        #endif
     }
     arr.is_view = is_view;
     arr.length = 1;
@@ -95,12 +97,14 @@ void    stack_array_init(t_ndarray *arr)
         case nd_bool:
             arr->type_size = sizeof(bool);
             break;
+        #ifndef __NVCC__
         case nd_cfloat:
             arr->type_size = sizeof(float complex);
             break;
         case nd_cdouble:
             arr->type_size = sizeof(double complex);
             break;
+        #endif
     }
     arr->length = 1;
     for (int32_t i = 0; i < arr->nd; i++)
@@ -114,7 +118,7 @@ void    stack_array_init(t_ndarray *arr)
     }
 }
 
-void   _array_fill_int8(int8_t c, t_ndarray arr)
+void   array_fill_int8(int8_t c, t_ndarray arr)
 {
     if (c == 0)
         memset(arr.raw_data, 0, arr.buffer_size);
@@ -123,7 +127,7 @@ void   _array_fill_int8(int8_t c, t_ndarray arr)
             arr.nd_int8[i] = c;
 }
 
-void   _array_fill_int16(int16_t c, t_ndarray arr)
+void   array_fill_int16(int16_t c, t_ndarray arr)
 {
     if (c == 0)
         memset(arr.raw_data, 0, arr.buffer_size);
@@ -132,7 +136,7 @@ void   _array_fill_int16(int16_t c, t_ndarray arr)
             arr.nd_int16[i] = c;
 }
 
-void   _array_fill_int32(int32_t c, t_ndarray arr)
+void   array_fill_int32(int32_t c, t_ndarray arr)
 {
     if (c == 0)
         memset(arr.raw_data, 0, arr.buffer_size);
@@ -141,7 +145,7 @@ void   _array_fill_int32(int32_t c, t_ndarray arr)
             arr.nd_int32[i] = c;
 }
 
-void   _array_fill_int64(int64_t c, t_ndarray arr)
+void   array_fill_int64(int64_t c, t_ndarray arr)
 {
     if (c == 0)
         memset(arr.raw_data, 0, arr.buffer_size);
@@ -150,7 +154,7 @@ void   _array_fill_int64(int64_t c, t_ndarray arr)
             arr.nd_int64[i] = c;
 }
 
-void   _array_fill_bool(bool c, t_ndarray arr)
+void   array_fill_bool(bool c, t_ndarray arr)
 {
     if (c == 0)
         memset(arr.raw_data, 0, arr.buffer_size);
@@ -159,7 +163,7 @@ void   _array_fill_bool(bool c, t_ndarray arr)
             arr.nd_bool[i] = c;
 }
 
-void   _array_fill_float(float c, t_ndarray arr)
+void   array_fill_float(float c, t_ndarray arr)
 {
     if (c == 0)
         memset(arr.raw_data, 0, arr.buffer_size);
@@ -168,7 +172,7 @@ void   _array_fill_float(float c, t_ndarray arr)
             arr.nd_float[i] = c;
 }
 
-void   _array_fill_double(double c, t_ndarray arr)
+void   array_fill_double(double c, t_ndarray arr)
 {
     if (c == 0)
         memset(arr.raw_data, 0, arr.buffer_size);
@@ -177,7 +181,8 @@ void   _array_fill_double(double c, t_ndarray arr)
             arr.nd_double[i] = c;
 }
 
-void   _array_fill_cfloat(float complex c, t_ndarray arr)
+#ifndef __NVCC__
+void   array_fill_cfloat(float complex c, t_ndarray arr)
 {
     if (c == 0)
         memset(arr.raw_data, 0, arr.buffer_size);
@@ -187,7 +192,7 @@ void   _array_fill_cfloat(float complex c, t_ndarray arr)
 }
 
 
-void   _array_fill_cdouble(double complex c, t_ndarray arr)
+void   array_fill_cdouble(double complex c, t_ndarray arr)
 {
     if (c == 0)
         memset(arr.raw_data, 0, arr.buffer_size);
@@ -195,6 +200,7 @@ void   _array_fill_cdouble(double complex c, t_ndarray arr)
         for (int32_t i = 0; i < arr.length; i++)
             arr.nd_cdouble[i] = c;
 }
+#endif
 
 /*
 ** deallocation
@@ -437,5 +443,7 @@ NUMPY_SUM_(int32, int64_t, int32)
 NUMPY_SUM_(int64, int64_t, int64)
 NUMPY_SUM_(float32, float, float)
 NUMPY_SUM_(float64, double, double)
+#ifndef __NVCC__
 NUMPY_SUM_(complex64, float complex, cfloat)
 NUMPY_SUM_(complex128, double complex, cdouble)
+#endif
diff --git a/pyccel/stdlib/ndarrays/ndarrays.h b/pyccel/stdlib/ndarrays/ndarrays.h
index ec3c07d6fc..bbfdb19b3f 100644
--- a/pyccel/stdlib/ndarrays/ndarrays.h
+++ b/pyccel/stdlib/ndarrays/ndarrays.h
@@ -10,16 +10,9 @@
 # include <stdbool.h>
 # include <stdint.h>
 
-/* mapping the function array_fill to the correct type */
-# define array_fill(c, arr) _Generic((c), int64_t : _array_fill_int64,\
-                                        int32_t : _array_fill_int32,\
-                                        int16_t : _array_fill_int16,\
-                                        int8_t : _array_fill_int8,\
-                                        float : _array_fill_float,\
-                                        double : _array_fill_double,\
-                                        bool : _array_fill_bool,\
-                                        float complex : _array_fill_cfloat,\
-                                        double complex : _array_fill_cdouble)(c, arr)
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 enum e_slice_type { ELEMENT, RANGE };
 
@@ -69,8 +62,17 @@ enum e_types
         nd_int64    = 7,
         nd_float    = 11,
         nd_double   = 12,
+        #ifndef __NVCC__
         nd_cfloat   = 14,
         nd_cdouble  = 15
+        #endif
+};
+
+enum e_memory_locations
+{
+        managedMemory,
+        allocateMemoryOnHost,
+        allocateMemoryOnDevice
 };
 
 typedef struct  s_ndarray
@@ -85,8 +87,10 @@ typedef struct  s_ndarray
             float           *nd_float;
             double          *nd_double;
             bool            *nd_bool;
+            #ifndef __NVCC__
             double complex  *nd_cdouble;
             float  complex  *nd_cfloat;
+            #endif
             };
     /* number of dimensions */
     int32_t                 nd;
@@ -112,15 +116,17 @@ typedef struct  s_ndarray
 void        stack_array_init(t_ndarray *arr);
 t_ndarray   array_create(int32_t nd, int64_t *shape,
         enum e_types type, bool is_view);
-void        _array_fill_int8(int8_t c, t_ndarray arr);
-void        _array_fill_int16(int16_t c, t_ndarray arr);
-void        _array_fill_int32(int32_t c, t_ndarray arr);
-void        _array_fill_int64(int64_t c, t_ndarray arr);
-void        _array_fill_float(float c, t_ndarray arr);
-void        _array_fill_double(double c, t_ndarray arr);
-void        _array_fill_bool(bool c, t_ndarray arr);
-void        _array_fill_cfloat(float complex c, t_ndarray arr);
-void        _array_fill_cdouble(double complex c, t_ndarray arr);
+void        array_fill_int8(int8_t c, t_ndarray arr);
+void        array_fill_int16(int16_t c, t_ndarray arr);
+void        array_fill_int32(int32_t c, t_ndarray arr);
+void        array_fill_int64(int64_t c, t_ndarray arr);
+void        array_fill_float(float c, t_ndarray arr);
+void        array_fill_double(double c, t_ndarray arr);
+void        array_fill_bool(bool c, t_ndarray arr);
+#ifndef __NVCC__
+void        array_fill_cfloat(float complex c, t_ndarray arr);
+void        array_fill_cdouble(double complex c, t_ndarray arr);
+#endif
 
 /* slicing */
                 /* creating a Slice object */
@@ -152,7 +158,13 @@ int64_t            numpy_sum_int32(t_ndarray arr);
 int64_t            numpy_sum_int64(t_ndarray arr);
 float              numpy_sum_float32(t_ndarray arr);
 double             numpy_sum_float64(t_ndarray arr);
+#ifndef __NVCC__
 float complex      numpy_sum_complex64(t_ndarray arr);
 double complex     numpy_sum_complex128(t_ndarray arr);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif
diff --git a/pytest.ini b/pytest.ini
index 42eb0d72ba..41cd7cdace 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -6,6 +6,7 @@ markers =
     parallel: test to be run using 'mpiexec'
     fortran: test to generate fortran code
     c: test to generate c code
+    ccuda: test to generate Ccuda code
     python: test to generate python code
     xdist_incompatible: test which compiles a file also compiled by another test
     external: test using an external dll (problematic with conda on Windows)
diff --git a/samples/test/test.py b/samples/test/test.py
new file mode 100644
index 0000000000..027e3945e9
--- /dev/null
+++ b/samples/test/test.py
@@ -0,0 +1,28 @@
+# from numpy import array
+from pyccel.decorators import kernel
+from pyccel import cuda
+from numpy import array as narray
+# from pyccel.stdlib.internal import PyccelthreadIdx
+
+
+@types('int[:]')
+def abbah(z):
+    return z[0]
+
+
+@kernel
+@types('int[:]')
+def func(a):
+    i = cuda.threadIdx(0)
+    """
+    test test
+    """
+    print("Hello World! ", a[i])
+    a[i] += 1
+
+if __name__ == "__main__":
+    # b = narray([1, 2, 3], dtype='int', order='C')
+    a = cuda.array([1, 2, 3], dtype='int', order='C')
+    func[1, 3](a)
+    cuda.synchronize()
+    print(a)
\ No newline at end of file
diff --git a/samples/test/tt.cu b/samples/test/tt.cu
new file mode 100644
index 0000000000..d300c3ac6c
--- /dev/null
+++ b/samples/test/tt.cu
@@ -0,0 +1,20 @@
+#include <stdlib.h>
+#include <stdio.h>
+
+__global__ void func(int *p)
+{
+    p[0] = 5;
+    printf("%s\n", "Hello World!");
+}
+
+int main()
+{
+    int *p1;
+
+    cudaMallocManaged(&p1, 3*sizeof(int));
+    p1[0] = 0;
+    func<<<1,2>>>(p1);
+    cudaDeviceSynchronize();
+    printf("%d\n", p1[0]);
+    return 0;
+}
\ No newline at end of file
diff --git a/tests/cuda_ndarrays/conftest.py b/tests/cuda_ndarrays/conftest.py
new file mode 100644
index 0000000000..112323a6cf
--- /dev/null
+++ b/tests/cuda_ndarrays/conftest.py
@@ -0,0 +1,120 @@
+# pylint: disable=missing-function-docstring, missing-module-docstring/
+import subprocess
+import os
+import pathlib
+import sys
+import shutil
+import pytest
+
+NEEDS_FROM_PARENT = hasattr(pytest.Item, "from_parent")
+
+def pytest_collect_file(parent, path):
+    """
+    A hook to collect test_*.cu test files.
+    """
+    if path.ext == ".cu" and path.basename.startswith("test_"):
+        if NEEDS_FROM_PARENT:
+            return CTestFile.from_parent(path=pathlib.Path(path), parent=parent)
+        return CTestFile(parent=parent, path=pathlib.Path(path))
+    return None
+
+
+class CTestFile(pytest.File):
+    """
+    A custom file handler class for CCuda unit test files.
+    """
+
+    @classmethod
+    def from_parent(cls, parent, **kwargs):
+        return super().from_parent(parent=parent, **kwargs)
+
+    def collect(self):
+        """
+        Overridden collect method to collect the results from each
+        CCuda unit test executable.
+        """
+        # Run the exe that corresponds to the .cu file and capture the output.
+        test_exe = os.path.splitext(self.fspath)[0]
+        rootdir = self.config.rootdir.strpath
+        test_exe = os.path.relpath(test_exe)
+        ndarray_path =  os.path.join(rootdir , "pyccel", "stdlib", "ndarrays")
+        cuda_ndarray_path = os.path.join(rootdir, "pyccel", "stdlib", "cuda_ndarrays")
+        comp_cmd = [shutil.which("nvcc"), test_exe + ".cu",
+                    os.path.join(ndarray_path, "ndarrays.c"),
+                    os.path.join(cuda_ndarray_path, "cuda_ndarrays.cu"),
+                    "-I", ndarray_path,
+                    "-I", cuda_ndarray_path,
+                    "-o", test_exe,]
+
+        subprocess.run(comp_cmd, check=True)
+        if sys.platform.startswith("win"):
+            test_exe += ".exe"
+        test_output = subprocess.check_output("./" + test_exe)
+
+        # Clean up the unit test output and remove non test data lines.
+        lines = test_output.decode().split("\n")
+        lines = [line.strip() for line in lines]
+        lines = [line for line in lines if line.startswith("[")]
+
+        # Extract the test metadata from the unit test output.
+        test_results = []
+        for line in lines:
+            token, data = line.split(" ", 1)
+            token = token[1:-1]
+            if token in ("PASS", "FAIL"):
+                file_name, line_number, function_name = data.split(":")
+                test_results.append({"condition": token,
+                                     "file_name": file_name,
+                                     "function_name": function_name,
+                                     "line_number": int(line_number),
+                                     "INFO" : "no data found",
+                                     "DSCR" : ""
+                                     })
+            elif token in ("INFO", "DSCR"):
+                test_results[-1][token] = data
+        for test_result in test_results:
+            if NEEDS_FROM_PARENT:
+                yield CTestItem.from_parent(name = test_result["function_name"] + " < " + test_result["DSCR"] + " >",
+                        parent = self, test_result = test_result)
+            else:
+                yield CTestItem(name = test_result["function_name"] + " < " + test_result["DSCR"] + " >", parent = self,
+                        test_result = test_result)
+
+
+class CTestItem(pytest.Item):
+    """
+    Pytest.Item subclass to handle each test result item. There may be
+    more than one test result from a test function.
+    """
+
+    def __init__(self, *, test_result, **kwargs):
+        """Overridden constructor to pass test results dict."""
+        super().__init__(**kwargs)
+        self.test_result = test_result
+
+    @classmethod
+    def from_parent(cls, parent, **kwargs):
+        return super().from_parent(parent=parent, **kwargs)
+
+    def runtest(self):
+        """The test has already been run. We just evaluate the result."""
+        if self.test_result["condition"] == "FAIL":
+            raise CTestException(self, self.name)
+
+    def reportinfo(self):
+        """"Called to display header information about the test case."""
+        return self.fspath, self.test_result["line_number"], self.name
+
+    def repr_failure(self, excinfo, style=None):
+        """
+        Called when runtest() raises an exception. The method is used
+        to format the output of the failed test result.
+        """
+        if isinstance(excinfo.value, CTestException):
+            return f"Test failed : {self.test_result['file_name']}:{self.test_result['line_number']} {self.test_result['function_name']} < {self.test_result['DSCR']} >\n INFO : {self.test_result['INFO']}"
+        return super().repr_failure(excinfo)
+
+
+
+class CTestException(Exception):
+    """Custom exception to distinguish C unit test failures from others."""
diff --git a/tests/cuda_ndarrays/test_cuda_ndarrays.cu b/tests/cuda_ndarrays/test_cuda_ndarrays.cu
new file mode 100644
index 0000000000..e3c5e57917
--- /dev/null
+++ b/tests/cuda_ndarrays/test_cuda_ndarrays.cu
@@ -0,0 +1,503 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "ndarrays.h"
+#include "cuda_ndarrays.h"
+
+void assert_double(double v1 , double v2, const char *dscr,
+        const char * func, const char *file, int32_t line)
+{
+    if (v1 != v2)
+    {
+        printf("[FAIL] %s:%d:%s\n", file, line, func);
+        printf("[INFO] v1:%f != v2:%f\n", v1, v2);
+        printf("[DSCR] %s\n", dscr);
+        return ;
+    }
+    printf("[PASS] %s:%d:%s\n", file, line, func);
+    printf("[DSCR] %s\n", dscr);
+}
+
+void assert_float(float v1 , float v2, const char *dscr,
+        const char * func, const char *file, int32_t line)
+{
+    if (v1 != v2)
+    {
+        printf("[FAIL] %s:%d:%s\n", file, line, func);
+        printf("[INFO] v1:%f != v2:%f\n", v1, v2);
+        printf("[DSCR] %s\n", dscr);
+        return ;
+    }
+    printf("[PASS] %s:%d:%s\n", file, line, func);
+    printf("[DSCR] %s\n", dscr);
+}
+
+void assert_int64(int64_t v1, int64_t v2, const char *dscr,
+        const char * func, const char *file, int32_t line)
+{
+    if (v1 != v2)
+    {
+        printf("[FAIL] %s:%d:%s\n", file, line, func);
+        printf("[INFO] v1:%ld != v2:%ld\n", v1, v2);
+        printf("[DSCR] %s\n", dscr);
+        return ;
+    }
+    printf("[PASS] %s:%d:%s\n", file, line, func);
+    printf("[DSCR] %s\n", dscr);
+}
+
+void assert_int32(int32_t v1 , int32_t v2, const char *dscr,
+        const char * func, const char *file, int32_t line)
+{
+    if (v1 != v2)
+    {
+        printf("[FAIL] %s:%d:%s\n", file, line, func);
+        printf("[INFO] v1:%d != v2:%d\n", v1, v2);
+        printf("[DSCR] %s\n", dscr);
+        return ;
+    }
+    printf("[PASS] %s:%d:%s\n",file, line, func);
+    printf("[DSCR] %s\n", dscr);
+}
+
+void assert_int16(int16_t v1 , int16_t v2, const char *dscr,
+        const char * func, const char *file, int32_t line)
+{
+    if (v1 != v2)
+    {
+        printf("[FAIL] %s:%d:%s\n", file, line, func);
+        printf("[INFO] v1:%d != v2:%d\n", v1, v2);
+        printf("[DSCR] %s\n", dscr);
+        return ;
+    }
+    printf("[PASS] %s:%d:%s\n", file, line, func);
+    printf("[DSCR] %s\n", dscr);
+}
+
+void assert_int8(int8_t v1 , int8_t v2, const char *dscr,
+        const char * func, const char *file, int32_t line)
+{
+    if (v1 != v2)
+    {
+        printf("[FAIL] %s:%d:%s\n", file, line, func);
+        printf("[INFO] v1:%d != v2:%d\n", v1, v2);
+        printf("[DSCR] %s\n", dscr);
+        return ;
+    }
+    printf("[PASS] %s:%d:%s\n", file, line, func);
+    printf("[DSCR] %s\n", dscr);
+}
+
+void test_cuda_array_create_host_double()
+{
+    t_ndarray arr = {.shape = NULL};
+
+    int64_t tmp_shape[] = {INT64_C(14)};
+
+    arr = cuda_array_create(1, tmp_shape, nd_double, false, allocateMemoryOnHost);
+    double cuda_array_dummy[] = {1.02, 0.25, 5e-05, 1.0, 200.0, 33.0, 5.0, 57.0, 62.0, 70.0, 103.009, 141.0, 122.0, 26.5};
+    cudaMemcpy(arr.nd_double, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToHost);
+
+    assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__);
+    assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__);
+    for(int i = 0; i < tmp_shape[0]; i++)
+    {
+        assert_double(arr.nd_double[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__);
+    }
+    cuda_free_host(arr);
+}
+
+void test_cuda_array_create_managed_double()
+{
+    t_ndarray arr = {.shape = NULL};
+
+    int64_t tmp_shape[] = {INT64_C(14)};
+    arr = cuda_array_create(1, tmp_shape, nd_double, false, managedMemory);
+    double cuda_array_dummy[] = {1.02, 0.25, 5e-05, 1.0, 200.0, 33.0, 5.0, 57.0, 62.0, 70.0, 103.009, 141.0, 122.0, 26.5};
+    cudaMemcpy(arr.nd_double, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice);
+
+    assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__);
+    assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__);
+    for(int i = 0; i < tmp_shape[0]; i++)
+    {
+        assert_double(arr.nd_double[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__);
+    }
+    cuda_free(arr);
+}
+
+void test_cuda_array_create_device_double()
+{
+    t_ndarray arr = {.shape = NULL};
+    t_ndarray b = {.shape = NULL};
+
+    int64_t tmp_shape[] = {INT64_C(14)};
+    arr = cuda_array_create(1, tmp_shape, nd_double, false, allocateMemoryOnDevice);
+    double cuda_array_dummy[] = {1.02, 0.25, 5e-05, 1.0, 200.0, 33.0, 5.0, 57.0, 62.0, 70.0, 103.009, 141.0, 122.0, 26.5};
+    cudaMemcpy(arr.nd_double, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice);
+
+    int64_t tmp_shape_0001[] = {INT64_C(14)};
+    b = cuda_array_create(1, tmp_shape_0001, nd_double, false, allocateMemoryOnHost);
+    cudaMemcpy(b.nd_double, arr.nd_double, b.buffer_size, cudaMemcpyDeviceToHost);
+
+    assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__);
+    assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__);
+    for(int i = 0; i < tmp_shape[0]; i++)
+    {
+        assert_double(b.nd_double[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__);
+    }
+    cuda_free(arr);
+    cuda_free_host(b);
+}
+
+void test_cuda_array_create_host_float()
+{
+    t_ndarray arr = {.shape = NULL};
+
+    int64_t tmp_shape[] = {INT64_C(14)};
+
+    arr = cuda_array_create(1, tmp_shape, nd_float, false, allocateMemoryOnHost);
+    float cuda_array_dummy[] = {1.02, 0.25, 5e-05, 1.0, 200.0, 33.0, 5.0, 57.0, 62.0, 70.0, 103.009, 141.0, 122.0, 26.5};
+    cudaMemcpy(arr.nd_float, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToHost);
+
+    assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__);
+    assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__);
+    for(int i = 0; i < tmp_shape[0]; i++)
+    {
+        assert_float(arr.nd_float[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__);
+    }
+    cuda_free_host(arr);
+}
+
+void test_cuda_array_create_managed_float()
+{
+    t_ndarray arr = {.shape = NULL};
+
+    int64_t tmp_shape[] = {INT64_C(14)};
+    arr = cuda_array_create(1, tmp_shape, nd_float, false, managedMemory);
+    float cuda_array_dummy[] = {1.02, 0.25, 5e-05, 1.0, 200.0, 33.0, 5.0, 57.0, 62.0, 70.0, 103.009, 141.0, 122.0, 26.5};
+    cudaMemcpy(arr.nd_float, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice);
+
+    assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__);
+    assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__);
+    for(int i = 0; i < tmp_shape[0]; i++)
+    {
+        assert_float(arr.nd_float[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__);
+    }
+    cuda_free(arr);
+}
+
+void test_cuda_array_create_device_float()
+{
+    t_ndarray arr = {.shape = NULL};
+    t_ndarray b = {.shape = NULL};
+
+    int64_t tmp_shape[] = {INT64_C(14)};
+    arr = cuda_array_create(1, tmp_shape, nd_float, false, allocateMemoryOnDevice);
+    float cuda_array_dummy[] = {1.02, 0.25, 5e-05, 1.0, 200.0, 33.0, 5.0, 57.0, 62.0, 70.0, 103.009, 141.0, 122.0, 26.5};
+    cudaMemcpy(arr.nd_float, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice);
+
+    int64_t tmp_shape_0001[] = {INT64_C(14)};
+    b = cuda_array_create(1, tmp_shape_0001, nd_float, false, allocateMemoryOnHost);
+    cudaMemcpy(b.nd_float, arr.nd_float, b.buffer_size, cudaMemcpyDeviceToHost);
+
+    assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__);
+    assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__);
+    for(int i = 0; i < tmp_shape[0]; i++)
+    {
+        assert_float(b.nd_float[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__);
+    }
+    cuda_free(arr);
+    cuda_free_host(b);
+}
+
+void test_cuda_array_create_host_int64()
+{
+    t_ndarray arr = {.shape = NULL};
+
+    int64_t tmp_shape[] = {INT64_C(14)};
+
+    arr = cuda_array_create(1, tmp_shape, nd_int64, false, allocateMemoryOnHost);
+    int64_t cuda_array_dummy[] = {1, 0, 0, 1, 200, 33, 5, 57,
+                    62, 70, 103, 141, 122, 26};
+    cudaMemcpy(arr.nd_int64, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToHost);
+
+    assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__);
+    assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__);
+    for(int i = 0; i < tmp_shape[0]; i++)
+    {
+        assert_int64(arr.nd_int64[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__);
+    }
+    cuda_free_host(arr);
+}
+
+void test_cuda_array_create_managed_int64()
+{
+    t_ndarray arr = {.shape = NULL};
+
+    int64_t tmp_shape[] = {INT64_C(14)};
+    arr = cuda_array_create(1, tmp_shape, nd_int64, false, managedMemory);
+    int64_t cuda_array_dummy[] = {1, 0, 0, 1, 200, 33, 5, 57,
+                    62, 70, 103, 141, 122, 26};
+    cudaMemcpy(arr.nd_int64, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice);
+
+
+    assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__);
+    assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__);
+    for(int i = 0; i < tmp_shape[0]; i++)
+    {
+        assert_int64(arr.nd_int64[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__);
+    }
+    cuda_free(arr);
+}
+
+void test_cuda_array_create_device_int64()
+{
+    t_ndarray arr = {.shape = NULL};
+    t_ndarray b = {.shape = NULL};
+
+    int64_t tmp_shape[] = {INT64_C(14)};
+    arr = cuda_array_create(1, tmp_shape, nd_int64, false, allocateMemoryOnDevice);
+    int64_t cuda_array_dummy[] = {1, 0, 0, 1, 200, 33, 5, 57,
+                    62, 70, 103, 141, 122, 26};
+    cudaMemcpy(arr.nd_double, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice);
+
+    int64_t tmp_shape_0001[] = {INT64_C(14)};
+    b = cuda_array_create(1, tmp_shape_0001, nd_int64, false, allocateMemoryOnHost);
+    cudaMemcpy(b.nd_int64, arr.nd_int64, b.buffer_size, cudaMemcpyDeviceToHost);
+
+    assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__);
+    assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__);
+    for(int i = 0; i < tmp_shape[0]; i++)
+    {
+        assert_int64(b.nd_int64[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__);
+    }
+    cuda_free(arr);
+    cuda_free_host(b);
+}
+
+
+void test_cuda_array_create_host_int32()
+{
+    t_ndarray arr = {.shape = NULL};
+
+    int64_t tmp_shape[] = {INT64_C(14)};
+
+    arr = cuda_array_create(1, tmp_shape, nd_int32, false, allocateMemoryOnHost);
+    int32_t cuda_array_dummy[] = {1, 0, 0, 1, 200, 33, 5, 57,
+                    62, 70, 103, 141, 122, 26};
+    cudaMemcpy(arr.nd_int32, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToHost);
+
+    assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__);
+    assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__);
+    for(int i = 0; i < tmp_shape[0]; i++)
+    {
+        assert_int32(arr.nd_int32[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__);
+    }
+    cuda_free_host(arr); 
+}
+
+void test_cuda_array_create_managed_int32()
+{
+    t_ndarray arr = {.shape = NULL};
+
+    int64_t tmp_shape[] = {INT64_C(14)};
+    arr = cuda_array_create(1, tmp_shape, nd_int32, false, managedMemory);
+    int32_t cuda_array_dummy[] = {1, 0, 0, 1, 200, 33, 5, 57,
+                    62, 70, 103, 141, 122, 26};
+    cudaMemcpy(arr.nd_int32, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice);
+
+
+    assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__);
+    assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__);
+    for(int i = 0; i < tmp_shape[0]; i++)
+    {
+        assert_int32(arr.nd_int32[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__);
+    }
+    cuda_free(arr);
+}
+
+void test_cuda_array_create_device_int32()
+{
+    t_ndarray arr = {.shape = NULL};
+    t_ndarray b = {.shape = NULL};
+
+    int64_t tmp_shape[] = {INT64_C(14)};
+    arr = cuda_array_create(1, tmp_shape, nd_int32, false, allocateMemoryOnDevice);
+    int32_t cuda_array_dummy[] = {1, 0, 0, 1, 200, 33, 5, 57,
+                    62, 70, 103, 141, 122, 26};
+    cudaMemcpy(arr.nd_int32, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice);
+
+    int64_t tmp_shape_0001[] = {INT64_C(14)};
+    b = cuda_array_create(1, tmp_shape_0001, nd_int32, false, allocateMemoryOnHost);
+    cudaMemcpy(b.nd_int32, arr.nd_int32, b.buffer_size, cudaMemcpyDeviceToHost);
+
+    assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__);
+    assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__);
+    for(int i = 0; i < tmp_shape[0]; i++)
+    {
+        assert_int32(b.nd_int32[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__);
+    }
+    cuda_free(arr);
+    cuda_free_host(b);
+}
+
+void test_cuda_array_create_host_int16()
+{
+    t_ndarray arr = {.shape = NULL};
+
+    int64_t tmp_shape[] = {INT64_C(14)};
+
+    arr = cuda_array_create(1, tmp_shape, nd_int16, false, allocateMemoryOnHost);
+    int16_t cuda_array_dummy[] = {1, 0, 0, 1, 200, 33, 5, 57,
+                    62, 70, 103, 141, 122, 26};
+    cudaMemcpy(arr.nd_int16, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToHost);
+
+    assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__);
+    assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__);
+    for(int i = 0; i < tmp_shape[0]; i++)
+    {
+        assert_int16(arr.nd_int16[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__);
+    }
+    cuda_free_host(arr);
+}
+
+void test_cuda_array_create_managed_int16()
+{
+    t_ndarray arr = {.shape = NULL};
+
+    int64_t tmp_shape[] = {INT64_C(14)};
+    arr = cuda_array_create(1, tmp_shape, nd_int16, false, managedMemory);
+    int16_t cuda_array_dummy[] = {1, 0, 0, 1, 200, 33, 5, 57,
+                    62, 70, 103, 141, 122, 26};
+    cudaMemcpy(arr.nd_int16, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice);
+
+
+    assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__);
+    assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__);
+    for(int i = 0; i < tmp_shape[0]; i++)
+    {
+        assert_int16(arr.nd_int16[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__);
+    }
+    cuda_free(arr);
+}
+
+void test_cuda_array_create_device_int16()
+{
+    t_ndarray arr = {.shape = NULL};
+    t_ndarray b = {.shape = NULL};
+
+    int64_t tmp_shape[] = {INT64_C(14)};
+    arr = cuda_array_create(1, tmp_shape, nd_int16, false, allocateMemoryOnDevice);
+    int16_t cuda_array_dummy[] = {1, 0, 0, 1, 200, 33, 5, 57,
+                    62, 70, 103, 141, 122, 26};
+    cudaMemcpy(arr.nd_int16, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice);
+
+    int64_t tmp_shape_0001[] = {INT64_C(14)};
+    b = cuda_array_create(1, tmp_shape_0001, nd_int16, false, allocateMemoryOnHost);
+    cudaMemcpy(b.nd_int16, arr.nd_int16, b.buffer_size, cudaMemcpyDeviceToHost);
+
+    assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__);
+    assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__);
+    for(int i = 0; i < tmp_shape[0]; i++)
+    {
+        assert_int16(b.nd_int16[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__);
+    }
+    cuda_free(arr);
+    cuda_free_host(b);
+}
+
+void test_cuda_array_create_host_int8()
+{
+    t_ndarray arr = {.shape = NULL};
+
+    int64_t tmp_shape[] = {INT64_C(14)};
+
+    arr = cuda_array_create(1, tmp_shape, nd_int8, false, allocateMemoryOnHost);
+    int8_t cuda_array_dummy[] = {1, 0, 0, 1, 116, 33, 5, 57,
+                    62, 70, 103, 120, 122, 26};
+    cudaMemcpy(arr.nd_int8, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToHost);
+
+    assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__);
+    assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__);
+    for(int i = 0; i < tmp_shape[0]; i++)
+    {
+        assert_int8(arr.nd_int8[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__);
+    }
+    cuda_free_host(arr); 
+}
+
+void test_cuda_array_create_managed_int8()
+{
+    t_ndarray arr = {.shape = NULL};
+
+    int64_t tmp_shape[] = {INT64_C(14)};
+    arr = cuda_array_create(1, tmp_shape, nd_int8, false, managedMemory);
+    int8_t cuda_array_dummy[] = {1, 0, 0, 1, 116, 33, 5, 57,
+                    62, 70, 103, 120, 122, 26};
+    cudaMemcpy(arr.nd_int8, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice);
+
+
+    assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__);
+    assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__);
+    for(int i = 0; i < tmp_shape[0]; i++)
+    {
+        assert_int8(arr.nd_int8[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__);
+    }
+    cuda_free(arr);
+}
+
+void test_cuda_array_create_device_int8()
+{
+    t_ndarray arr = {.shape = NULL};
+    t_ndarray b = {.shape = NULL};
+
+    int64_t tmp_shape[] = {INT64_C(14)};
+    arr = cuda_array_create(1, tmp_shape, nd_int8, false, allocateMemoryOnDevice);
+    int8_t cuda_array_dummy[] = {1, 0, 0, 1, 116, 33, 5, 57,
+                    62, 70, 103, 120, 122, 26};
+    cudaMemcpy(arr.nd_int8, cuda_array_dummy, arr.buffer_size, cudaMemcpyHostToDevice);
+
+    int64_t tmp_shape_0001[] = {INT64_C(14)};
+    b = cuda_array_create(1, tmp_shape_0001, nd_int8, false, allocateMemoryOnHost);
+    cudaMemcpy(b.nd_int8, arr.nd_int8, b.buffer_size, cudaMemcpyDeviceToHost);
+
+    assert_int32(arr.nd, 1, "testing the number of dimensions", __func__, __FILE__, __LINE__);
+    assert_int64(arr.shape[0], tmp_shape[0], "testing the shape", __func__, __FILE__, __LINE__);
+    for(int i = 0; i < tmp_shape[0]; i++)
+    {
+        assert_int8(b.nd_int8[i], cuda_array_dummy[i], "testing the data", __func__, __FILE__, __LINE__);
+    }
+    cuda_free(arr);
+    cuda_free_host(b);
+}
+
+int32_t main(void)
+{
+    /* Cuda array creation tests */
+    test_cuda_array_create_host_double();
+    test_cuda_array_create_managed_double();
+    test_cuda_array_create_device_double();
+
+    test_cuda_array_create_host_float();
+    test_cuda_array_create_managed_float();
+    test_cuda_array_create_device_float();
+
+    test_cuda_array_create_host_int64();
+    test_cuda_array_create_managed_int64();
+    test_cuda_array_create_device_int64();
+
+    test_cuda_array_create_host_int32();
+    test_cuda_array_create_managed_int32();
+    test_cuda_array_create_device_int32();
+
+    test_cuda_array_create_host_int16();
+    test_cuda_array_create_managed_int16();
+    test_cuda_array_create_device_int16();
+
+    test_cuda_array_create_host_int8();
+    test_cuda_array_create_managed_int8();
+    test_cuda_array_create_device_int8();
+
+    return (0);
+}
diff --git a/tests/cuda_test/test_kernel_semantic.py b/tests/cuda_test/test_kernel_semantic.py
new file mode 100644
index 0000000000..e08e4e3711
--- /dev/null
+++ b/tests/cuda_test/test_kernel_semantic.py
@@ -0,0 +1,218 @@
+
+# pylint: disable=missing-function-docstring, missing-module-docstring/
+import pytest
+
+import numpy as np
+from pyccel.epyccel import epyccel
+from pyccel.decorators import stack_array, types, kernel
+from pyccel.errors.errors import Errors, PyccelSemanticError
+from pyccel.errors.messages import (INVALID_FUNCTION_CALL, KERNEL_STACK_ARRAY_ARG, MISSING_KERNEL_CONFIGURATION,
+                                    NON_KERNEL_FUNCTION_CUDA_VAR,
+                                    INVALID_KERNEL_CALL_BP_GRID,
+                                    INVALID_KERNEL_CALL_TP_BLOCK
+                                    )
+
+@pytest.mark.parametrize( 'language', [
+        pytest.param("ccuda", marks = pytest.mark.ccuda)
+    ]
+)
+def test_stack_array_kernel(language):
+    @stack_array('arr')
+    def kernel_caller():
+        from numpy import ones
+        @kernel
+        @types('int[:]')
+        def stack_array_kernel(arr):
+            return arr[0]
+        arr = ones(1, dtype=int)
+        return stack_array_kernel[1,1](arr)
+
+    # Initialize singleton that stores Pyccel errors
+    errors = Errors()
+
+    # epyccel should raise an Exception
+    with pytest.raises(PyccelSemanticError):
+        epyccel(kernel_caller, language=language)
+
+    # Check that we got exactly 1 Pyccel error
+    assert errors.has_errors()
+    assert errors.num_messages() == 1
+
+    # Check that the error is correct
+    error_info = [*errors.error_info_map.values()][0][0]
+    assert error_info.symbol.funcdef  == 'stack_array_kernel'
+    assert KERNEL_STACK_ARRAY_ARG == error_info.message
+
+@pytest.mark.parametrize( 'language', [
+        pytest.param("ccuda", marks = pytest.mark.ccuda)
+    ]
+)
+def test_cuda_intern_var_non_kernel(language):
+    def non_kernel_function():
+        from pyccel import cuda
+        i = cuda.threadIdx(0) + cuda.blockIdx(0) * cuda.blockDim(0)
+
+    # Initialize singleton that stores Pyccel errors
+    errors = Errors()
+
+    # epyccel should raise an Exception
+    with pytest.raises(PyccelSemanticError):
+        epyccel(non_kernel_function, language=language)
+
+    # Check that we got exactly 1 Pyccel error
+    assert errors.has_errors()
+    assert errors.num_messages() == 1
+
+    # Check that the error is correct
+    error_info = [*errors.error_info_map.values()][0][0]
+    assert error_info.symbol.name[0]            == 'cuda'
+    assert error_info.symbol.name[1].func_name  == 'threadIdx'
+    assert NON_KERNEL_FUNCTION_CUDA_VAR == error_info.message
+
+@pytest.mark.parametrize( 'language', [
+        pytest.param("ccuda", marks = pytest.mark.ccuda)
+    ]
+)
+def test_unvalid_block_number(language):
+    def unvalid_block_number():
+        @kernel
+        def kernel_call():
+            pass
+        kernel_call[1.2,1]()
+
+    # Initialize singleton that stores Pyccel errors
+    errors = Errors()
+
+    # epyccel should raise an Exception
+    with pytest.raises(PyccelSemanticError):
+        epyccel(unvalid_block_number, language=language)
+
+    # Check that we got exactly 1 Pyccel error
+    assert errors.has_errors()
+    assert errors.num_messages() == 1
+
+    # Check that the error is correct
+    error_info = [*errors.error_info_map.values()][0][0]
+    assert error_info.symbol.funcdef  == 'kernel_call'
+    assert INVALID_KERNEL_CALL_BP_GRID == error_info.message
+
+@pytest.mark.parametrize( 'language', [
+        pytest.param("ccuda", marks = pytest.mark.ccuda)
+    ]
+)
+def test_unvalid_thread_per_block(language):
+    def unvalid_thread_per_block():
+        @kernel
+        def kernel_call():
+            pass
+        kernel_call[1,1.2]()
+
+    # Initialize singleton that stores Pyccel errors
+    errors = Errors()
+
+    # epyccel should raise an Exception
+    with pytest.raises(PyccelSemanticError):
+        epyccel(unvalid_thread_per_block, language=language)
+
+    # Check that we got exactly 1 Pyccel error
+    assert errors.has_errors()
+    assert errors.num_messages() == 1
+
+    # Check that the error is correct
+    error_info = [*errors.error_info_map.values()][0][0]
+    assert error_info.symbol.funcdef  == 'kernel_call'
+    assert INVALID_KERNEL_CALL_TP_BLOCK == error_info.message
+
+@pytest.mark.parametrize( 'language', [
+        pytest.param("ccuda", marks = pytest.mark.ccuda)
+    ]
+)
+def test_missing_kernel_config(language):
+    def missing_kernel_config():
+        @kernel
+        def kernel_call():
+            pass
+        kernel_call()
+
+    errors = Errors()
+
+    with pytest.raises(PyccelSemanticError):
+        epyccel(missing_kernel_config, language=language)
+
+    assert errors.has_errors()
+    assert errors.num_messages() == 1
+
+    error_info = [*errors.error_info_map.values()][0][0]
+    assert error_info.symbol.func_name == 'kernel_call'
+    assert MISSING_KERNEL_CONFIGURATION == error_info.message
+
+@pytest.mark.parametrize( 'language', [
+        pytest.param("ccuda", marks = pytest.mark.ccuda)
+    ]
+)
+def test_invalid_block_number(language):
+    def invalid_block_number():
+        @kernel
+        def kernel_call():
+            pass
+        blocks_per_grid = 5.0
+        kernel_call[blocks_per_grid, 1]()
+
+    errors = Errors()
+
+    with pytest.raises(PyccelSemanticError):
+        epyccel(invalid_block_number, language=language)
+
+    assert errors.has_errors()
+    assert errors.num_messages() == 1
+
+    error_info = [*errors.error_info_map.values()][0][0]
+    assert error_info.symbol.funcdef == 'kernel_call'
+    assert  INVALID_KERNEL_CALL_BP_GRID == error_info.message
+
+@pytest.mark.parametrize( 'language', [
+        pytest.param("ccuda", marks = pytest.mark.ccuda)
+    ]
+)
+def test_invalid_thread_per_block(language):
+    def invalid_thread_per_block():
+        @kernel
+        def kernel_call():
+            pass
+        threads_per_block = 5.0
+        kernel_call[1, threads_per_block]()
+
+    errors = Errors()
+
+    with pytest.raises(PyccelSemanticError):
+        epyccel(invalid_thread_per_block, language=language)
+
+    assert errors.has_errors()
+    assert errors.num_messages() == 1
+
+    error_info = [*errors.error_info_map.values()][0][0]
+    assert error_info.symbol.funcdef == 'kernel_call'
+    assert  INVALID_KERNEL_CALL_TP_BLOCK == error_info.message
+
+
+@pytest.mark.parametrize( 'language', [
+        pytest.param("ccuda", marks = pytest.mark.ccuda)
+    ]
+)
+def test_invalid_function_call(language):
+    def invalid_function_call():
+        def non_kernel_func():
+            pass
+        non_kernel_func[1, 2]() # pylint: disable=E1136
+
+    errors = Errors()
+
+    with pytest.raises(PyccelSemanticError):
+        epyccel(invalid_function_call, language=language)
+
+    assert errors.has_errors()
+    assert errors.num_messages() == 1
+
+    error_info = [*errors.error_info_map.values()][0][0]
+    assert error_info.symbol.funcdef == 'non_kernel_func'
+    assert INVALID_FUNCTION_CALL == error_info.message
diff --git a/tests/internal/scripts/ccuda/cuda_array_device.py b/tests/internal/scripts/ccuda/cuda_array_device.py
new file mode 100644
index 0000000000..17c50a2824
--- /dev/null
+++ b/tests/internal/scripts/ccuda/cuda_array_device.py
@@ -0,0 +1,4 @@
+from pyccel import cuda
+
+if __name__ == '__main__':
+    arr = cuda.array([0,1,2,3,4], memory_location = 'device')
diff --git a/tests/internal/scripts/ccuda/cuda_array_host.py b/tests/internal/scripts/ccuda/cuda_array_host.py
new file mode 100644
index 0000000000..8702031bba
--- /dev/null
+++ b/tests/internal/scripts/ccuda/cuda_array_host.py
@@ -0,0 +1,4 @@
+from pyccel import cuda
+
+if __name__ == '__main__':
+     arr = cuda.array([0,1,2,3,4], memory_location='host')
diff --git a/tests/internal/scripts/ccuda/cuda_array_managed.py b/tests/internal/scripts/ccuda/cuda_array_managed.py
new file mode 100644
index 0000000000..58070d84ae
--- /dev/null
+++ b/tests/internal/scripts/ccuda/cuda_array_managed.py
@@ -0,0 +1,4 @@
+from pyccel import cuda
+
+if __name__ == '__main__':
+    a = cuda.array([0,1,2,3,4], memory_location = 'managed')
diff --git a/tests/internal/scripts/ccuda/cuda_copy.py b/tests/internal/scripts/ccuda/cuda_copy.py
new file mode 100644
index 0000000000..11a59c7fa0
--- /dev/null
+++ b/tests/internal/scripts/ccuda/cuda_copy.py
@@ -0,0 +1,6 @@
+from pyccel.decorators import kernel, types
+from pyccel import cuda
+
+if __name__ == '__main__':
+    arr = cuda.array([1,2,3,4], memory_location='device')
+    out = cuda.copy(arr, 'host', is_async=True)
diff --git a/tests/internal/scripts/ccuda/cuda_grid.py b/tests/internal/scripts/ccuda/cuda_grid.py
new file mode 100644
index 0000000000..56c2d14325
--- /dev/null
+++ b/tests/internal/scripts/ccuda/cuda_grid.py
@@ -0,0 +1,20 @@
+from pyccel.decorators import kernel, types
+from pyccel import cuda
+
+@kernel
+@types('int[:]')
+def func_1d(a):
+    i = cuda.grid(0)
+    print("1 dim :", a[i])
+
+@kernel
+@types('int[:]')
+def func_2d(a):
+    i, j = cuda.grid(1)
+    print("2 dim :", a[i], a[j])
+
+@kernel
+@types('int[:]')
+def func_3d(a):
+    i, j, k = cuda.grid(2)
+    print("3 dim :", a[i], a[j], a[k])
diff --git a/tests/internal/scripts/ccuda/cupy_arange.py b/tests/internal/scripts/ccuda/cupy_arange.py
new file mode 100644
index 0000000000..91664bf956
--- /dev/null
+++ b/tests/internal/scripts/ccuda/cupy_arange.py
@@ -0,0 +1,4 @@
+import cupy as cp
+
+if __name__ == '__main__':
+    arr = cp.arange(32)
diff --git a/tests/internal/scripts/ccuda/cupy_array.py b/tests/internal/scripts/ccuda/cupy_array.py
new file mode 100644
index 0000000000..c168c156e5
--- /dev/null
+++ b/tests/internal/scripts/ccuda/cupy_array.py
@@ -0,0 +1,4 @@
+import cupy as cp
+
+if __name__ == '__main__':
+    arr = cp.array([0, 1, 2, 3, 4])
diff --git a/tests/internal/scripts/ccuda/free_pointer.py b/tests/internal/scripts/ccuda/free_pointer.py
new file mode 100644
index 0000000000..4cc6e8f722
--- /dev/null
+++ b/tests/internal/scripts/ccuda/free_pointer.py
@@ -0,0 +1,16 @@
+# pylint: disable=missing-function-docstring, disable=unused-variable, missing-module-docstring
+
+import math
+from pyccel.decorators import kernel
+from pyccel import cuda
+
+@kernel
+def func(arr:'int[:]'):
+    i = cuda.grid(0)
+    arr[i] = math.pow(arr[i], 2)
+
+if __name__ == '__main__':
+    a = cuda.array([1,2,3,4], memory_location='device')
+    func[1,4](a)
+    c = a
+    cuda.synchronize()
diff --git a/tests/internal/scripts/ccuda/kernel.py b/tests/internal/scripts/ccuda/kernel.py
new file mode 100644
index 0000000000..86d77418c6
--- /dev/null
+++ b/tests/internal/scripts/ccuda/kernel.py
@@ -0,0 +1,8 @@
+from pyccel.decorators import kernel, types
+from pyccel import cuda
+
+@kernel
+@types('int[:]')
+def func(a):
+    i = cuda.threadIdx(0) + cuda.blockIdx(0) * cuda.blockDim(0)
+    print("Hello World! ", a[i])
diff --git a/tests/internal/scripts/ccuda/kernel_launch_config_literal.py b/tests/internal/scripts/ccuda/kernel_launch_config_literal.py
new file mode 100644
index 0000000000..25a6d3f317
--- /dev/null
+++ b/tests/internal/scripts/ccuda/kernel_launch_config_literal.py
@@ -0,0 +1,11 @@
+from pyccel.decorators import kernel
+from pyccel import cuda
+
+
+@kernel
+def func():
+    i = cuda.threadIdx(0) + cuda.blockIdx(0) * cuda.blockDim(0)
+    print("Hello World! ")
+
+if __name__ == '__main__':
+    func[1, 5]()
diff --git a/tests/internal/scripts/ccuda/kernel_launch_config_variable.py b/tests/internal/scripts/ccuda/kernel_launch_config_variable.py
new file mode 100644
index 0000000000..199e9ee6e5
--- /dev/null
+++ b/tests/internal/scripts/ccuda/kernel_launch_config_variable.py
@@ -0,0 +1,13 @@
+from pyccel.decorators import kernel
+from pyccel import cuda
+
+
+@kernel
+def func():
+    i = cuda.threadIdx(0) + cuda.blockIdx(0) * cuda.blockDim(0)
+    print("Hello World! ")
+
+if __name__ == '__main__':
+    nb = 1
+    tpb = 5
+    func[nb, tpb]()
diff --git a/tests/internal/test_internal.py b/tests/internal/test_internal.py
index 10630e7e8b..cf9fd01a8d 100644
--- a/tests/internal/test_internal.py
+++ b/tests/internal/test_internal.py
@@ -41,6 +41,14 @@ def test_mpi(f):
 def test_openmp(f, language):
     execute_pyccel(f, accelerators=['openmp'], language=language)
 
+@pytest.mark.parametrize("f", get_files_from_folder('ccuda'))
+@pytest.mark.parametrize( 'language',
+        (pytest.param("ccuda", marks = pytest.mark.ccuda),)
+)
+@pytest.mark.external
+def test_ccuda(f, language):
+    execute_pyccel(f, language=language)
+
 #@pytest.mark.parametrize("f", get_files_from_folder('openacc'))
 #@pytest.mark.external
 #def test_openacc():
@@ -98,3 +106,13 @@ def test_openmp(f, language):
 #        print('> testing {0}'.format(str(os.path.basename(f))))
 #        test_openacc(f)
 #    print('\n')
+
+    print('*********************************')
+    print('***                           ***')
+    print('***  TESTING INTERNAL/Cuda ***')
+    print('***                           ***')
+    print('*********************************')
+    for f in get_files_from_folder('ccuda'):
+       print('> testing {0}'.format(str(os.path.basename(f))))
+       test_ccuda(f)
+    print('\n')
diff --git a/tests/ndarrays/test_ndarrays.c b/tests/ndarrays/test_ndarrays.c
index eee6caa020..2c5116b11c 100644
--- a/tests/ndarrays/test_ndarrays.c
+++ b/tests/ndarrays/test_ndarrays.c
@@ -609,7 +609,7 @@ int32_t test_array_fill_int64(void)
     int64_t c_value;
 
     x = array_create(2, m_1_shape, nd_int64, false);
-    array_fill((int64_t)32, x);
+    array_fill_int64((int64_t)32, x);
     // testing the index [3, 1]
     index = 3 * x.strides[0] + 1 * x.strides[1];
     c_index = 7;
@@ -633,7 +633,7 @@ int32_t test_array_fill_int32(void)
     int32_t c_value;
 
     x = array_create(2, m_1_shape, nd_int32, false);
-    array_fill((int32_t)32, x);
+    array_fill_int32((int32_t)32, x);
     // testing the index [3, 1]
     index = 3 * x.strides[0] + 1 * x.strides[1];
     c_index = 7;
@@ -657,7 +657,7 @@ int32_t test_array_fill_int16(void)
     int16_t c_value;
 
     x = array_create(2, m_1_shape, nd_int16, false);
-    array_fill((int16_t)32, x);
+    array_fill_int16((int16_t)32, x);
     // testing the index [3, 1]
     index = 3 * x.strides[0] + 1 * x.strides[1];
     c_index = 7;
@@ -681,7 +681,7 @@ int32_t test_array_fill_int8(void)
     int8_t c_value;
 
     x = array_create(2, m_1_shape, nd_int8, false);
-    array_fill((int8_t)32, x);
+    array_fill_int8((int8_t)32, x);
     // testing the index [3, 1]
     index = 3 * x.strides[0] + 1 * x.strides[1];
     c_index = 7;
@@ -705,7 +705,7 @@ int32_t test_array_fill_double(void)
     double c_value;
 
     x = array_create(2, m_1_shape, nd_double, false);
-    array_fill(2., x);
+    array_fill_double(2., x);
     // testing the index [3, 1]
     index = 3 * x.strides[0] + 1 * x.strides[1];
     c_index = 7;
@@ -729,7 +729,7 @@ int32_t test_array_fill_cdouble(void)
     double complex c_value;
 
     x = array_create(2, m_1_shape, nd_cdouble, false);
-    array_fill(0.3+0.54*I, x);
+    array_fill_cdouble(0.3+0.54*I, x);
     // testing the index [3, 1]
     index = 3 * x.strides[0] + 1 * x.strides[1];
     c_index = 7;
@@ -755,7 +755,7 @@ int32_t test_array_zeros_double(void)
     double c_value;
 
     x = array_create(2, m_1_shape, nd_double, false);
-    array_fill(0, x);
+    array_fill_double(0, x);
     // testing the index [3, 1]
     index = 3 * x.strides[0] + 1 * x.strides[1];
     c_index = 7;
@@ -779,7 +779,7 @@ int32_t test_array_zeros_int32(void)
     int32_t c_value;
 
     x = array_create(2, m_1_shape, nd_int32, false);
-    array_fill(0, x);
+    array_fill_int32(0, x);
     // testing the index [3, 1]
     index = 3 * x.strides[0] + 1 * x.strides[1];
     c_index = 7;
@@ -803,7 +803,7 @@ int32_t test_array_zeros_cdouble(void)
     double complex c_value;
 
     x = array_create(2, m_1_shape, nd_cdouble, false);
-    array_fill(0, x);
+    array_fill_cdouble(0, x);
     // testing the index [3, 1]
     index = 3 * x.strides[0] + 1 * x.strides[1];
     c_index = 7;
diff --git a/tests/run_tests_py3.sh b/tests/run_tests_py3.sh
index 6145dca0d2..16fb78a4f0 100755
--- a/tests/run_tests_py3.sh
+++ b/tests/run_tests_py3.sh
@@ -15,7 +15,7 @@ SCRIPT_DIR=$(dirname -- "$(realpath -- "$0")")
 #python3 "$SCRIPT_DIR"/internal/test_internal.py
 #python3 "$SCRIPT_DIR"/external/test_external.py
 #python3 "$SCRIPT_DIR"/macro/test_macro.py
-
+python3 -m pytest "$SCRIPT_DIR"/cuda_test -v
 python3 -m pytest "$SCRIPT_DIR"/pyccel -v
 python3 -m pytest "$SCRIPT_DIR"/epyccel -v -m "not parallel"
 mpirun -n 4 python3 -m pytest "$SCRIPT_DIR"/epyccel/test_epyccel_mpi_modules.py -v
diff --git a/tutorial/decorators.md b/tutorial/decorators.md
index cf7edb3589..5f64b96a0a 100644
--- a/tutorial/decorators.md
+++ b/tutorial/decorators.md
@@ -600,6 +600,45 @@ The generated C code:
 ```c
 ```
 
+## GPU decorators
+
+### kernel
+
+This decorator is used to mark a Python function as a GPU kernel function using the Pyccel library. The decorated function can then be executed on a GPU by specifying the number of blocks and threads per block as arguments to the function using square brackets. The `kernel` decorator is provided by the Pyccel library and is used to create a GPU kernel from a Python function.
+
+#### Basic Example
+
+python Code:
+
+```Python
+from pyccel.decorators import kernel
+
+@kernel
+def func():
+    #Code
+
+if __name__ == '__main__':
+    # the decorated function.
+    func[1, 5]()
+```
+
+the Generated Code (Ccuda):
+
+```C
+extern "C" __global__ void func(void)
+{
+    /*Code*/
+    return;
+}
+
+int main()
+{
+    /*the decorated function.*/
+    func<<<1,5>>>();
+    return 0;
+}
+```
+
 ## Getting Help
 
 If you face problems with Pyccel, please take the following steps: