helmholtz-analytics · ClaudiaComito · Nov 3, 2022 · Apr 27, 2022 · May 26, 2022 · Jun 1, 2022
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -1,7 +1,7 @@
 name: Bug Report
 description: File a bug report
 title: "[Bug]: "
-labels: ["bug"]
+labels: ["bug :bug:"]
 
 body:
   - type: markdown
@@ -44,18 +44,19 @@ body:
       label: Python version
       description: What Python version?
       options:
+        - 3.7
         - 3.8
         - 3.9
-        - 3.10
-        - 3.7
+        - "3.10"
   - type: dropdown
     id: pytorch-version
     attributes:
       label: PyTorch version
       description: What PyTorch version?
       options:
+        - 1.12
         - 1.11
-        - 1.10
+        - "1.10"
         - 1.9
         - 1.8
         - 1.7

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -50,6 +50,3 @@ my be illegible. It may be easiest to save the output of each to a file.
 
 #### Does this change modify the behaviour of other functions? If so, which?
 yes / no
-
-<!-- Remove this line for GPU Cluster tests. It will need an approval. --->
-skip ci
diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml
@@ -29,6 +29,10 @@ categories:
     labels:
       - 'io'
       - 'communication'
+  - title: 'Google Summer of Code 2022'
+    label: 'GSoC22'
+  - title: 'Array API'
+    label: 'array API'
 change-template: '- #$NUMBER $TITLE (by @$AUTHOR)'
 categorie-template: '### $TITLE'
 exclude-labels:

diff --git a/.github/workflows/mirrorci.yml → .github/workflows/ci_cpu.yml b/.github/workflows/mirrorci.yml → .github/workflows/ci_cpu.yml
@@ -10,11 +10,11 @@ jobs:
     - name: Mirror + trigger CI
       uses: SvanBoxel/gitlab-mirror-and-ci-action@master
       with:
-        args: "https://gitlab.jsc.fz-juelich.de/haf/heat"
+        args: "https://gitlab.hzdr.de/haf/heat"
       env:
-        FORCE_PUSH: "false"
-        GITLAB_HOSTNAME: "gitlab.jsc.fz-juelich.de"
+        FORCE_PUSH: "true"
+        GITLAB_HOSTNAME: "gitlab.hzdr.de"
         GITLAB_USERNAME: ""
-        GITLAB_PASSWORD: ${{ secrets.GITLAB_TOKEN }}
-        GITLAB_PROJECT_ID: "4935"
+        GITLAB_PASSWORD: ${{ secrets.GITLAB_TOKEN_1 }}
+        GITLAB_PROJECT_ID: "845"
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -1,19 +1,29 @@
 test:
-  image: ubuntu:20.04
+  image: nvidia/cuda:11.6.2-runtime-ubuntu20.04
   tags:
-  - heat
+    - cuda
+    - x86_64
   script:
-  - apt update
-  - apt -y install build-essential python3-pip curl
-  - DEBIAN_FRONTEND=noninteractive apt -y install libopenmpi-dev openmpi-bin openmpi-doc
-  - apt -y install libhdf5-openmpi-dev libpnetcdf-dev
-  - pip install pytest coverage
-  - pip install .[hdf5,netcdf]
-  - COVERAGE_FILE=report/cov/coverage1 mpirun --allow-run-as-root -n 1 coverage run --source=heat --parallel-mode -m pytest --junitxml=report/test/report1.xml heat/
-  - COVERAGE_FILE=report/cov/coverage2 mpirun --allow-run-as-root -n 3 coverage run --source=heat --parallel-mode -m pytest --junitxml=report/test/report2.xml heat/
-  - COVERAGE_FILE=report/cov/coverage5 mpirun --allow-run-as-root -n 5 coverage run --source=heat --parallel-mode -m pytest --junitxml=report/test/report5.xml heat/
-  - COVERAGE_FILE=report/cov/coverage8 mpirun --allow-run-as-root -n 8 coverage run --source=heat --parallel-mode -m pytest --junitxml=report/test/report8.xml heat/
-  - coverage combine report/cov/*
-  - coverage report
-  - coverage xml
-  - curl -s https://codecov.io/bash | bash -s -- -c -F unit -f coverage.xml -t $CODECOV_TOKEN  || echo "Codecov failed to upload"
+    - apt update
+    - apt -y install build-essential python3-pip curl git
+    - DEBIAN_FRONTEND=noninteractive apt -y install libopenmpi-dev openmpi-bin openmpi-doc
+    - apt -y install libhdf5-openmpi-dev libpnetcdf-dev
+    - pip install pytest coverage
+    - pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
+    - pip install .[hdf5,netcdf]
+    - COVERAGE_FILE=report/cov/coverage1 HEAT_TEST_USE_DEVICE=cpu mpirun --allow-run-as-root -n 1 coverage run --source=heat --parallel-mode -m pytest --junitxml=report/test/report1.xml heat/
+    - COVERAGE_FILE=report/cov/coverage2 HEAT_TEST_USE_DEVICE=gpu mpirun --allow-run-as-root -n 3 coverage run --source=heat --parallel-mode -m pytest --junitxml=report/test/report3.xml heat/
+    - COVERAGE_FILE=report/cov/coverage5 HEAT_TEST_USE_DEVICE=cpu mpirun --allow-run-as-root -n 5 coverage run --source=heat --parallel-mode -m pytest --junitxml=report/test/report5.xml heat/
+    - COVERAGE_FILE=report/cov/coverage8 HEAT_TEST_USE_DEVICE=gpu mpirun --allow-run-as-root -n 6 coverage run --source=heat --parallel-mode -m pytest --junitxml=report/test/report6.xml heat/
+    - coverage combine report/cov/*
+    - coverage report
+    - coverage xml
+    - curl -Os https://uploader.codecov.io/latest/linux/codecov
+    - chmod +x codecov
+    - ./codecov -F unit -f ./coverage.xml -t $CODECOV_TOKEN -Z
+  artifacts:
+    when: always
+    paths:
+      - report/test/report*.xml
+    reports:
+      junit: report/test/report*.xml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -10,7 +10,7 @@ repos:
     -   id: check-added-large-files
     -   id: flake8
 -   repo: https://github.com/psf/black
-    rev: 22.6.0
+    rev: 22.10.0
     hooks:
     -   id: black
 -   repo: https://github.com/pycqa/pydocstyle

diff --git a/README.md b/README.md
@@ -8,9 +8,7 @@ Heat is a distributed tensor framework for high performance data analytics.
 
 Project Status
 --------------
-
-[![Jenkins](https://img.shields.io/jenkins/build?jobUrl=https%3A%2F%2Fheat-ci.fz-juelich.de%2Fjob%2Fheat%2Fjob%2Fheat%2Fjob%2Fmain%2F&label=CPU)](https://heat-ci.fz-juelich.de/blue/organizations/jenkins/heat%2Fheat/activity?branch=main)
-[![Jenkins](https://img.shields.io/jenkins/build?jobUrl=https%3A%2F%2Fheat-ci.fz-juelich.de%2Fjob%2FGPU%2520Cluster%2Fjob%2Fmain%2F&label=GPU)](https://heat-ci.fz-juelich.de/blue/organizations/jenkins/GPU%20Cluster%2Fmain/activity)
+[![Mirror and run GitLab CI](https://github.com/helmholtz-analytics/heat/actions/workflows/ci_cpu.yml/badge.svg)](https://github.com/helmholtz-analytics/heat/actions/workflows/ci_cpu.yml)
 [![Documentation Status](https://readthedocs.org/projects/heat/badge/?version=latest)](https://heat.readthedocs.io/en/latest/?badge=latest)
 [![codecov](https://codecov.io/gh/helmholtz-analytics/heat/branch/main/graph/badge.svg)](https://codecov.io/gh/helmholtz-analytics/heat)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)

diff --git a/codecov.yml b/codecov.yml
@@ -13,31 +13,16 @@ coverage:
         # basic
         target: auto
         threshold: 3%
-        base: auto
         flags:
           - unit
-          - gpu
         paths:
           - "heat"
-       # advanced settings
-        branches:
-          - main
-        if_ci_failed: error #success, failure, error, ignore
-        informational: false
-        only_pulls: false
     patch:
       default:
         # basic
         target: auto
         threshold: 3%
-        base: auto
-        # advanced
-        branches:
-          - main
-        if_ci_failed: error #success, failure, error, ignore
-        only_pulls: false
         flags:
           - "unit"
-          - "gpu"
         paths:
           - "heat"
diff --git a/heat/core/_operations.py b/heat/core/_operations.py
@@ -422,7 +422,7 @@ def __reduce_op(
     balanced = x.balanced
 
     # if local tensor is empty, replace it with the identity element
-    if 0 in x.lshape and (axis is None or (x.split in axis)):
+    if x.is_distributed() and 0 in x.lshape and (axis is None or split in axis):
         if neutral is None:
             neutral = float("nan")
         neutral_shape = x.gshape[:split] + (1,) + x.gshape[split + 1 :]

diff --git a/heat/core/factories.py b/heat/core/factories.py
@@ -381,9 +381,13 @@ def array(
         obj = sanitize_memory_layout(obj, order=order)
     # check with the neighboring rank whether the local shape would fit into a global shape
     elif is_split is not None:
-        gshape = np.array(gshape)
-        lshape = np.array(lshape)
         obj = sanitize_memory_layout(obj, order=order)
+
+        # Check whether the shape of distributed data
+        # matches in all dimensions except the split axis
+        neighbour_shape = np.array(gshape)
+        lshape = np.array(lshape)
+
         if comm.rank < comm.size - 1:
             comm.Isend(lshape, dest=comm.rank + 1)
         if comm.rank != 0:
@@ -395,21 +399,23 @@ def array(
             if length != len(lshape):
                 discard_buffer = np.empty(length)
                 comm.Recv(discard_buffer, source=comm.rank - 1)
-                gshape[is_split] = np.iinfo(gshape.dtype).min
+                neighbour_shape[is_split] = np.iinfo(neighbour_shape.dtype).min
             else:
                 # check whether the individual shape elements match
-                comm.Recv(gshape, source=comm.rank - 1)
+                comm.Recv(neighbour_shape, source=comm.rank - 1)
                 for i in range(length):
                     if i == is_split:
                         continue
-                    elif lshape[i] != gshape[i] and lshape[i] - 1 != gshape[i]:
-                        gshape[is_split] = np.iinfo(gshape.dtype).min
+                    elif lshape[i] != neighbour_shape[i]:
+                        neighbour_shape[is_split] = np.iinfo(neighbour_shape.dtype).min
 
         # sum up the elements along the split dimension
-        reduction_buffer = np.array(gshape[is_split])
-        comm.Allreduce(MPI.IN_PLACE, reduction_buffer, MPI.SUM)
+        reduction_buffer = np.array(neighbour_shape[is_split])
+        comm.Allreduce(MPI.IN_PLACE, reduction_buffer, MPI.MIN)
         if reduction_buffer < 0:
-            raise ValueError("unable to construct tensor, shape of local data chunk does not match")
+            raise ValueError(
+                "Unable to construct DNDarray. Local data slices have inconsistent shapes or dimensions."
+            )
         ttl_shape = np.array(obj.shape)
         ttl_shape[is_split] = lshape[is_split]
         comm.Allreduce(MPI.IN_PLACE, ttl_shape, MPI.SUM)

diff --git a/heat/core/logical.py b/heat/core/logical.py
@@ -91,6 +91,9 @@ def all(
     def local_all(t, *args, **kwargs):
         return torch.all(t != 0, *args, **kwargs)
 
+    if keepdim and axis is None:
+        axis = tuple(range(x.ndim))
+
     return _operations.__reduce_op(
         x, local_all, MPI.LAND, axis=axis, out=out, neutral=1, keepdim=keepdim
     )
@@ -196,6 +199,9 @@ def any(
     def local_any(t, *args, **kwargs):
         return torch.any(t != 0, *args, **kwargs)
 
+    if keepdim and axis is None:
+        axis = tuple(range(x.ndim))
+
     return _operations.__reduce_op(
         x, local_any, MPI.LOR, axis=axis, out=out, neutral=0, keepdim=keepdim
     )

diff --git a/heat/core/tests/test_arithmetics.py b/heat/core/tests/test_arithmetics.py
@@ -646,6 +646,10 @@ def test_prod(self):
         self.assertEqual(shape_split_axis_tuple_prod.split, None)
         self.assertTrue((shape_split_axis_tuple_prod == expected_result).all())
 
+        # empty array
+        empty = ht.array([])
+        self.assertEqual(ht.prod(empty), ht.array([1.0]))
+
         # exceptions
         with self.assertRaises(ValueError):
             ht.ones(array_len).prod(axis=1)
@@ -792,6 +796,10 @@ def test_sum(self):
         self.assertEqual(shape_split_axis_tuple_sum.split, None)
         self.assertTrue((shape_split_axis_tuple_sum == expected_result).all())
 
+        # empty array
+        empty = ht.array([])
+        self.assertEqual(ht.sum(empty), ht.array([0.0]))
+
         # exceptions
         with self.assertRaises(ValueError):
             ht.ones(array_len).sum(axis=1)

diff --git a/heat/core/tests/test_factories.py b/heat/core/tests/test_factories.py
@@ -308,6 +308,12 @@ def test_array(self):
         with self.assertRaises(TypeError):
             ht.array((4,), comm={})
 
+        # data already distributed but don't match in shape
+        if self.get_size() > 1:
+            with self.assertRaises(ValueError):
+                dim = self.get_rank() + 1
+                ht.array([[0] * dim] * dim, is_split=0)
+
     def test_asarray(self):
         # same heat array
         arr = ht.array([1, 2])

diff --git a/heat/core/tests/test_logical.py b/heat/core/tests/test_logical.py
@@ -140,6 +140,32 @@ def test_all(self):
         out_noaxis = ht.zeros((1, 2, 3, 5), split=1)
         ht.all(ones_noaxis_split_axis_neg, axis=-2, out=out_noaxis)
 
+        # test keepdim
+        ones_2d = ht.ones((1, 1))
+        self.assertEqual(ones_2d.all(keepdim=True).shape, ones_2d.shape)
+
+        ones_2d_split = ht.ones((2, 2), split=0)
+        keepdim_is_one = ones_2d_split.all(keepdim=True)
+        self.assertEqual(keepdim_is_one.shape, (1, 1))
+        self.assertEqual(keepdim_is_one.split, None)
+        keepdim_is_one = ones_2d_split.all(axis=0, keepdim=True)
+        self.assertEqual(keepdim_is_one.shape, (1, 2))
+        self.assertEqual(keepdim_is_one.split, None)
+        keepdim_is_one = ones_2d_split.all(axis=1, keepdim=True)
+        self.assertEqual(keepdim_is_one.shape, (2, 1))
+        self.assertEqual(keepdim_is_one.split, 0)
+
+        ones_2d_split = ht.ones((2, 2), split=1)
+        keepdim_is_one = ones_2d_split.all(keepdim=True)
+        self.assertEqual(keepdim_is_one.shape, (1, 1))
+        self.assertEqual(keepdim_is_one.split, None)
+        keepdim_is_one = ones_2d_split.all(axis=0, keepdim=True)
+        self.assertEqual(keepdim_is_one.shape, (1, 2))
+        self.assertEqual(keepdim_is_one.split, 1)
+        keepdim_is_one = ones_2d_split.all(axis=1, keepdim=True)
+        self.assertEqual(keepdim_is_one.shape, (2, 1))
+        self.assertEqual(keepdim_is_one.split, None)
+
         # exceptions
         with self.assertRaises(ValueError):
             ht.ones(array_len).all(axis=1)
@@ -212,6 +238,32 @@ def test_any(self):
         self.assertEqual(any_tensor.dtype, ht.bool)
         self.assertTrue(ht.equal(any_tensor, res))
 
+        # test keepdim
+        ones_2d = ht.ones((1, 1))
+        self.assertEqual(ones_2d.any(keepdim=True).shape, ones_2d.shape)
+
+        ones_2d_split = ht.ones((2, 2), split=0)
+        keepdim_any = ones_2d_split.any(keepdim=True)
+        self.assertEqual(keepdim_any.shape, (1, 1))
+        self.assertEqual(keepdim_any.split, None)
+        keepdim_any = ones_2d_split.any(axis=0, keepdim=True)
+        self.assertEqual(keepdim_any.shape, (1, 2))
+        self.assertEqual(keepdim_any.split, None)
+        keepdim_any = ones_2d_split.any(axis=1, keepdim=True)
+        self.assertEqual(keepdim_any.shape, (2, 1))
+        self.assertEqual(keepdim_any.split, 0)
+
+        ones_2d_split = ht.ones((2, 2), split=1)
+        keepdim_any = ones_2d_split.any(keepdim=True)
+        self.assertEqual(keepdim_any.shape, (1, 1))
+        self.assertEqual(keepdim_any.split, None)
+        keepdim_any = ones_2d_split.any(axis=0, keepdim=True)
+        self.assertEqual(keepdim_any.shape, (1, 2))
+        self.assertEqual(keepdim_any.split, 1)
+        keepdim_any = ones_2d_split.any(axis=1, keepdim=True)
+        self.assertEqual(keepdim_any.shape, (2, 1))
+        self.assertEqual(keepdim_any.split, None)
+
     def test_isclose(self):
         size = ht.communication.MPI_WORLD.size
         a = ht.float32([[2, 2], [2, 2]])

diff --git a/setup.py b/setup.py
@@ -33,7 +33,7 @@
     install_requires=[
         "mpi4py>=3.0.0",
         "numpy>=1.13.0",
-        "torch>=1.7.0, <=1.12.1",
+        "torch>=1.7.0, <1.13",
         "scipy>=0.14.0",
         "pillow>=6.0.0",
         "torchvision>=0.8.0",