Fix comment/version dependency documentation in build.py

Add rocm as a valid device option to build.py fix typo install_env.py for rocm remove extra } in export_pip-r.sh Fix Docker build for cuda 11.8 Add weights_only=True for torch.load to clean up torch warnings. Fix delim_whitespace warning in data_utils.py Fix AMD ROCm documentation.
Deep-MI · Aug 27, 2024 · b60716f · b60716f
1 parent 5b96538
commit b60716f
Show file tree

Hide file tree

Showing 9 changed files with 25 additions and 16 deletions.
diff --git a/Docker/Dockerfile b/Docker/Dockerfile
@@ -189,6 +189,9 @@ SHELL ["/bin/bash", "--login", "-c"]
 COPY --from=selected_freesurfer_build_image /opt/freesurfer /opt/freesurfer
 COPY --from=selected_conda_build_image /venv /venv
 
+# Fix for cuda11.8+cudnn8.7 bug+warning: https://github.com/pytorch/pytorch/issues/97041
+RUN if [[ "$DEVICE" == "cu118" ]] ; then cd /venv/python3.10/site-packages/torch/lib && ln -s libnvrtc-*.so.11.2 libnvrtc.so ; fi
+
 # Copy fastsurfer over from the build context and add PYTHONPATH
 COPY . /fastsurfer/
 ENV PYTHONPATH=/fastsurfer:/opt/freesurfer/python/packages \

diff --git a/Docker/README.md b/Docker/README.md
@@ -139,7 +139,7 @@ As you can see, only the tag of the image is changed from gpu to cpu and the sta
 
 Here we build an experimental image to test performance when running on AMD GPUs. Note that you need a supported OS and Kernel version and supported GPU for the RocM to work correctly. You need to install the Kernel drivers into 
 your host machine kernel (amdgpu-install --usecase=dkms) for the amd docker to work. For this follow:
-https://docs.amd.com/en/latest/deploy/linux/quick_start.html
+https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/quick-start.html#rocm-install-quick, https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/amdgpu-install.html#amdgpu-install-dkms and https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html
 
 ```bash
 PYTHONPATH=<FastSurferRoot>
@@ -149,22 +149,22 @@ python build.py --device rocm --tag my_fastsurfer:rocm
 and run segmentation only:
 
 ```bash
-docker run --rm --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
-           --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host \
-	   --shm-size 8G \
+docker run --rm --security-opt seccomp=unconfined \
+           --device=/dev/kfd --device=/dev/dri --group-add video \
 	   -v /home/user/my_mri_data:/data \
 	   -v /home/user/my_fastsurfer_analysis:/output \
 	   my_fastsurfer:rocm \
 	   --t1 /data/subjectX/t1-weighted.nii.gz \
 	   --sid subjectX --sd /output 
 ```
 
-Note, we tested on an AMD Radeon Pro W6600, which is [not officially supported](https://docs.amd.com/en/latest/release/gpu_os_support.html), but setting `HSA_OVERRIDE_GFX_VERSION=10.3.0` [inside docker did the trick](https://en.opensuse.org/AMD_OpenCL#ROCm_-_Running_on_unsupported_hardware):
+In conflict with the official ROCm documentation (above), we also needed to add the group render `--group-add render` (in addition to `--group-add video`).
+
+Note, we tested on an AMD Radeon Pro W6600, which is [not officially supported](https://docs.amd.com/en/latest/release/gpu_os_support.html), but setting `HSA_OVERRIDE_GFX_VERSION=10.3.0` [inside docker did the trick](https://en.opensuse.org/SDB:AMD_GPGPU#Using_CUDA_code_with_ZLUDA_and_ROCm):
 
 ```bash
-docker run --rm --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
-           --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host \
-	   --shm-size 8G \
+docker run --rm --security-opt seccomp=unconfined \
+           --device=/dev/kfd --device=/dev/dri --group-add video --group-add render \
 	   -v /home/user/my_mri_data:/data \
 	   -v /home/user/my_fastsurfer_analysis:/output \
 	   -e HSA_OVERRIDE_GFX_VERSION=10.3.0 \

diff --git a/Docker/build.py b/Docker/build.py
@@ -30,7 +30,7 @@
 Target = Literal['runtime', 'build_common', 'build_conda', 'build_freesurfer',
                  'build_base', 'runtime_cuda']
 CacheType = Literal["inline", "registry", "local", "gha", "s3", "azblob"]
-AllDeviceType = Literal["cpu", "cuda", "cu118", "cu121", "cu124", "rocm6.1"]
+AllDeviceType = Literal["cpu", "cuda", "cu118", "cu121", "cu124", "rocm", "rocm6.1"]
 DeviceType = Literal["cpu", "cu118", "cu121", "cu124", "rocm6.1"]
 
 CREATE_BUILDER = "Create builder with 'docker buildx create --name fastsurfer'."
@@ -58,6 +58,7 @@ class DEFAULTS:
     # and rocm versions, if pytorch comes with new versions.
     # torch 1.12.0 comes compiled with cu113, cu116, rocm5.0 and rocm5.1.1
     # torch 2.0.1 comes compiled with cu117, cu118, and rocm5.4.2
+    # torch 2.4 comes compiled with cu118, cu121, cu124 and rocm6.1
     MapDeviceType: Dict[AllDeviceType, DeviceType] = dict(
         ((d, d) for d in get_args(DeviceType)),
         rocm="rocm6.1",
@@ -230,6 +231,7 @@ def make_parser() -> argparse.ArgumentParser:
                  --cache type=registry,ref=server/fastbuild,mode=max.
                  Will default to the environment variable FASTSURFER_BUILD_CACHE: 
                  {cache_kwargs.get('default', 'N/A')}""",
+        metavar="type={inline,local,...}[,<param>=<value>[,...]]",
         **cache_kwargs,
     )
     parser.add_argument(

diff --git a/Docker/install_env.py b/Docker/install_env.py
@@ -19,7 +19,7 @@
 def mode(arg: str) -> str:
     if arg in ["base", "cpu"] or \
             re.match("^cu\\d+$", arg) or \
-            re.match("^rocm\\d+\\.\\d+(\\.\\d+)?$"):
+            re.match("^rocm\\d+\\.\\d+(\\.\\d+)?$", arg):
         return arg
     else:
         raise argparse.ArgumentTypeError(f"The mode was '{arg}', but should be "

diff --git a/FastSurferCNN/data_loader/data_utils.py b/FastSurferCNN/data_loader/data_utils.py
@@ -623,7 +623,7 @@ def read_classes_from_lut(lut_file: str | Path):
     if lut_file.suffix == ".csv":
         kwargs["sep"] = ","
     elif lut_file.suffix == ".txt":
-        kwargs["delim_whitespace"] = True
+        kwargs["sep"] = "\\s+"
     else:
         raise RuntimeError(
             f"Unknown LUT file extension {lut_file}, must be csv, txt or tsv."

diff --git a/FastSurferCNN/inference.py b/FastSurferCNN/inference.py
@@ -213,7 +213,9 @@ def load_checkpoint(self, ckpt: Union[str, os.PathLike]):
             # make sure the model is, where it is supposed to be
             self.model.to(self.device)
 
-        model_state = torch.load(ckpt, map_location=device)
+        # WARNING: weights_only=False can cause unsafe code execution, but here the
+        # checkpoint can be considered to be from a safe source
+        model_state = torch.load(ckpt, map_location=device, weights_only=False)
         self.model.load_state_dict(model_state["model_state"])
 
         # workaround for mps (move the model back to mps)

diff --git a/FastSurferCNN/utils/checkpoint.py b/FastSurferCNN/utils/checkpoint.py
@@ -228,7 +228,9 @@ def load_from_checkpoint(
     loaded_epoch : int
         Epoch number.
     """
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
+    # WARNING: weights_only=False can cause unsafe code execution, but here the
+    # checkpoint can be considered to be from a safe source
+    checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
 
     if drop_classifier:
         classifier_conv = ["classifier.conv.weight", "classifier.conv.bias"]

diff --git a/HypVINN/inference.py b/HypVINN/inference.py
@@ -181,7 +181,9 @@ def load_checkpoint(self, ckpt: str):
             of a model.
         """
         logger.info("Loading checkpoint {}".format(ckpt))
-        model_state = torch.load(ckpt, map_location=self.device)
+        # WARNING: weights_only=False can cause unsafe code execution, but here the
+        # checkpoint can be considered to be from a safe source
+        model_state = torch.load(ckpt, map_location=self.device, weights_only=False)
         self.model.load_state_dict(model_state["model_state"])
 
     def get_modelname(self):

diff --git a/env/export_pip-r.sh b/env/export_pip-r.sh
@@ -73,5 +73,3 @@ pyversion=$(echo "$out" | head -n 1 | cut -d" " -f2)
   echo ""
   echo "# $out"
 } >> $1
-
-}