Skip to content

Commit

Permalink
Fix comment/version dependency documentation in build.py
Browse files Browse the repository at this point in the history
Add rocm as a valid device option to build.py
fix typo install_env.py for rocm
remove extra } in export_pip-r.sh
Fix Docker build for cuda 11.8
Add weights_only=True for torch.load to clean up torch warnings.
Fix delim_whitespace warning in data_utils.py
Fix AMD ROCm documentation.
  • Loading branch information
dkuegler committed Aug 27, 2024
1 parent 5b96538 commit b60716f
Show file tree
Hide file tree
Showing 9 changed files with 25 additions and 16 deletions.
3 changes: 3 additions & 0 deletions Docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,9 @@ SHELL ["/bin/bash", "--login", "-c"]
COPY --from=selected_freesurfer_build_image /opt/freesurfer /opt/freesurfer
COPY --from=selected_conda_build_image /venv /venv

# Fix for cuda11.8+cudnn8.7 bug+warning: https://github.com/pytorch/pytorch/issues/97041
RUN if [[ "$DEVICE" == "cu118" ]] ; then cd /venv/python3.10/site-packages/torch/lib && ln -s libnvrtc-*.so.11.2 libnvrtc.so ; fi

# Copy fastsurfer over from the build context and add PYTHONPATH
COPY . /fastsurfer/
ENV PYTHONPATH=/fastsurfer:/opt/freesurfer/python/packages \
Expand Down
16 changes: 8 additions & 8 deletions Docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ As you can see, only the tag of the image is changed from gpu to cpu and the sta

Here we build an experimental image to test performance when running on AMD GPUs. Note that you need a supported OS and Kernel version and supported GPU for the RocM to work correctly. You need to install the Kernel drivers into
your host machine kernel (amdgpu-install --usecase=dkms) for the amd docker to work. For this follow:
https://docs.amd.com/en/latest/deploy/linux/quick_start.html
https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/quick-start.html#rocm-install-quick, https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/amdgpu-install.html#amdgpu-install-dkms and https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html

```bash
PYTHONPATH=<FastSurferRoot>
Expand All @@ -149,22 +149,22 @@ python build.py --device rocm --tag my_fastsurfer:rocm
and run segmentation only:

```bash
docker run --rm --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
--device=/dev/kfd --device=/dev/dri --group-add video --ipc=host \
--shm-size 8G \
docker run --rm --security-opt seccomp=unconfined \
--device=/dev/kfd --device=/dev/dri --group-add video \
-v /home/user/my_mri_data:/data \
-v /home/user/my_fastsurfer_analysis:/output \
my_fastsurfer:rocm \
--t1 /data/subjectX/t1-weighted.nii.gz \
--sid subjectX --sd /output
```

Note, we tested on an AMD Radeon Pro W6600, which is [not officially supported](https://docs.amd.com/en/latest/release/gpu_os_support.html), but setting `HSA_OVERRIDE_GFX_VERSION=10.3.0` [inside docker did the trick](https://en.opensuse.org/AMD_OpenCL#ROCm_-_Running_on_unsupported_hardware):
In conflict with the official ROCm documentation (above), we also needed to add the group render `--group-add render` (in addition to `--group-add video`).

Note, we tested on an AMD Radeon Pro W6600, which is [not officially supported](https://docs.amd.com/en/latest/release/gpu_os_support.html), but setting `HSA_OVERRIDE_GFX_VERSION=10.3.0` [inside docker did the trick](https://en.opensuse.org/SDB:AMD_GPGPU#Using_CUDA_code_with_ZLUDA_and_ROCm):

```bash
docker run --rm --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
--device=/dev/kfd --device=/dev/dri --group-add video --ipc=host \
--shm-size 8G \
docker run --rm --security-opt seccomp=unconfined \
--device=/dev/kfd --device=/dev/dri --group-add video --group-add render \
-v /home/user/my_mri_data:/data \
-v /home/user/my_fastsurfer_analysis:/output \
-e HSA_OVERRIDE_GFX_VERSION=10.3.0 \
Expand Down
4 changes: 3 additions & 1 deletion Docker/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
Target = Literal['runtime', 'build_common', 'build_conda', 'build_freesurfer',
'build_base', 'runtime_cuda']
CacheType = Literal["inline", "registry", "local", "gha", "s3", "azblob"]
AllDeviceType = Literal["cpu", "cuda", "cu118", "cu121", "cu124", "rocm6.1"]
AllDeviceType = Literal["cpu", "cuda", "cu118", "cu121", "cu124", "rocm", "rocm6.1"]
DeviceType = Literal["cpu", "cu118", "cu121", "cu124", "rocm6.1"]

CREATE_BUILDER = "Create builder with 'docker buildx create --name fastsurfer'."
Expand Down Expand Up @@ -58,6 +58,7 @@ class DEFAULTS:
# and rocm versions, if pytorch comes with new versions.
# torch 1.12.0 comes compiled with cu113, cu116, rocm5.0 and rocm5.1.1
# torch 2.0.1 comes compiled with cu117, cu118, and rocm5.4.2
# torch 2.4 comes compiled with cu118, cu121, cu124 and rocm6.1
MapDeviceType: Dict[AllDeviceType, DeviceType] = dict(
((d, d) for d in get_args(DeviceType)),
rocm="rocm6.1",
Expand Down Expand Up @@ -230,6 +231,7 @@ def make_parser() -> argparse.ArgumentParser:
--cache type=registry,ref=server/fastbuild,mode=max.
Will default to the environment variable FASTSURFER_BUILD_CACHE:
{cache_kwargs.get('default', 'N/A')}""",
metavar="type={inline,local,...}[,<param>=<value>[,...]]",
**cache_kwargs,
)
parser.add_argument(
Expand Down
2 changes: 1 addition & 1 deletion Docker/install_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
def mode(arg: str) -> str:
if arg in ["base", "cpu"] or \
re.match("^cu\\d+$", arg) or \
re.match("^rocm\\d+\\.\\d+(\\.\\d+)?$"):
re.match("^rocm\\d+\\.\\d+(\\.\\d+)?$", arg):
return arg
else:
raise argparse.ArgumentTypeError(f"The mode was '{arg}', but should be "
Expand Down
2 changes: 1 addition & 1 deletion FastSurferCNN/data_loader/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,7 +623,7 @@ def read_classes_from_lut(lut_file: str | Path):
if lut_file.suffix == ".csv":
kwargs["sep"] = ","
elif lut_file.suffix == ".txt":
kwargs["delim_whitespace"] = True
kwargs["sep"] = "\\s+"
else:
raise RuntimeError(
f"Unknown LUT file extension {lut_file}, must be csv, txt or tsv."
Expand Down
4 changes: 3 additions & 1 deletion FastSurferCNN/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,9 @@ def load_checkpoint(self, ckpt: Union[str, os.PathLike]):
# make sure the model is, where it is supposed to be
self.model.to(self.device)

model_state = torch.load(ckpt, map_location=device)
# WARNING: weights_only=False can cause unsafe code execution, but here the
# checkpoint can be considered to be from a safe source
model_state = torch.load(ckpt, map_location=device, weights_only=False)
self.model.load_state_dict(model_state["model_state"])

# workaround for mps (move the model back to mps)
Expand Down
4 changes: 3 additions & 1 deletion FastSurferCNN/utils/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,9 @@ def load_from_checkpoint(
loaded_epoch : int
Epoch number.
"""
checkpoint = torch.load(checkpoint_path, map_location="cpu")
# WARNING: weights_only=False can cause unsafe code execution, but here the
# checkpoint can be considered to be from a safe source
checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False)

if drop_classifier:
classifier_conv = ["classifier.conv.weight", "classifier.conv.bias"]
Expand Down
4 changes: 3 additions & 1 deletion HypVINN/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,9 @@ def load_checkpoint(self, ckpt: str):
of a model.
"""
logger.info("Loading checkpoint {}".format(ckpt))
model_state = torch.load(ckpt, map_location=self.device)
# WARNING: weights_only=False can cause unsafe code execution, but here the
# checkpoint can be considered to be from a safe source
model_state = torch.load(ckpt, map_location=self.device, weights_only=False)
self.model.load_state_dict(model_state["model_state"])

def get_modelname(self):
Expand Down
2 changes: 0 additions & 2 deletions env/export_pip-r.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,5 +73,3 @@ pyversion=$(echo "$out" | head -n 1 | cut -d" " -f2)
echo ""
echo "# $out"
} >> $1

}

0 comments on commit b60716f

Please sign in to comment.