Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update geoformer to use cuda 11.3, pytorch 1.11.0, and spconv 2.3.6 #2

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions .devcontainer/Dockerfile_U2004_CUDA113
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
FROM nvidia/cuda:11.3.1-devel-ubuntu20.04

RUN apt-get update && apt-get install wget git -yq
RUN apt-get install build-essential g++ gcc -y
ENV DEBIAN_FRONTEND noninteractive
# Unsure if openmpi is needed
# RUN apt-get update && apt-get install libgl1-mesa-glx libglib2.0-0 libxcb-* \
# openmpi-bin openmpi-common libopenmpi-dev libgtk2.0-dev -y

# Install miniconda
ENV CONDA_DIR /opt/conda

RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
/bin/bash ~/miniconda.sh -b -p /opt/conda

# Put conda in path so we can use conda activate
ENV PATH=$CONDA_DIR/bin:/usr/local/bin:$PATH
# general packages
RUN conda install python=3.8
RUN conda install numpy=1.23
RUN conda install -c anaconda jupyter
RUN echo "numpy==1.23.*" > /opt/conda/conda-meta/pinned
RUN conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 cudatoolkit=11.3 -c pytorch
RUN conda install conda=22.11
RUN conda install -c conda-forge setuptools=59.5

# Make sure CUDA is visible
# ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:$LD_LIBRARY_PATH
ENV NVIDIA_VISIBLE_DEVICES all
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
# ARG TORCH_CUDA_ARCH_LIST="8.9"
# Install pointgroup_ops
RUN apt-get install libsparsehash-dev
COPY requirements.txt /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
COPY lib /lib
RUN cd /lib/pointgroup_ops && python setup.py develop

# Install spconv
RUN conda install libboost && pip install pccm
RUN pip install spconv-cu113

# Install pointnet2
# RUN cd /lib/pointnet2 && python setup.py install

# Install faiss
RUN conda install -c conda-forge faiss-gpu

51 changes: 51 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
{
"build": {
"dockerfile": "Dockerfile_U2004_CUDA113",
"context": "..",
"args": {
"DOCKER_BUILDKIT": "0"
}
},
"mounts": [
"source=${localWorkspaceFolder},target=/workspace,type=bind,consistency=cached"
],
"runArgs": [
"--gpus",
"all",
"--shm-size",
"16gb",
"-v",
"/tmp/.X11-unix:/tmp.X11-unix"
],
"containerEnv": {
"NVIDIA_DRIVER_CAPABILITIES": "all",
"DISPLAY": "unix:0"
},
"forwardPorts": [
8887,
8888,
8886
],
"customizations": {
"vscode": {
"extensions": [
"ms-python.python",
"ms-python.vscode-pylance",
"ms-toolsai.jupyter",
"ms-python.black-formatter"
],
"settings": {
        "python.defaultInterpreterPath": "/opt/conda/bin/python",
        "python.linting.enabled": true,
        "python.linting.pylintEnabled": true,
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.formatOnSave": true
}


}
}
},
"workspaceFolder": "/workspace"
}
47 changes: 34 additions & 13 deletions checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,12 @@ def align_and_update_state_dicts(model_state_dict, loaded_state_dict):
loaded_keys = sorted(list(loaded_state_dict.keys()))
# get a matrix of string matches, where each (i, j) entry correspond to the size of the
# loaded_key string, if it matches
match_matrix = [len(j) if i.endswith(j) else 0 for i in current_keys for j in loaded_keys]
match_matrix = torch.as_tensor(match_matrix).view(len(current_keys), len(loaded_keys))
match_matrix = [
len(j) if i.endswith(j) else 0 for i in current_keys for j in loaded_keys
]
match_matrix = torch.as_tensor(match_matrix).view(
len(current_keys), len(loaded_keys)
)
max_match_size, idxs = match_matrix.max(1)
# remove indices that correspond to no-match
idxs[max_match_size == 0] = -1
Expand All @@ -44,15 +48,19 @@ def align_and_update_state_dicts(model_state_dict, loaded_state_dict):
key = current_keys[idx_new]
key_old = loaded_keys[idx_old]
if loaded_state_dict[key_old].shape != model_state_dict[key].shape:
# if 'unet' in key or 'input_conv' in key:
# reshaped = loaded_state_dict[key_old].permute(4,0,1,2,3)
# loaded_state_dict[key_old] = reshaped
# else:
print(
"Skip loading parameter {}, required shape{}, "
"loaded shape{}.".format(key, model_state_dict[key].shape, loaded_state_dict[key_old].shape)
)
loaded_state_dict[key_old] = model_state_dict[key]
if "unet" in key or "input_conv" in key:
reshaped = loaded_state_dict[key_old].permute(4, 0, 1, 2, 3)
loaded_state_dict[key_old] = reshaped
else:
print(
"Skip loading parameter {}, required shape{}, "
"loaded shape{}.".format(
key,
model_state_dict[key].shape,
loaded_state_dict[key_old].shape,
)
)
loaded_state_dict[key_old] = model_state_dict[key]

model_state_dict[key] = loaded_state_dict[key_old]
logger.info(
Expand Down Expand Up @@ -87,15 +95,28 @@ def mkdir_p(path):
raise


def checkpoint(model, optimizer, epoch, log_dir, best_val=None, best_val_iter=None, postfix=None, last=False):
def checkpoint(
model,
optimizer,
epoch,
log_dir,
best_val=None,
best_val_iter=None,
postfix=None,
last=False,
):
mkdir_p(log_dir)

if last:
filename = "checkpoint_last.pth"
else:
filename = f"checkpoint_epoch_{epoch}.pth"
checkpoint_file = log_dir + "/" + filename
state = {"epoch": epoch, "state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
state = {
"epoch": epoch,
"state_dict": model.state_dict(),
"optimizer": optimizer.state_dict(),
}

torch.save(state, checkpoint_file)
logging.info(f"Checkpoint saved to {checkpoint_file}")
2 changes: 1 addition & 1 deletion lib/pointgroup_ops/src/bfs_cluster/bfs_cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ All Rights Reserved 2020.
#define BFS_CLUSTER_H
#include <torch/serialize/tensor.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THC.h>
// #include <THC/THC.h>

#include "../datatype/datatype.h"

Expand Down
Loading