Skip to content

Commit

Permalink
[CI] Setup release (#324)
Browse files Browse the repository at this point in the history
  • Loading branch information
CSY-ModelCloud authored Aug 2, 2024
1 parent bab180e commit db15847
Show file tree
Hide file tree
Showing 3 changed files with 156 additions and 213 deletions.
249 changes: 155 additions & 94 deletions .github/workflows/build_wheels_cuda_linux.yml
Original file line number Diff line number Diff line change
@@ -1,115 +1,176 @@
name: Build GPTQModel Wheels with CUDA for Linux
name: Release

on: workflow_dispatch
defaults:
run:
shell: bash -le {0}
on:
schedule:
- cron: '0 20 * * *'
repository_dispatch:
workflow_dispatch:

env:
CUDA_DEVICE_ORDER: PCI_BUS_ID
AMD_SERVER: 10.0.13.31
INTEL_SERVER: 10.0.23.35

concurrency:
group: ${{ github.ref }}-workflow
cancel-in-progress: true

jobs:
build_wheels:
if: ${{ github.repository_owner == 'ModelCloud' }}
name: Build wheels for ${{ matrix.os }} and Python ${{ matrix.python }} and CUDA ${{ matrix.cuda }}
runs-on: ${{ matrix.os }}
check-vm:
runs-on: self-hosted
container:
image: modelcloud/gptqmodel:alpine-ci-v1
outputs:
ip: ${{ steps.get_ip.outputs.ip }}
tag: ${{ steps.get_ip.outputs.tag }}
steps:
- name: Select server
id: get_ip
run: |
if [[ "${{ runner.name }}" == *"intel"* ]]; then
echo "current ci is intel"
response=0
else
echo "test intel vm status"
response=$(curl --silent --fail --max-time 5 http://$INTEL_SERVER/gpu/runner/status/intel) || response=error
if [ "$response" == "error" ]; then
echo "test amd vm status"
response=$(curl --silent --fail --max-time 5 http://${AMD_SERVER}/gpu/runner/status/intel) || response=error
fi
fi
echo "response: $response"
if [ "$response" == "0" ]; then
tag="intel"
elif [ "$response" == "-1" ]; then
tag="amd"
else
echo "Error: Unexpected result - $response"
exit 1
fi
echo "Runner tag: $tag"
response=$(curl -s --head --fail --max-time 5 http://${INTEL_SERVER}/gpu/status) || response=error
if echo "$response" | grep "200 OK" > /dev/null; then
echo "Intel server is online. set ip to $ip"
ip=${INTEL_SERVER}
else
response=$(curl -s --head --max-time 5 http://${AMD_SERVER}/gpu/status) || response=error
if echo "$response" | grep "200 OK" > /dev/null; then
ip=${AMD_SERVER}
echo "AMD server is online. set ip to $ip"
else
echo "AMD server is offline."
exit 1
fi
fi
echo "ip=$ip" >> "$GITHUB_OUTPUT"
echo "tag=$tag" >> "$GITHUB_OUTPUT"
echo "tag: $tag, ip: $ip"
build:
strategy:
fail-fast: false
matrix:
os: [ubuntu-22.04]
pyver: ["3.8", "3.9", "3.10", "3.11", "3.12"]
cuda: ["11.8"] # wheel for 12.1 are built in build_wheels_pypi.yml
defaults:
run:
shell: bash
env:
CUDA_VERSION: ${{ matrix.cuda }}

cuda: [ "11.8", "12.1", "12.4" ]
torch: [ "2.0", "2.1", "2.2", "2.3", "2.4" ]
python: [ "3.9", "3.10", "3.11" ] # Python 3.12 is unsupported now. https://github.com/intel/intel-extension-for-pytorch/issues/525
exclude:
- cuda: "12.4"
torch: "2.1"
- cuda: "12.4"
torch: "2.2"
- cuda: "12.4"
torch: "2.3"
- torch: "2.4"
cuda: "11.8"
- torch: "2.4"
cuda: "12.1"
- torch: "2.0"
python: "3.12"
- torch: "2.1"
python: "3.12"
- torch: "2.0"
cuda: "12.1"
- torch: "2.0"
cuda: "12.4"
max-parallel: 4
runs-on: [ self-hosted ]
needs: check-vm
container:
image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:compiler_cuda${{ matrix.cuda }}-torch${{ matrix.torch }}-python${{ matrix.python }}
steps:
- uses: actions/checkout@v4

- name: Free disk space
- name: Print Env
run: |
# Go from 19G to 54G free disk space in 3min
df -h
sudo apt-get update
sudo apt-get purge -y '^apache.*'
sudo apt-get purge -y '^imagemagick.*'
sudo apt-get purge -y '^dotnet.*'
sudo apt-get purge -y '^aspnetcore.*'
sudo apt-get purge -y 'php.*'
sudo apt-get purge -y '^temurin.*'
sudo apt-get purge -y '^mysql.*'
sudo apt-get purge -y '^java.*'
sudo apt-get purge -y '^openjdk.*'
sudo apt-get purge -y microsoft-edge-stable google-cloud-cli azure-cli google-chrome-stable firefox powershell mono-devel
df -h
sudo apt-get autoremove -y >/dev/null 2>&1
sudo apt-get clean
df -h
echo "https://github.com/actions/virtual-environments/issues/709"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
df -h
echo "remove big /usr/local"
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf /usr/local/lib/android >/dev/null 2>&1
df -h
echo "remove /usr/share leftovers"
sudo rm -rf /usr/share/dotnet/sdk > /dev/null 2>&1
sudo rm -rf /usr/share/dotnet/shared > /dev/null 2>&1
sudo rm -rf /usr/share/swift > /dev/null 2>&1
df -h
echo "remove other leftovers"
sudo rm -rf /var/lib/mysql > /dev/null 2>&1
sudo rm -rf /home/runner/.dotnet > /dev/null 2>&1
sudo rm -rf /home/runneradmin/.dotnet > /dev/null 2>&1
sudo rm -rf /etc/skel/.dotnet > /dev/null 2>&1
sudo rm -rf /usr/local/.ghcup > /dev/null 2>&1
sudo rm -rf /usr/local/aws-cli > /dev/null 2>&1
sudo rm -rf /usr/local/lib/node_modules > /dev/null 2>&1
sudo rm -rf /usr/lib/heroku > /dev/null 2>&1
sudo rm -rf /usr/local/share/chromium > /dev/null 2>&1
df -h
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.pyver }}
export PYENV_ROOT=/opt/pyenv && export PATH=$PYENV_ROOT/bin:$PATH && eval "$(pyenv init -)" && eval "$(pyenv init --path)"
env_name="torch${{ matrix.torch }}_py${{ matrix.python }}"
echo "env: $env_name"
pyenv local ${{ matrix.python }} && pyenv activate $env_name && pyenv versions
- name: Setup Miniconda
uses: conda-incubator/setup-miniconda@v3.0.4
echo "========="
python --version
echo "========="
nvcc --version
echo "========="
pip show torch
- name: Checkout Codes
uses: actions/checkout@v4
with:
activate-environment: "build"
python-version: ${{ matrix.pyver }}
mamba-version: "*"
use-mamba: false
channels: conda-forge,defaults
channel-priority: true
add-pip-as-python-dependency: true
auto-activate-base: false

- name: Install Dependencies
repository: ${{ github.event.inputs.repo }}
ref: ${{ github.event.inputs.ref }}

- name: Install requirements
run: |
conda install cuda-toolkit -c "nvidia/label/cuda-${CUDA_VERSION}.0"
export PYENV_ROOT=/opt/pyenv && export PATH=$PYENV_ROOT/bin:$PATH && eval "$(pyenv init -)" && eval "$(pyenv init --path)"
# Refer to https://pytorch.org/get-started/locally/
python -m pip install torch --index-url https://download.pytorch.org/whl/cu118
python -m pip install --upgrade build setuptools wheel ninja numpy gekko pandas
env_name="torch${{ matrix.torch }}_py${{ matrix.python }}"
echo "env: $env_name"
- name: Check install
run: |
python -c "import torch; print('torch version:', torch.__version__)"
pyenv local ${{ matrix.python }} && pyenv activate $env_name && pyenv versions
- name: Build Wheel
pip install cmake
pip install -r requirements.txt -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
- name: Compile
run: |
# For some reason $CONDA_PREFIX is empty.
export CUDA_HOME=/usr/share/miniconda
export CUDA_PATH=/usr/share/miniconda
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CONDA_PREFIX}/lib"
export PYENV_ROOT=/opt/pyenv && export PATH=$PYENV_ROOT/bin:$PATH && eval "$(pyenv init -)" && eval "$(pyenv init --path)"
env_name="torch${{ matrix.torch }}_py${{ matrix.python }}"
echo "env: $env_name"
export TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX"
pyenv local ${{ matrix.python }} && pyenv activate $env_name && pyenv versions
python setup.py bdist_wheel
- name: List build dir
run: |
export PYENV_ROOT=/opt/pyenv && export PATH=$PYENV_ROOT/bin:$PATH && eval "$(pyenv init -)" && eval "$(pyenv init --path)"
echo "CUDA_PATH:"
echo $CUDA_PATH
env_name="torch${{ matrix.torch }}_py${{ matrix.python }}"
echo "env: $env_name"
echo "PYPI_RELEASE:"
echo $PYPI_RELEASE
pyenv local ${{ matrix.python }} && pyenv activate $env_name && pyenv versions
python setup.py sdist bdist_wheel
cd dist
ls -alh .
whl=$(ls -t *.whl | head -n 1)
twine check $whl
echo "WHL_NAME=$whl" >> $GITHUB_ENV
- uses: actions/upload-artifact@v4
- name: Upload artifact
uses: actions/upload-artifact@v4
with:
name: 'linux-cuda-wheels'
path: ./dist/*.whl
name: ${{ env.WHL_NAME }}
path: dist/${{ env.WHL_NAME }}
116 changes: 0 additions & 116 deletions .github/workflows/build_wheels_pypi_linux.yml

This file was deleted.

Loading

0 comments on commit db15847

Please sign in to comment.