diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3799d5017177..0d102563470e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -220,7 +220,7 @@ jobs: export CPLUS_INCLUDE_PATH=/usr/local/include # Used to run the newer GNUMake version from brew that supports --output-sync export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH" - BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make --jobs 4 --output-sync=target test + BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test - name: Setup tmate session if tests fail if: ${{ failure() }} uses: mxschmitt/action-tmate@v3.18 diff --git a/Makefile b/Makefile index 8764086a5cf7..88949acd2768 100644 --- a/Makefile +++ b/Makefile @@ -80,8 +80,8 @@ ifeq ($(OS),Darwin) BUILD_TYPE=metal # disable metal if on Darwin and any other value is explicitly passed. else ifneq ($(BUILD_TYPE),metal) - CMAKE_ARGS+=-DLLAMA_METAL=OFF - export LLAMA_NO_ACCELERATE=1 + CMAKE_ARGS+=-DGGML_METAL=OFF + export GGML_NO_ACCELERATE=1 endif ifeq ($(BUILD_TYPE),metal) @@ -98,13 +98,13 @@ endif ifeq ($(BUILD_TYPE),cublas) CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH) - export LLAMA_CUBLAS=1 + export GGML_CUDA=1 export WHISPER_CUDA=1 CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft endif ifeq ($(BUILD_TYPE),vulkan) - CMAKE_ARGS+=-DLLAMA_VULKAN=1 + CMAKE_ARGS+=-DGGML_VULKAN=1 endif ifeq ($(BUILD_TYPE),hipblas) @@ -118,13 +118,13 @@ ifeq ($(BUILD_TYPE),hipblas) export WHISPER_HIPBLAS=1 GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101 AMDGPU_TARGETS ?= "$(GPU_TARGETS)" - CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)" + CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)" CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib endif ifeq ($(BUILD_TYPE),metal) CGO_LDFLAGS+=-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders - export LLAMA_METAL=1 + export GGML_METAL=1 export WHISPER_METAL=1 endif @@ -354,7 +354,7 @@ else endif dist-cross-linux-arm64: - CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_NATIVE=off" GRPC_BACKENDS="backend-assets/grpc/llama-cpp-fallback backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server" \ + CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_NATIVE=off" GRPC_BACKENDS="backend-assets/grpc/llama-cpp-fallback backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server" \ STATIC=true $(MAKE) build mkdir -p release # if BUILD_ID is empty, then we don't append it to the binary name @@ -711,21 +711,21 @@ backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc cp -rf backend/cpp/llama backend/cpp/llama-avx2 $(MAKE) -C backend/cpp/llama-avx2 purge $(info ${GREEN}I llama-cpp build info:avx2${RESET}) - CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server + CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2 backend-assets/grpc/llama-cpp-avx: backend-assets/grpc cp -rf backend/cpp/llama backend/cpp/llama-avx $(MAKE) -C backend/cpp/llama-avx purge $(info ${GREEN}I llama-cpp build info:avx${RESET}) - CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server + CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-avx/grpc-server backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc cp -rf backend/cpp/llama backend/cpp/llama-fallback $(MAKE) -C backend/cpp/llama-fallback purge $(info ${GREEN}I llama-cpp build info:fallback${RESET}) - CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server + CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback # TODO: every binary should have its own folder instead, so can have different metal implementations ifeq ($(BUILD_TYPE),metal) @@ -736,7 +736,7 @@ backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc cp -rf backend/cpp/llama backend/cpp/llama-cuda $(MAKE) -C backend/cpp/llama-cuda purge $(info ${GREEN}I llama-cpp build info:cuda${RESET}) - CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server + CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc @@ -764,7 +764,7 @@ backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc cp -rf backend/cpp/llama backend/cpp/llama-grpc $(MAKE) -C backend/cpp/llama-grpc purge $(info ${GREEN}I llama-cpp build info:grpc${RESET}) - CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server + CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -D =off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc diff --git a/backend/cpp/llama/Makefile b/backend/cpp/llama/Makefile index d8cda4093402..175d2ade1391 100644 --- a/backend/cpp/llama/Makefile +++ b/backend/cpp/llama/Makefile @@ -6,35 +6,35 @@ BUILD_TYPE?= ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh TARGET?=--target grpc-server -# If build type is cublas, then we set -DLLAMA_CUBLAS=ON to CMAKE_ARGS automatically +# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically ifeq ($(BUILD_TYPE),cublas) - CMAKE_ARGS+=-DLLAMA_CUBLAS=ON -# If build type is openblas then we set -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS + CMAKE_ARGS+=-DGGML_CUDA=ON +# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS # to CMAKE_ARGS automatically else ifeq ($(BUILD_TYPE),openblas) - CMAKE_ARGS+=-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -# If build type is clblas (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path + CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS +# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path else ifeq ($(BUILD_TYPE),clblas) - CMAKE_ARGS+=-DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path + CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ else ifeq ($(BUILD_TYPE),hipblas) - CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -# If it's OSX, DO NOT embed the metal library - -DLLAMA_METAL_EMBED_LIBRARY=ON requires further investigation + CMAKE_ARGS+=-DGGML_HIPBLAS=ON +# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation # But if it's OSX without metal, disable it here else ifeq ($(OS),Darwin) ifneq ($(BUILD_TYPE),metal) - CMAKE_ARGS+=-DLLAMA_METAL=OFF + CMAKE_ARGS+=-DGGML_METAL=OFF else TARGET+=--target ggml-metal endif endif ifeq ($(BUILD_TYPE),sycl_f16) - CMAKE_ARGS+=-DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON + CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON endif ifeq ($(BUILD_TYPE),sycl_f32) - CMAKE_ARGS+=-DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx + CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx endif llama.cpp: diff --git a/docs/content/docs/advanced/fine-tuning.md b/docs/content/docs/advanced/fine-tuning.md index 0680a2794872..0811c39c5b20 100644 --- a/docs/content/docs/advanced/fine-tuning.md +++ b/docs/content/docs/advanced/fine-tuning.md @@ -118,7 +118,7 @@ And we convert it to the gguf format that LocalAI can consume: # Convert to gguf git clone https://github.com/ggerganov/llama.cpp.git -pushd llama.cpp && make LLAMA_CUBLAS=1 && popd +pushd llama.cpp && make GGML_CUDA=1 && popd # We need to convert the pytorch model into ggml for quantization # It crates 'ggml-model-f16.bin' in the 'merged' directory. diff --git a/docs/content/docs/faq.md b/docs/content/docs/faq.md index 49a3b20ad07d..9b2a54792ce0 100644 --- a/docs/content/docs/faq.md +++ b/docs/content/docs/faq.md @@ -55,4 +55,4 @@ This typically happens when your prompt exceeds the context size. Try to reduce ### I'm getting a 'SIGILL' error, what's wrong? -Your CPU probably does not have support for certain instructions that are compiled by default in the pre-built binaries. If you are running in a container, try setting `REBUILD=true` and disable the CPU instructions that are not compatible with your CPU. For instance: `CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make build` \ No newline at end of file +Your CPU probably does not have support for certain instructions that are compiled by default in the pre-built binaries. If you are running in a container, try setting `REBUILD=true` and disable the CPU instructions that are not compatible with your CPU. For instance: `CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make build` \ No newline at end of file diff --git a/docs/content/docs/getting-started/build.md b/docs/content/docs/getting-started/build.md index 8f8cf09fca91..c8428c2d97cb 100644 --- a/docs/content/docs/getting-started/build.md +++ b/docs/content/docs/getting-started/build.md @@ -101,14 +101,14 @@ Here is the list of the variables available that can be used to customize the bu LocalAI uses different backends based on ggml and llama.cpp to run models. If your CPU doesn't support common instruction sets, you can disable them during build: ``` -CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_AVX=OFF -DLLAMA_FMA=OFF" make build +CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_AVX=OFF -DGGML_FMA=OFF" make build ``` To have effect on the container image, you need to set `REBUILD=true`: ``` docker run quay.io/go-skynet/localai -docker run --rm -ti -p 8080:8080 -e DEBUG=true -e MODELS_PATH=/models -e THREADS=1 -e REBUILD=true -e CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_AVX=OFF -DLLAMA_FMA=OFF" -v $PWD/models:/models quay.io/go-skynet/local-ai:latest +docker run --rm -ti -p 8080:8080 -e DEBUG=true -e MODELS_PATH=/models -e THREADS=1 -e REBUILD=true -e CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_AVX=OFF -DGGML_FMA=OFF" -v $PWD/models:/models quay.io/go-skynet/local-ai:latest ``` {{% /alert %}} diff --git a/entrypoint.sh b/entrypoint.sh index fb8417dfc24c..389c846d53f3 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -22,7 +22,7 @@ else echo "@@@@@" echo "If you are experiencing issues with the pre-compiled builds, try setting REBUILD=true" echo "If you are still experiencing issues with the build, try setting CMAKE_ARGS and disable the instructions set as needed:" - echo 'CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF"' + echo 'CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF"' echo "see the documentation at: https://localai.io/basics/build/index.html" echo "Note: See also https://github.com/go-skynet/LocalAI/issues/288" echo "@@@@@" diff --git a/examples/e2e-fine-tuning/README.md b/examples/e2e-fine-tuning/README.md index af3ab8a31811..d95d89148a8c 100644 --- a/examples/e2e-fine-tuning/README.md +++ b/examples/e2e-fine-tuning/README.md @@ -65,7 +65,7 @@ And we convert it to the gguf format that LocalAI can consume: # Convert to gguf git clone https://github.com/ggerganov/llama.cpp.git -pushd llama.cpp && make LLAMA_CUBLAS=1 && popd +pushd llama.cpp && make GGML_CUDA=1 && popd # We need to convert the pytorch model into ggml for quantization # It crates 'ggml-model-f16.bin' in the 'merged' directory. diff --git a/examples/e2e-fine-tuning/notebook.ipynb b/examples/e2e-fine-tuning/notebook.ipynb index 4996da5d339e..e80dfce5d8f2 100644 --- a/examples/e2e-fine-tuning/notebook.ipynb +++ b/examples/e2e-fine-tuning/notebook.ipynb @@ -1600,7 +1600,7 @@ "source": [ "\n", "!git clone https://github.com/ggerganov/llama.cpp.git\n", - "!cd llama.cpp && make LLAMA_CUBLAS=1\n", + "!cd llama.cpp && make GGML_CUDA=1\n", "\n" ] },