From 042b0c269b2b24134c3ddd5e294d3ca5fd35b548 Mon Sep 17 00:00:00 2001 From: thxCode Date: Sat, 29 Jun 2024 17:29:59 +0800 Subject: [PATCH] refactor: bump llama.cpp Signed-off-by: thxCode --- .github/workflows/ci.yml | 42 +- CMakeLists.txt | 719 ++------------------- llama-box/CMakeLists.txt | 11 +- llama-box/main.cpp | 34 +- llama-box/param.hpp | 18 +- llama-box/ratelimiter.hpp | 24 +- llama-box/scripts/build-windows-oneapi.bat | 4 +- llama-box/utils.hpp | 2 +- llama.cpp | 2 +- 9 files changed, 123 insertions(+), 733 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4624f4b..922cf63 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -77,9 +77,9 @@ jobs: echo "===== BUILD =====" mkdir -p ${{ github.workspace }}/.cache cmake -S ${{ github.workspace }} -B ${{ github.workspace }}/build -DCMAKE_BUILD_TYPE=Release \ - -DLLAMA_ACCELERATE=on -DLLAMA_METAL=on -DLLAMA_METAL_EMBED_LIBRARY=on \ - ${{ matrix.arch == 'amd64' && '-DLLAMA_NATIVE=off' || '-DLLAMA_NATIVE=on' }} \ - -DLLAMA_OPENMP=off + -DGGML_ACCELERATE=on -DGGML_METAL=on -DGGML_METAL_EMBED_LIBRARY=on \ + ${{ matrix.arch == 'amd64' && '-DGGML_NATIVE=off' || '-DGGML_NATIVE=on' }} \ + -DGGML_OPENMP=off cmake --build ${{ github.workspace }}/build --target llama-box --config Release -- -j $(nproc) echo "===== RESULT =====" @@ -91,7 +91,7 @@ jobs: echo "===== PACKAGE =====" mkdir -p ${{ github.workspace }}/out - zip -j ${{ github.workspace }}/out/llama-box-darwin-${{ matrix.arch }}-metal.zip ${{ github.workspace }}/build/bin/* + zip -j ${{ github.workspace }}/out/llama-box-darwin-${{ matrix.arch }}-metal.zip ${{ github.workspace }}/build/bin/llama-box - name: Upload Artifact uses: actions/upload-artifact@v4 with: @@ -157,9 +157,9 @@ jobs: git config --system --add safe.directory '*' mkdir -p ${{ github.workspace }}/.cache cmake -S ${{ github.workspace }} -B ${{ github.workspace }}/build -DCMAKE_BUILD_TYPE=Release \ - -DLLAMA_HIPBLAS=on -DAMDGPU_TARGETS="${AMDGPU_TARGETS}" \ - ${{ matrix.arch == 'amd64' && '-DLLAMA_NATIVE=off' || '-DLLAMA_NATIVE=on' }} \ - -DLLAMA_OPENMP=off + -DGGML_HIPBLAS=on -DAMDGPU_TARGETS="${AMDGPU_TARGETS}" \ + ${{ matrix.arch == 'amd64' && '-DGGML_NATIVE=off' || '-DGGML_NATIVE=on' }} \ + -DGGML_OPENMP=off cmake --build ${{ github.workspace }}/build --target llama-box --config Release -- -j $(nproc) EOF chmod +x /tmp/entrypoint.sh @@ -201,7 +201,7 @@ jobs: strategy: fail-fast: false matrix: - arch: [ amd64 ] + arch: [ amd64, arm64 ] # see https://hub.docker.com/r/nvidia/cuda/tags?page=&page_size=&ordering=&name=devel. # 12.5 ==> 12.5.0 # 11.7 ==> 11.7.1 @@ -251,9 +251,9 @@ jobs: git config --system --add safe.directory '*' mkdir -p ${{ github.workspace }}/.cache cmake -S ${{ github.workspace }} -B ${{ github.workspace }}/build -DCMAKE_BUILD_TYPE=Release \ - -DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCHITECTURES}" \ - ${{ matrix.arch == 'amd64' && '-DLLAMA_NATIVE=off' || '-DLLAMA_NATIVE=on' }} \ - -DLLAMA_OPENMP=off + -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCHITECTURES}" \ + ${{ matrix.arch == 'amd64' && '-DGGML_NATIVE=off' || '-DGGML_NATIVE=on' }} \ + -DGGML_OPENMP=off cmake --build ${{ github.workspace }}/build --target llama-box --config Release -- -j $(nproc) EOF chmod +x /tmp/entrypoint.sh @@ -340,9 +340,9 @@ jobs: git config --system --add safe.directory '*' mkdir -p ${{ github.workspace }}/.cache cmake -S ${{ github.workspace }} -B ${{ github.workspace }}/build -DCMAKE_BUILD_TYPE=Release \ - -DLLAMA_SYCL=on -DLLAMA_SYCL_F16=on \ - ${{ matrix.arch == 'amd64' && '-DLLAMA_NATIVE=off' || '-DLLAMA_NATIVE=on' }} \ - -DLLAMA_OPENMP=off + -DGGML_SYCL=on -DGGML_SYCL_F16=on \ + ${{ matrix.arch == 'amd64' && '-DGGML_NATIVE=off' || '-DGGML_NATIVE=on' }} \ + -DGGML_OPENMP=off cmake --build ${{ github.workspace }}/build --target llama-box --config Release -- -j $(nproc) EOF chmod +x /tmp/entrypoint.sh @@ -356,7 +356,7 @@ jobs: --volume ${{ github.workspace }}:${{ github.workspace }} \ --workdir ${{ github.workspace }} \ --env CC=icx \ - --env CXX=icx \ + --env CXX=icpx \ --env CCACHE_DIR \ --volume /tmp/entrypoint.sh:/entrypoint.sh \ --entrypoint /entrypoint.sh \ @@ -445,9 +445,9 @@ jobs: $env:CMAKE_PREFIX_PATH = "${env:HIP_PATH}" cmake -G "Unix Makefiles" -S ${{ github.workspace }} -B ${{ github.workspace }}\build -DCMAKE_BUILD_TYPE=Release ` -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" ` - -DLLAMA_HIPBLAS=on -DAMDGPU_TARGETS="${env:AMDGPU_TARGETS}" ` - ${{ matrix.arch == 'amd64' && '-DLLAMA_NATIVE=off' || '-DLLAMA_NATIVE=on' }} ` - -DLLAMA_OPENMP=off + -DGGML_HIPBLAS=on -DAMDGPU_TARGETS="${env:AMDGPU_TARGETS}" ` + ${{ matrix.arch == 'amd64' && '-DGGML_NATIVE=off' || '-DGGML_NATIVE=on' }} ` + -DGGML_OPENMP=off cmake --build ${{ github.workspace }}\build --target llama-box --config Release -- -j ${env:NUMBER_OF_PROCESSORS} Write-Host "===== RESULT =====" @@ -511,9 +511,9 @@ jobs: Write-Host "===== BUILD =====" Write-Host "CUDA_PATH=${env:CUDA_PATH}" cmake -S ${{ github.workspace }} -B ${{ github.workspace }}\build -DCMAKE_BUILD_TYPE=Release ` - -DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES="${env:CUDA_ARCHITECTURES}" ` - ${{ matrix.arch == 'amd64' && '-DLLAMA_NATIVE=off' || '-DLLAMA_NATIVE=on' }} ` - -DLLAMA_OPENMP=off + -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES="${env:CUDA_ARCHITECTURES}" ` + ${{ matrix.arch == 'amd64' && '-DGGML_NATIVE=off' || '-DGGML_NATIVE=on' }} ` + -DGGML_OPENMP=off cmake --build ${{ github.workspace }}\build --target llama-box --config Release -- /m:${env:NUMBER_OF_PROCESSORS} Write-Host "===== RESULT =====" diff --git a/CMakeLists.txt b/CMakeLists.txt index 13187c6..d924b32 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,120 +7,47 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") -endif() +endif () set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) +set(BUILD_SHARED_LIBS OFF CACHE BOOL "build shared libraries" FORCE) # -# Option list +# Options # -if (APPLE) - set(LLAMA_METAL_DEFAULT ON) - set(LLAMA_BLAS_DEFAULT ON) - set(LLAMA_BLAS_VENDOR_DEFAULT "Apple") -else() - set(LLAMA_METAL_DEFAULT OFF) - set(LLAMA_BLAS_DEFAULT OFF) - set(LLAMA_BLAS_VENDOR_DEFAULT "Generic") -endif() - # general -option(BUILD_SHARED_LIBS "build shared libraries" OFF) -option(LLAMA_STATIC "llama: static link libraries" OFF) -option(LLAMA_NATIVE "llama: enable -march=native flag" ON) -option(LLAMA_LTO "llama: enable link time optimization" OFF) -option(LLAMA_CCACHE "llama: use ccache if available" ON) +option(LLAMA_CCACHE "llama: use ccache if available" ON) # debug -option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) -option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) -option(LLAMA_GPROF "llama: enable gprof" OFF) +option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) # build -option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF) +option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF) # sanitizers -option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) -option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) -option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) - -# instruction set specific -if (LLAMA_NATIVE) - set(INS_ENB OFF) -else() - set(INS_ENB ON) -endif() - -option(LLAMA_SVE "llama: enable SVE" OFF) -option(LLAMA_AVX "llama: enable AVX" ${INS_ENB}) -option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB}) -option(LLAMA_AVX512 "llama: enable AVX512" OFF) -option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF) -option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF) -option(LLAMA_AVX512_BF16 "llama: enable AVX512-BF16" OFF) -option(LLAMA_FMA "llama: enable FMA" ${INS_ENB}) -# in MSVC F16C is implied with AVX2/AVX512 -if (NOT MSVC) - option(LLAMA_F16C "llama: enable F16C" ${INS_ENB}) -endif() +option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) +option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) +option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) -if (WIN32) - set(LLAMA_WIN_VER "0x602" CACHE STRING "llama: Windows Version") -endif() +# extra artifacts +set(LLAMA_BUILD_TESTS OFF CACHE BOOL "llama: build tests" FORCE) +set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "llama: build examples" FORCE) +set(LLAMA_BUILD_SERVER OFF CACHE BOOL "llama: build server example" FORCE) # 3rd party libs -option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) -option(LLAMA_BLAS "llama: use BLAS" ${LLAMA_BLAS_DEFAULT}) -set(LLAMA_BLAS_VENDOR ${LLAMA_BLAS_VENDOR_DEFAULT} CACHE STRING - "llama: BLAS library vendor") -option(LLAMA_CUDA "llama: use CUDA" OFF) -option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) -option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF) -set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") -set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") -option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF) -set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") -set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING - "llama: max. batch size for using peer access") -option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF) -option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF) -option(LLAMA_CUDA_FA_ALL_QUANTS "llama: compile all quants for FlashAttention" OFF) - -option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) -option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) -option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) -option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) -option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF) -option(LLAMA_METAL_EMBED_LIBRARY "llama: embed Metal library" OFF) -set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING - "llama: metal minimum macOS version") -set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)") -option(LLAMA_OPENMP "llama: use OpenMP" ON) -option(LLAMA_SYCL "llama: use SYCL" OFF) -option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) -set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device") - -set(LLAMA_BUILD_TESTS OFF CACHE BOOL "llama: build tests") -set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "llama: build examples") -set(LLAMA_BUILD_SERVER OFF CACHE BOOL "llama: build server example") -option(LLAMA_LASX "llama: enable lasx" ON) -option(LLAMA_LSX "llama: enable lsx" ON) - -# add perf arguments -set(LLAMA_PERF OFF CACHE BOOL "llama: enable perf") +set(LLAMA_CURL OFF CACHE BOOL "llama: use libcurl to download model from an URL" FORCE) # # Compile flags # -if (LLAMA_SYCL) +if (GGML_SYCL) set(CMAKE_CXX_STANDARD 17) -else() +else () set(CMAKE_CXX_STANDARD 11) -endif() - +endif () set(CMAKE_CXX_STANDARD_REQUIRED true) set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED true) @@ -132,399 +59,92 @@ include(CheckCXXCompilerFlag) # enable libstdc++ assertions for debug builds if (CMAKE_SYSTEM_NAME MATCHES "Linux") add_compile_definitions($<$:_GLIBCXX_ASSERTIONS>) -endif() +endif () if (NOT MSVC) if (LLAMA_SANITIZE_THREAD) add_compile_options(-fsanitize=thread) - link_libraries (-fsanitize=thread) - endif() + link_libraries(-fsanitize=thread) + endif () if (LLAMA_SANITIZE_ADDRESS) add_compile_options(-fsanitize=address -fno-omit-frame-pointer) - link_libraries (-fsanitize=address) - endif() + link_libraries(-fsanitize=address) + endif () if (LLAMA_SANITIZE_UNDEFINED) add_compile_options(-fsanitize=undefined) - link_libraries (-fsanitize=undefined) - endif() -endif() - -if (APPLE AND LLAMA_ACCELERATE) - find_library(ACCELERATE_FRAMEWORK Accelerate) - - if (ACCELERATE_FRAMEWORK) - message(STATUS "Accelerate framework found") - - add_compile_definitions(ACCELERATE_NEW_LAPACK) - add_compile_definitions(ACCELERATE_LAPACK_ILP64) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) - else() - message(WARNING "Accelerate framework not found") - endif() -endif() - -if (LLAMA_METAL) - find_library(FOUNDATION_LIBRARY Foundation REQUIRED) - find_library(METAL_FRAMEWORK Metal REQUIRED) - find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) - - message(STATUS "Metal framework found") - - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} - ${FOUNDATION_LIBRARY} - ${METAL_FRAMEWORK} - ${METALKIT_FRAMEWORK} - ) -endif() - -if (LLAMA_OPENMP) - find_package(OpenMP) - - if (OpenMP_FOUND) - message(STATUS "OpenMP found, Libraries: ${OpenMP_LIBRARIES}") - - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX) - else() - message(WARNING "OpenMP not found") - endif() -endif() - -if (LLAMA_BLAS) - if (LLAMA_STATIC) - set(BLA_STATIC ON) - endif() - #if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22) - # set(BLA_SIZEOF_INTEGER 8) - #endif() - - set(BLA_VENDOR ${LLAMA_BLAS_VENDOR}) - find_package(BLAS) - - if (BLAS_FOUND) - message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") - - if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${LLAMA_BLAS_VENDOR} MATCHES "Apple")) - # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. - # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 - find_package(PkgConfig REQUIRED) - if (${LLAMA_BLAS_VENDOR} MATCHES "Generic") - pkg_check_modules(DepBLAS REQUIRED blas) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS") - # As of openblas v0.3.22, the 64-bit is named openblas64.pc - pkg_check_modules(DepBLAS openblas64) - if (NOT DepBLAS_FOUND) - pkg_check_modules(DepBLAS REQUIRED openblas) - endif() - elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME") - pkg_check_modules(DepBLAS REQUIRED blis) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS") - pkg_check_modules(DepBLAS REQUIRED blas-atlas) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS") - pkg_check_modules(DepBLAS REQUIRED flexiblas_api) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel") - # all Intel* libraries share the same include path - pkg_check_modules(DepBLAS REQUIRED mkl-sdl) - elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC") - # this doesn't provide pkg-config - # suggest to assign BLAS_INCLUDE_DIRS on your own - if ("${NVHPC_VERSION}" STREQUAL "") - message(WARNING "Better to set NVHPC_VERSION") - else() - set(DepBLAS_FOUND ON) - set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include") - endif() - endif() - if (DepBLAS_FOUND) - set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS}) - else() - message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically" - " detected by pkgconfig, trying to find cblas.h from possible paths...") - find_path(BLAS_INCLUDE_DIRS - NAMES cblas.h - HINTS - /usr/include - /usr/local/include - /usr/include/openblas - /opt/homebrew/opt/openblas/include - /usr/local/opt/openblas/include - /usr/include/x86_64-linux-gnu/openblas/include - ) - endif() - endif() - - message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") - - add_compile_options(${BLAS_LINKER_FLAGS}) - - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) - set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) - else() - message(WARNING "BLAS not found, please refer to " - "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" - " to set correct LLAMA_BLAS_VENDOR") - endif() -endif() - -if (LLAMA_CUDA) - cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES - - find_package(CUDAToolkit) - if (CUDAToolkit_FOUND) - message(STATUS "CUDA found") - - if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - # 52 == lowest CUDA 12 standard - # 60 == f16 CUDA intrinsics - # 61 == integer CUDA intrinsics - # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster - if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16) - set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics - else() - set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics - #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work - endif() - endif() - message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") - - enable_language(CUDA) - - if (LLAMA_STATIC) - if (WIN32) - # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) - else () - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) - endif() - else() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) - endif() - - if (LLAMA_CUDA_NO_VMM) - # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) - else() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... - endif() - else() - message(WARNING "CUDA not found") - endif() -endif() - -if (LLAMA_HIPBLAS) - if (NOT EXISTS $ENV{ROCM_PATH}) - if (NOT EXISTS /opt/rocm) - set(ROCM_PATH /usr) - else() - set(ROCM_PATH /opt/rocm) - endif() - else() - set(ROCM_PATH $ENV{ROCM_PATH}) - endif() - list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}) - list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake") - - # CMake on Windows doesn't support the HIP language yet - if(WIN32) - set(CXX_IS_HIPCC TRUE) - else() - string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}") - endif() - - if(CXX_IS_HIPCC) - if(LINUX) - if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") - message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") - endif() - - message(WARNING "Setting hipcc as the C++ compiler is legacy behavior." - " Prefer setting the HIP compiler directly. See README for details.") - endif() - else() - # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES. - if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES) - set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS}) - endif() - cmake_minimum_required(VERSION 3.21) - enable_language(HIP) - endif() - find_package(hip REQUIRED) - find_package(hipblas REQUIRED) - find_package(rocblas REQUIRED) - - message(STATUS "HIP and hipBLAS found") - - if (CXX_IS_HIPCC) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device) - endif() - - if (LLAMA_STATIC) - message(FATAL_ERROR "Static linking not supported for HIP/ROCm") - endif() - - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas) -endif() - -if (LLAMA_SYCL) - if (NOT LLAMA_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$") - message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA") - endif() - - if (NOT DEFINED ENV{ONEAPI_ROOT}) - message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh") - endif() - #todo: AOT - - find_package(IntelSYCL REQUIRED) - - message(STATUS "SYCL found") - - add_compile_options(-I./) #include DPCT - add_compile_options(-I/${SYCL_INCLUDE_DIR}) - - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib") - if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") - endif() - - if (WIN32) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib) - else() - if (LLAMA_SYCL_TARGET STREQUAL "INTEL") - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) - elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA") - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl pthread m dl onemkl) - endif() - endif() -endif() + link_libraries(-fsanitize=undefined) + endif () +endif () function(get_flags CCID CCVER) set(C_FLAGS "") set(CXX_FLAGS "") if (CCID MATCHES "Clang") - set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return) + set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return) set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi) if ( - (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR - (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0) + (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR + (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0) ) list(APPEND C_FLAGS -Wdouble-promotion) - endif() + endif () elseif (CCID STREQUAL "GNU") - set(C_FLAGS -Wdouble-promotion) + set(C_FLAGS -Wdouble-promotion) set(CXX_FLAGS -Wno-array-bounds) if (CCVER VERSION_GREATER_EQUAL 7.1.0) list(APPEND CXX_FLAGS -Wno-format-truncation) - endif() + endif () if (CCVER VERSION_GREATER_EQUAL 8.1.0) list(APPEND CXX_FLAGS -Wextra-semi) - endif() - endif() + endif () + endif () - set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE) + set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE) set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE) endfunction() if (LLAMA_FATAL_WARNINGS) if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") - list(APPEND C_FLAGS -Werror) + list(APPEND C_FLAGS -Werror) list(APPEND CXX_FLAGS -Werror) elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") add_compile_options(/WX) - endif() -endif() + endif () +endif () if (LLAMA_ALL_WARNINGS) if (NOT MSVC) list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) - list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes - -Werror=implicit-int -Werror=implicit-function-declaration) - list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) + list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes + -Werror=implicit-int -Werror=implicit-function-declaration) + list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) - list(APPEND C_FLAGS ${WARNING_FLAGS}) + list(APPEND C_FLAGS ${WARNING_FLAGS}) list(APPEND CXX_FLAGS ${WARNING_FLAGS}) get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}) add_compile_options("$<$:${C_FLAGS};${GF_C_FLAGS}>" - "$<$:${CXX_FLAGS};${GF_CXX_FLAGS}>") - else() + "$<$:${CXX_FLAGS};${GF_CXX_FLAGS}>") + else () # todo : msvc - set(C_FLAGS "") + set(C_FLAGS "") set(CXX_FLAGS "") - endif() -endif() - -set(CUDA_CXX_FLAGS "") - -if (LLAMA_CUDA) - set(CUDA_FLAGS -use_fast_math) - - if (LLAMA_FATAL_WARNINGS) - list(APPEND CUDA_FLAGS -Werror all-warnings) - endif() - - if (LLAMA_ALL_WARNINGS AND NOT MSVC) - set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c) - if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "") - list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER}) - endif() - - execute_process( - COMMAND ${NVCC_CMD} -Xcompiler --version - OUTPUT_VARIABLE CUDA_CCFULLVER - ERROR_QUIET - ) - - if (NOT CUDA_CCFULLVER MATCHES clang) - set(CUDA_CCID "GNU") - execute_process( - COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion" - OUTPUT_VARIABLE CUDA_CCVER - ERROR_QUIET - ) - else() - if (CUDA_CCFULLVER MATCHES Apple) - set(CUDA_CCID "AppleClang") - else() - set(CUDA_CCID "Clang") - endif() - string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER}) - endif() - - message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") - - get_flags(${CUDA_CCID} ${CUDA_CCVER}) - list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later - endif() - - if (NOT MSVC) - list(APPEND CUDA_CXX_FLAGS -Wno-pedantic) - endif() -endif() + endif () +endif () if (WIN32) add_compile_definitions(_CRT_SECURE_NO_WARNINGS) if (BUILD_SHARED_LIBS) set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) - endif() -endif() - -if (LLAMA_LTO) - include(CheckIPOSupported) - check_ipo_supported(RESULT result OUTPUT output) - if (result) - set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) - else() - message(WARNING "IPO is not supported: ${output}") - endif() -endif() + endif () +endif () if (LLAMA_CCACHE) find_program(LLAMA_CCACHE_FOUND ccache) @@ -532,252 +152,13 @@ if (LLAMA_CCACHE) set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) set(ENV{CCACHE_SLOPPINESS} time_macros) message(STATUS "ccache found, compilation results will be cached. Disable with LLAMA_CCACHE=OFF.") - else() + else () message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with LLAMA_CCACHE=OFF") endif () -endif() - -# this version of Apple ld64 is buggy -execute_process( - COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v - ERROR_VARIABLE output - OUTPUT_QUIET -) - -if (output MATCHES "dyld-1015\.7") - add_compile_definitions(HAVE_BUGGY_APPLE_LINKER) -endif() - -# Architecture specific -# TODO: probably these flags need to be tweaked on some architectures -# feel free to update the Makefile for your architecture and send a pull request or issue -message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") -if (MSVC) - string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) - message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") -else () - set(CMAKE_GENERATOR_PLATFORM_LWR "") endif () -if (NOT MSVC) - if (LLAMA_STATIC) - add_link_options(-static) - if (MINGW) - add_link_options(-static-libgcc -static-libstdc++) - endif() - endif() - if (LLAMA_GPROF) - add_compile_options(-pg) - endif() -endif() - -set(ARCH_FLAGS "") - -if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR - (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND - CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) - message(STATUS "ARM detected") - if (MSVC) - add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead - add_compile_definitions(__ARM_NEON) - add_compile_definitions(__ARM_FEATURE_FMA) - - set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) - string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) - if (GGML_COMPILER_SUPPORT_DOTPROD) - add_compile_definitions(__ARM_FEATURE_DOTPROD) - endif () - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) - if (GGML_COMPILER_SUPPORT_MATMUL_INT8) - add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) - endif () - - check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) - if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) - add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - endif () - set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) - else() - check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) - if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") - list(APPEND ARCH_FLAGS -mfp16-format=ieee) - endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") - # Raspberry Pi 1, Zero - list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) - endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") - if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") - # Android armeabi-v7a - list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) - else() - # Raspberry Pi 2 - list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) - endif() - endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") - # Android arm64-v8a - # Raspberry Pi 3, 4, Zero 2 (32-bit) - list(APPEND ARCH_FLAGS -mno-unaligned-access) - endif() - if (LLAMA_SVE) - list(APPEND ARCH_FLAGS -march=armv8.6-a+sve) - endif() - endif() -elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR - (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND - CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) - message(STATUS "x86 detected") - if (MSVC) - # instruction set detection for MSVC only - if (LLAMA_NATIVE) - include(cmake/FindSIMD.cmake) - endif () - if (LLAMA_AVX512) - list(APPEND ARCH_FLAGS /arch:AVX512) - # MSVC has no compile-time flags enabling specific - # AVX512 extensions, neither it defines the - # macros corresponding to the extensions. - # Do it manually. - if (LLAMA_AVX512_VBMI) - add_compile_definitions($<$:__AVX512VBMI__>) - add_compile_definitions($<$:__AVX512VBMI__>) - endif() - if (LLAMA_AVX512_VNNI) - add_compile_definitions($<$:__AVX512VNNI__>) - add_compile_definitions($<$:__AVX512VNNI__>) - endif() - if (LLAMA_AVX512_BF16) - add_compile_definitions($<$:__AVX512BF16__>) - add_compile_definitions($<$:__AVX512BF16__>) - endif() - elseif (LLAMA_AVX2) - list(APPEND ARCH_FLAGS /arch:AVX2) - elseif (LLAMA_AVX) - list(APPEND ARCH_FLAGS /arch:AVX) - endif() - else() - if (LLAMA_NATIVE) - list(APPEND ARCH_FLAGS -march=native) - endif() - if (LLAMA_F16C) - list(APPEND ARCH_FLAGS -mf16c) - endif() - if (LLAMA_FMA) - list(APPEND ARCH_FLAGS -mfma) - endif() - if (LLAMA_AVX) - list(APPEND ARCH_FLAGS -mavx) - endif() - if (LLAMA_AVX2) - list(APPEND ARCH_FLAGS -mavx2) - endif() - if (LLAMA_AVX512) - list(APPEND ARCH_FLAGS -mavx512f) - list(APPEND ARCH_FLAGS -mavx512bw) - endif() - if (LLAMA_AVX512_VBMI) - list(APPEND ARCH_FLAGS -mavx512vbmi) - endif() - if (LLAMA_AVX512_VNNI) - list(APPEND ARCH_FLAGS -mavx512vnni) - endif() - if (LLAMA_AVX512_BF16) - list(APPEND ARCH_FLAGS -mavx512bf16) - endif() - endif() -elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") - message(STATUS "PowerPC detected") - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") - list(APPEND ARCH_FLAGS -mcpu=powerpc64le) - else() - list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) - #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) - endif() -elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") - message(STATUS "loongarch64 detected") - - list(APPEND ARCH_FLAGS -march=loongarch64) - if (LLAMA_LASX) - list(APPEND ARCH_FLAGS -mlasx) - endif() - if (LLAMA_LSX) - list(APPEND ARCH_FLAGS -mlsx) - endif() - -else() - message(STATUS "Unknown architecture") -endif() - -add_compile_options("$<$:${ARCH_FLAGS}>") -add_compile_options("$<$:${ARCH_FLAGS}>") - -if (LLAMA_CUDA) - list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS}) - list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument - if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "") - list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED}) - endif() - add_compile_options("$<$:${CUDA_FLAGS}>") -endif() - -if (MINGW) - # Target Windows 8 for PrefetchVirtualMemory - add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER}) -endif() - -# -# POSIX conformance -# - -# clock_gettime came in POSIX.1b (1993) -# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional -# posix_memalign came in POSIX.1-2001 / SUSv3 -# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985) -add_compile_definitions(_XOPEN_SOURCE=600) - -# Somehow in OpenBSD whenever POSIX conformance is specified -# some string functions rely on locale_t availability, -# which was introduced in POSIX.1-2008, forcing us to go higher -if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") - remove_definitions(-D_XOPEN_SOURCE=600) - add_compile_definitions(_XOPEN_SOURCE=700) -endif() - -# Data types, macros and functions related to controlling CPU affinity and -# some memory allocation are available on Linux through GNU extensions in libc -if (CMAKE_SYSTEM_NAME MATCHES "Linux") - add_compile_definitions(_GNU_SOURCE) -endif() - -# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, -# and on macOS its availability depends on enabling Darwin extensions -# similarly on DragonFly, enabling BSD extensions is necessary -if ( - CMAKE_SYSTEM_NAME MATCHES "Darwin" OR - CMAKE_SYSTEM_NAME MATCHES "iOS" OR - CMAKE_SYSTEM_NAME MATCHES "tvOS" OR - CMAKE_SYSTEM_NAME MATCHES "DragonFly" -) - add_compile_definitions(_DARWIN_C_SOURCE) -endif() - -# alloca is a non-standard interface that is not visible on BSDs when -# POSIX conformance is specified, but not all of them provide a clean way -# to enable it in such cases -if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD") - add_compile_definitions(__BSD_VISIBLE) -endif() -if (CMAKE_SYSTEM_NAME MATCHES "NetBSD") - add_compile_definitions(_NETBSD_SOURCE) -endif() -if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") - add_compile_definitions(_BSD_SOURCE) -endif() - # -# programs +# Programs # add_subdirectory(llama.cpp) diff --git a/llama-box/CMakeLists.txt b/llama-box/CMakeLists.txt index 131ba9e..477dc08 100644 --- a/llama-box/CMakeLists.txt +++ b/llama-box/CMakeLists.txt @@ -1,5 +1,10 @@ # llama-box +if (MINGW) + # fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006 + add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER}) +endif () + # # version # @@ -43,11 +48,11 @@ add_library(${TARGET} OBJECT version.cpp) # llama-box # set(TARGET llama-box) -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_SOURCE_DIR}) add_executable(${TARGET} main.cpp param.hpp ratelimiter.hpp utils.hpp) target_link_libraries(${TARGET} PRIVATE version common ${CMAKE_THREAD_LIBS_INIT}) -target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}) +target_include_directories(${TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) if (WIN32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) -endif() +endif () target_compile_features(${TARGET} PUBLIC cxx_std_11) diff --git a/llama-box/main.cpp b/llama-box/main.cpp index 1ef695a..fc5dccb 100644 --- a/llama-box/main.cpp +++ b/llama-box/main.cpp @@ -3,19 +3,17 @@ #include #include #include -#include #include -#include +#include #include #include "llama.cpp/common/common.h" -#include "llama.cpp/common/grammar-parser.h" #include "llama.cpp/common/json-schema-to-grammar.h" #define JSON_ASSERT GGML_ASSERT #include "llama.cpp/common/json.hpp" #include "llama.cpp/common/log.h" -#include "llama.cpp/ggml.h" -#include "llama.cpp/llama.h" +#include "llama.cpp/ggml/include/ggml.h" +#include "llama.cpp/include/llama.h" #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 10485760 #include "llama.cpp/examples/server/httplib.h" @@ -658,14 +656,6 @@ struct server_context { return true; } - bool validate_model_chat_template() const { - llama_chat_message chat[] = {{"user", "test"}}; - - const int res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0); - - return res > 0; - } - std::string load_chat_template() const { std::vector model_template(2048, 0); // longest known template is about 1200 bytes std::string template_key = "tokenizer.chat_template"; @@ -1942,6 +1932,7 @@ struct server_context { slot.t_start_generation = 0; if (slot.infill) { + const bool add_bos = llama_should_add_bos_token(model); bool suff_rm_leading_spc = true; if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { @@ -1959,18 +1950,21 @@ struct server_context { } prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); - prefix_tokens.insert(prefix_tokens.begin(), - llama_token_bos(model)); // always add BOS - prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model)); - prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), - suffix_tokens.end()); + suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model)); + + auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens; + auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens; + if (add_bos) { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); const llama_token middle_token = llama_token_middle(model); if (middle_token >= 0) { - prefix_tokens.push_back(middle_token); + embd_inp.push_back(middle_token); } - prompt_tokens = prefix_tokens; + prompt_tokens = embd_inp; } else { prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there diff --git a/llama-box/param.hpp b/llama-box/param.hpp index 50b581f..7582230 100644 --- a/llama-box/param.hpp +++ b/llama-box/param.hpp @@ -7,8 +7,8 @@ #include "llama.cpp/common/json-schema-to-grammar.h" #define JSON_ASSERT GGML_ASSERT #include "llama.cpp/common/json.hpp" -#include "llama.cpp/ggml.h" -#include "llama.cpp/llama.h" +#include "llama.cpp/ggml/include/ggml.h" +#include "llama.cpp/include/llama.h" // version extern const char *LLAMA_BOX_GIT_TREE_STATE; @@ -50,7 +50,7 @@ static int invalid(const char *flag) { #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) #endif -void llama_box_params_print_usage(int, char **argv, const llama_box_params &bparams) { +static void llama_box_params_print_usage(int, char **argv, const llama_box_params &bparams) { struct opt { LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5) @@ -180,6 +180,8 @@ void llama_box_params_print_usage(int, char **argv, const llama_box_params &bpar "add a control vector with user defined scaling SCALE" }); opts.push_back({ "*", " --control-vector-layer-range START END", "layer range to apply the control vector(s) to, start and end inclusive" }); + opts.push_back({ "*", " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" }); + if (llama_supports_gpu_offload()) { opts.push_back({ "*", "-ngl, --gpu-layers N", "number of layers to store in VRAM" }); opts.push_back({ "*", "-sm, --split-mode SPLIT_MODE", @@ -248,7 +250,7 @@ void llama_box_params_print_usage(int, char **argv, const llama_box_params &bpar printf("\n"); } -bool llama_box_params_parse(int argc, char **argv, llama_box_params &bparams) { +static bool llama_box_params_parse(int argc, char **argv, llama_box_params &bparams) { try { for (int i = 1; i < argc;) { const char *flag = argv[i++]; @@ -896,8 +898,14 @@ bool llama_box_params_parse(int argc, char **argv, llama_box_params &bparams) { continue; } + if (!strcmp(flag, "--spm-infill")) { + bparams.gparams.spm_infill = true; + continue; + } + if (llama_supports_gpu_offload()) { - if (!strcmp(flag, "-ngl") || !strcmp(flag, "--gpu-layers") || !strcmp(flag, "--n-gpu-layers")) { + if (!strcmp(flag, "-ngl") || !strcmp(flag, "--gpu-layers") || + !strcmp(flag, "--n-gpu-layers")) { if (i == argc) { missing("--gpu-layers"); } diff --git a/llama-box/ratelimiter.hpp b/llama-box/ratelimiter.hpp index 571f9f0..6ed9592 100644 --- a/llama-box/ratelimiter.hpp +++ b/llama-box/ratelimiter.hpp @@ -4,36 +4,38 @@ // lockless token bucket rate limiter class token_bucket { -private: + private: int capacity; int rate; - int tokens; + int tokens_remain; std::chrono::steady_clock::time_point last_time; void refill() { auto const now = std::chrono::steady_clock::now(); - auto const elapsed = std::chrono::duration_cast(now - last_time).count(); - int new_tokens = elapsed * rate / 1000; + auto const elapsed = + std::chrono::duration_cast(now - last_time).count(); + int new_tokens = int(elapsed) * rate / 1000; if (new_tokens > 0) { - tokens = std::min(capacity, tokens + new_tokens); + tokens_remain = std::min(capacity, tokens_remain + new_tokens); last_time = now; } } -public: - token_bucket(int capacity, int rate) : capacity(capacity), rate(rate) { - tokens = capacity; + public: + token_bucket(int capacity, int rate) + : capacity(capacity), rate(rate) { + tokens_remain = capacity; last_time = std::chrono::steady_clock::now(); } bool acquire(int tokens = 1) { - if (this->tokens < tokens) { + if (this->tokens_remain < tokens) { refill(); - if (this->tokens < tokens) { + if (this->tokens_remain < tokens) { return false; } } - this->tokens -= tokens; + this->tokens_remain -= tokens; return true; } }; \ No newline at end of file diff --git a/llama-box/scripts/build-windows-oneapi.bat b/llama-box/scripts/build-windows-oneapi.bat index b24b3ff..13cea72 100644 --- a/llama-box/scripts/build-windows-oneapi.bat +++ b/llama-box/scripts/build-windows-oneapi.bat @@ -5,9 +5,9 @@ set ARCH=%2 if %ERRORLEVEL% neq 0 (exit /B %ERRORLEVEL%) if "%ARCH%"=="amd64" ( - cmake -G "Ninja" -S %DIR% -B %DIR%\build -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DLLAMA_SYCL=on -DLLAMA_SYCL_F16=on -DLLAMA_NATIVE=off -DLLAMA_OPENMP=off + cmake -G "Ninja" -S %DIR% -B %DIR%\build -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DGGML_SYCL=on -DGGML_SYCL_F16=on -DGGML_NATIVE=off -DGGML_OPENMP=off ) else ( - cmake -G "Ninja" -S %DIR% -B %DIR%\build -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DLLAMA_SYCL=on -DLLAMA_SYCL_F16=on -DLLAMA_NATIVE=on -DLLAMA_OPENMP=off + cmake -G "Ninja" -S %DIR% -B %DIR%\build -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DGGML_SYCL=on -DGGML_SYCL_F16=on -DGGML_NATIVE=on -DGGML_OPENMP=off ) if %ERRORLEVEL% neq 0 (exit /B %ERRORLEVEL%) diff --git a/llama-box/utils.hpp b/llama-box/utils.hpp index ea06a6e..dab4e5a 100644 --- a/llama-box/utils.hpp +++ b/llama-box/utils.hpp @@ -8,7 +8,7 @@ #include "llama.cpp/common/common.h" #define JSON_ASSERT GGML_ASSERT #include "llama.cpp/common/json.hpp" -#include "llama.cpp/llama.h" +#include "llama.cpp/include/llama.h" #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" diff --git a/llama.cpp b/llama.cpp index d50f889..72272b8 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit d50f8897a797a5a03f31228d1b5a7b8130ee1bc2 +Subproject commit 72272b83a3878e91251218c981b4c6ec16c33912