diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 5ae630c318d..9c1e3e138b3 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -187,6 +187,13 @@ jobs: generate-windows-cuda: environment: release runs-on: windows + strategy: + matrix: + cuda: + - version: "11" + url: 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe' + - version: "12" + url: 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe' env: KEY_CONTAINER: ${{ vars.KEY_CONTAINER }} steps: @@ -220,11 +227,11 @@ jobs: with: go-version-file: go.mod cache: true - - name: 'Install CUDA' + - name: 'Install CUDA ${{ matrix.cuda.version }}' run: | $ErrorActionPreference = "Stop" write-host "downloading CUDA Installer" - Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe" + Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe" write-host "Installing CUDA" Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait write-host "Completed CUDA" @@ -256,15 +263,16 @@ jobs: cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\" - uses: actions/upload-artifact@v4 with: - name: generate-windows-cuda + name: generate-windows-cuda-${{ matrix.cuda.version }} path: | llm/build/**/bin/* dist/windows-amd64/** - uses: actions/upload-artifact@v4 with: - name: windows-cuda-deps + name: windows-cuda-deps-${{ matrix.cuda.version }} path: dist/deps/* + # Import the prior generation steps and build the final windows assets build-windows: environment: release @@ -314,10 +322,16 @@ jobs: name: generate-windows-cpu - uses: actions/download-artifact@v4 with: - name: generate-windows-cuda + name: generate-windows-cuda-11 + - uses: actions/download-artifact@v4 + with: + name: generate-windows-cuda-12 + - uses: actions/download-artifact@v4 + with: + name: windows-cuda-deps-11 - uses: actions/download-artifact@v4 with: - name: windows-cuda-deps + name: windows-cuda-deps-12 - uses: actions/download-artifact@v4 with: name: windows-rocm-deps @@ -363,7 +377,6 @@ jobs: - run: | ./scripts/build_linux.sh ./scripts/build_docker.sh - mv dist/deps/* dist/ - uses: actions/upload-artifact@v4 with: name: dist-linux-amd64 @@ -459,7 +472,10 @@ jobs: merge-multiple: true - run: | ls -lh dist/ - (cd dist; sha256sum * > sha256sum.txt) + (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt) + mv sha256sum.txt dist/ + mv dist/linux-???64 . + mv dist/linux-amd64-rocm . cat dist/sha256sum.txt - name: Create or update Release run: | diff --git a/Dockerfile b/Dockerfile index c8efdd8a276..c46477b4925 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,9 @@ ARG GOLANG_VERSION=1.22.5 ARG CMAKE_VERSION=3.22.1 -# this CUDA_VERSION corresponds with the one specified in docs/gpu.md -ARG CUDA_VERSION=11.3.1 +ARG CUDA_VERSION_11=11.3.1 +ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" +ARG CUDA_VERSION_12=12.4.0 +ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" ARG ROCM_VERSION=6.1.2 # Copy the minimal context we need to run the generate scripts @@ -10,7 +12,7 @@ COPY .git .git COPY .gitmodules .gitmodules COPY llm llm -FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64 +FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-11-build-amd64 ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh @@ -18,9 +20,34 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh - -FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64 +ARG CUDA_V11_ARCHITECTURES +ENV GOARCH amd64 +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \ + CUDA_VARIANT="_v11" \ + bash gen_linux.sh + +FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-12-build-amd64 +ARG CMAKE_VERSION +COPY ./scripts/rh_linux_deps.sh / +RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh +ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH +COPY --from=llm-code / /go/src/github.com/ollama/ollama/ +WORKDIR /go/src/github.com/ollama/ollama/llm/generate +ARG CGO_CFLAGS +ARG CUDA_V12_ARCHITECTURES +ENV GOARCH amd64 +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \ + CUDA_VARIANT="_v12" \ + OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \ + bash gen_linux.sh + +FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64 ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh @@ -28,7 +55,32 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh +ARG CUDA_V11_ARCHITECTURES +ENV GOARCH arm64 +RUN OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \ + CUDA_VARIANT="_v11" \ + bash gen_linux.sh + +FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-server-arm64 +ARG CMAKE_VERSION +COPY ./scripts/rh_linux_deps.sh / +RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh +ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH +COPY --from=llm-code / /go/src/github.com/ollama/ollama/ +WORKDIR /go/src/github.com/ollama/ollama/llm/generate +ARG CGO_CFLAGS +ARG CUDA_V12_ARCHITECTURES +ENV GOARCH arm64 +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \ + CUDA_VARIANT="_v12" \ + OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \ + bash gen_linux.sh + FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64 ARG CMAKE_VERSION @@ -40,15 +92,11 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ARG AMDGPU_TARGETS -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh -RUN mkdir /tmp/scratch && \ - for dep in $(zcat /go/src/github.com/ollama/ollama/llm/build/linux/x86_64/rocm*/bin/deps.txt.gz) ; do \ - cp ${dep} /tmp/scratch/ || exit 1 ; \ - done && \ - (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \ - mkdir -p /go/src/github.com/ollama/ollama/dist/deps/ && \ - (cd /tmp/scratch/ && tar czvf /go/src/github.com/ollama/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . ) - +ENV GOARCH amd64 +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh +RUN mkdir -p ../../dist/linux-amd64-rocm/lib/ollama && \ + (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64-rocm/lib/ollama && tar xf - ) FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64 ARG CMAKE_VERSION @@ -59,16 +107,21 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ ARG OLLAMA_CUSTOM_CPU_DEFS ARG CGO_CFLAGS +ENV GOARCH amd64 WORKDIR /go/src/github.com/ollama/ollama/llm/generate FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64 -RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_CPU_TARGET="static" bash gen_linux.sh FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" bash gen_linux.sh FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" bash gen_linux.sh FROM --platform=linux/arm64 rockylinux:8 AS cpu-builder-arm64 ARG CMAKE_VERSION @@ -79,12 +132,15 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ ARG OLLAMA_CUSTOM_CPU_DEFS ARG CGO_CFLAGS +ENV GOARCH arm64 WORKDIR /go/src/github.com/ollama/ollama/llm/generate FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64 -RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_CPU_TARGET="static" bash gen_linux.sh FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh # Intermediate stage used for ./scripts/build_linux.sh @@ -95,12 +151,16 @@ COPY . . COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/ ARG GOFLAGS ARG CGO_CFLAGS -RUN go build -trimpath . +RUN --mount=type=cache,target=/root/.ccache \ + go build -trimpath -o dist/linux-amd64/bin/ollama . # Intermediate stage used for ./scripts/build_linux.sh FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64 @@ -109,23 +169,36 @@ ARG GOLANG_VERSION WORKDIR /go/src/github.com/ollama/ollama COPY . . COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ ARG GOFLAGS ARG CGO_CFLAGS -RUN go build -trimpath . +RUN --mount=type=cache,target=/root/.ccache \ + go build -trimpath -o dist/linux-arm64/bin/ollama . + +# Strip out ROCm dependencies to keep the primary image lean +FROM --platform=linux/amd64 ubuntu:22.04 as amd64-libs-without-rocm +COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /scratch/ +RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa* # Runtime stages FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64 +COPY --from=amd64-libs-without-rocm /scratch/ /lib/ RUN apt-get update && apt-get install -y ca-certificates -COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama +COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/ + FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64 +COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/ RUN apt-get update && apt-get install -y ca-certificates -COPY --from=build-arm64 /go/src/github.com/ollama/ollama/ollama /bin/ollama +COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/ # Radeon images are much larger so we keep it distinct from the CPU/CUDA image FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm RUN update-pciids -COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama +COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/ +RUN ln -s /opt/rocm/lib /lib/ollama EXPOSE 11434 ENV OLLAMA_HOST 0.0.0.0 diff --git a/app/ollama.iss b/app/ollama.iss index dc6178f7bb3..bce0a337e77 100644 --- a/app/ollama.iss +++ b/app/ollama.iss @@ -87,20 +87,11 @@ DialogFontSize=12 [Files] Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit -Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit -Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs +Source: "..\ollama.exe"; DestDir: "{app}\bin"; Flags: ignoreversion 64bit +Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion -#if DirExists("..\dist\windows-amd64\cuda") - Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs -#endif -#if DirExists("..\dist\windows-amd64\oneapi") - Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs -#endif -#if DirExists("..\dist\windows-amd64\rocm") - Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs -#endif - +Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Flags: ignoreversion recursesubdirs [Icons] Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico" @@ -108,7 +99,7 @@ Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilen Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico" [Run] -Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden +Filename: "{cmd}"; Parameters: "/C set PATH={app}\bin;%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden [UninstallRun] ; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden @@ -143,8 +134,8 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi [Registry] Root: HKCU; Subkey: "Environment"; \ - ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}"; \ - Check: NeedsAddPath('{app}') + ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}\bin"; \ + Check: NeedsAddPath('{app}\bin') [Code] diff --git a/cmd/cmd.go b/cmd/cmd.go index fd7246c8d43..a8a02605c26 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -223,6 +223,14 @@ func tempZipFiles(path string) (string, error) { } files = append(files, js...) + // bert models require a nested config.json + // TODO(mxyng): merge this with the glob above + js, err = glob(filepath.Join(path, "**/*.json"), "text/plain") + if err != nil { + return "", err + } + files = append(files, js...) + if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 { // add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob // tokenizer.model might be a unresolved git lfs reference; error if it is @@ -252,6 +260,11 @@ func tempZipFiles(path string) (string, error) { return "", err } + zfi.Name, err = filepath.Rel(path, file) + if err != nil { + return "", err + } + zf, err := zipfile.CreateHeader(zfi) if err != nil { return "", err diff --git a/convert/convert.go b/convert/convert.go index 24c19aa4dcc..5a314cdd7ce 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -7,6 +7,7 @@ import ( "io" "io/fs" "log/slog" + "strings" "github.com/ollama/ollama/llm" ) @@ -58,14 +59,20 @@ type Converter interface { KV(*Tokenizer) llm.KV // Tensors maps input tensors to LLM tensors. Model specific modifications can be done here. Tensors([]Tensor) []llm.Tensor + // Replacements returns a list of string pairs to replace in tensor names. + // See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details + Replacements() []string - // tensorName returns the LLM tensor name for a specific input name - tensorName(string) string // specialTokenTypes returns any special token types the model uses specialTokenTypes() []string + // writeFile writes the model to the provided io.WriteSeeker writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error } +type moreParser interface { + parseMore(fs.FS) error +} + // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations // and files it finds in the input path. // Supported input model formats include safetensors. @@ -93,8 +100,12 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error { conv = &mixtral{} case "GemmaForCausalLM": conv = &gemma{} + case "Gemma2ForCausalLM": + conv = &gemma2{} case "Phi3ForCausalLM": conv = &phi3{} + case "BertModel": + conv = &bert{} default: return errors.New("unsupported architecture") } @@ -103,6 +114,12 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error { return err } + if t, ok := conv.(moreParser); ok { + if err := t.parseMore(fsys); err != nil { + return err + } + } + t, err := parseTokenizer(fsys, conv.specialTokenTypes()) if err != nil { return err @@ -119,7 +136,7 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error { slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens)) } - ts, err := parseTensors(fsys) + ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...)) if err != nil { return err } diff --git a/convert/convert_bert.go b/convert/convert_bert.go new file mode 100644 index 00000000000..6e7d59fec98 --- /dev/null +++ b/convert/convert_bert.go @@ -0,0 +1,174 @@ +package convert + +import ( + "cmp" + "encoding/json" + "io/fs" + "path/filepath" + "slices" + "strings" + + "github.com/ollama/ollama/llm" +) + +type bert struct { + Parameters + NLayers uint32 `json:"n_layers"` + NumHiddenLayers uint32 `json:"num_hidden_layers"` + NLayer uint32 `json:"n_layer"` + MaxPositionEmbeddings uint32 `json:"max_position_embeddings"` + NCtx uint32 `json:"n_ctx"` + HiddenSize uint32 `json:"hidden_size"` + NEmbd uint32 `json:"n_embd"` + IntermediateSize uint32 `json:"intermediate_size"` + NInner uint32 `json:"n_inner"` + NumAttentionHeads uint32 `json:"num_attention_heads"` + NHead uint32 `json:"n_head"` + NumKeyValueHeads uint32 `json:"num_key_value_heads"` + LayerNormEPS float32 `json:"layer_norm_eps"` + LayerNormEpsilon float32 `json:"layer_norm_epsilon"` + NormEpsilon float32 `json:"norm_epsilon"` + + PoolingType uint32 +} + +var ( + _ Converter = (*bert)(nil) + _ moreParser = (*bert)(nil) +) + +func (p *bert) parseMore(fsys fs.FS) error { + bts, err := fs.ReadFile(fsys, "modules.json") + if err != nil { + return err + } + + var modules []struct { + Type string `json:"type"` + Path string `json:"path"` + } + + if err := json.Unmarshal(bts, &modules); err != nil { + return err + } + + var pooling string + for _, m := range modules { + if m.Type == "sentence_transformers.models.Pooling" { + pooling = m.Path + break + } + } + + if pooling != "" { + bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json")) + if err != nil { + return err + } + + var pc struct { + PoolingModeCLSToken bool `json:"pooling_mode_cls_token"` + PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"` + } + + if err := json.Unmarshal(bts, &pc); err != nil { + return err + } + + if pc.PoolingModeMeanTokens { + p.PoolingType = 1 + } else if pc.PoolingModeCLSToken { + p.PoolingType = 2 + } + } + + return nil +} + +func (p *bert) KV(t *Tokenizer) llm.KV { + kv := p.Parameters.KV(t) + kv["general.architecture"] = "bert" + kv["bert.attention.causal"] = false + kv["bert.pooling_type"] = p.PoolingType + + kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer) + + if contextLength := cmp.Or(p.MaxPositionEmbeddings, p.NCtx); contextLength > 0 { + kv["bert.context_length"] = contextLength + } + + if embeddingLength := cmp.Or(p.HiddenSize, p.NEmbd); embeddingLength > 0 { + kv["bert.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd) + } + + if feedForwardLength := cmp.Or(p.IntermediateSize, p.NInner); feedForwardLength > 0 { + kv["bert.feed_forward_length"] = cmp.Or(p.IntermediateSize, p.NInner) + } + + if headCount := cmp.Or(p.NumAttentionHeads, p.NHead); headCount > 0 { + kv["bert.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead) + } + + if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon, p.NormEpsilon); layerNormEpsilon > 0 { + kv["bert.attention.layer_norm_epsilon"] = layerNormEpsilon + } + + kv["tokenizer.ggml.model"] = "bert" + kv["tokenizer.ggml.token_type_count"] = uint32(2) + + // convert to phantom space tokens + for i, e := range t.Tokens { + if strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]") { + // noop + } else if strings.HasPrefix(e, "##") { + t.Tokens[i] = e[2:] + } else { + t.Tokens[i] = "\u2581" + e + } + } + + kv["tokenizer.ggml.tokens"] = t.Tokens + + return kv +} + +func (p *bert) Tensors(ts []Tensor) []llm.Tensor { + var out []llm.Tensor + for _, t := range ts { + if slices.Contains([]string{ + "embeddings.position_ids", + "pooler.dense.weight", + "pooler.dense.bias", + }, t.Name()) { + continue + } + + out = append(out, llm.Tensor{ + Name: t.Name(), + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + } + + return out +} + +func (bert) Replacements() []string { + return []string{ + "encoder.layer", "blk", + "encoder.layers", "blk", + "embeddings.word_embeddings", "token_embd", + "embeddings.token_type_embeddings", "token_types", + "embeddings.LayerNorm", "token_embd_norm", + "embeddings.position_embeddings", "position_embd", + "attention.self.query", "attn_q", + "attention.self.key", "attn_k", + "attention.self.value", "attn_v", + "attention.output.dense", "attn_output", + "attention.output.LayerNorm", "attn_output_norm", + "intermediate.dense", "ffn_up", + "output.dense", "ffn_down", + "output.LayerNorm", "layer_output_norm", + } +} diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go index 9213e15768d..c43168087b0 100644 --- a/convert/convert_gemma.go +++ b/convert/convert_gemma.go @@ -26,7 +26,6 @@ var _ Converter = (*gemma)(nil) func (p *gemma) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "gemma" - kv["general.name"] = "gemma" kv["gemma.context_length"] = p.MaxPositionEmbeddings kv["gemma.embedding_length"] = p.HiddenSize kv["gemma.block_count"] = p.HiddenLayers @@ -44,15 +43,14 @@ func (p *gemma) KV(t *Tokenizer) llm.KV { } func (p *gemma) Tensors(ts []Tensor) []llm.Tensor { - var out []llm.Tensor + out := make([]llm.Tensor, 0, len(ts)) for _, t := range ts { - name := p.tensorName(t.Name()) - if strings.HasSuffix(name, "_norm.weight") { + if strings.HasSuffix(t.Name(), "_norm.weight") { t.SetRepacker(p.addOne) } out = append(out, llm.Tensor{ - Name: name, + Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), WriterTo: t, @@ -62,8 +60,8 @@ func (p *gemma) Tensors(ts []Tensor) []llm.Tensor { return out } -func (p *gemma) tensorName(n string) string { - return strings.NewReplacer( +func (p *gemma) Replacements() []string { + return []string{ "model.embed_tokens", "token_embd", "model.norm", "output_norm", "model.layers", "blk", @@ -76,8 +74,7 @@ func (p *gemma) tensorName(n string) string { "mlp.down_proj", "ffn_down", "mlp.up_proj", "ffn_up", "post_attention_layernorm", "ffn_norm", - "block_sparse_moe.gate", "ffn_inp", - ).Replace(n) + } } func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) { diff --git a/convert/convert_gemma2.go b/convert/convert_gemma2.go new file mode 100644 index 00000000000..084f9c529fd --- /dev/null +++ b/convert/convert_gemma2.go @@ -0,0 +1,43 @@ +package convert + +import ( + "github.com/ollama/ollama/llm" +) + +type gemma2 struct { + gemma + SlidingWindow uint32 `json:"sliding_window"` + AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"` + FinalLogitSoftcap float32 `json:"final_logit_softcapping"` +} + +func (p *gemma2) KV(t *Tokenizer) llm.KV { + kv := p.Parameters.KV(t) + kv["general.architecture"] = "gemma2" + kv["gemma2.context_length"] = p.MaxPositionEmbeddings + kv["gemma2.embedding_length"] = p.HiddenSize + kv["gemma2.block_count"] = p.HiddenLayers + kv["gemma2.feed_forward_length"] = p.IntermediateSize + kv["gemma2.attention.head_count"] = p.NumAttentionHeads + kv["gemma2.attention.head_count_kv"] = p.NumKeyValueHeads + kv["gemma2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS + kv["gemma2.attention.key_length"] = p.HeadDim + kv["gemma2.attention.value_length"] = p.HeadDim + kv["gemma2.attention.sliding_window"] = p.SlidingWindow + kv["gemma2.attn_logit_softcapping"] = p.AttentionLogitSoftcap + kv["gemma2.final_logit_softcapping"] = p.FinalLogitSoftcap + kv["tokenizer.ggml.eot_token_id"] = uint32(107) + kv["tokenizer.ggml.middle_token_id"] = uint32(68) + kv["tokenizer.ggml.prefix_token_id"] = uint32(67) + kv["tokenizer.ggml.suffix_token_id"] = uint32(69) + return kv +} + +func (p *gemma2) Replacements() []string { + return append( + p.gemma.Replacements(), + "post_attention_layernorm", "post_attention_norm", + "pre_feedforward_layernorm", "ffn_norm", + "post_feedforward_layernorm", "post_ffw_norm", + ) +} diff --git a/convert/convert_llama.go b/convert/convert_llama.go index 178b13f3348..27f924fbf25 100644 --- a/convert/convert_llama.go +++ b/convert/convert_llama.go @@ -3,6 +3,7 @@ package convert import ( "cmp" "fmt" + "math" "strings" "github.com/pdevine/tensor" @@ -27,8 +28,14 @@ type llama struct { NumKeyValueHeads uint32 `json:"num_key_value_heads"` RopeTheta float32 `json:"rope_theta"` RopeScaling struct { - Type string `json:"type"` - Factor float32 `json:"factor"` + Type string `json:"type"` + RopeType string `json:"rope_type"` + Factor float32 `json:"factor"` + LowFrequencyFactor float32 `json:"low_freq_factor"` + HighFrequencyFactor float32 `json:"high_freq_factor"` + OriginalMaxPositionalEmbeddings uint32 `json:"original_max_positional_embeddings"` + + factors ropeFactor } `json:"rope_scaling"` RMSNormEPS float32 `json:"rms_norm_eps"` LayerNormEPS float32 `json:"layer_norm_eps"` @@ -42,7 +49,6 @@ var _ Converter = (*llama)(nil) func (p *llama) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "llama" - kv["general.name"] = "llama" kv["llama.vocab_size"] = p.VocabSize kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer) @@ -71,6 +77,27 @@ func (p *llama) KV(t *Tokenizer) llm.KV { if p.RopeScaling.Type == "linear" { kv["llama.rope.scaling.type"] = p.RopeScaling.Type kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor + } else if p.RopeScaling.RopeType == "llama3" { + dim := p.HiddenSize / p.NumAttentionHeads + for i := uint32(0); i < dim; i += 2 { + factor := cmp.Or(p.RopeScaling.Factor, 8.0) + factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0) + factorHigh := cmp.Or(p.RopeScaling.HighFrequencyFactor, 4.0) + + original := cmp.Or(p.RopeScaling.OriginalMaxPositionalEmbeddings, 8192) + lambdaLow := float32(original) / factorLow + lambdaHigh := float32(original) / factorHigh + + lambda := 2 * math.Pi * math.Pow(float64(p.RopeTheta), float64(i)/float64(dim)) + if lambda < float64(lambdaHigh) { + p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0) + } else if lambda > float64(lambdaLow) { + p.RopeScaling.factors = append(p.RopeScaling.factors, factor) + } else { + smooth := (float32(original)/float32(lambda) - factorLow) / (factorHigh - factorLow) + p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0/((1-smooth)/factor+smooth)) + } + } } if p.NumKeyValueHeads > 0 { @@ -95,15 +122,24 @@ func (p *llama) KV(t *Tokenizer) llm.KV { func (p *llama) Tensors(ts []Tensor) []llm.Tensor { var out []llm.Tensor + + if p.RopeScaling.factors != nil { + out = append(out, llm.Tensor{ + Name: "rope_freqs.weight", + Kind: 0, + Shape: []uint64{uint64(len(p.RopeScaling.factors))}, + WriterTo: p.RopeScaling.factors, + }) + } + for _, t := range ts { - name := p.tensorName(t.Name()) - if strings.HasSuffix(name, "attn_q.weight") || - strings.HasSuffix(name, "attn_k.weight") { + if strings.HasSuffix(t.Name(), "attn_q.weight") || + strings.HasSuffix(t.Name(), "attn_k.weight") { t.SetRepacker(p.repack) } out = append(out, llm.Tensor{ - Name: name, + Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), WriterTo: t, @@ -113,8 +149,8 @@ func (p *llama) Tensors(ts []Tensor) []llm.Tensor { return out } -func (p *llama) tensorName(n string) string { - return strings.NewReplacer( +func (p *llama) Replacements() []string { + return []string{ "lm_head", "output", "model.embed_tokens", "token_embd", "model.norm", "output_norm", @@ -128,9 +164,7 @@ func (p *llama) tensorName(n string) string { "mlp.down_proj", "ffn_down", "mlp.up_proj", "ffn_up", "post_attention_layernorm", "ffn_norm", - // mixtral - "block_sparse_moe.gate", "ffn_gate_inp", - ).Replace(n) + } } func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) { @@ -140,9 +174,9 @@ func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, } var heads uint32 - if strings.HasSuffix(name, "q_proj.weight") { + if strings.HasSuffix(name, "attn_q.weight") { heads = p.NumAttentionHeads - } else if strings.HasSuffix(name, "k_proj.weight") { + } else if strings.HasSuffix(name, "attn_k.weight") { heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads) } else { return nil, fmt.Errorf("unknown tensor for repack: %s", name) diff --git a/convert/convert_mixtral.go b/convert/convert_mixtral.go index 3263a27b383..97a86b3034a 100644 --- a/convert/convert_mixtral.go +++ b/convert/convert_mixtral.go @@ -15,8 +15,6 @@ type mixtral struct { NumExpertsPerToken uint32 `json:"num_experts_per_tok"` } -var _ Converter = (*mixtral)(nil) - func (p *mixtral) KV(t *Tokenizer) llm.KV { kv := p.llama.KV(t) @@ -72,6 +70,13 @@ func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor { return append(out, p.llama.Tensors(ts)...) } +func (p *mixtral) Replacements() []string { + return append( + p.llama.Replacements(), + "block_sparse_moe.gate", "ffn_gate_inp", + ) +} + type experts []Tensor func (e experts) WriteTo(w io.Writer) (int64, error) { diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go index 0f645217d86..64d3d012e92 100644 --- a/convert/convert_phi3.go +++ b/convert/convert_phi3.go @@ -40,7 +40,6 @@ var _ Converter = (*phi3)(nil) func (p *phi3) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "phi3" - kv["general.name"] = "phi3" kv["phi3.context_length"] = p.MaxPositionEmbeddings kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd) kv["phi3.feed_forward_length"] = p.IntermediateSize @@ -74,8 +73,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor { out := make([]llm.Tensor, 0, len(ts)+2) for _, t := range ts { - name := p.tensorName(t.Name()) - if strings.HasPrefix(name, "blk.0.") { + if strings.HasPrefix(t.Name(), "blk.0.") { addRopeFactors.Do(func() { out = append(out, llm.Tensor{ Name: "rope_factors_long.weight", @@ -92,7 +90,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor { } out = append(out, llm.Tensor{ - Name: name, + Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), WriterTo: t, @@ -102,8 +100,8 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor { return out } -func (p *phi3) tensorName(n string) string { - return strings.NewReplacer( +func (p *phi3) Replacements() []string { + return []string{ "lm_head", "output", "model.embed_tokens", "token_embd", "model.norm", "output_norm", @@ -114,7 +112,7 @@ func (p *phi3) tensorName(n string) string { "mlp.down_proj", "ffn_down", "mlp.gate_up_proj", "ffn_up", "post_attention_layernorm", "ffn_norm", - ).Replace(n) + } } type ropeFactor []float32 diff --git a/convert/convert_test.go b/convert/convert_test.go index cb2c585ef02..64b7df3b335 100644 --- a/convert/convert_test.go +++ b/convert/convert_test.go @@ -62,11 +62,14 @@ func TestMain(m *testing.M) { func TestConvertFull(t *testing.T) { cases := []string{ "Meta-Llama-3-8B-Instruct", + "Meta-Llama-3.1-8B-Instruct", "Mistral-7B-Instruct-v0.2", "Mixtral-8x7B-Instruct-v0.1", "gemma-2b-it", // microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8 "Phi-3-mini-128k-instruct", + "all-MiniLM-L6-v2", + "gemma-2-9b-it", } for i := range cases { diff --git a/convert/reader.go b/convert/reader.go index ce95208e122..5bba0406b1e 100644 --- a/convert/reader.go +++ b/convert/reader.go @@ -35,7 +35,9 @@ const ( ) func (t tensorBase) Kind() uint32 { - if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") { + if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") || + t.name == "token_types.weight" { + // these tensors are always F32 return 0 } @@ -55,10 +57,10 @@ func (t *tensorBase) SetRepacker(fn repacker) { type repacker func(string, []float32, []uint64) ([]float32, error) -func parseTensors(fsys fs.FS) ([]Tensor, error) { +func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) { patterns := []struct { Pattern string - Func func(fs.FS, ...string) ([]Tensor, error) + Func func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error) }{ {"model-*-of-*.safetensors", parseSafetensors}, {"model.safetensors", parseSafetensors}, @@ -74,7 +76,7 @@ func parseTensors(fsys fs.FS) ([]Tensor, error) { } if len(matches) > 0 { - return pattern.Func(fsys, matches...) + return pattern.Func(fsys, replacer, matches...) } } diff --git a/convert/reader_safetensors.go b/convert/reader_safetensors.go index 42f902a5a17..32a362cd97b 100644 --- a/convert/reader_safetensors.go +++ b/convert/reader_safetensors.go @@ -8,6 +8,7 @@ import ( "io" "io/fs" "slices" + "strings" "github.com/d4l3k/go-bfloat16" "github.com/x448/float16" @@ -20,7 +21,7 @@ type safetensorMetadata struct { Offsets []int64 `json:"data_offsets"` } -func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) { +func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) { var ts []Tensor for _, p := range ps { f, err := fsys.Open(p) @@ -56,7 +57,7 @@ func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) { offset: safetensorsPad(n, value.Offsets[0]), size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]), tensorBase: &tensorBase{ - name: key, + name: replacer.Replace(key), shape: value.Shape, }, }) diff --git a/convert/reader_torch.go b/convert/reader_torch.go index 531996bf23f..1b3e1c9f101 100644 --- a/convert/reader_torch.go +++ b/convert/reader_torch.go @@ -3,12 +3,13 @@ package convert import ( "io" "io/fs" + "strings" "github.com/nlpodyssey/gopickle/pytorch" "github.com/nlpodyssey/gopickle/types" ) -func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) { +func parseTorch(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) { var ts []Tensor for _, p := range ps { pt, err := pytorch.Load(p) @@ -27,7 +28,7 @@ func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) { ts = append(ts, torch{ storage: t.(*pytorch.Tensor).Source, tensorBase: &tensorBase{ - name: k.(string), + name: replacer.Replace(k.(string)), shape: shape, }, }) diff --git a/convert/testdata/Meta-Llama-3.1-8B-Instruct.json b/convert/testdata/Meta-Llama-3.1-8B-Instruct.json new file mode 100644 index 00000000000..ad7cd20ac65 --- /dev/null +++ b/convert/testdata/Meta-Llama-3.1-8B-Instruct.json @@ -0,0 +1,3 @@ +{ + "rope_freqs.weight": "80fd5efb2f729381785b293a091a268cfeceb0079167f6ece9b07070e662b222" +} diff --git a/convert/testdata/all-MiniLM-L6-v2.json b/convert/testdata/all-MiniLM-L6-v2.json new file mode 100644 index 00000000000..15c8f039c64 --- /dev/null +++ b/convert/testdata/all-MiniLM-L6-v2.json @@ -0,0 +1,124 @@ +{ + "general.architecture": "bert", + "general.file_type": "1", + "general.quantization_version": "2", + "bert.attention.causal": "false", + "bert.attention.head_count": "12", + "bert.attention.layer_norm_epsilon": "1e-12", + "bert.block_count": "6", + "bert.context_length": "512", + "bert.embedding_length": "384", + "bert.feed_forward_length": "1536", + "bert.pooling_type": "1", + "tokenizer.ggml.model": "bert", + "tokenizer.ggml.padding_token_id": "0", + "tokenizer.ggml.unknown_token_id": "100", + "tokenizer.ggml.cls_token_id": "101", + "tokenizer.ggml.seperator_token_id": "102", + "tokenizer.ggml.mask_token_id": "103", + "tokenizer.ggml.token_type_count": "2", + "tokenizer.ggml.scores": "6db964fe67338aca57790481a390121ff3dd643eebe49f7dd308029ad99abb6f", + "tokenizer.ggml.token_type": "98d247c5404b6b18f05f133b92dd56edf6efefefac326794b00d7b351f6c5aa1", + "tokenizer.ggml.tokens": "9efe405e229a45ff9916f54c475d151d2200cd2ab0006f347abfb069cf096c86", + "token_embd.weight": "8c1ee80a9ea4f65aa385ba30112010068af3d209bebc6e149d3d4589c2cd0a5a", + "position_embd.weight": "6c516f0b1c4e2388ab90394dd80ad69e4e4509b890982fc3408108ae66210eb6", + "token_types.weight": "f879f8e422ed211948f28b560d3c5e17aae7993f063b51196a28cf5c0fb3da21", + "token_embd_norm.weight": "75076e095d717aab96f8b6beeee503c27940d9a76f2b891a0e3de72f8a6043e4", + "token_embd_norm.bias": "298735285ffe944e1bf03e5d35c7280326b85cf121bde9874f1af5dc51ab939d", + "blk.0.attn_q.weight": "ab0923ce4c1549175112dcdfcc860fe30137f991e03ea6857fb5993670adaf6c", + "blk.0.attn_q.bias": "a3ec29551dabf976e1d34256b8ab5ab7b758f3ed9742c3cafdbd984d5441df62", + "blk.0.attn_k.weight": "4c1038a6d035c3e9ffed7fa672b614627814752503755fbad0cfb76a41ad71ba", + "blk.0.attn_k.bias": "e0363930eb588d91816aa3d230bb03b6e2551c165117b80b8d60397413819ef9", + "blk.0.attn_v.weight": "425e2e53e3f00ce98d29c3e6a161eb55d3e6ae0d96fdb9f6242d1c4fd6eef4b3", + "blk.0.attn_v.bias": "6579173a1e65ee124fbd0bd53cbdca4225515b4f2c5f18fb1bfd000f5978f9bb", + "blk.0.attn_output.weight": "a6d70a08cd7164de5d12af65d86d657c3db35aaecde778b2b3fda9193c4c9802", + "blk.0.attn_output.bias": "2b8d12c4f9a9c5bfaa29c597839568f6e0525cb41eeaf64ddeb6bd84dfeb9701", + "blk.0.attn_output_norm.weight": "bbe6e502a473228b525aeed26cc31b7db123ad63bdc5a6eebac6ea70b8b51d62", + "blk.0.attn_output_norm.bias": "36eaacaf0007c5c62daea97aab0115390c0682914f78482e37eb76885f4b7a50", + "blk.0.ffn_up.weight": "24654561c76ce387d125759ba843f06b904ef721fcceaeff6ccc62180a48e874", + "blk.0.ffn_up.bias": "fd3f0126aa1d95768fa60eb6f4ab8a2763cfcb7e5405f35b92353031d86f4d34", + "blk.0.ffn_down.weight": "97a829763a6a5bf3329ceb4d39c424ba4787d61653a5b0bbd1f84782e4d4e0ca", + "blk.0.ffn_down.bias": "7aa980c30ae8b4ee7f69df28808dbf5c431f56ccc4a80340f644a0419f16c054", + "blk.0.layer_output_norm.weight": "ef30dad4c2a083ae1ff5039a2a6cda60ecc89bf1e486a6f8c0d15f50589603f8", + "blk.0.layer_output_norm.bias": "8b1b77e67568b1bce43fc476de1b177c53ff688d66beb66995e8eb3dc290da8a", + "blk.1.attn_q.weight": "284331622a1f6f9b87ccee4f652bd66a394ca493c4d93be4d1844e4f6159ad10", + "blk.1.attn_q.bias": "e24ebd4860330e08f6bfdd077a82db0bee33f4c8846cf1db26327a34754c7069", + "blk.1.attn_k.weight": "729dd0d555544b5bd0f7580b3c8b384256b974605f0e7487b95f295aa032997d", + "blk.1.attn_k.bias": "2aa51a828a858f35473f54477583fea54ce2ccc34ea60fbd1d228fbe9bca827f", + "blk.1.attn_v.weight": "6be304671cc311d5ca5c103f2b51467ee800c589bc5b8101e09ff5aed1f68c21", + "blk.1.attn_v.bias": "43bcbab78a8819e07f723bc9e5b737b71e87a7594f15234e882b63e327a64199", + "blk.1.attn_output.weight": "15ec8a1a12b26c9976445308a09f748ab0e4bef0f583d13ab08c3129f8738d73", + "blk.1.attn_output.bias": "dac2146f4baa6ed16f6c0dc7443831fb7ec79bedcceafd80d1a4b628a1bb072d", + "blk.1.attn_output_norm.weight": "d2151eb33bffac536787a4c9a5d2b31c7a80b17c4611877842a3cce2cd6e98d8", + "blk.1.attn_output_norm.bias": "31e1b779716dafb855d2cf5631ee168a0ccf372eb9c6ea6091f66fa97a9b9d2d", + "blk.1.ffn_up.weight": "a57547fc3fc3b77406f5cdcb0c87af9bc184701f175c39c1f35297826fce3cc7", + "blk.1.ffn_up.bias": "123be6d541d086202913c75d878c54d59a749f3af7b58f7ef9eb9e7c62a24c9a", + "blk.1.ffn_down.weight": "cfdb79788377e5cbded8790cd41b9e66c397ecab75474071fcd7cf32d30f9613", + "blk.1.ffn_down.bias": "bcb58315519a573097960891c9ae41cf4c685ab78c3e0e77471471758a7eae88", + "blk.1.layer_output_norm.weight": "819b554271452bfb1d84c2603b90377b2e41a0ac1e3aa8b417ccf9dce63375bd", + "blk.1.layer_output_norm.bias": "47a3433ac27f5ce8947fb38dd491f3706df4ef6adb0ddf74612bf0f54b19e164", + "blk.2.attn_q.weight": "1557a9ea852b1880551f7290e00aded4f35e6c4180fdcbed1b0039bf805f639e", + "blk.2.attn_q.bias": "c3bfe5f3066f655fd36b055530997b59ff33ef013563aaeb3cb8ff07dabd59a9", + "blk.2.attn_k.weight": "cfd08eb69c61ae2f9f14f9b7ff5c5394ca264b1a9f3d48156677f90dd1766289", + "blk.2.attn_k.bias": "9b839bc0e79974a0b3f5d1895972bc6f5c9a1bc16052e1af786e6a530758152d", + "blk.2.attn_v.weight": "02b26b1208480eaeeb00e7b4cf8b690006ca14759357fc44ed4a2a8924ead993", + "blk.2.attn_v.bias": "e7e6f0089fded1659a867ab736c220d9653ea7da6b1b94baf5c8d30a748b63ab", + "blk.2.attn_output.weight": "a1db121c7d33806b349cadd050300a57db49fdc91224fd07c9ac43bf4299dc79", + "blk.2.attn_output.bias": "7675128b6a92555cd955c820311e91e9417d31f48848f45d047b4100c62148b3", + "blk.2.attn_output_norm.weight": "5b4595e0fbcba67a700c4331adf746d2fba3546364a4db5607ae241947bb1a21", + "blk.2.attn_output_norm.bias": "7b8e16826ea30e5a2ba0b02e0095a901775981a296e98819625320e983060d08", + "blk.2.ffn_up.weight": "a0d815d946ac07a65095c4ae4df77b818845e6d97795c7d82f55e689d944db59", + "blk.2.ffn_up.bias": "ce37c0a4174d6bf773ded7bd016ede627ad3bdb8bc99b9992a18dc8e8898f252", + "blk.2.ffn_down.weight": "f6231d2a25426fbd45b9f1160aa484220eb227ceef0348c4a6a6de890606e5ef", + "blk.2.ffn_down.bias": "429e00556e8dc63a785238b309b9d83738500c1ef6d736fe6526ad88ea496d27", + "blk.2.layer_output_norm.weight": "651457a573adf3f7dd9ee5dfe1c8e89389e94443993aab77ec6a0b05aa621e35", + "blk.2.layer_output_norm.bias": "41fbbeda7fd89b0cef5f945ae44011c316982390401d6f75ba8c6d365e185247", + "blk.3.attn_q.weight": "95a43f32949d2cb8d22815bb27a44abfc6665ba96221af817dfe058cb6ca72c6", + "blk.3.attn_q.bias": "f4e34385e75d8108b6b3bd336106e2133a8c9be0cc343dfe5dc48c32a823c7cb", + "blk.3.attn_k.weight": "6b892da6a17d4d3265265a15f695864a31813ee8c8e710ae9bc9e1adbc6c9a18", + "blk.3.attn_k.bias": "40b8067b641a56014cee42548240aa8930820958b1933004892b5f04fbaef39e", + "blk.3.attn_v.weight": "9fcd5922319dd2a461082a5ce040c1dfe65d87d70ca6547dd0b46eeecc3eeb2b", + "blk.3.attn_v.bias": "b528c56212e66931fdbe267ac327a9c2f87cd03baff3ea719e30afe681da15f1", + "blk.3.attn_output.weight": "e3b178c1b03981e75510e0d277af23ea59cc404b5394e61bd32291825719b502", + "blk.3.attn_output.bias": "712c84d39a6a5a9c06a09da8fd9939ba0d5525524a4bba61ea4de09b48f45cae", + "blk.3.attn_output_norm.weight": "d1ffac88e675592ff72f8a617be32b4a381d443b2f8f2645dbe44a1e5745aac0", + "blk.3.attn_output_norm.bias": "ea31a1c73146234c50e0e43f485c458413714867b8e2703af66482f7db2d6c40", + "blk.3.ffn_up.weight": "4ef4f3b9a1ea6ab2ef2eb6e8b008e06a44790d099d97482a05a51e39a29afac0", + "blk.3.ffn_up.bias": "06a4296dda16f452675c51f108079fe7722552d6521c737d97734943818b9a2b", + "blk.3.ffn_down.weight": "f114b2bebe392c7d80433bb880c6730293aa4561b0b0370dcdaf7472daebd847", + "blk.3.ffn_down.bias": "2c8e67831d28a3bf613fc7912ae3259b63d72abcaf4d30efd8800758400158de", + "blk.3.layer_output_norm.weight": "a1dfeb7b5a51dd56447312ca41e2ad2f361a3ea12ddc355127f5f4219fb0a482", + "blk.3.layer_output_norm.bias": "1ed630021b25c6c6fc93fd32988b9907df966d4982a93081f639aac3044618ab", + "blk.4.attn_q.weight": "b5fae4c1f9a5f33a2a2e816ac0c01c25f422e4efdd59ef1ed93da2610e5370fc", + "blk.4.attn_q.bias": "c2e376524ea98ac3b10d9eee19ecb1b1e261fa5149efe0232844c923dfb428fb", + "blk.4.attn_k.weight": "a4632f5ebf9321d9d08f9112a4e5dda2efe5671df4a4e67fee24845f5b14af16", + "blk.4.attn_k.bias": "a9a02ffb8b8b4f6dfe487a7e0341f1d5318c9d2b793a688f34cb1b22fc66ef60", + "blk.4.attn_v.weight": "10ad8deb81d9fa093b1e5c0f24ea82aa7df43e6aca49e260fcbea56eab8cc86a", + "blk.4.attn_v.bias": "7326813e181e021130bd33ac136293fcffccce2d1d8cb59041e5b13a8cceacf6", + "blk.4.attn_output.weight": "c92573088c7437c2b3cda51490e152c27fb19e5468df591eabba5a49d5398d44", + "blk.4.attn_output.bias": "14e10b419e5859af1eb685af5c330aee67048cd704dcead9217840c6f5393222", + "blk.4.attn_output_norm.weight": "02b6831c0e0fb0edbc579a92812a1dd972cb15d14fcd382d4427c5a7b300ac44", + "blk.4.attn_output_norm.bias": "7eed5cd503bb6bb6ceb1bc8b07cc077903a4f14fb8b9d6cdf39644815ecf1374", + "blk.4.ffn_up.weight": "8d0c91d62e74d6431321116a37cf3339e630bd50ba164d3304fc4fe8dd831223", + "blk.4.ffn_up.bias": "d325f07f73c005a273c484c7be8e7abb4d6e8a5c4fd093f5869133b97629d017", + "blk.4.ffn_down.weight": "7ba7bd81143f40537b84f938e403e19f30e4928625eb371de052b9025beb4d21", + "blk.4.ffn_down.bias": "2853d9c2a75288214a4bf4907dc19d04d01926f4913d302b1aa7bdbfcce0f7a1", + "blk.4.layer_output_norm.weight": "a4ed1885fa77b90fed5300c355ef0aa0c876a8c747151d9d790939d464d57d4f", + "blk.4.layer_output_norm.bias": "62142a81e813a9e636333b2b805d6bc3b17c5e7cd4b15adce1ada6bc9a32563c", + "blk.5.attn_q.weight": "afc1dff080a72c3daad01384b1448d476aaf789871017c8ff8e144788887995d", + "blk.5.attn_q.bias": "748a820371c1d4f872c84545b36358d239c35bf6c99e2812c237d88c3292763b", + "blk.5.attn_k.weight": "59e30c1ed8acd2cbb01de5f62e7804015b9ecf98ba157d98cab016344639eda5", + "blk.5.attn_k.bias": "f839520078f9e589496e982e86d0126c7aa14196047339abffcf49a696229f77", + "blk.5.attn_v.weight": "3e21fb874e21b90308e1f46af034a3c32d3eba1628d62ae5f2246d6af5818923", + "blk.5.attn_v.bias": "5cd4852bf95c1444d10d756750f6bf49f842c0b39e9953c7f408bb67c325ac8c", + "blk.5.attn_output.weight": "636ce6a7752895f204b9d01ba0aedd9a294f908b42f372c22a16d9dd590d7471", + "blk.5.attn_output.bias": "82d924d4b0d2b94f2bbff91619216d6967a3541ce9b1531a6a60457a67b5d219", + "blk.5.attn_output_norm.weight": "5e7bd0a8d3396080f3360d7c4700bf094a06216431bd014c4479eef72ecf4271", + "blk.5.attn_output_norm.bias": "66c6de5edda5466d029c6753780be81ccd4218bf8bc00680000e0f06856ab712", + "blk.5.ffn_up.weight": "5bbf6e7ea380e216e33f8bee06d25f2265359d3876a300e92bc6e41d48e33430", + "blk.5.ffn_up.bias": "9d795388bb36fb33ad3a37fea3ccb4937838e02800a608fb47d363cd06b47370", + "blk.5.ffn_down.weight": "2fd628974e7f075479dd227b46fbd48ae8d3ca34d735b36f391ac06410730368", + "blk.5.ffn_down.bias": "cd213ba9eaa75fa541648097fbe9c96e58077e6c3ad6ad2fb1f21f8350f44291", + "blk.5.layer_output_norm.weight": "159a9df41d15b7022d136f86a2a2631c4635f9816e957472217077b522bcf52a", + "blk.5.layer_output_norm.bias": "24c1f27ffd1eb4e5be7e3a2909943e6f0980635d761fa1efdd0c19645da23766" +} diff --git a/convert/testdata/gemma-2-9b-it.json b/convert/testdata/gemma-2-9b-it.json new file mode 100644 index 00000000000..90cdbee4744 --- /dev/null +++ b/convert/testdata/gemma-2-9b-it.json @@ -0,0 +1,6 @@ +{ + "general.architecture": "gemma2", + "gemma2.attention.sliding_window": "4096", + "gemma2.attn_logit_softcapping": "50", + "gemma2.final_logit_softcapping": "30" +} diff --git a/convert/tokenizer.go b/convert/tokenizer.go index 0d42a6d8ab7..653df6d2898 100644 --- a/convert/tokenizer.go +++ b/convert/tokenizer.go @@ -1,7 +1,6 @@ package convert import ( - "cmp" "crypto/sha256" "encoding/hex" "encoding/json" @@ -11,6 +10,8 @@ import ( "log/slog" "os" "slices" + + "golang.org/x/exp/maps" ) const ( @@ -184,32 +185,32 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) { return nil, err } - var tokens []token + tokens := make(map[int]token, len(t.Model.Vocab)) for k, v := range t.Model.Vocab { - tokens = append(tokens, token{ + tokens[v] = token{ ID: v, Content: k, - }) + } } - for _, t := range t.AddedTokens { - t.UserDefined = true - tokens = append(tokens, t) + for _, token := range t.AddedTokens { + token.UserDefined = true + tokens[token.ID] = token } - slices.SortFunc(tokens, func(i, j token) int { - return cmp.Compare(i.ID, j.ID) - }) + keys := maps.Keys(tokens) + slices.Sort(keys) v := Vocabulary{Model: "gpt2"} - for _, t := range tokens { - v.Tokens = append(v.Tokens, t.Content) - v.Scores = append(v.Scores, float32(t.ID)) + for _, k := range keys { + token := tokens[k] + v.Tokens = append(v.Tokens, token.Content) + v.Scores = append(v.Scores, float32(token.ID)) switch { - case t.Special: + case token.Special: v.Types = append(v.Types, tokenTypeControl) - case t.UserDefined: + case token.UserDefined: v.Types = append(v.Types, tokenTypeUserDefined) default: v.Types = append(v.Types, tokenTypeNormal) diff --git a/convert/tokenizer_spm.go b/convert/tokenizer_spm.go index babf702c77c..5e506087c29 100644 --- a/convert/tokenizer_spm.go +++ b/convert/tokenizer_spm.go @@ -15,6 +15,11 @@ import ( ) func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) { + ast, err := parseAdditionalSpecialTokens(fsys) + if err != nil { + return nil, err + } + bts, err := fs.ReadFile(fsys, "tokenizer.model") if err != nil { return nil, err @@ -37,7 +42,12 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) { sentencepiece.ModelProto_SentencePiece_BYTE: v.Types = append(v.Types, int32(t)) default: - v.Types = append(v.Types, int32(sentencepiece.ModelProto_SentencePiece_NORMAL)) + tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL) + if slices.Contains(ast, piece.GetPiece()) { + tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL) + } + + v.Types = append(v.Types, tt) } } @@ -81,3 +91,23 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) { return &v, nil } + +func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) { + f, err := fsys.Open("special_tokens_map.json") + if errors.Is(err, os.ErrNotExist) { + return nil, nil + } else if err != nil { + return nil, err + } + defer f.Close() + + var m struct { + AdditionalSpecialTokens []string `json:"additional_special_tokens"` + } + + if err := json.NewDecoder(f).Decode(&m); err != nil { + return nil, err + } + + return m.AdditionalSpecialTokens, nil +} diff --git a/docs/linux.md b/docs/linux.md index ec7306560d8..d1d5892c43e 100644 --- a/docs/linux.md +++ b/docs/linux.md @@ -20,13 +20,12 @@ GPU. ## Manual install -### Download the `ollama` binary +### Download `ollama` -Ollama is distributed as a self-contained binary. Download it to a directory in your PATH: +Download and extract the Linux package: ```bash -sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama -sudo chmod +x /usr/bin/ollama +curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr ``` ### Adding Ollama as a startup service (recommended) @@ -96,8 +95,7 @@ curl -fsSL https://ollama.com/install.sh | sh Or by downloading the ollama binary: ```bash -sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama -sudo chmod +x /usr/bin/ollama +curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr ``` ## Installing specific versions diff --git a/envconfig/config.go b/envconfig/config.go index b82b773d4ad..7e45a4f5194 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -174,7 +174,7 @@ func RunnersDir() (p string) { defer func() { if p == "" { - slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'") + slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'") } }() @@ -190,17 +190,17 @@ func RunnersDir() (p string) { } var paths []string - for _, root := range []string{filepath.Dir(exe), cwd} { + for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), ".."), cwd} { paths = append(paths, root, - filepath.Join(root, "windows-"+runtime.GOARCH), - filepath.Join(root, "dist", "windows-"+runtime.GOARCH), + filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH), + filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH), ) } // Try a few variations to improve developer experience when building from source in the local tree for _, path := range paths { - candidate := filepath.Join(path, "ollama_runners") + candidate := filepath.Join(path, "lib", "ollama", "runners") if _, err := os.Stat(candidate); err == nil { p = candidate break diff --git a/gpu/amd_common.go b/gpu/amd_common.go index 2839cb7c31f..72d204f77f7 100644 --- a/gpu/amd_common.go +++ b/gpu/amd_common.go @@ -54,7 +54,7 @@ func commonAMDValidateLibDir() (string, error) { // Installer payload location if we're running the installed binary exe, err := os.Executable() if err == nil { - rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm") + rocmTargetDir := filepath.Join(filepath.Dir(exe), "..", "lib", "ollama") if rocmLibUsable(rocmTargetDir) { slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir) return rocmTargetDir, nil diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go index edabeb43a87..a0ae7c960d2 100644 --- a/gpu/amd_windows.go +++ b/gpu/amd_windows.go @@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) { // Installer payload (if we're running from some other location) localAppData := os.Getenv("LOCALAPPDATA") appDir := filepath.Join(localAppData, "Programs", "Ollama") - rocmTargetDir := filepath.Join(appDir, "rocm") + rocmTargetDir := filepath.Join(appDir, "..", "lib", "ollama") if rocmLibUsable(rocmTargetDir) { slog.Debug("detected ollama installed ROCm at " + rocmTargetDir) return rocmTargetDir, nil diff --git a/gpu/cuda_common.go b/gpu/cuda_common.go index c90a644c48e..827cc9b480d 100644 --- a/gpu/cuda_common.go +++ b/gpu/cuda_common.go @@ -4,9 +4,17 @@ package gpu import ( "log/slog" + "os" + "regexp" + "runtime" + "strconv" "strings" ) +// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed. +// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. +var CudaTegra string = os.Getenv("JETSON_JETPACK") + func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { ids := []string{} for _, info := range gpuInfo { @@ -19,3 +27,38 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { } return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",") } + +func cudaVariant(gpuInfo CudaGPUInfo) string { + if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" { + if CudaTegra != "" { + ver := strings.Split(CudaTegra, ".") + if len(ver) > 0 { + return "jetpack" + ver[0] + } + } else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil { + r := regexp.MustCompile(` R(\d+) `) + m := r.FindSubmatch(data) + if len(m) != 2 { + slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version") + } else { + if l4t, err := strconv.Atoi(string(m[1])); err == nil { + // Note: mapping from L4t -> JP is inconsistent (can't just subtract 30) + // https://developer.nvidia.com/embedded/jetpack-archive + switch l4t { + case 35: + return "jetpack5" + case 36: + return "jetpack6" + default: + slog.Info("unsupported L4T version", "nv_tegra_release", string(data)) + } + } + } + } + } + + if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 { + return "v11" + } + return "v12" +} diff --git a/gpu/gpu.go b/gpu/gpu.go index dc124a3ed59..72d237a6c96 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -64,10 +64,6 @@ var RocmComputeMin = 9 // TODO find a better way to detect iGPU instead of minimum memory const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU -// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed. -// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. -var CudaTegra string = os.Getenv("JETSON_JETPACK") - // Note: gpuMutex must already be held func initCudaHandles() *cudaHandles { // TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing @@ -215,7 +211,7 @@ func GetGPUInfo() GpuInfoList { GpuInfo: GpuInfo{ memInfo: mem, Library: "cpu", - Variant: cpuCapability, + Variant: cpuCapability.String(), ID: "0", }, }, @@ -229,11 +225,7 @@ func GetGPUInfo() GpuInfoList { return GpuInfoList{cpus[0].GpuInfo} } - // On windows we bundle the nvidia library one level above the runner dir - depPath := "" - if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" { - depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "cuda") - } + depPath := LibraryDir() // Load ALL libraries cHandles = initCudaHandles() @@ -269,11 +261,23 @@ func GetGPUInfo() GpuInfoList { gpuInfo.FreeMemory = uint64(memInfo.free) gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor) + gpuInfo.computeMajor = int(memInfo.major) + gpuInfo.computeMinor = int(memInfo.minor) gpuInfo.MinimumMemory = cudaMinimumMemory - gpuInfo.DependencyPath = depPath + variant := cudaVariant(gpuInfo) + if depPath != "" { + gpuInfo.DependencyPath = depPath + // Check for variant specific directory + if variant != "" { + if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil { + gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant) + } + } + } gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) gpuInfo.DriverMajor = driverMajor gpuInfo.DriverMinor = driverMinor + gpuInfo.Variant = variant // query the management library as well so we can record any skew between the two // which represents overhead on the GPU we must set aside on subsequent updates @@ -306,13 +310,6 @@ func GetGPUInfo() GpuInfoList { if envconfig.IntelGPU() { oHandles = initOneAPIHandles() if oHandles != nil && oHandles.oneapi != nil { - - // On windows we bundle the oneapi library one level above the runner dir - depPath = "" - if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" { - depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi") - } - for d := range oHandles.oneapi.num_drivers { if oHandles.oneapi == nil { // shouldn't happen @@ -467,10 +464,12 @@ func GetGPUInfo() GpuInfoList { func FindGPULibs(baseLibName string, defaultPatterns []string) []string { // Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them var ldPaths []string - var patterns []string gpuLibPaths := []string{} slog.Debug("Searching for GPU library", "name", baseLibName) + // Start with our bundled libraries + patterns := []string{filepath.Join(LibraryDir(), baseLibName)} + switch runtime.GOOS { case "windows": ldPaths = strings.Split(os.Getenv("PATH"), ";") @@ -479,13 +478,14 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string { default: return gpuLibPaths } - // Start with whatever we find in the PATH/LD_LIBRARY_PATH + + // Then with whatever we find in the PATH/LD_LIBRARY_PATH for _, ldPath := range ldPaths { d, err := filepath.Abs(ldPath) if err != nil { continue } - patterns = append(patterns, filepath.Join(d, baseLibName+"*")) + patterns = append(patterns, filepath.Join(d, baseLibName)) } patterns = append(patterns, defaultPatterns...) slog.Debug("gpu library search", "globs", patterns) @@ -641,3 +641,31 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) { return "", "" } } + +func LibraryDir() string { + // On Windows/linux we bundle the dependencies at the same level as the executable + appExe, err := os.Executable() + if err != nil { + slog.Warn("failed to lookup executable path", "error", err) + } + cwd, err := os.Getwd() + if err != nil { + slog.Warn("failed to lookup working directory", "error", err) + } + // Scan for any of our dependeices, and pick first match + for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), ".."), cwd} { + libDep := filepath.Join("lib", "ollama") + if _, err := os.Stat(filepath.Join(root, libDep)); err == nil { + return filepath.Join(root, libDep) + } + // Developer mode, local build + if _, err := os.Stat(filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil { + return filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep) + } + if _, err := os.Stat(filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil { + return filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep) + } + } + slog.Warn("unable to locate gpu dependency libraries") + return "" +} diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index 9d9fd84ebf4..417b48dfa88 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -25,7 +25,7 @@ func GetGPUInfo() GpuInfoList { return []GpuInfo{ { Library: "cpu", - Variant: GetCPUCapability(), + Variant: GetCPUCapability().String(), memInfo: mem, }, } @@ -48,7 +48,7 @@ func GetCPUInfo() GpuInfoList { return []GpuInfo{ { Library: "cpu", - Variant: GetCPUCapability(), + Variant: GetCPUCapability().String(), memInfo: mem, }, } diff --git a/gpu/gpu_linux.go b/gpu/gpu_linux.go index d6d2675c889..d4d20bc4fec 100644 --- a/gpu/gpu_linux.go +++ b/gpu/gpu_linux.go @@ -47,7 +47,7 @@ var ( CudartMgmtName = "libcudart.so*" NvcudaMgmtName = "libcuda.so*" NvmlMgmtName = "" // not currently wired on linux - OneapiMgmtName = "libze_intel_gpu.so" + OneapiMgmtName = "libze_intel_gpu.so*" ) func GetCPUMem() (memInfo, error) { diff --git a/gpu/types.go b/gpu/types.go index 8d22b06bfa5..4cbbeb841b5 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -19,7 +19,7 @@ type GpuInfo struct { Library string `json:"library,omitempty"` // Optional variant to select (e.g. versions, cpu feature flags) - Variant CPUCapability `json:"variant"` + Variant string `json:"variant"` // MinimumMemory represents the minimum memory required to use the GPU MinimumMemory uint64 `json:"-"` @@ -53,8 +53,10 @@ type CPUInfo struct { type CudaGPUInfo struct { GpuInfo - OSOverhead uint64 // Memory overhead between the driver library and management library - index int //nolint:unused,nolintlint + OSOverhead uint64 // Memory overhead between the driver library and management library + index int //nolint:unused,nolintlint + computeMajor int //nolint:unused,nolintlint + computeMinor int //nolint:unused,nolintlint } type CudaGPUInfoList []CudaGPUInfo @@ -81,8 +83,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList { for _, info := range l { found := false requested := info.Library - if info.Variant != CPUCapabilityNone { - requested += "_" + info.Variant.String() + if info.Variant != CPUCapabilityNone.String() { + requested += "_" + info.Variant } for i, lib := range libs { if lib == requested { @@ -105,6 +107,7 @@ func (l GpuInfoList) LogDetails() { slog.Info("inference compute", "id", g.ID, "library", g.Library, + "variant", g.Variant, "compute", g.Compute, "driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor), "name", g.Name, diff --git a/integration/embed_test.go b/integration/embed_test.go index 10333d5dfa9..4a68af68ae7 100644 --- a/integration/embed_test.go +++ b/integration/embed_test.go @@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) { t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0]) } - if res.PromptEvalCount != 8 { - t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount) + if res.PromptEvalCount != 6 { + t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount) } } @@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) { t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0]) } - if res.PromptEvalCount != 16 { - t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount) + if res.PromptEvalCount != 12 { + t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount) } } diff --git a/llm/ext_server/CMakeLists.txt b/llm/ext_server/CMakeLists.txt index bfc97c63d99..90fd0ef251f 100644 --- a/llm/ext_server/CMakeLists.txt +++ b/llm/ext_server/CMakeLists.txt @@ -1,12 +1,13 @@ set(TARGET ollama_llama_server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) +set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS}) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h) install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ ) -target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS}) if (WIN32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) endif() diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 5717c17a905..8e08b850f8e 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1429,7 +1429,13 @@ struct llama_server_context switch (task.type) { case TASK_TYPE_COMPLETION: { - server_slot *slot = prefix_slot(task.data["prompt"]); + server_slot *slot = nullptr; + if (task.embedding_mode) { + // Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0 + slot = slots[0].available() ? &slots[0] : nullptr; + } else { + slot = prefix_slot(task.data["prompt"]); + } if (slot == nullptr) { // if no slot is available, we defer this task for processing later diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh index da1b068829b..40115936d84 100644 --- a/llm/generate/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -9,11 +9,14 @@ init_vars() { ARCH="arm64" ;; *) - ARCH=$(uname -m | sed -e "s/aarch64/arm64/g") + echo "GOARCH must be set" + echo "this script is meant to be run from within go generate" + exit 1 + ;; esac LLAMACPP_DIR=../llama.cpp - CMAKE_DEFS="" + CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on" CMAKE_TARGETS="--target ollama_llama_server" if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}" @@ -27,6 +30,7 @@ init_vars() { WHOLE_ARCHIVE="-Wl,-force_load" NO_WHOLE_ARCHIVE="" GCC_ARCH="-arch ${ARCH}" + DIST_BASE=../../dist/darwin-${GOARCH}/ ;; "Linux") LIB_EXT="so" @@ -35,6 +39,7 @@ init_vars() { # Cross compiling not supported on linux - Use docker GCC_ARCH="" + DIST_BASE=../../dist/linux-${GOARCH}/ ;; *) ;; @@ -42,6 +47,7 @@ init_vars() { if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80" fi + GZIP=$(which pigz 2>/dev/null || echo "gzip") } git_module_setup() { @@ -85,26 +91,36 @@ build() { compress() { echo "Compressing payloads to reduce overall binary size..." - pids="" rm -rf ${BUILD_DIR}/bin/*.gz for f in ${BUILD_DIR}/bin/* ; do - gzip -n --best -f ${f} & - pids+=" $!" + ${GZIP} -n --best -f ${f} & + compress_pids+=" $!" done # check for lib directory if [ -d ${BUILD_DIR}/lib ]; then for f in ${BUILD_DIR}/lib/* ; do - gzip -n --best -f ${f} & - pids+=" $!" + ${GZIP} -n --best -f ${f} & + compress_pids+=" $!" done fi echo - for pid in ${pids}; do +} + +wait_for_compress() { + for pid in ${compress_pids}; do wait $pid done echo "Finished compression" } +install() { + echo "Installing libraries to bin dir ${BUILD_DIR}/bin/" + for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do + rm -f "${BUILD_DIR}/bin/$(basename ${lib})" + cp -af "${lib}" "${BUILD_DIR}/bin/" + done +} + # Keep the local tree clean after we're done with the build cleanup() { (cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt) diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index 6c0b62cb7f2..f22c0f8e759 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -6,6 +6,7 @@ set -ex set -o pipefail +compress_pids="" echo "Starting darwin generate script" source $(dirname $0)/gen_common.sh init_vars @@ -98,4 +99,5 @@ case "${GOARCH}" in esac cleanup +wait_for_compress echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)" diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index db2c6c30c9b..6927dda8bcb 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -13,6 +13,7 @@ set -ex set -o pipefail +compress_pids="" # See https://llvm.org/docs/AMDGPUUsage.html#processors for reference amdGPUs() { @@ -51,7 +52,7 @@ if [ -z "${CUDACXX}" ]; then export CUDACXX=$(command -v nvcc) fi fi -COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off" +COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off" source $(dirname $0)/gen_common.sh init_vars git_module_setup @@ -77,10 +78,11 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then init_vars echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\"" - CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" + CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" BUILD_DIR="../build/linux/${ARCH}/cpu" echo "Building custom CPU" build + install compress else # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512 @@ -93,7 +95,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake - COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off" + COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off" if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then # # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) @@ -103,6 +105,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then BUILD_DIR="../build/linux/${ARCH}/cpu" echo "Building LCD CPU" build + install compress fi @@ -120,6 +123,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then BUILD_DIR="../build/linux/${ARCH}/cpu_avx" echo "Building AVX CPU" build + install compress fi @@ -133,6 +137,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then BUILD_DIR="../build/linux/${ARCH}/cpu_avx2" echo "Building AVX2 CPU" build + install compress fi fi @@ -160,7 +165,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then echo "CUDA libraries detected - building dynamic CUDA library" init_vars CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true) - if [ -n "${CUDA_MAJOR}" ]; then + if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then CUDA_VARIANT=_v${CUDA_MAJOR} fi if [ "${ARCH}" == "arm64" ]; then @@ -178,29 +183,19 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}" echo "Building custom CUDA GPU" else - CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" + CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" fi - CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}" + export CUDAFLAGS="-t8" + CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off" BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" - EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" + export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" + CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}" build - - # Carry the CUDA libs as payloads to help reduce dependency burden on users - # - # TODO - in the future we may shift to packaging these separately and conditionally - # downloading them in the install script. - DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )" - for lib in libcudart.so libcublas.so libcublasLt.so ; do - DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true) - if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then - cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/" - elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then - cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/" - elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then - cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/" - else - cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/" - fi + install + echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}" + mkdir -p "${CUDA_DIST_DIR}" + for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do + cp -a "${lib}" "${CUDA_DIST_DIR}" done compress @@ -218,21 +213,24 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then CC=icx CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF" BUILD_DIR="../build/linux/${ARCH}/oneapi" - EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb" + ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama" + export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb" DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it build # copy oneAPI dependencies + mkdir -p "${ONEAPI_DIST_DIR}" for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do - cp "${dep}" "${BUILD_DIR}/bin/" + cp -a "${dep}" "${ONEAPI_DIST_DIR}" done - cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${BUILD_DIR}/bin/" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}" + install compress fi @@ -262,23 +260,22 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then echo "Building custom ROCM GPU" fi BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}" - EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu" + # ROCm dependencies are too large to fit into a unified bundle + ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama" + # TODO figure out how to disable runpath (rpath) + # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work + export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu" build - # Record the ROCM dependencies - rm -f "${BUILD_DIR}/bin/deps.txt" - touch "${BUILD_DIR}/bin/deps.txt" - for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do - echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt" + # copy the ROCM dependencies + mkdir -p "${ROCM_DIST_DIR}" + for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do + cp -a "${dep}"* "${ROCM_DIST_DIR}" done - # bomb out if for some reason we didn't get a few deps - if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then - cat "${BUILD_DIR}/bin/deps.txt" - echo "ERROR: deps file short" - exit 1 - fi + install compress fi cleanup +wait_for_compress echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)" diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index d8bce92d6fe..cbdfd09f145 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -35,7 +35,7 @@ function init_vars { ) $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on") $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower() - $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners" + $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\runners" md "$script:DIST_BASE" -ea 0 > $null if ($env:CGO_CFLAGS -contains "-g") { $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo") @@ -117,7 +117,7 @@ function build { if ($cmakeDefs -contains "-G") { $extra=@("-j8") } else { - $extra= @("--", "/p:CL_MPcount=8") + $extra= @("--", "/maxCpuCount:8") } write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra" & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra @@ -261,7 +261,7 @@ function build_cuda() { if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) { # Then build cuda as a dynamically loaded library $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe" - $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename + $script:CUDA_VERSION=((get-item ($nvcc | split-path | split-path)).Basename -Split "\.")[0] if ($null -ne $script:CUDA_VERSION) { $script:CUDA_VARIANT="_"+$script:CUDA_VERSION } @@ -273,9 +273,9 @@ function build_cuda() { "-DGGML_CUDA=ON", "-DGGML_AVX=on", "-DGGML_AVX2=off", - "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", - "-DCMAKE_CUDA_FLAGS=-t8", - "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}" + "-DCMAKE_CUDA_FLAGS=-t6", + "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}", + "-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH" ) if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) { write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`"" @@ -286,12 +286,11 @@ function build_cuda() { sign install - rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" - md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null - write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" - cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" - cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" - cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null + write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" } else { write-host "Skipping CUDA generation step" } @@ -325,18 +324,17 @@ function build_oneapi() { sign install - rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" } else { Write-Host "Skipping oneAPI generation step" } @@ -386,12 +384,11 @@ function build_rocm() { sign install - rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\" - md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null - cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\" - cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" -ea 0 > $null + cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs - cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" + cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" } else { write-host "Skipping ROCm generation step" } diff --git a/llm/memory_test.go b/llm/memory_test.go index 6cf0119f946..ffb14286b8f 100644 --- a/llm/memory_test.go +++ b/llm/memory_test.go @@ -33,7 +33,6 @@ func TestEstimateGPULayers(t *testing.T) { assert.Len(t, tensors, inputLayerCount+1) err = WriteGGUF(f, KV{ "general.architecture": "llama", - "general.name": "name", "llama.context_length": uint32(32), "llama.embedding_length": uint32(4096), "llama.block_count": uint32(inputLayerCount), diff --git a/llm/patches/08-pooling.diff b/llm/patches/08-pooling.diff deleted file mode 100644 index 2e4fe11eef9..00000000000 --- a/llm/patches/08-pooling.diff +++ /dev/null @@ -1,60 +0,0 @@ -diff --git a/src/llama.cpp b/src/llama.cpp -index 721b8f4e..cfe7ac40 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -8420,14 +8420,14 @@ struct llm_build_context { - } - - struct ggml_tensor * build_inp_mean() { -- lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); -+ lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max); - cb(lctx.inp_mean, "inp_mean", -1); - ggml_set_input(lctx.inp_mean); - return lctx.inp_mean; - } - - struct ggml_tensor * build_inp_cls() { -- lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); -+ lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max); - cb(lctx.inp_cls, "inp_cls", -1); - ggml_set_input(lctx.inp_cls); - return lctx.inp_cls; -@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); - - float * data = (float *) lctx.inp_mean->data; -- memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean)); -+ memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean)); - - std::vector sum(n_tokens, 0); - for (int i = 0; i < n_tokens; ++i) { - const llama_seq_id seq_id = batch.seq_id[i][0]; -- -- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); -- - sum[seq_id] += 1; - } - -- std::vector div(n_tokens, 0.0f); -- for (int i = 0; i < n_tokens; ++i) { -+ std::vector div(cparams.n_seq_max, 0.0f); -+ for (uint32_t i = 0; i < cparams.n_seq_max; ++i) { - const uint64_t s = sum[i]; - if (s > 0) { - div[i] = 1.0f/float(s); -@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); - - uint32_t * data = (uint32_t *) lctx.inp_cls->data; -- memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls)); -+ memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls)); - - for (int i = 0; i < n_tokens; ++i) { - const llama_seq_id seq_id = batch.seq_id[i][0]; - const llama_pos pos = batch.pos[i]; -- -- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS"); -- - if (pos == 0) { - data[seq_id] = i; - } diff --git a/llm/payload.go b/llm/payload.go index b402e1f2439..963b32959ce 100644 --- a/llm/payload.go +++ b/llm/payload.go @@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string { // glob workDir for files that start with ollama_ availableServers := getAvailableServers() requested := info.Library - if info.Variant != gpu.CPUCapabilityNone { - requested += "_" + info.Variant.String() + if info.Variant != gpu.CPUCapabilityNone.String() { + requested += "_" + info.Variant } servers := []string{} diff --git a/llm/server.go b/llm/server.go index d2b8db9b365..9347a4589a2 100644 --- a/llm/server.go +++ b/llm/server.go @@ -306,20 +306,18 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr if runtime.GOOS == "windows" { pathEnv = "PATH" } - // prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies - libraryPaths := []string{dir, filepath.Dir(dir)} + // Start with the server directory for the LD_LIBRARY_PATH/PATH + libraryPaths := []string{dir} if libraryPath, ok := os.LookupEnv(pathEnv); ok { - // Append our runner directory to the path - // This will favor system libraries over our bundled library dependencies + // favor our bundled library dependencies over system libraries libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...) } // Note: we always put the dependency path first - // since this was the exact version we verified for AMD GPUs - // and we favor what the user had in their path + // since this was the exact version we compiled/linked against if gpus[0].DependencyPath != "" { - // TODO refine for multi-gpu support + // assume gpus from the same library have the same dependency path libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...) } diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index 27c4ff1f8c0..6cb0d0cd065 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -4,6 +4,7 @@ set -eu export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")} export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'" +GZIP=$(which pigz 2>/dev/null || echo "gzip") BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"} export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""} @@ -21,11 +22,16 @@ for TARGETARCH in ${BUILD_ARCH}; do -t builder:$TARGETARCH \ . docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH - docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/ollama ./dist/ollama-linux-$TARGETARCH - - if [ "$TARGETARCH" = "amd64" ]; then - docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/deps/ ./dist/ + rm -rf ./dist/linux-$TARGETARCH + docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist + if echo ${TARGETARCH} | grep "amd64" > /dev/null; then + docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist fi - docker rm builder-$TARGETARCH + echo "Compressing final linux bundle..." + rm -f ./dist/ollama-linux-$TARGETARCH.tgz + (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz ) + if [ -d dist/linux-$TARGETARCH-rocm ]; then + (cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz ) + fi done diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1 index edc737593fc..9cebf1f408e 100644 --- a/scripts/build_windows.ps1 +++ b/scripts/build_windows.ps1 @@ -7,6 +7,7 @@ $ErrorActionPreference = "Stop" function checkEnv() { + $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower() $script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower() Write-host "Building for ${script:TARGET_ARCH}" write-host "Locating required tools and paths" @@ -15,26 +16,23 @@ function checkEnv() { $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0] } - # Try to find the CUDA dir - if ($null -eq $env:NVIDIA_DIR) { + # Locate CUDA versions + # Note: this assumes every version found will be built + $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue') + if ($cudaList.length -eq 0) { $d=(get-command -ea 'silentlycontinue' nvcc).path - if ($d -ne $null) { - $script:NVIDIA_DIR=($d| split-path -parent) - } else { - $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue') - if ($cudaList.length > 0) { - $script:NVIDIA_DIR=$cudaList[0] - } + if ($null -ne $d) { + $script:CUDA_DIRS=@($d| split-path -parent) } } else { - $script:NVIDIA_DIR=$env:NVIDIA_DIR + $script:CUDA_DIRS=$cudaList } $script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0] $script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}" $env:CGO_ENABLED="1" - echo "Checking version" + Write-Output "Checking version" if (!$env:VERSION) { $data=(git describe --tags --first-parent --abbrev=7 --long --dirty --always) $pattern="v(.+)" @@ -71,7 +69,48 @@ function checkEnv() { function buildOllama() { write-host "Building ollama CLI" if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) { - & go generate ./... + Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}" + + # TODO - consider trying to parallelize this with Start-ThreadJob, but env vars can't be used to toggle + # which targets to build + + # Start by skipping CUDA to build everything else + pwsh -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... } + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + + # Then skip everyhting else and build all the CUDA variants + foreach ($env:CUDA_LIB_DIR in $script:CUDA_DIRS) { + write-host "Building CUDA ${env:CUDA_LIB_DIR}" + + if ($env:CUDA_LIB_DIR.Contains("v12")) { + pwsh -Command { + $env:OLLAMA_SKIP_CUDA_GENERATE="" + $env:OLLAMA_SKIP_STATIC_GENERATE="1" + $env:OLLAMA_SKIP_CPU_GENERATE="1" + $env:OLLAMA_SKIP_ONEAPI_GENERATE="1" + $env:OLLAMA_SKIP_ROCM_GENERATE="1" + $env:CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" + $env:OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" + $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent + $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH" + & go generate ./... + } + } else { + pwsh -Command { + $env:OLLAMA_SKIP_CUDA_GENERATE="" + $env:OLLAMA_SKIP_STATIC_GENERATE="1" + $env:OLLAMA_SKIP_CPU_GENERATE="1" + $env:OLLAMA_SKIP_ONEAPI_GENERATE="1" + $env:OLLAMA_SKIP_ROCM_GENERATE="1" + $env:CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" + $env:OLLAMA_CUSTOM_CUDA_DEFS="" + $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent + $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH" + & go generate ./... + } + } + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + } if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } else { write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set" @@ -83,8 +122,8 @@ function buildOllama() { /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } - New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force - cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\ + New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\bin\ -Force + cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\bin\ } function buildApp() { @@ -103,22 +142,22 @@ function buildApp() { function gatherDependencies() { write-host "Gathering runtime dependencies" cd "${script:SRC_DIR}" - md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null + md "${script:DEPS_DIR}\lib\ollama" -ea 0 > $null # TODO - this varies based on host build system and MSVC version - drive from dumpbin output # currently works for Win11 + MSVC 2019 + Cuda V11 - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_runners\" - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\" - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\lib\ollama\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\lib\ollama\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\lib\ollama\" foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) { - cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_runners\" + cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\lib\ollama\" } cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\" if ("${env:KEY_CONTAINER}") { write-host "about to sign" - foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){ + foreach ($file in (get-childitem "${script:DEPS_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){ write-host "signing $file" & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" ` /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file diff --git a/scripts/install.sh b/scripts/install.sh index 03af5a69e39..25f57565a3b 100644 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -63,16 +63,36 @@ if [ -n "$NEEDS" ]; then exit 1 fi -status "Downloading ollama..." -curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}" - for BINDIR in /usr/local/bin /usr/bin /bin; do echo $PATH | grep -q $BINDIR && break || continue done +OLLAMA_INSTALL_DIR=$(dirname ${BINDIR}) -status "Installing ollama to $BINDIR..." +status "Installing ollama to $OLLAMA_INSTALL_DIR" $SUDO install -o0 -g0 -m755 -d $BINDIR -$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $BINDIR/ollama +$SUDO install -o0 -g0 -m755 -d "$OLLAMA_INSTALL_DIR" +if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" >/dev/null ; then + status "Downloading Linux ${ARCH} bundle" + curl --fail --show-error --location --progress-bar \ + "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \ + $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR" + BUNDLE=1 + if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then + status "Making ollama accessible in the PATH in $BINDIR" + $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama" + fi +else + status "Downloading Linux ${ARCH} CLI" + curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\ + "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}" + $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama + BUNDLE=0 + if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then + status "Making ollama accessible in the PATH in $BINDIR" + $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama" + fi +fi + install_success() { status 'The Ollama API is now available at 127.0.0.1:11434.' @@ -178,6 +198,16 @@ if ! check_gpu lspci nvidia && ! check_gpu lshw nvidia && ! check_gpu lspci amdg fi if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then + if [ $BUNDLE -ne 0 ]; then + status "Downloading Linux ROCm ${ARCH} bundle" + curl --fail --show-error --location --progress-bar \ + "https://ollama.com/download/ollama-linux-${ARCH}-rocm.tgz${VER_PARAM}" | \ + $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR" + + install_success + status "AMD GPU ready." + exit 0 + fi # Look for pre-existing ROCm v6 before downloading the dependencies for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then diff --git a/scripts/rh_linux_deps.sh b/scripts/rh_linux_deps.sh index 81648d68e22..b4c9afd6386 100644 --- a/scripts/rh_linux_deps.sh +++ b/scripts/rh_linux_deps.sh @@ -3,6 +3,7 @@ # Script for common Dockerfile dependency installation in redhat linux based images set -ex +set -o pipefail MACHINE=$(uname -m) if grep -i "centos" /etc/system-release >/dev/null; then @@ -29,7 +30,7 @@ if grep -i "centos" /etc/system-release >/dev/null; then dnf install -y rh-git227-git ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git fi - dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ + dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz elif grep -i "rocky" /etc/system-release >/dev/null; then # Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC) cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo @@ -43,12 +44,21 @@ gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-rockyofficial EOF dnf install -y git \ gcc-toolset-10-gcc-10.2.1-8.2.el8 \ - gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 + gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \ + pigz else echo "ERROR Unexpected distro" exit 1 fi +if [ "${MACHINE}" = "x86_64" ] ; then + curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /tmp --strip-components 1 && \ + mv /tmp/ccache /usr/local/bin/ +else + yum -y install epel-release + yum install -y ccache +fi + if [ -n "${CMAKE_VERSION}" ]; then curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1 fi diff --git a/server/layer.go b/server/layer.go index c666bd10624..0bdee72b3cb 100644 --- a/server/layer.go +++ b/server/layer.go @@ -51,6 +51,9 @@ func NewLayer(r io.Reader, mediatype string) (Layer, error) { if err := os.Rename(temp.Name(), blob); err != nil { return Layer{}, err } + if err := os.Chmod(blob, 0o644); err != nil { + return Layer{}, err + } } return Layer{ diff --git a/server/sched.go b/server/sched.go index 9947fd32577..58071bf035c 100644 --- a/server/sched.go +++ b/server/sched.go @@ -193,6 +193,11 @@ func (s *Scheduler) processPending(ctx context.Context) { break } + // Embedding models should always be loaded with parallel=1 + if pending.model.CheckCapabilities(CapabilityCompletion) != nil { + numParallel = 1 + } + // Evaluate if the model will fit in the available system memory, or if we should unload a model first if len(gpus) == 1 && gpus[0].Library == "cpu" { // simplifying assumption of defaultParallel when in CPU mode @@ -734,7 +739,10 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL // If multiple Libraries are detected, pick the Library which loads the most layers for the model func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { - *numParallel = 1 + if *numParallel <= 0 { + *numParallel = 1 + req.opts.NumCtx = req.origNumCtx + } byLibrary := gpus.ByLibrary() if len(byLibrary) <= 1 { return gpus diff --git a/server/sched_test.go b/server/sched_test.go index 713b9259e12..fb049574f46 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -117,7 +117,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est require.NoError(t, llm.WriteGGUF(f, llm.KV{ "general.architecture": "llama", - "general.name": "name", "llama.context_length": uint32(32), "llama.embedding_length": uint32(4096), "llama.block_count": uint32(1), diff --git a/server/upload.go b/server/upload.go index 2f115436efa..020e895518b 100644 --- a/server/upload.go +++ b/server/upload.go @@ -45,7 +45,7 @@ type blobUpload struct { } const ( - numUploadParts = 64 + numUploadParts = 16 minUploadPartSize int64 = 100 * format.MegaByte maxUploadPartSize int64 = 1000 * format.MegaByte )