Merge pull request #557 from sirajperson/main

This PR updates the docker files for launching local LLMs. It updates…
TransformerOptimus · Jun 30, 2023 · c6cff18 · c6cff18
2 parents 7f1c1a9 + ae019db
commit c6cff18
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 29 deletions.
diff --git a/local-llm-gpu b/local-llm-gpu
@@ -42,14 +42,16 @@ services:
 
   super__tgwui:
     build:
-      context: .
+      context: ./tgwui/
       target: llama-cublas
-      dockerfile: ./tgwui/DockerfileTGWUI
+      dockerfile: DockerfileTGWUI
 #      args:
 #        - LCL_SRC_DIR=text-generation-webui  # Developers - see Dockerfile app_base
+    image: atinoda/text-generation-webui:llama-cublas # Specify variant as the :tag
     container_name: super__tgwui
     environment:
-      - EXTRA_LAUNCH_ARGS="--listen --no-mmap --verbose --extensions openai --auto-devices --n_ctx 1600 --gpu-memory 20 20 --n-gpu-layers 128 --threads 8  --model vicuna-13b-cot.ggmlv3.q8_0.bin"
+      - EXTRA_LAUNCH_ARGS="--no-mmap --verbose --extensions openai --auto-devices --n_ctx 2000 --gpu-memory 22 22 --n-gpu-layers 128 --threads 8"
+#      - BUILD_EXTENSIONS_LIVE="silero_tts whisper_stt" # Install named extensions during every container launch. THIS WILL SIGNIFICANLTLY SLOW LAUNCH TIME.
     ports:
       - 7860:7860  # Default web port
       - 5000:5000  # Default API port
@@ -62,15 +64,14 @@ services:
       - ./tgwui/config/prompts:/app/prompts
       - ./tgwui/config/softprompts:/app/softprompts
       - ./tgwui/config/training:/app/training
-      - ./tgwui/config/embeddings:/app/embeddings
+#      - ./config/extensions:/app/extensions
     logging:
       driver:  json-file
       options:
         max-file: "3"   # number of files or file count
         max-size: '10m'
     networks:
       - super_network
-###  Uncomment the following lines to run the container using the host machine's GPU resources
     deploy:
         resources:
           reservations:
@@ -79,8 +80,6 @@ services:
 #                count: "all"
                 device_ids: ['0', '1'] # must comment the above line if this line is uncommented.
                 capabilities: [gpu]
-
-
   super__redis:
     image: "docker.io/library/redis:latest"
     networks:

diff --git a/tgwui/DockerfileTGWUI b/tgwui/DockerfileTGWUI
@@ -19,28 +19,26 @@ RUN git clone https://github.com/oobabooga/text-generation-webui /src
 # To use local source: comment out the git clone command then set the build arg `LCL_SRC_DIR`
 #ARG LCL_SRC_DIR="text-generation-webui"
 #COPY ${LCL_SRC_DIR} /src
-# This is required to get multi-gpu support until the main branch updates the requirements.txt file to include llama-cpp-python 0.1.59 or greater.
-
 #################################
+ENV LLAMA_CUBLAS=1
 # Copy source to app
 RUN cp -ar /src /app
 # Install oobabooga/text-generation-webui
 RUN --mount=type=cache,target=/root/.cache/pip pip3 install -r /app/requirements.txt
 # Install extensions
-COPY tgwui/scripts/build_extensions.sh /app/scripts/build_extensions.sh
+COPY ./scripts/build_extensions.sh /scripts/build_extensions.sh
 RUN --mount=type=cache,target=/root/.cache/pip \
-    chmod +x /app/scripts/build_extensions.sh && . /app/scripts/build_extensions.sh
-
-## Clone default GPTQ
-RUN git clone https://github.com/oobabooga/GPTQ-for-LLaMa.git
-## Build and install default GPTQ ('quant_cuda')
+    chmod +x /scripts/build_extensions.sh && . /scripts/build_extensions.sh
+# Clone default GPTQ
+RUN git clone https://github.com/oobabooga/GPTQ-for-LLaMa.git -b cuda /app/repositories/GPTQ-for-LLaMa
+# Build and install default GPTQ ('quant_cuda')
 ARG TORCH_CUDA_ARCH_LIST="6.1;7.0;7.5;8.0;8.6+PTX"
-RUN cd GPTQ-for-LLaMa/ && python3 setup_cuda.py install
+RUN cd /app/repositories/GPTQ-for-LLaMa/ && python3 setup_cuda.py install
 
 FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 AS base
 # Runtime pre-reqs
 RUN apt-get update && apt-get install --no-install-recommends -y \
-    python3-venv python3-dev git 
+    python3-venv python3-dev git
 # Copy app and src
 COPY --from=app_base /app /app
 COPY --from=app_base /src /src
@@ -49,30 +47,26 @@ COPY --from=app_base /venv /venv
 ENV VIRTUAL_ENV=/venv
 RUN python3 -m venv $VIRTUAL_ENV
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+## Link models directory to container
+#ADD ./config/models/ /app/models/
 # Finalise app setup
 WORKDIR /app
 EXPOSE 7860
 EXPOSE 5000
 EXPOSE 5005
-EXPOSE 5001
 # Required for Python print statements to appear in logs
 ENV PYTHONUNBUFFERED=1
 # Force variant layers to sync cache by setting --build-arg BUILD_DATE
 ARG BUILD_DATE
 ENV BUILD_DATE=$BUILD_DATE
 RUN echo "$BUILD_DATE" > /build_date.txt
-
-# Set embeddings model for llama
-#ENV OPENEDAI_EMBEDDING_MODEL=/app/embeddings/SGPT-125M-weightedmean-nli-bitfit
-#COPY tgwui/config/embeddings/SGPT-125M-weightedmean-nli-bitfit /app/embeddings
-#RUN echo -e "Embeddings model $OPENEDAI_EMBEDDING_MODEL"
-#RUN python extensions/openai/cache_embedding_model.py
-
+# Copy and enable all scripts
+COPY ./scripts /scripts
+RUN chmod +x /scripts/*
 # Run
-COPY tgwui/scripts/docker-entrypoint.sh /scripts/docker-entrypoint.sh
-RUN chmod +x /scripts/docker-entrypoint.sh
 ENTRYPOINT ["/scripts/docker-entrypoint.sh"]
 
+
 # VARIANT BUILDS
 FROM base AS cuda
 RUN echo "CUDA" >> /variant.txt
@@ -100,8 +94,7 @@ FROM base AS llama-cublas
 RUN echo "LLAMA-CUBLAS" >> /variant.txt
 RUN apt-get install --no-install-recommends -y git python3-dev build-essential python3-pip
 ENV LLAMA_CUBLAS=1
-RUN pip uninstall -y llama-cpp-python && \
-    CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
+RUN pip uninstall -y llama-cpp-python && pip install llama-cpp-python
 ENV EXTRA_LAUNCH_ARGS=""
 CMD ["python3", "/app/server.py"]
 

diff --git a/tgwui/config/models/place-your-models-here.txt b/tgwui/config/models/place-your-models-here.txt