Add files via upload

SaladTechnologies · Sep 15, 2023 · 7897ba2 · 7897ba2
1 parent 7d60c66
commit 7897ba2
Show file tree

Hide file tree

Showing 29 changed files with 1,984 additions and 0 deletions.
diff --git a/src/code-llama-7b/Dockerfile b/src/code-llama-7b/Dockerfile
@@ -0,0 +1,71 @@
+ARG PYVERSION=py38
+FROM baseten/truss-server-base:3.8-gpu-v0.4.9
+
+ENV PYTHON_EXECUTABLE python3
+
+
+RUN grep -w 'ID=debian\|ID_LIKE=debian' /etc/os-release || { echo "ERROR: Supplied base image is not a debian image"; exit 1; }
+RUN $PYTHON_EXECUTABLE -c "import sys; sys.exit(0) if sys.version_info.major == 3 and sys.version_info.minor >=8 and sys.version_info.minor <=11 else sys.exit(1)" \
+    || { echo "ERROR: Supplied base image does not have 3.8 <= python <= 3.11"; exit 1; }
+
+
+RUN pip install --upgrade pip --no-cache-dir
+
+
+# If user base image is supplied in config, apply build commands from truss base image
+
+RUN apt-get update
+RUN apt-get install -y socat socat
+RUN apt-get install -y curl
+
+
+
+
+
+
+
+
+
+
+
+
+
+COPY ./requirements.txt requirements.txt
+RUN pip install -r requirements.txt --no-cache-dir
+
+
+
+
+
+
+
+
+ENV APP_HOME /app
+WORKDIR $APP_HOME
+
+
+
+# Copy data before code for better caching
+COPY ./data /app/data
+
+COPY ./server /app
+COPY ./start.sh /app
+
+COPY ./config.yaml /app/config.yaml
+
+## RUN cd /app/data && curl -O https://huggingface.co/codellama/CodeLlama-7b-hf/resolve/main/pytorch_model-00001-of-00003.bin && curl -O https://huggingface.co/codellama/CodeLlama-7b-hf/resolve/main/pytorch_model-00002-of-00003.bin && curl -O https://huggingface.co/codellama/CodeLlama-7b-hf/resolve/main/pytorch_model-00003-of-00003.bin
+
+COPY ./model /app/model
+## RUN apt-get install git
+
+## RUN cd /app/data && git clone https://huggingface.co/codellama/CodeLlama-7b-hf/resolve/main/pytorch_model-00001-of-00003.bin && git clone https://huggingface.co/codellama/CodeLlama-7b-hf/resolve/main/pytorch_model-00002-of-00003.bin && git clone https://huggingface.co/codellama/CodeLlama-7b-hf/resolve/main/pytorch_model-00003-of-00003.bin
+
+
+
+
+ENV INFERENCE_SERVER_PORT 8080
+ENV SERVER_START_CMD="python3 /app/inference_server.py"
+CMD bash start.sh
+
+## ENTRYPOINT ["python3", "/app/inference_server.py"]
+
diff --git a/src/code-llama-7b/README.md b/src/code-llama-7b/README.md
@@ -0,0 +1,89 @@
+[![Deploy to Baseten](https://user-images.githubusercontent.com/2389286/236301770-16f46d4f-4e23-4db5-9462-f578ec31e751.svg)](https://app.baseten.co/explore/alpaca)
+
+# Alpaca-7B Truss
+
+This is a [Truss](https://truss.baseten.co/) for Alpaca-7B, a fine-tuned variant of LLaMA-7B. LLaMA is a family of language models released by Meta. This README will walk you through how to deploy this Truss on Baseten to get your own instance of Alpaca-7B.
+
+## Truss
+
+Truss is an open-source model serving framework developed by Baseten. It allows you to develop and deploy machine learning models onto Baseten (and other platforms like [AWS](https://truss.baseten.co/deploy/aws) or [GCP](https://truss.baseten.co/deploy/gcp). Using Truss, you can develop a GPU model using [live-reload](https://baseten.co/blog/technical-deep-dive-truss-live-reload), package models and their associated code, create Docker containers and deploy on Baseten.
+
+## Deploying Alpaca-7B
+
+To deploy the Alpaca-7B Truss, you'll need to follow these steps:
+
+1. __Prerequisites__: Make sure you have a Baseten account and API key. You can sign up for a Baseten account [here](https://app.baseten.co/signup).
+
+2. __Install Truss and the Baseten Python client__: If you haven't already, install the Baseten Python client and Truss in your development environment using:
+```
+pip install --upgrade baseten truss
+```
+
+3. __Load the Alpaca-7B Truss__: Assuming you've cloned this repo, spin up an IPython shell and load the Truss into memory:
+```
+import truss
+
+alpaca7b_truss = truss.load("path/to/alpaca7b_truss")
+```
+
+4. __Log in to Baseten__: Log in to your Baseten account using your API key (key found [here](https://app.baseten.co/settings/account/api_keys)):
+```
+import baseten
+
+baseten.login("PASTE_API_KEY_HERE")
+```
+
+5. __Deploy the Alpaca-7B Truss__: Deploy the Alpaca-7B Truss to Baseten with the following command:
+```
+baseten.deploy(alpaca7b_truss)
+```
+
+Once your Truss is deployed, you can start using the Alpaca-7B model through the Baseten platform! Navigate to the Baseten UI to watch the model build and deploy and invoke it via the REST API.
+
+## Alpaca-7B API documentation
+This section provides an overview of the Alpaca-7B API, its parameters, and how to use it. The API consists of a single route named  `predict`, which you can invoke to generate text based on the provided instruction.
+
+### API route: `predict`
+The predict route is the primary method for generating text completions based on a given instruction. It takes several parameters:
+
+- __instruction__: The input text that you want the model to generate a response for.
+- __temperature__ (optional, default=0.1): Controls the randomness of the generated text. Higher values produce more diverse results, while lower values produce more deterministic results.
+- __top_p__ (optional, default=0.75): The cumulative probability threshold for token sampling. The model will only consider tokens whose cumulative probability is below this threshold.
+- __top_k__ (optional, default=40): The number of top tokens to consider when sampling. The model will only consider the top_k highest-probability tokens.
+- __num_beams__ (optional, default=4): The number of beams used for beam search. Increasing this value can result in higher-quality output but will increase the computational cost.
+
+The API also supports passing any parameter supported by Huggingface's `Transformers.generate`.
+
+## Example usage
+
+You can use the `baseten` model package to invoke your model from Python
+```
+import baseten
+# You can retrieve your deployed model ID from the UI
+model = baseten.deployed_model_version_id('YOUR_MODEL_ID')
+
+request = {
+    "prompt": "What's the meaning of life?",
+    "temperature": 0.1,
+    "top_p": 0.75,
+    "top_k": 40,
+    "num_beams": 4,
+}
+
+response = model.predict(request)
+```
+
+You can also invoke your model via a REST API
+```
+curl -X POST " https://app.baseten.co/models/YOUR_MODEL_ID/predict" \
+     -H "Content-Type: application/json" \
+     -H 'Authorization: Api-Key {YOUR_API_KEY}' \
+     -d '{
+           "prompt": "What's the meaning of life?",
+           "temperature": 0.1,
+           "top_p": 0.75,
+           "top_k": 40,
+           "num_beams": 4
+         }'
+
+```
diff --git a/src/code-llama-7b/config.yaml b/src/code-llama-7b/config.yaml
@@ -0,0 +1,64 @@
+apply_library_patches: true
+base_image: null
+build:
+  arguments: {}
+  model_server: TrussServer
+bundled_packages_dir: packages
+data_dir: data
+description: Generate text from a instructional prompt with this tuned version of
+  LLaMA 7B.
+environment_variables: {}
+examples_filename: examples.yaml
+external_data:
+- backend: http_public
+  local_data_path: pytorch_model-00001-of-00002.bin
+  url: https://baseten-public.s3.us-west-2.amazonaws.com/models/alpaca/pytorch_model-00001-of-00002.bin
+- backend: http_public
+  local_data_path: pytorch_model-00002-of-00002.bin
+  url: https://baseten-public.s3.us-west-2.amazonaws.com/models/alpaca/pytorch_model-00002-of-00002.bin
+external_package_dirs: []
+hf_cache: null
+input_type: Any
+live_reload: false
+model_class_filename: model.py
+model_class_name: Model
+model_framework: custom
+model_metadata:
+  avatar_url: https://cdn.baseten.co/production/static/explore/meta.png
+  cover_image_url: https://cdn.baseten.co/production/static/explore/alpaca.png
+  example_model_input:
+    num_beams: 4
+    prompt: What's the meaning of life?
+    temperature: 0.1
+    top_p: 0.75
+  tags:
+  - text-generation
+model_module_dir: model
+model_name: Alpaca 7B
+model_type: custom
+python_version: py38
+requirements:
+- torch==2.0.1
+- peft==0.3.0
+- sentencepiece==0.1.99
+- git+https://github.com/huggingface/transformers.git
+resources:
+  accelerator: A10G
+  cpu: '3'
+  memory: 14Gi
+  use_gpu: true
+runtime:
+  predict_concurrency: 1
+secrets: {}
+spec_version: '2.0'
+system_packages: []
+train:
+  resources:
+    accelerator: null
+    cpu: 500m
+    memory: 512Mi
+    use_gpu: false
+  training_class_filename: train.py
+  training_class_name: Train
+  training_module_dir: train
+  variables: {}
diff --git a/src/code-llama-7b/data/config.json b/src/code-llama-7b/data/config.json
@@ -0,0 +1,24 @@
+{
+  "_name_or_path": "decapoda-research/llama-7b-hf",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "bos_token_id": 0,
+  "eos_token_id": 1,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 2048,
+  "max_sequence_length": 2048,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "pad_token_id": -1,
+  "rms_norm_eps": 1e-06,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.29.0.dev0",
+  "use_cache": true,
+  "vocab_size": 32000
+}
diff --git a/src/code-llama-7b/data/generation_config.json b/src/code-llama-7b/data/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.29.0.dev0"
+}