diff --git a/manifests/modules/aiml/chatbot/gradio-mistral/gradio-ui.yaml b/manifests/modules/aiml/chatbot/gradio-mistral/gradio-ui.yaml new file mode 100644 index 000000000..8cf572c20 --- /dev/null +++ b/manifests/modules/aiml/chatbot/gradio-mistral/gradio-ui.yaml @@ -0,0 +1,130 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: gradio-mistral-tran1 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gradio-deployment + namespace: gradio-mistral-tran1 + labels: + app: gradio +spec: + replicas: 1 + selector: + matchLabels: + app: gradio + template: + metadata: + labels: + app: gradio + spec: + containers: + - name: gradio + image: public.ecr.aws/data-on-eks/gradio-web-app-base:latest + imagePullPolicy: IfNotPresent + ports: + - containerPort: 7860 + resources: + requests: + cpu: "512m" + memory: "2048Mi" + limits: + cpu: "1" + memory: "4096Mi" + env: + - name: MODEL_ENDPOINT + value: "/infer" + - name: SERVICE_NAME + value: "http://mistral-serve-svc.mistral.svc.cluster.local:8000" + volumeMounts: + - name: gradio-app-script + mountPath: /app/gradio-app.py + subPath: gradio-app-mistral-tran1.py + volumes: + - name: gradio-app-script + configMap: + name: gradio-app-script +--- +apiVersion: v1 +kind: Service +metadata: + name: gradio-service + namespace: gradio-mistral-tran1 + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip +spec: + selector: + app: gradio + ports: + - name: http + protocol: TCP + port: 80 + targetPort: 7860 + type: LoadBalancer +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: gradio-app-script + namespace: gradio-mistral-tran1 +data: + gradio-app-mistral-tran1.py: | + import gradio as gr + import requests + import os + + # Constants for model endpoint and service name + model_endpoint = "/infer" + service_name = os.environ.get("SERVICE_NAME", "http://localhost:8000") + + # Function to generate text + def text_generation(message, history): + prompt = message + + # Create the URL for the inference + url = f"{service_name}{model_endpoint}" + + try: + # Send the request to the model service + response = requests.get(url, params={"sentence": prompt}, timeout=180) + response.raise_for_status() # Raise an exception for HTTP errors + + full_output = response.json()[0] + # Removing the original question from the output + answer_only = full_output.replace(prompt, "", 1).strip('["]?\n') + + # Safety filter to remove harmful or inappropriate content + answer_only = filter_harmful_content(answer_only) + return answer_only + except requests.exceptions.RequestException as e: + # Handle any request exceptions (e.g., connection errors) + return f"AI: Error: {str(e)}" + + # Define the safety filter function (you can implement this as needed) + def filter_harmful_content(text): + # TODO: Implement a safety filter to remove any harmful or inappropriate content from the text + + # For now, simply return the text as-is + return text + + # Define the Gradio ChatInterface + chat_interface = gr.ChatInterface( + text_generation, + chatbot=gr.Chatbot(line_breaks=True), + textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7), + title="neuron-mistral7bv0.3 AI Chat", + description="Ask me any question", + theme="soft", + examples=["How many languages are in India", "What is Generative AI?"], + cache_examples=False, + retry_btn=None, + undo_btn="Delete Previous", + clear_btn="Clear", + ) + + # Launch the ChatInterface + chat_interface.launch(server_name="0.0.0.0") diff --git a/manifests/modules/aiml/chatbot/gradio-mistral/kustomization.yaml b/manifests/modules/aiml/chatbot/gradio-mistral/kustomization.yaml new file mode 100644 index 000000000..1cca24122 --- /dev/null +++ b/manifests/modules/aiml/chatbot/gradio-mistral/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - gradio-ui.yaml diff --git a/manifests/modules/aiml/chatbot/nodepool/kustomization.yaml b/manifests/modules/aiml/chatbot/nodepool/kustomization.yaml index b0f432bde..64ff67cc9 100644 --- a/manifests/modules/aiml/chatbot/nodepool/kustomization.yaml +++ b/manifests/modules/aiml/chatbot/nodepool/kustomization.yaml @@ -3,3 +3,4 @@ kind: Kustomization resources: - nodepool-inf2.yaml - nodepool-x86.yaml + - nodepool-tran1.yaml diff --git a/manifests/modules/aiml/chatbot/nodepool/nodepool-tran1.yaml b/manifests/modules/aiml/chatbot/nodepool/nodepool-tran1.yaml new file mode 100644 index 000000000..f3cb372d3 --- /dev/null +++ b/manifests/modules/aiml/chatbot/nodepool/nodepool-tran1.yaml @@ -0,0 +1,67 @@ +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: trainium-tran1 +spec: + template: + metadata: + labels: + instanceType: trainium + provisionerType: Karpenter + neuron.amazonaws.com/neuron-device: "true" + spec: + startupTaints: + - key: node.kubernetes.io/not-ready + effect: "NoExecute" + taints: + - key: aws.amazon.com/neuron + effect: "NoSchedule" + requirements: + - key: node.kubernetes.io/instance-type + operator: In + values: ["trn1.2xlarge"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.sh/capacity-type" + operator: In + values: ["on-demand", "spot"] + expireAfter: 720h + terminationGracePeriod: 24h + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: trainium-tran1 + limits: + cpu: 100 + memory: 400Gi + aws.amazon.com/neuron: 10 + disruption: + consolidateAfter: 300s + consolidationPolicy: WhenEmptyOrUnderutilized + +--- +apiVersion: karpenter.k8s.aws/v1 +kind: EC2NodeClass +metadata: + name: trainium-tran1 +spec: + amiFamily: AL2 + amiSelectorTerms: + - alias: al2@latest + blockDeviceMappings: + - deviceName: /dev/xvda + ebs: + deleteOnTermination: true + encrypted: true + volumeSize: 500Gi + volumeType: gp3 + role: ${KARPENTER_NODE_ROLE} + securityGroupSelectorTerms: + - tags: + karpenter.sh/discovery: ${EKS_CLUSTER_NAME} + subnetSelectorTerms: + - tags: + karpenter.sh/discovery: ${EKS_CLUSTER_NAME} + tags: + app.kubernetes.io/created-by: eks-workshop diff --git a/manifests/modules/aiml/chatbot/nodepool/nodepool-x86.yaml b/manifests/modules/aiml/chatbot/nodepool/nodepool-x86.yaml index 41937cfaf..6c4027803 100644 --- a/manifests/modules/aiml/chatbot/nodepool/nodepool-x86.yaml +++ b/manifests/modules/aiml/chatbot/nodepool/nodepool-x86.yaml @@ -11,6 +11,9 @@ spec: provisionerType: Karpenter workload: rayhead spec: + startupTaints: + - key: node.kubernetes.io/not-ready + effect: "NoExecute" requirements: - key: "karpenter.k8s.aws/instance-family" operator: In diff --git a/manifests/modules/aiml/chatbot/ray-service-neuron-mistral-chatbot/Dockerfile b/manifests/modules/aiml/chatbot/ray-service-neuron-mistral-chatbot/Dockerfile new file mode 100644 index 000000000..c5e7276a2 --- /dev/null +++ b/manifests/modules/aiml/chatbot/ray-service-neuron-mistral-chatbot/Dockerfile @@ -0,0 +1,46 @@ +# https://hub.docker.com/layers/rayproject/ray/2.11.0-py310/images/sha256-de798e487b76a8f2412c718c43c5f342b3eb05e0705a71325102904cd27c3613?context=explore +FROM rayproject/ray:2.32.0-py310 + +# Maintainer label +LABEL maintainer="DoEKS" + +# Set environment variables to non-interactive (this prevents some prompts) +ENV DEBIAN_FRONTEND=non-interactive + +# Switch to root to add Neuron repo and install necessary packages +USER root + +# Set up the Neuron repository and install Neuron packages +RUN . /etc/os-release && \ + sudo echo "deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main" > /etc/apt/sources.list.d/neuron.list && \ + sudo wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - && \ + sudo apt-get update -y && \ + sudo apt-get install aws-neuronx-dkms aws-neuronx-collectives=2.* aws-neuronx-runtime-lib=2.* aws-neuronx-tools=2.* -y && \ + sudo apt-get clean && \ + sudo rm -rf /var/lib/apt/lists/* + + + +# Switch back to a non-root user for the subsequent commands +USER $USER + +# Set pip repository pointing to the Neuron repository and install required Python packages +RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com && \ + pip install wget awscli regex neuronx-cc==2.* torch-neuronx torchvision transformers-neuronx sentencepiece transformers huggingface_hub tenacity psutil fastapi uvicorn mistral-inference mistral-common + + +# Add Neuron path to PATH +ENV PATH /opt/aws/neuron/bin:$PATH + +# Set LD_LIBRARY_PATH to include the directory with libpython3.10.so.1.0 +ENV LD_LIBRARY_PATH /home/ray/anaconda3/lib:$LD_LIBRARY_PATH + +# Create cache directories +RUN mkdir -p /serve_app + +# Set working directory +WORKDIR /serve_app + +COPY mistral1.py /serve_app/mistral1.py + + diff --git a/manifests/modules/aiml/chatbot/ray-service-neuron-mistral-chatbot/kustomization.yaml b/manifests/modules/aiml/chatbot/ray-service-neuron-mistral-chatbot/kustomization.yaml new file mode 100644 index 000000000..1f5a41bc2 --- /dev/null +++ b/manifests/modules/aiml/chatbot/ray-service-neuron-mistral-chatbot/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ray_service_mistral.yaml diff --git a/manifests/modules/aiml/chatbot/ray-service-neuron-mistral-chatbot/mistral1.py b/manifests/modules/aiml/chatbot/ray-service-neuron-mistral-chatbot/mistral1.py new file mode 100644 index 000000000..ba8fcedd4 --- /dev/null +++ b/manifests/modules/aiml/chatbot/ray-service-neuron-mistral-chatbot/mistral1.py @@ -0,0 +1,142 @@ +import os +import json +import logging +from fastapi import FastAPI +from ray import serve +import torch +import torch_neuronx +from transformers import AutoTokenizer +from transformers_neuronx.mistral.model import MistralForSampling +from huggingface_hub import snapshot_download + +# Initialize FastAPI +app = FastAPI() + +neuron_cores = int(os.getenv('NEURON_CORES', 2)) # Default to 2 for trn1.2xlarge +cacheDir = os.path.join('/tmp','model','neuron-mistral7bv0.3') + +# --- Logging Setup --- +logger = logging.getLogger("ray.serve") +logger.setLevel(logging.INFO) +logging.basicConfig(level=logging.INFO) + +@serve.deployment(num_replicas=1) +@serve.ingress(app) +class APIIngress: + def __init__(self, mistral_model_handle): + self.handle = mistral_model_handle + + @app.get("/infer") + async def infer(self, sentence: str): + result = await self.handle.infer.remote(sentence) + return result + +@serve.deployment( + name="mistral-7b", + autoscaling_config={"min_replicas": 1, "max_replicas": 1}, + ray_actor_options={ + "resources": {"neuron_cores": neuron_cores}, + "memory": 28000000000 + } +) +class MistralModel: + def __init__(self): + try: + logger.info("Initializing model with pre-compiled files...") + + mistral_model = os.getenv('MODEL_ID', 'askulkarni2/neuron-mistral7bv0.3') + logger.info(f"Using model ID: {mistral_model}") + + model_path='/tmp/model/neuron-mistral7bv0.3' + model_cache='/tmp/model/cache' + + # Initialize model state + self.neuron_model = None + self.tokenizer = None + + #Downloading files to local dir + if not os.path.exists(model_path): + os.makedirs(cacheDir, exist_ok=True) + os.makedirs(model_cache, exist_ok=True) + logger.info("downloading model file to../tmp/model/neuron-mistral7bv0.3") + model_path = snapshot_download(repo_id=mistral_model, local_dir=cacheDir, local_dir_use_symlinks=False) + logger.info(f"model path: {model_path}") + + logger.info(f"Checking model path contents: {os.listdir(model_path)}") + + # Set the environment variable with absolute path + os.environ.update({ + "NEURON_RT_VISIBLE_CORES": "0,1", + "NEURON_RT_NUM_CORES": "2", + "NEURON_RT_USE_PREFETCHED_NEFF": "1", + }) + + logger.info("Loading tokenizer...") + self.tokenizer = AutoTokenizer.from_pretrained( + model_path, + local_files_only=True + ) + + # Set padding token + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + logger.info("Set padding token to EOS token") + + + logger.info("Loading model...") + # Load model with minimal configuration + self.neuron_model = MistralForSampling.from_pretrained( + model_path, batch_size=1, tp_degree=2, amp='bf16' + ) + + logger.info("Model preparation...") + + neuronxcc_dirs = [d for d in os.listdir(model_cache)] + if not neuronxcc_dirs: + # compile modele first time and save compile artifacts in cache dir + self.neuron_model.to_neuron() + self.neuron_model.save(model_cache) + else: + # load pre-complied .neff files + self.neuron_model.load(model_cache) + self.neuron_model.to_neuron() + + logger.info("Model successfully prepared for inference") + + # Verify initialization + if not self._verify_model_state(): + raise RuntimeError("Model initialization failed verification") + + logger.info("Model initialization complete") + + except Exception as e: + logger.error(f"Error during model initialization: {e}") + raise + + def _verify_model_state(self): + if self.neuron_model is None: + return False + if not hasattr(self.neuron_model, 'sample'): + return False + if self.tokenizer is None: + return False + return True + + def infer(self, sentence: str): + input_ids = self.tokenizer.encode(sentence, return_tensors="pt") + with torch.inference_mode(): + try: + logger.info(f"Performing inference on input: {sentence}") + generated_sequences = self.neuron_model.sample( + input_ids, sequence_length=2048, top_k=50 + ) + decoded_sequences = [self.tokenizer.decode(seq, skip_special_tokens=True) for seq in generated_sequences] + logger.info(f"Inference result: {decoded_sequences}") + return decoded_sequences + except Exception as e: + logger.error(f"Error during inference: {e}") + return {"error": "Inference failed"} + + +# Create an entry point for the FastAPI application +entrypoint = APIIngress.bind(MistralModel.bind()) diff --git a/manifests/modules/aiml/chatbot/ray-service-neuron-mistral-chatbot/ray_service_mistral.yaml b/manifests/modules/aiml/chatbot/ray-service-neuron-mistral-chatbot/ray_service_mistral.yaml new file mode 100644 index 000000000..73b8f3bd3 --- /dev/null +++ b/manifests/modules/aiml/chatbot/ray-service-neuron-mistral-chatbot/ray_service_mistral.yaml @@ -0,0 +1,194 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: mistral +--- +#---------------------------------------------------------------------- +# NOTE: For deployment instructions, refer to the DoEKS website. +#---------------------------------------------------------------------- +apiVersion: ray.io/v1 +kind: RayService +metadata: + name: mistral + namespace: mistral +spec: + serviceUnhealthySecondThreshold: 900 + deploymentUnhealthySecondThreshold: 300 + serveConfigV2: | + applications: + - name: mistral-deployment + import_path: "mistral1:entrypoint" + route_prefix: "/" + runtime_env: + env_vars: + MODEL_ID: "askulkarni2/neuron-mistral7bv0.3" + NEURON_CC_FLAGS: "-O1" + LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH" + NEURON_CORES: "2" + NEURON_COMPILE_CACHE_URL: "/tmp/model/cache" + NEURON_RT_CACHE_DIRECTORY: "/tmp/model/cache" + deployments: + - name: mistral-7b + autoscaling_config: + min_replicas: 1 + max_replicas: 1 + target_num_ongoing_requests_per_replica: 1 + ray_actor_options: + resources: {"neuron_cores": 2} + memory: 28000000000 + rayClusterConfig: + rayVersion: '2.32.0' + enableInTreeAutoscaling: true + headGroupSpec: + serviceType: NodePort + headService: + metadata: + name: mistral + rayStartParams: + dashboard-host: '0.0.0.0' + num-cpus: "0" # this is to ensure no tasks or actors are scheduled on the head Pod + template: + spec: + containers: + - name: head + image: public.ecr.aws/e3e2e5u9/aiml/mistral-7b:latest + imagePullPolicy: Always # Ensure the image is always pulled when updated + lifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "ray stop"] + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + - mountPath: /tmp/model/cache + name: model-cache + resources: + limits: + cpu: "4" + memory: 16Gi + requests: + cpu: "2" + memory: 8Gi + env: + - name: PORT + value: "8000" + - name: LD_LIBRARY_PATH + value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH" + nodeSelector: + instanceType: mixed-x86 + provisionerType: Karpenter + workload: rayhead + volumes: + - name: ray-logs + emptyDir: {} + - name: model-cache + emptyDir: {} + workerGroupSpecs: + - groupName: worker-group + replicas: 1 + minReplicas: 1 + maxReplicas: 1 + rayStartParams: + resources: '"{\"neuron_cores\": 2}"' + num-cpus: "6" + template: + spec: + containers: + - name: worker + image: public.ecr.aws/e3e2e5u9/aiml/mistral-7b:latest + imagePullPolicy: Always # Ensure the image is always pulled when updated + lifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "ray stop"] + # We are using 2 Neuron cores per HTTP request hence this configuration handles 6 requests per second + resources: + limits: + memory: "30Gi" + aws.amazon.com/neuron: "1" + requests: + memory: "28Gi" + aws.amazon.com/neuron: "1" + env: + # Model and Neuron configuration + - name: MODEL_ID + value: "askulkarni2/neuron-mistral7bv0.3" + - name: NEURON_CORES + value: "2" + - name: NEURON_RT_NUM_CORES + value: "2" + - name: NEURON_RT_VISIBLE_CORES + value: "0,1" + - name: NEURON_CC_FLAGS + value: "-O1" # Changed from --no-compile + - name: NEURON_COMPILE_ONLY + value: "0" + - name: NEURON_RT_LOG_LEVEL + value: "INFO" + # Cache configuration + - name: NEURON_COMPILE_CACHE_URL + value: "/tmp/model/cache" + - name: NEURON_RT_CACHE_DIRECTORY + value: "/tmp/model/cache" + - name: NEURON_RT_USE_PREFETCHED_NEFF + value: "1" # Added to use pre-compiled NEFF files + # Memory management + - name: NEURON_RT_MAX_WORKSPACE_SIZE + value: "8589934592" + - name: XLA_TENSOR_ALLOCATOR_MAXSIZE + value: "12884901888" + - name: MALLOC_ARENA_MAX + value: "32" + - name: MALLOC_TRIM_THRESHOLD_ + value: "128K" + - name: XLA_PYTHON_CLIENT_MEM_FRACTION + value: "0.95" + # Runtime configuration + - name: NEURON_RT_STALL_ENABLE + value: "1" + - name: NEURON_RT_BLOCKING_IO + value: "1" + - name: NEURON_RT_EXEC_TIMEOUT + value: "900" + - name: RAY_memory_monitor_refresh_ms + value: "5000" + - name: RAY_memory_usage_threshold + value: "0.90" + # System paths + - name: LD_LIBRARY_PATH + value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH" + - name: PORT + value: "8000" + volumeMounts: + - mountPath: /tmp/ray + name: ray-logs + - mountPath: /dev/shm + name: dshm + - mountPath: /tmp/model/cache + name: model-cache + volumes: + - name: dshm + emptyDir: + medium: Memory + - name: ray-logs + emptyDir: {} + - name: model-cache + emptyDir: {} + nodeSelector: + instanceType: trainium + provisionerType: Karpenter + neuron.amazonaws.com/neuron-device: "true" + tolerations: + - key: "aws.amazon.com/neuron" + operator: "Exists" + effect: "NoSchedule" + diff --git a/website/docs/aiml/chatbot/add-mistral.md b/website/docs/aiml/chatbot/add-mistral.md new file mode 100644 index 000000000..d55033619 --- /dev/null +++ b/website/docs/aiml/chatbot/add-mistral.md @@ -0,0 +1,86 @@ +--- +title: "Deploying The Mistral-7B-Instruct-v0.3 Chat Model on Ray Serve" +sidebar_position: 60 +--- + +With all the node pools provisioned, we can now proceed to deploy Mistral-7B-Instruct-v0.3 chatbot infrastructure. + +Let's begin by deploying the `ray-service-mistral.yaml` file: + +```bash wait=5 +$ kubectl apply -k ~/environment/eks-workshop/modules/aiml/chatbot/ray-service-neuron-mistral-chatbot +namespace/mistral created +rayservice.ray.io/mistral created +``` + +### Creating the Ray Service Pods for Inference + +The `ray-service-mistral.yaml` file defines the Kubernetes configuration for deploying the Ray Serve service for the mistral7bv0.3 AI chatbot: + +```file +manifests/modules/aiml/chatbot/ray-service-neuron-mistral-chatbot/ray_service_mistral.yaml +``` + +This configuration accomplishes the following: + +1. Creates a Kubernetes namespace named `mistral` for resource isolation +2. Deploys a RayService named `rayservice.ray.io/mistral` that utilizes a Python script to create the Ray Serve component +3. Provisions a Head Pod and Worker Pods to pull Docker images from Amazon Elastic Container Registry (ECR) + +After applying the configurations, we'll monitor the progress of the head and worker pods: + +```bash wait=5 +$ kubectl get pod -n mistral +NAME READY STATUS RESTARTS AGE +pod/mistral-raycluster-ltvjb-head-7rd7d 0/2 Pending 0 4s +pod/mistral-raycluster-ltvjb-worker-worker-group-nff7x 0/1 Pending 0 4s +``` + +:::caution +It may take up to 15 minutes for both pods to be ready. +::: + +We can wait for the pods to be ready using the following command: + +```bash timeout=900 +$ kubectl wait pod \ +--all \ +--for=condition=Ready \ +--namespace=mistral \ +--timeout=15m +pod/mistral-raycluster-ltvjb-head-7rd7d met +pod/mistral-raycluster-ltvjb-worker-worker-group-nff7x met +``` + +Once the pods are fully deployed, we'll verify that everything is in place: + +```bash +$ kubectl get all -n mistral +NAME READY STATUS RESTARTS AGE +pod/mistral-raycluster-ltvjb-head-7rd7d 2/2 Running 0 7m +pod/mistral-raycluster-ltvjb-worker-worker-group-nff7x 1/1 Running 0 7m + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +service/mistral NodePort 172.20.74.49 6379:32625/TCP,8265:30941/TCP,10001:32430/TCP,8000:31393/TCP,8080:31361/TCP 94m +service/mistral-head-svc NodePort 172.20.121.46 8000:30481/TCP,8080:32609/TCP,6379:31066/TCP,8265:31006/TCP,10001:30220/TCP 92m +service/mistral-serve-svc NodePort 172.20.241.50 8000:32351/TCP 92m + +NAME DESIRED WORKERS AVAILABLE WORKERS CPUS MEMORY GPUS STATUS AGE +raycluster.ray.io/mistral-raycluster-ltvjb 1 1 2 36Gi 0 ready 94m + +NAME SERVICE STATUS NUM SERVE ENDPOINTS +rayservice.ray.io/mistral Running 2 +``` + +:::caution +Configuring RayService may take up to 10 minutes. +::: + +We can wait for the RayService to be running with this command: + +```bash wait=5 timeout=600 +$ kubectl wait --for=jsonpath='{.status.serviceStatus}'=Running rayservice/mistral -n mistral --timeout=10m +rayservice.ray.io/mistral condition met +``` + +With everything properly deployed, we can now proceed to create the web interface for the chatbot. diff --git a/website/docs/aiml/chatbot/gradio-mistral.md b/website/docs/aiml/chatbot/gradio-mistral.md new file mode 100644 index 000000000..61a7217ee --- /dev/null +++ b/website/docs/aiml/chatbot/gradio-mistral.md @@ -0,0 +1,68 @@ +--- +title: "Configuring the Gradio Web User Interface for Access" +sidebar_position: 70 +--- + +After all the resources have been configured within the Ray Serve Cluster, it's now time to directly access the Mistral-7B-Instruct-v0.3 chatbot. The web interface is powered by the Gradio UI. + +:::tip +You can learn more about Load Balancers in the [Load Balancer module](../../../fundamentals/exposing/loadbalancer/index.md) provided in this workshop. +::: + +### Deploying Gradio Web User Interface + +Once the AWS Load Balancer Controller has been installed, we can deploy the Gradio UI components. + +```file +manifests/modules/aiml/chatbot/gradio-mistral/gradio-ui.yaml +``` + +The components consist of a `Deployment`, `Service`, and `ConfigMap` to launch the application. In particular, the `Service` component is named gradio-service and is deployed as a `LoadBalancer`. + +```bash +$ kubectl apply -k ~/environment/eks-workshop/modules/aiml/chatbot/gradio-mistral +namespace/gradio-mistral-tran1 created +configmap/gradio-app-script created +service/gradio-service created +deployment.apps/gradio-deployment created +``` + +To check the status of each component, run the following commands: + +```bash +$ kubectl get deployments -n gradio-mistral-tran1 +NAME READY UP-TO-DATE AVAILABLE AGE +gradio-deployment 1/1 1 1 95s +``` + +```bash +$ kubectl get configmaps -n gradio-mistral-tran1 +NAME DATA AGE +gradio-app-script 1 110s +kube-root-ca.crt 1 111s +``` + +### Accessing the Chatbot Website + +Once the load balancer has finished deploying, use the external IP address to directly access the website: + +```bash wait=10 +$ kubectl get services -n gradio-llama2-inf2 +NAME TYPE ClUSTER-IP EXTERNAL-IP PORT(S) AGE +gradio-service LoadBalancer 172.20.84.26 k8s-gradioll-gradiose-a6d0b586ce-06885d584b38b400.elb.us-west-2.amazonaws.com 80:30802/TCP 8m42s +``` + +To wait until the Network Load Balancer has finished provisioning, run the following command: + +```bash wait=240 timeout=600 +$ curl --head -X GET --retry 30 --retry-all-errors --retry-delay 15 --connect-timeout 5 --max-time 10 \ +-k $(kubectl get service -n gradio-mistral-tran1 gradio-service -o jsonpath="{.status.loadBalancer.ingress[*].hostname}{'\n'}") +``` + +Now that our application is exposed to the outside world, let's access it by pasting the URL in your web browser. You will see the Mistral-7B-Instruct-v0.3 chatbot and will be able to interact with it by asking questions. + + + + + +This concludes the current lab on deploying the Mistral-7B-Instruct-v0.3 Chatbot Model within an EKS Cluster via Karpenter. diff --git a/website/docs/aiml/chatbot/mistral.md b/website/docs/aiml/chatbot/mistral.md new file mode 100644 index 000000000..ccda8ed3d --- /dev/null +++ b/website/docs/aiml/chatbot/mistral.md @@ -0,0 +1,31 @@ +--- +title: "Understanding the Mistral-7B-Instruct-v0.3 Chat Model" +sidebar_position: 50 +sidebar_custom_props: { "module": true } +weight: 30 +description: "Use Inferentia to accelerate deep learning inference workloads on Amazon Elastic Kubernetes Service." +--- + + +Mistral-7B-Instruct-v0.3 model represents a significant advancement in language model technology, combining powerful capabilities like Text generation and completion, Information extraction, Data analysis, API interaction, Complex reasoning tasks with practical efficiency. + +As a 7B parameter model, it offers remarkable performance while remaining deployable on standard hardware configurations. It requires aproximately ~26-28 GB memory (13 GB for 7B parameters and additional ~13 GB for Optimizer states and overhead). `trn1.2xlarge` instance with 32GB memory is suitable for running the Mistral-7B model, as it provides enough headroom Model weights, Optimizer states, KV cache, Input/output tensors and Runtime overhead. + +Mistral-7B-Instruct-v0.3 is implemented using FastAPI, Ray Serve, and PyTorch-based Hugging Face Transformers to create a seamless API for text generation. + +Here's the code for compiling the model that we'll use: + +```file +manifests/modules/aiml/chatbot/ray-service-neuron-mistral-chatbot/mistral1.py +``` + +This Python code performs the following tasks: + +1. Configures an APIIngress class responsible for handling inference requests +2. Defines a MistralModel class responsible for managing the Mistral language model +3. Loads and compiles the model based on existing parameters +4. Creates an entry point for the FastAPI application + +Through these steps, the Mistral-7B-Instruct-v0.3 chat model allows the endpoint to accept input sentences and generate text outputs. The high performance efficiency in processing tasks enables the model to handle a wide variety of natural language processing applications, such as chat bots and text generation tasks. + +In this lab, we'll see how the Mistral-7B-Instruct-v0.3 Model is configured with Ray Service as a Kubernetes configuration, allowing users to understand how to incorporate fine-tuning and deploy their own natural language processing applications.