diff --git a/README.md b/README.md
index 34bba8fd9d..5c6bc6ca4e 100644
--- a/README.md
+++ b/README.md
@@ -77,6 +77,7 @@ Refer to [torchserve docker](docker/README.md) for details.
 
 
 ## 🏆 Highlighted Examples
+* [Chatbot with Llama 2 on Mac 🦙💬](examples/LLM/llama2/chat_app)
 * [🤗 HuggingFace Transformers](examples/Huggingface_Transformers) with a [Better Transformer Integration/ Flash Attention & Xformer Memory Efficient ](examples/Huggingface_Transformers#Speed-up-inference-with-Better-Transformer)
 * [Model parallel inference](examples/Huggingface_Transformers#model-parallelism)
 * [MultiModal models with MMF](https://github.com/pytorch/serve/tree/master/examples/MMF-activity-recognition) combining text, audio and video
diff --git a/examples/LLM/llama2/chat_app/Readme.md b/examples/LLM/llama2/chat_app/Readme.md
new file mode 100644
index 0000000000..4684bd3132
--- /dev/null
+++ b/examples/LLM/llama2/chat_app/Readme.md
@@ -0,0 +1,142 @@
+
+# TorchServe Llama 2 Chatapp
+
+This is an example showing how to deploy a llama2 chat app using TorchServe.
+We use [streamlit](https://github.com/streamlit/streamlit) to create the app
+
+We are using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) in this example
+
+You can run this example on your laptop to understand how to use TorchServe
+
+
+## Architecture
+
+![Chatbot Architecture](./screenshots/architecture.png)
+
+
+## Pre-requisites
+
+The following example has been tested on M1 Mac.
+Before you install TorchServe, make sure you have the following installed
+1) JDK 17
+
+Make sure your javac version is `17.x.x`
+```
+javac --version
+javac 17.0.8
+```
+You can download it from [java](https://www.oracle.com/java/technologies/downloads/#jdk17-mac)
+2) Install conda with support for arm64
+
+3) Since we are running this example on Mac, we will use the 7B llama2 model.
+Download llama2-7b weights by following instructions [here](https://github.com/pytorch/serve/tree/master/examples/large_models/Huggingface_accelerate/llama2#step-1-download-model-permission)
+
+4) Install streamlit with
+
+```
+python -m pip install -r requirements.txt
+```
+
+
+### Steps
+
+#### Install TorchServe
+Install TorchServe with the following steps
+
+```
+python ts_scripts/install_dependencies.py
+pip install torchserve torch-model-archiver torch-workflow-archiver
+```
+
+#### Package model for TorchServe
+
+Run this script to create `llamacpp.tar.gz` to be loaded in TorchServe
+
+```
+source package_llama.sh <path to llama2 snapshot folder>
+```
+This creates the quantized weights in `$LLAMA2_WEIGHTS`
+
+For subsequent runs, we don't need to regenerate these weights. We only need to package the handler, model-config.yaml in the tar file.
+
+Hence, you can skip the model generation by running the script as follows
+
+```
+source package_llama.sh <path to llama2 snapshot folder>  false
+```
+
+You might need to run the below command if the script output indicates it.
+```
+sudo xcodebuild -license
+```
+
+The script is setting an env variable `LLAMA2_Q4_MODEL` and using this in the handler. In an actual use-case, you would set the path to the weights in `model-config.yaml`
+
+```
+handler:
+    model_name: "llama-cpp"
+    model_path: "<absolute path to the weights file"
+```
+
+
+#### Start TorchServe
+
+We launch a streamlit app to configure TorchServe. This opens a UI in your browser, which you can use to start/stop TorchServe, register model, change some of the TorchServe parameters
+
+```
+streamlit run torchserve_server_app.py
+```
+
+You can check the model status on the app to make sure the model is ready to receive requests
+
+![Server](./screenshots/Server.png)
+
+#### Client Chat App
+
+We launch a streamlit app from which a client can send requests to TorchServe. The reference app used is [here](https://blog.streamlit.io/how-to-build-a-llama-2-chatbot/)
+
+```
+streamlit run client_app.py
+```
+
+You can change the model parameters and ask the server questions in the following format
+
+```
+Question: What is the closest star to Earth ? Answer:
+```
+results in
+
+```
+Question: What is the closest star to Earth ? Answer: The closest star to Earth is Proxima Centauri, which is located about 4. nobody knows if there is other life out there similar to ours or not, but it's pretty cool that we know of a star so close to us!
+```
+
+![Client](./screenshots/Client.png)
+
+
+### Experiments
+
+You can launch a second client app from another terminal.
+
+You can send requests simultaneously to see how quickly TorchServe responds
+
+#### Dynamic Batching
+
+Batching requests is a strategy used to make efficient use of compute resources. TorchServe supports dynamic batching, where the frontend can batch requests if the requests arrive within the max_batch_delay time.
+You can make use of dynamic batching in TorchServe by configuring the `batch_size` and `max_batch_delay` parameters in TorchServe. You can do this on the Server app.
+
+You can read more about batching in TorchServe [here](https://github.com/pytorch/serve/blob/master/docs/batch_inference_with_ts.md)
+
+![Batch Size](./screenshots/batch_size.png)
+
+#### Backend Workers
+
+TorchServe's backend workers perform the actual model inference.
+You can increase the number of backend workers in TorchServe by configuring  `min_workers` parameter in TorchServe. You can do this on the Server app.
+
+The number of workers can be autoscaled based on the traffic and usage patterns.
+
+Number of workers = Number of TorchServe python processes
+
+So to reduce latency and improve performance, we can increase the number of workers for parallel processing of requests.
+
+![Workers](./screenshots/Workers.png)
diff --git a/examples/LLM/llama2/chat_app/client_app.py b/examples/LLM/llama2/chat_app/client_app.py
new file mode 100644
index 0000000000..a006e6139f
--- /dev/null
+++ b/examples/LLM/llama2/chat_app/client_app.py
@@ -0,0 +1,97 @@
+import json
+
+import requests
+import streamlit as st
+
+# App title
+st.set_page_config(page_title="🦙💬 Llama 2 Chatbot")
+
+# Replicate Credentials
+with st.sidebar:
+    st.title("🦙💬 Llama 2 Chatbot")
+
+    try:
+        res = requests.get(url="http://localhost:8080/ping")
+        res = requests.get(url="http://localhost:8081/models/llamacpp")
+        status = json.loads(res.text)[0]["workers"][0]["status"]
+
+        if status == "READY":
+            st.success("Proceed to entering your prompt message!", icon="👉")
+        else:
+            st.warning("Model not loaded in TorchServe", icon="⚠️")
+
+    except requests.ConnectionError:
+        st.warning("TorchServe is not up. Try again", icon="⚠️")
+
+    st.subheader("Model parameters")
+    temperature = st.sidebar.slider(
+        "temperature", min_value=0.01, max_value=5.0, value=0.8, step=0.01
+    )
+    top_p = st.sidebar.slider(
+        "top_p", min_value=0.01, max_value=1.0, value=0.95, step=0.01
+    )
+    max_tokens = st.sidebar.slider(
+        "max_tokens", min_value=128, max_value=512, value=100, step=8
+    )
+
+# Store LLM generated responses
+if "messages" not in st.session_state.keys():
+    st.session_state.messages = [
+        {"role": "assistant", "content": "How may I assist you today?"}
+    ]
+
+# Display or clear chat messages
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.write(message["content"])
+
+
+def clear_chat_history():
+    st.session_state.messages = [
+        {"role": "assistant", "content": "How may I assist you today?"}
+    ]
+
+
+st.sidebar.button("Clear Chat History", on_click=clear_chat_history)
+
+
+# Function for generating LLaMA2 response. Refactored from https://github.com/a16z-infra/llama2-chatbot
+def generate_llama2_response(prompt_input):
+    string_dialogue = (
+        "Question: What are the names of the planets in the solar system? Answer: "
+    )
+    headers = {"Content-type": "application/json", "Accept": "text/plain"}
+    url = "http://127.0.0.1:8080/predictions/llamacpp"
+    data = json.dumps(
+        {
+            "prompt": prompt_input,
+            "max_tokens": max_tokens,
+            "top_p": top_p,
+            "temperature": temperature,
+        }
+    )
+
+    res = requests.post(url=url, data=data, headers=headers)
+
+    return res.text
+
+
+# User-provided prompt
+if prompt := st.chat_input():
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    with st.chat_message("user"):
+        st.write(prompt)
+
+# Generate a new response if last message is not from assistant
+if st.session_state.messages[-1]["role"] != "assistant":
+    with st.chat_message("assistant"):
+        with st.spinner("Thinking..."):
+            response = generate_llama2_response(prompt)
+            placeholder = st.empty()
+            full_response = ""
+            for item in response:
+                full_response += item
+                placeholder.markdown(full_response)
+            placeholder.markdown(full_response)
+    message = {"role": "assistant", "content": full_response}
+    st.session_state.messages.append(message)
diff --git a/examples/LLM/llama2/chat_app/llama_cpp_handler.py b/examples/LLM/llama2/chat_app/llama_cpp_handler.py
new file mode 100644
index 0000000000..a26778efc8
--- /dev/null
+++ b/examples/LLM/llama2/chat_app/llama_cpp_handler.py
@@ -0,0 +1,57 @@
+import logging
+import os
+from abc import ABC
+
+import torch
+from llama_cpp import Llama
+
+from ts.torch_handler.base_handler import BaseHandler
+
+logger = logging.getLogger(__name__)
+
+
+class LlamaCppHandler(BaseHandler, ABC):
+    def __init__(self):
+        super(LlamaCppHandler, self).__init__()
+        self.initialized = False
+        logger.info("Init done")
+
+    def initialize(self, ctx):
+        """In this initialize function, the HF large model is loaded and
+        partitioned using DeepSpeed.
+        Args:
+            ctx (context): It is a JSON Object containing information
+            pertaining to the model artifacts parameters.
+        """
+        logger.info("Start initialize")
+        model_name = ctx.model_yaml_config["handler"]["model_name"]
+        model_path = ctx.model_yaml_config["handler"]["model_path"]
+        if not os.path.exists(model_path):
+            model_path = os.environ["LLAMA2_Q4_MODEL"]
+        seed = int(ctx.model_yaml_config["handler"]["manual_seed"])
+        torch.manual_seed(seed)
+
+        self.model = Llama(model_path=model_path)
+
+    def preprocess(self, data):
+        for row in data:
+            item = row.get("body")
+            return item
+
+    def inference(self, data):
+        result = self.model.create_completion(
+            data["prompt"],
+            max_tokens=data["max_tokens"],
+            top_p=data["top_p"],
+            temperature=data["temperature"],
+            stop=["Q:", "\n"],
+            echo=True,
+        )
+        tokens = self.model.tokenize(bytes(data["prompt"], "utf-8"))
+        return result
+
+    def postprocess(self, output):
+        logger.info(output)
+        result = []
+        result.append(output["choices"][0]["text"])
+        return result
diff --git a/examples/LLM/llama2/chat_app/model-config.yaml b/examples/LLM/llama2/chat_app/model-config.yaml
new file mode 100644
index 0000000000..f9a3fe56d7
--- /dev/null
+++ b/examples/LLM/llama2/chat_app/model-config.yaml
@@ -0,0 +1,7 @@
+# TorchServe frontend parameters
+responseTimeout: 1200
+
+handler:
+    model_name: "llama-cpp"
+    model_path: "/Users/agunapal/Documents/experiments/llama/ggml-model-q4_0.gguf"
+    manual_seed: 40
diff --git a/examples/LLM/llama2/chat_app/package_llama.sh b/examples/LLM/llama2/chat_app/package_llama.sh
new file mode 100755
index 0000000000..b7f2f0f171
--- /dev/null
+++ b/examples/LLM/llama2/chat_app/package_llama.sh
@@ -0,0 +1,46 @@
+
+# Check if the argument is empty or unset
+if [ -z "$1" ]; then
+  echo "Missing Mandatory argument: Path to llama weights"
+  echo "Usage: ./package_llama.sh ./model/models--meta-llama--Llama-2-7b-chat-hf/snapshots/08751db2aca9bf2f7f80d2e516117a53d7450235"
+  exit 1
+fi
+
+MODEL_GENERATION="true"
+LLAMA2_WEIGHTS="$1"
+
+if [ -n "$2" ]; then
+  MODEL_GENERATION="$2"
+fi
+
+CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
+
+if [ "$MODEL_GENERATION" = "true" ]; then
+  echo "Cleaning up previous build of llama-cpp"
+  rm -rf build
+  git clone https://github.com/ggerganov/llama.cpp.git build
+  cd build
+  make 
+  python -m pip install -r requirements.txt
+  
+  echo "Convert the 7B model to ggml FP16 format"
+  python convert.py $LLAMA2_WEIGHTS --outfile ggml-model-f16.gguf
+  
+  echo "Quantize the model to 4-bits (using q4_0 method)"
+  ./quantize ggml-model-f16.gguf ../ggml-model-q4_0.gguf q4_0
+  
+  cd ..
+  export LLAMA2_Q4_MODEL=$PWD/ggml-model-q4_0.gguf
+  echo "Saved quantized model weights to $LLAMA2_Q4_MODEL"
+fi
+
+echo "Creating torchserve model archive"
+torch-model-archiver --model-name llamacpp --version 1.0 --handler llama_cpp_handler.py --config-file model-config.yaml --archive-format tgz
+
+mkdir -p model_store
+mv llamacpp.tar.gz model_store/.
+if [ "$MODEL_GENERATION" = "true" ]; then
+  echo "Cleaning up build of llama-cpp"
+  rm -rf build
+fi
+
diff --git a/examples/LLM/llama2/chat_app/requirements.txt b/examples/LLM/llama2/chat_app/requirements.txt
new file mode 100644
index 0000000000..f2ed67f700
--- /dev/null
+++ b/examples/LLM/llama2/chat_app/requirements.txt
@@ -0,0 +1 @@
+streamlit>=1.26.0
\ No newline at end of file
diff --git a/examples/LLM/llama2/chat_app/screenshots/Client.png b/examples/LLM/llama2/chat_app/screenshots/Client.png
new file mode 100644
index 0000000000..6e876c244a
Binary files /dev/null and b/examples/LLM/llama2/chat_app/screenshots/Client.png differ
diff --git a/examples/LLM/llama2/chat_app/screenshots/Server.png b/examples/LLM/llama2/chat_app/screenshots/Server.png
new file mode 100644
index 0000000000..5e8c1d7b7c
Binary files /dev/null and b/examples/LLM/llama2/chat_app/screenshots/Server.png differ
diff --git a/examples/LLM/llama2/chat_app/screenshots/Workers.png b/examples/LLM/llama2/chat_app/screenshots/Workers.png
new file mode 100644
index 0000000000..a8418a4fd2
Binary files /dev/null and b/examples/LLM/llama2/chat_app/screenshots/Workers.png differ
diff --git a/examples/LLM/llama2/chat_app/screenshots/architecture.png b/examples/LLM/llama2/chat_app/screenshots/architecture.png
new file mode 100644
index 0000000000..5c158175c3
Binary files /dev/null and b/examples/LLM/llama2/chat_app/screenshots/architecture.png differ
diff --git a/examples/LLM/llama2/chat_app/screenshots/batch_size.png b/examples/LLM/llama2/chat_app/screenshots/batch_size.png
new file mode 100644
index 0000000000..1cb128ade5
Binary files /dev/null and b/examples/LLM/llama2/chat_app/screenshots/batch_size.png differ
diff --git a/examples/LLM/llama2/chat_app/torchserve_server_app.py b/examples/LLM/llama2/chat_app/torchserve_server_app.py
new file mode 100644
index 0000000000..74b1b2060d
--- /dev/null
+++ b/examples/LLM/llama2/chat_app/torchserve_server_app.py
@@ -0,0 +1,171 @@
+import json
+import os
+
+import requests
+import streamlit as st
+
+MODEL_NAME = "llamacpp"
+# App title
+st.set_page_config(page_title="🦙💬 Llama 2 TorchServe Serve")
+
+
+def start_server():
+    os.system("torchserve --start --model-store model_store --ncs")
+    st.session_state.started = True
+    st.session_state.stopped = False
+    st.session_state.registered = False
+
+
+def stop_server():
+    os.system("torchserve --stop")
+    st.session_state.stopped = True
+    st.session_state.started = False
+    st.session_state.registered = False
+
+
+def _register_model(url):
+    res = requests.post(url)
+    if res.status_code != 200:
+        server_state_container.error("Error registering model", icon="🚫")
+        st.session_state.started = True
+        return
+    st.session_state.registered = True
+    st.session_state.started = False
+    st.session_state.stopped = False
+    server_state_container.caption(res.text)
+
+
+def register_model():
+    if not st.session_state.started:
+        server_state_container.caption("TorchServe is not running. Start it")
+        return
+    url = (
+        f"http://localhost:8081/models?model_name={MODEL_NAME}&url={MODEL_NAME}"
+        f".tar.gz&initial_workers=1&synchronous=true"
+    )
+    _register_model(url)
+
+
+def get_status():
+    if st.session_state.registered:
+        url = f"http://localhost:8081/models/{MODEL_NAME}"
+        res = requests.get(url)
+        if res.status_code != 200:
+            model_state_container.error("Error getting model status", icon="🚫")
+            return
+        status = json.loads(res.text)[0]
+        model_state_container.write(status)
+
+
+def scale_workers(workers):
+    if st.session_state.registered:
+        num_workers = st.session_state[workers]
+        url = (
+            f"http://localhost:8081/models/{MODEL_NAME}?min_worker="
+            f"{str(num_workers)}&synchronous=true"
+        )
+        res = requests.put(url)
+        server_state_container.caption(res.text)
+
+
+def set_batch_size(batch_size):
+    if st.session_state.registered:
+        url = f"http://localhost:8081/models/{MODEL_NAME}/1.0"
+        res = requests.delete(url)
+        server_state_container.caption(res.text)
+        st.session_state.registered = False
+
+        batch_size = st.session_state[batch_size]
+        url = (
+            f"http://localhost:8081/models?model_name={MODEL_NAME}&url={MODEL_NAME}"
+            f".tar.gz&batch_size={str(batch_size)}&initial_workers={str(workers)}"
+            f"&synchronous=true&max_batch_delay={str(max_batch_delay)}"
+        )
+        _register_model(url)
+
+
+def set_max_batch_delay(max_batch_delay):
+    if st.session_state.registered:
+        url = f"http://localhost:8081/models/{MODEL_NAME}/1.0"
+        res = requests.delete(url)
+        server_state_container.caption(res.text)
+        st.session_state.registered = False
+
+        max_batch_delay = st.session_state[max_batch_delay]
+        url = (
+            f"http://localhost:8081/models?model_name={MODEL_NAME}&url="
+            f"{MODEL_NAME}.tar.gz&batch_size={str(batch_size)}&initial_workers="
+            f"{str(workers)}&synchronous=true&max_batch_delay={str(max_batch_delay)}"
+        )
+        _register_model(url)
+
+
+if "started" not in st.session_state:
+    st.session_state.started = False
+if "stopped" not in st.session_state:
+    st.session_state.stopped = False
+if "registered" not in st.session_state:
+    st.session_state.registered = False
+
+with st.sidebar:
+    st.title("🦙💬 Llama 2 TorchServe Server ")
+
+    st.button("Start Server", on_click=start_server)
+    st.button("Stop Server", on_click=stop_server)
+    st.button("Register Llama2", on_click=register_model)
+    workers = st.sidebar.slider(
+        "Num Workers",
+        key="Num Workers",
+        min_value=1,
+        max_value=4,
+        value=1,
+        step=1,
+        on_change=scale_workers,
+        args=("Num Workers",),
+    )
+    batch_size = st.sidebar.select_slider(
+        "Batch Size",
+        key="Batch Size",
+        options=[2**j for j in range(0, 8)],
+        on_change=set_batch_size,
+        args=("Batch Size",),
+    )
+    max_batch_delay = st.sidebar.slider(
+        "Max Batch Delay",
+        key="Max Batch Delay",
+        min_value=100,
+        max_value=10000,
+        value=100,
+        step=100,
+        on_change=set_max_batch_delay,
+        args=("Max Batch Delay",),
+    )
+
+    if st.session_state.started:
+        st.success("Started TorchServe", icon="✅")
+
+    if st.session_state.stopped:
+        st.success("Stopped TorchServe", icon="✅")
+
+    if st.session_state.registered:
+        st.success("Registered model", icon="✅")
+
+st.title("TorchServe Status")
+server_state_container = st.container()
+server_state_container.subheader("Server status:")
+
+if st.session_state.started:
+    server_state_container.success("Started TorchServe", icon="✅")
+
+if st.session_state.stopped:
+    server_state_container.success("Stopped TorchServe", icon="✅")
+
+if st.session_state.registered:
+    server_state_container.success("Registered model", icon="✅")
+
+model_state_container = st.container()
+with model_state_container:
+    st.subheader("Model  Status")
+
+with model_state_container:
+    st.button("Model Status", on_click=get_status)
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
index 641873358f..2b1b907552 100644
--- a/ts_scripts/spellcheck_conf/wordlist.txt
+++ b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -1095,4 +1095,13 @@ PreprocessCallCount
 AOT
 microbatches
 tokenization
+Chatapp
+autoscaled
+cpp
+javac
+llamacpp
+streamlit
 tp
+quantized
+Chatbot
+LLM