diff --git a/examples/llm_serving/README.rst b/examples/llm_serving/README.rst index fec1a6649..38668332a 100644 --- a/examples/llm_serving/README.rst +++ b/examples/llm_serving/README.rst @@ -3,7 +3,8 @@ Serving OPT-175B using Alpa =========================== This tutorial shows how to setup a serving system to serve the largest available pretrained language model `OPT-175B `_. -You can also try a live demo at `Alpa-OPT Demo `_. + +👉 Try a live demo at `Alpa-OPT Demo `_ 👈 Overview ======== @@ -203,12 +204,11 @@ They will use two ports. The port of the website is defined in the command line .. code:: shell - # Launch the website - uvicorn launch_website:app --host 0.0.0.0 --port 8001 - - # Launch the model worker (in a new terminal) + # Launch the model worker python3 launch_model_worker.py --model alpa/opt-175b + # Launch the website (in a new terminal) + uvicorn launch_website:app --host 0.0.0.0 --port 8001 Then open ``http://[IP-ADDRESS]:8001`` in your browser to try out the model! diff --git a/examples/llm_serving/client.py b/examples/llm_serving/client.py index 5d03297ed..e48fd29e1 100644 --- a/examples/llm_serving/client.py +++ b/examples/llm_serving/client.py @@ -3,7 +3,7 @@ import requests -DEFAULT_URL = "https://opt.alpa.ai/" +DEFAULT_URL = "https://opt.alpa.ai" class Client(object): diff --git a/examples/llm_serving/launch_model_worker.py b/examples/llm_serving/launch_model_worker.py index 0d96cd208..35b9ccb7e 100644 --- a/examples/llm_serving/launch_model_worker.py +++ b/examples/llm_serving/launch_model_worker.py @@ -336,7 +336,6 @@ def check_authorization(self, args, request): parser.add_argument("--host", type=str, default="0.0.0.0") parser.add_argument("--torch-device", type=str, default="cpu") parser.add_argument("--no-recaptcha", action="store_true") - parser.add_argument("--keys-file", type=str, default="keys.json") parser.add_argument("--register-name", type=str, default="default") parser.add_argument("--ssl-keyfile", type=str) parser.add_argument("--ssl-certfile", type=str) diff --git a/examples/llm_serving/launch_website.py b/examples/llm_serving/launch_website.py index 342378581..6be801a72 100644 --- a/examples/llm_serving/launch_website.py +++ b/examples/llm_serving/launch_website.py @@ -1,4 +1,5 @@ import json +import logging from typing import Union from fastapi import FastAPI, Request @@ -19,12 +20,80 @@ else: sampling_css = "" - recaptcha = load_recaptcha(USE_RECAPTCHA) +def log_scope(request): + scope = request.scope + del scope["app"] + del scope["fastapi_astack"] + del scope["router"] + del scope["endpoint"] + del scope["route"] + scope["tstamp"] = time.time() + logging.info(scope) + return scope + + +##### Redirect Begin ##### +import asyncio +import pickle +import time + +from alpa.serve.http_util import HTTPRequestWrapper, make_error_response +import ray +from starlette.responses import JSONResponse +ray.init(address="auto", namespace="alpa_serve") + +manager = None + +async def connect_manager(): + global manager + while True: + if manager is None: + try: + manager = ray.get_actor("mesh_group_manager_0") + except ValueError: + manager = None + await asyncio.sleep(1) + +asyncio.get_event_loop().create_task(connect_manager()) + +async def redirect(request): + global manager + + body = await request.body() + scope = log_scope(request) + request = pickle.dumps(HTTPRequestWrapper(scope, body)) + try: + ret = await manager.handle_request.remote("default", request) + except ray.exceptions.RayActorError: + manager = None + if isinstance(ret, Exception): + ret = make_error_response(ret) + ret = JSONResponse(ret, status_code=400) + return ret + + +@app.post("/completions") +async def completions(request: Request): + return await redirect(request) + + +@app.post("/logprobs") +async def logprobs(request: Request): + return await redirect(request) + + +@app.post("/call") +async def logprobs(request: Request): + return await redirect(request) + +##### Redirect End ##### + @app.get("/") async def homepage(request: Request): + log_scope(request) return templates.TemplateResponse("index.html", { "request": request, "num_return_sequences": NUM_RETURN_SEQ, diff --git a/examples/llm_serving/log_config.yaml b/examples/llm_serving/log_config.yaml new file mode 100644 index 000000000..1e8b7da0c --- /dev/null +++ b/examples/llm_serving/log_config.yaml @@ -0,0 +1,21 @@ +version: 1 +formatters: + simple: + format: "%(asctime)s | %(levelname)s | %(name)s | %(message)s" + datefmt: "%Y-%m-%d %H:%M:%S" +handlers: + console: + class : logging.StreamHandler + formatter: simple + level : INFO + stream : ext://sys.stdout + file: + class : logging.handlers.TimedRotatingFileHandler + filename: weblogs/llm_serving.website.log + when: "D" + utc: True + formatter: simple + level : INFO +root: + level: INFO + handlers: [console, file] diff --git a/examples/llm_serving/service/constants.py b/examples/llm_serving/service/constants.py index 842975c23..83404f76d 100644 --- a/examples/llm_serving/service/constants.py +++ b/examples/llm_serving/service/constants.py @@ -3,6 +3,7 @@ # Alpa serve url ALPA_SERVE_PORT = 20001 ALPA_SERVE_URL = f"window.location.protocol + '//' + window.location.hostname + ':{ALPA_SERVE_PORT}/completions'" +#ALPA_SERVE_URL = f'"completions"' # Generation params NUM_BEAMS = 1 diff --git a/examples/llm_serving/service/static/index.html b/examples/llm_serving/service/static/index.html index 3ea1a1ea6..ae2729f2d 100644 --- a/examples/llm_serving/service/static/index.html +++ b/examples/llm_serving/service/static/index.html @@ -178,16 +178,17 @@ if ("responseJSON" in xhr) { msg = "Error: " + xhr.responseJSON.message; if (msg.includes("No replica of model") || - msg.includes("is not registered")) { - msg += "\nThe server is probably under maintenance. " + - "Please come back later."; + msg.includes("is not registered") || + msg.includes("object has no attribute")) { + msg += "\nThe server is probably under regular maintenance. " + + "Please come back 10 minutes later."; } $("#error").text(msg); } else { $("#error").text( - "Cannot connect to the server due to unknown error. " + - "\nThe server is probably under maintenance. " + - "Please come back later."); + "Cannot connect to the server due to unknown errors. " + + "\nThe server is probably under regular maintenance. " + + "Please come back 10 minutes later."); } } }); @@ -379,7 +380,7 @@

Large Model for Everyone

@@ -423,7 +424,7 @@

Large Model for Everyone

-
0.7 diff --git a/examples/llm_serving/service/utils.py b/examples/llm_serving/service/utils.py index a5d24369a..01338b985 100644 --- a/examples/llm_serving/service/utils.py +++ b/examples/llm_serving/service/utils.py @@ -42,13 +42,9 @@ def build_logger(): # Add a file handler for all loggers if handler is None: os.makedirs(LOGDIR, exist_ok=True) - logfile_path = os.path.join( - LOGDIR, - f"alpa.llm_serving.log.{datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}" - ) - handler = logging.handlers.RotatingFileHandler(logfile_path, - maxBytes=1024 * 1024, - backupCount=100000) + filename = os.path.join(LOGDIR, f"llm_serving.worker.log") + handler = logging.handlers.TimedRotatingFileHandler( + filename, when='D', utc=True) handler.setFormatter(formatter) for name, item in logging.root.manager.loggerDict.items():