From 2c5bebce76df6485f4f712c91068aaa422af8a1f Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 16 Jul 2024 13:46:26 +0800 Subject: [PATCH 01/24] update --- README.md | 10 ++-- docs/en/benchmark/profile_api_server.md | 2 +- docs/en/get_started.md | 2 +- docs/en/index.rst | 60 +++++++++---------- docs/en/{serving => llm}/api_server.md | 4 +- docs/en/llm/function_call.md | 1 + docs/en/{serving => llm}/gradio.md | 2 +- docs/en/llm/introduction.md | 1 + docs/en/{inference => llm}/pipeline.md | 2 +- docs/en/multi_modal/introduction.md | 1 + docs/en/multi_modal/minicpmv.md | 2 +- .../{inference => multi_modal}/vl_pipeline.md | 2 +- docs/en/quantization/kv_quant.md | 2 +- docs/en/quantization/w4a16.md | 6 +- docs/en/quantization/w8a8.md | 2 +- docs/en/supported_models/codellama.md | 2 +- 16 files changed, 50 insertions(+), 51 deletions(-) rename docs/en/{serving => llm}/api_server.md (98%) create mode 100644 docs/en/llm/function_call.md rename docs/en/{serving => llm}/gradio.md (98%) create mode 100644 docs/en/llm/introduction.md rename docs/en/{inference => llm}/pipeline.md (99%) create mode 100644 docs/en/multi_modal/introduction.md rename docs/en/{inference => multi_modal}/vl_pipeline.md (99%) diff --git a/README.md b/README.md index f18c42d9e..86b2ec572 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ ______________________________________________________________________ - \[2024/03\] Support DeepSeek-VL offline inference pipeline and serving. - \[2024/03\] Support VLM offline inference pipeline and serving. - \[2024/02\] Support Qwen 1.5, Gemma, Mistral, Mixtral, Deepseek-MOE and so on. -- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) seamless integration with [LMDeploy Serving Service](./docs/en/serving/api_server.md). +- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) seamless integration with [LMDeploy Serving Service](docs/en/llm/api_server.md). - \[2024/01\] Support for multi-model, multi-machine, multi-card inference services. For usage instructions, please refer to [here](./docs/en/serving/proxy_server.md) - \[2024/01\] Support [PyTorch inference engine](./docs/en/inference/pytorch.md), developed entirely in Python, helping to lower the barriers for developers and enable rapid experimentation with new features and technologies. @@ -180,7 +180,7 @@ print(response) > > `export LMDEPLOY_USE_MODELSCOPE=True` -For more information about inference pipeline, please refer to [here](./docs/en/inference/pipeline.md). +For more information about inference pipeline, please refer to [here](docs/en/llms/pipeline.md). # Tutorials @@ -189,9 +189,9 @@ Please overview [getting_started](./docs/en/get_started.md) section for the basi For detailed user guides and advanced guides, please refer to our [tutorials](https://lmdeploy.readthedocs.io/en/latest/): - User Guide - - [LLM Inference pipeline](./docs/en/inference/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ) - - [VLM Inference pipeline](./docs/en/inference/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing) - - [LLM Serving](docs/en/serving/api_server.md) + - [LLM Inference pipeline](docs/en/llms/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ) + - [VLM Inference pipeline](docs/en/multi_modal/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing) + - [LLM Serving](docs/en/llm/api_server.md) - [VLM Serving](docs/en/serving/api_server_vl.md) - [Quantization](docs/en/quantization) - Advance Guide diff --git a/docs/en/benchmark/profile_api_server.md b/docs/en/benchmark/profile_api_server.md index 456ee308a..07dfc4900 100644 --- a/docs/en/benchmark/profile_api_server.md +++ b/docs/en/benchmark/profile_api_server.md @@ -41,7 +41,7 @@ In this section, we take [internlm/internlm-7b](https://huggingface.co/internlm/ lmdeploy serve api_server internlm/internlm-7b ``` -If you would like to change the server's port or other parameters, such as inference engine, max batch size and etc., please run `lmdeploy serve api_server -h` or read [this](../serving/api_server.md) guide to get the detailed explanation. +If you would like to change the server's port or other parameters, such as inference engine, max batch size and etc., please run `lmdeploy serve api_server -h` or read [this](../llm/api_server.md) guide to get the detailed explanation. ### Profile diff --git a/docs/en/get_started.md b/docs/en/get_started.md index 7cbb6c3c1..6774bc39b 100644 --- a/docs/en/get_started.md +++ b/docs/en/get_started.md @@ -27,7 +27,7 @@ response = pipe(["Hi, pls intro yourself", "Shanghai is"]) print(response) ``` -For more information on inference pipeline parameters, please refer to [here](./inference/pipeline.md). +For more information on inference pipeline parameters, please refer to [here](llms/pipeline.md). ## Serving diff --git a/docs/en/index.rst b/docs/en/index.rst index 430c6fa0d..c25b021ef 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -3,54 +3,39 @@ Welcome to LMDeploy's tutorials! .. _get_started: .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :caption: Get Started + installation.md get_started.md -.. _build: -.. toctree:: - :maxdepth: 1 - :caption: Build - - build.md - -.. _benchmark: -.. toctree:: - :maxdepth: 1 - :caption: Benchmark - - benchmark/profile_generation.md - benchmark/profile_throughput.md - benchmark/profile_api_server.md - benchmark/profile_triton_server.md - benchmark/evaluate_with_opencompass.md - .. _supported_models: .. toctree:: :maxdepth: 1 - :caption: Supported Models + :caption: Models supported_models/supported_models.md -.. _inference: +.. _llm_deployment: .. toctree:: :maxdepth: 1 - :caption: Inference - - inference/pipeline.md - inference/vl_pipeline.md + :caption: Large Language Models(LLMs) Deployment + llm/introduction.md + llm/pipeline.md + llm/api_server.md + llm/function_call.md + llm/gradio.md -.. _serving: +.. _vlm_deployment: .. toctree:: :maxdepth: 1 - :caption: serving + :caption: Vision-Language Models(LLMs) Deployment - serving/api_server.md - serving/api_server_vl.md - serving/gradio.md - serving/proxy_server.md + multi_modal/introduction.md + multi_modal/llava.md + multi_modal/internvl.md + multi_modal/minicpmv.md .. _quantization: .. toctree:: @@ -58,8 +43,19 @@ Welcome to LMDeploy's tutorials! :caption: Quantization quantization/w4a16.md - quantization/kv_quant.md quantization/w8a8.md + quantization/kv_quant.md + +.. _benchmark: +.. toctree:: + :maxdepth: 1 + :caption: Benchmark + + benchmark/profile_generation.md + benchmark/profile_throughput.md + benchmark/profile_api_server.md + benchmark/profile_triton_server.md + benchmark/evaluate_with_opencompass.md .. toctree:: :maxdepth: 1 diff --git a/docs/en/serving/api_server.md b/docs/en/llm/api_server.md similarity index 98% rename from docs/en/serving/api_server.md rename to docs/en/llm/api_server.md index 1286eba8b..a1af8bfcb 100644 --- a/docs/en/serving/api_server.md +++ b/docs/en/llm/api_server.md @@ -1,7 +1,7 @@ -# Serving LLM with OpenAI Compatible Server +# OpenAI Compatible Server This article primarily discusses the deployment of a single LLM model across multiple GPUs on a single node, providing a service that is compatible with the OpenAI interface, as well as the usage of the service API. -For the sake of convenience, we refer to this service as `api_server`. Regarding parallel services with multiple models, please refer to the guide about [Request Distribution Server](./proxy_server.md). +For the sake of convenience, we refer to this service as `api_server`. Regarding parallel services with multiple models, please refer to the guide about [Request Distribution Server](../serving/proxy_server.md). In the following sections, we will first introduce two methods for starting the service, choosing the appropriate one based on your application scenario. diff --git a/docs/en/llm/function_call.md b/docs/en/llm/function_call.md new file mode 100644 index 000000000..fcbbc4a4a --- /dev/null +++ b/docs/en/llm/function_call.md @@ -0,0 +1 @@ +# Function Call diff --git a/docs/en/serving/gradio.md b/docs/en/llm/gradio.md similarity index 98% rename from docs/en/serving/gradio.md rename to docs/en/llm/gradio.md index 7b223565f..584f5e0cc 100644 --- a/docs/en/serving/gradio.md +++ b/docs/en/llm/gradio.md @@ -1,4 +1,4 @@ -# Serving with Gradio +# WebUI Demo Starting an LLM model's gradio service with LMDeploy and interacting with the model on the WebUI is incredibly simple. diff --git a/docs/en/llm/introduction.md b/docs/en/llm/introduction.md new file mode 100644 index 000000000..72efd7acb --- /dev/null +++ b/docs/en/llm/introduction.md @@ -0,0 +1 @@ +# Introduction (readme first) diff --git a/docs/en/inference/pipeline.md b/docs/en/llm/pipeline.md similarity index 99% rename from docs/en/inference/pipeline.md rename to docs/en/llm/pipeline.md index 5540a4c81..27b08092e 100644 --- a/docs/en/inference/pipeline.md +++ b/docs/en/llm/pipeline.md @@ -1,4 +1,4 @@ -# LLM Offline Inference Pipeline +# Offline Inference Pipeline In this tutorial, We will present a list of examples to introduce the usage of `lmdeploy.pipeline`. diff --git a/docs/en/multi_modal/introduction.md b/docs/en/multi_modal/introduction.md new file mode 100644 index 000000000..235724180 --- /dev/null +++ b/docs/en/multi_modal/introduction.md @@ -0,0 +1 @@ +# Introduction(readme first) diff --git a/docs/en/multi_modal/minicpmv.md b/docs/en/multi_modal/minicpmv.md index efc4ec823..d83beadb5 100644 --- a/docs/en/multi_modal/minicpmv.md +++ b/docs/en/multi_modal/minicpmv.md @@ -1,4 +1,4 @@ -# MiniCPM-V +# MiniCPM-V Deployment ## Introduction diff --git a/docs/en/inference/vl_pipeline.md b/docs/en/multi_modal/vl_pipeline.md similarity index 99% rename from docs/en/inference/vl_pipeline.md rename to docs/en/multi_modal/vl_pipeline.md index 047fb37af..55f544e7c 100644 --- a/docs/en/inference/vl_pipeline.md +++ b/docs/en/multi_modal/vl_pipeline.md @@ -1,6 +1,6 @@ # VLM Offline Inference Pipeline -LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference [pipeline](./pipeline.md). +LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference [pipeline](../llms/pipeline.md). Currently, it supports the following models. diff --git a/docs/en/quantization/kv_quant.md b/docs/en/quantization/kv_quant.md index c349b635a..ed3982b6b 100644 --- a/docs/en/quantization/kv_quant.md +++ b/docs/en/quantization/kv_quant.md @@ -1,4 +1,4 @@ -# Key-Value(KV) Cache Quantization +# INT4/INT8 KV Cache Since v0.4.0, LMDeploy has supported **online** key-value (kv) cache quantization with int4 and int8 numerical precision, utilizing an asymmetric quantization method that is applied on a per-head, per-token basis. The original kv offline quantization method has been removed. diff --git a/docs/en/quantization/w4a16.md b/docs/en/quantization/w4a16.md index 7c70c8d4b..ed1962839 100644 --- a/docs/en/quantization/w4a16.md +++ b/docs/en/quantization/w4a16.md @@ -1,4 +1,4 @@ -# W4A16 Quantization +# AWQ LMDeploy adopts [AWQ](https://arxiv.org/abs/2306.00978) algorithm for 4bit weight-only quantization. By developed the high-performance cuda kernel, the 4bit quantized model inference achieves up to 2.4x faster than FP16. @@ -88,7 +88,7 @@ response = pipe(["Hi, pls intro yourself", "Shanghai is"]) print(response) ``` -For more information about the pipeline parameters, please refer to [here](../inference/pipeline.md). +For more information about the pipeline parameters, please refer to [here](../llms/pipeline.md). In addition to performing inference with the quantized model on localhost, LMDeploy can also execute inference for the 4bit quantized model derived from AWQ algorithm available on Huggingface Hub, such as models from the [lmdeploy space](https://huggingface.co/lmdeploy) and [TheBloke space](https://huggingface.co/TheBloke) @@ -124,7 +124,7 @@ The default port of `api_server` is `23333`. After the server is launched, you c lmdeploy serve api_client http://0.0.0.0:23333 ``` -You can overview and try out `api_server` APIs online by swagger UI at `http://0.0.0.0:23333`, or you can also read the API specification from [here](../serving/api_server.md). +You can overview and try out `api_server` APIs online by swagger UI at `http://0.0.0.0:23333`, or you can also read the API specification from [here](../llm/api_server.md). ## Performance diff --git a/docs/en/quantization/w8a8.md b/docs/en/quantization/w8a8.md index 12e873b4f..1b1726bd5 100644 --- a/docs/en/quantization/w8a8.md +++ b/docs/en/quantization/w8a8.md @@ -1,4 +1,4 @@ -# W8A8 LLM Model Deployment +# SmoothQuant LMDeploy provides functions for quantization and inference of large language models using 8-bit integers. diff --git a/docs/en/supported_models/codellama.md b/docs/en/supported_models/codellama.md index 9bbd7ba3a..5ef5bfa69 100644 --- a/docs/en/supported_models/codellama.md +++ b/docs/en/supported_models/codellama.md @@ -108,4 +108,4 @@ or through webui after launching gradio, lmdeploy serve gradio api_server_url --server-name ${gradio_ui_ip} --server-port ${gradio_ui_port} ``` -Regarding the detailed information of RESTful API, you can refer to the [guide](../serving/api_server.md). +Regarding the detailed information of RESTful API, you can refer to the [guide](../llm/api_server.md). From e45ed4584453e4c13eb8e35db2bcd90881ccb3dd Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 16 Jul 2024 14:36:26 +0800 Subject: [PATCH 02/24] adjust directory structure --- README.md | 6 +++--- docs/en/get_started.md | 2 +- docs/en/index.rst | 14 ++++++-------- docs/en/installation.md | 1 + docs/en/llm/api_server.md | 2 +- docs/en/{serving => llm}/api_server_tools.md | 2 +- docs/en/llm/function_call.md | 1 - docs/en/{serving => llm}/proxy_server.md | 0 docs/en/{serving => llm}/qos.md | 0 docs/en/{serving => multi_modal}/api_server_vl.md | 4 ++-- docs/en/multi_modal/index.rst | 12 ++++++++++++ docs/en/multi_modal/internvl.md | 1 + docs/en/multi_modal/llava.md | 1 + docs/en/multi_modal/vl_pipeline.md | 4 ++-- docs/en/quantization/kv_quant.md | 2 +- docs/en/quantization/w4a16.md | 2 +- 16 files changed, 33 insertions(+), 21 deletions(-) create mode 100644 docs/en/installation.md rename docs/en/{serving => llm}/api_server_tools.md (99%) delete mode 100644 docs/en/llm/function_call.md rename docs/en/{serving => llm}/proxy_server.md (100%) rename docs/en/{serving => llm}/qos.md (100%) rename docs/en/{serving => multi_modal}/api_server_vl.md (97%) create mode 100644 docs/en/multi_modal/index.rst create mode 100644 docs/en/multi_modal/internvl.md create mode 100644 docs/en/multi_modal/llava.md diff --git a/README.md b/README.md index 11cc5ec1f..32e1d735c 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ ______________________________________________________________________ - \[2024/03\] Support VLM offline inference pipeline and serving. - \[2024/02\] Support Qwen 1.5, Gemma, Mistral, Mixtral, Deepseek-MOE and so on. - \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) seamless integration with [LMDeploy Serving Service](docs/en/llm/api_server.md). -- \[2024/01\] Support for multi-model, multi-machine, multi-card inference services. For usage instructions, please refer to [here](./docs/en/serving/proxy_server.md) +- \[2024/01\] Support for multi-model, multi-machine, multi-card inference services. For usage instructions, please refer to [here](docs/en/llm/proxy_server.md) - \[2024/01\] Support [PyTorch inference engine](./docs/en/inference/pytorch.md), developed entirely in Python, helping to lower the barriers for developers and enable rapid experimentation with new features and technologies. @@ -201,7 +201,7 @@ For detailed user guides and advanced guides, please refer to our [tutorials](ht - [LLM Inference pipeline](docs/en/llms/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ) - [VLM Inference pipeline](docs/en/multi_modal/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing) - [LLM Serving](docs/en/llm/api_server.md) - - [VLM Serving](docs/en/serving/api_server_vl.md) + - [VLM Serving](docs/en/multi_modal/api_server_vl.md) - [Quantization](docs/en/quantization) - Advance Guide - [Inference Engine - TurboMind](docs/en/inference/turbomind.md) @@ -210,7 +210,7 @@ For detailed user guides and advanced guides, please refer to our [tutorials](ht - [Add a new model](docs/en/advance/pytorch_new_model.md) - gemm tuning - [Long context inference](docs/en/advance/long_context.md) - - [Multi-model inference service](docs/en/serving/proxy_server.md) + - [Multi-model inference service](docs/en/llm/proxy_server.md) # Third-party projects diff --git a/docs/en/get_started.md b/docs/en/get_started.md index 63c56ad1e..91788b129 100644 --- a/docs/en/get_started.md +++ b/docs/en/get_started.md @@ -27,7 +27,7 @@ response = pipe(["Hi, pls intro yourself", "Shanghai is"]) print(response) ``` -For more information on inference pipeline parameters, please refer to [here](llms/pipeline.md). +For more information on inference pipeline parameters, please refer to [here](llm/pipeline.md). ## Serving diff --git a/docs/en/index.rst b/docs/en/index.rst index 7d304d6f2..51eb907c2 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -59,20 +59,19 @@ Documentation llm/introduction.md llm/pipeline.md llm/api_server.md - llm/function_call.md + llm/api_server_tools.md llm/gradio.md + llm/proxy_server.md -.. _serving: .. _vlm_deployment: .. toctree:: :maxdepth: 1 - :caption: Serving - :caption: Vision-Language Models(LLMs) Deployment + :caption: Vision-Language Models(VLMs) Deployment multi_modal/introduction.md - multi_modal/llava.md - multi_modal/internvl.md - multi_modal/minicpmv.md + multi_modal/vl_pipeline.md + multi_modal/api_server_vl.md + multi_modal/index.rst .. _quantization: .. toctree:: @@ -104,7 +103,6 @@ Documentation advance/long_context.md advance/chat_template.md advance/debug_turbomind.md - serving/qos.md .. toctree:: :maxdepth: 1 diff --git a/docs/en/installation.md b/docs/en/installation.md new file mode 100644 index 000000000..25267fe2b --- /dev/null +++ b/docs/en/installation.md @@ -0,0 +1 @@ +# Installation diff --git a/docs/en/llm/api_server.md b/docs/en/llm/api_server.md index 2aa625115..2699ab5f4 100644 --- a/docs/en/llm/api_server.md +++ b/docs/en/llm/api_server.md @@ -1,7 +1,7 @@ # OpenAI Compatible Server This article primarily discusses the deployment of a single LLM model across multiple GPUs on a single node, providing a service that is compatible with the OpenAI interface, as well as the usage of the service API. -For the sake of convenience, we refer to this service as `api_server`. Regarding parallel services with multiple models, please refer to the guide about [Request Distribution Server](../serving/proxy_server.md). +For the sake of convenience, we refer to this service as `api_server`. Regarding parallel services with multiple models, please refer to the guide about [Request Distribution Server](proxy_server.md). In the following sections, we will first introduce two methods for starting the service, choosing the appropriate one based on your application scenario. diff --git a/docs/en/serving/api_server_tools.md b/docs/en/llm/api_server_tools.md similarity index 99% rename from docs/en/serving/api_server_tools.md rename to docs/en/llm/api_server_tools.md index 162964093..2f631acd3 100644 --- a/docs/en/serving/api_server_tools.md +++ b/docs/en/llm/api_server_tools.md @@ -1,4 +1,4 @@ -# Tools +# Function Call ## Single Round Invocation diff --git a/docs/en/llm/function_call.md b/docs/en/llm/function_call.md deleted file mode 100644 index fcbbc4a4a..000000000 --- a/docs/en/llm/function_call.md +++ /dev/null @@ -1 +0,0 @@ -# Function Call diff --git a/docs/en/serving/proxy_server.md b/docs/en/llm/proxy_server.md similarity index 100% rename from docs/en/serving/proxy_server.md rename to docs/en/llm/proxy_server.md diff --git a/docs/en/serving/qos.md b/docs/en/llm/qos.md similarity index 100% rename from docs/en/serving/qos.md rename to docs/en/llm/qos.md diff --git a/docs/en/serving/api_server_vl.md b/docs/en/multi_modal/api_server_vl.md similarity index 97% rename from docs/en/serving/api_server_vl.md rename to docs/en/multi_modal/api_server_vl.md index 390da44c5..878ddfea9 100644 --- a/docs/en/serving/api_server_vl.md +++ b/docs/en/multi_modal/api_server_vl.md @@ -1,7 +1,7 @@ -# Serving VLM with OpenAI Compatible Server +# OpenAI Compatible Server This article primarily discusses the deployment of a single large vision language model across multiple GPUs on a single node, providing a service that is compatible with the OpenAI interface, as well as the usage of the service API. -For the sake of convenience, we refer to this service as `api_server`. Regarding parallel services with multiple models, please refer to the guide about [Request Distribution Server](./proxy_server.md). +For the sake of convenience, we refer to this service as `api_server`. Regarding parallel services with multiple models, please refer to the guide about [Request Distribution Server](../llm/proxy_server.md). In the following sections, we will first introduce two methods for starting the service, choosing the appropriate one based on your application scenario. diff --git a/docs/en/multi_modal/index.rst b/docs/en/multi_modal/index.rst new file mode 100644 index 000000000..04bf8570e --- /dev/null +++ b/docs/en/multi_modal/index.rst @@ -0,0 +1,12 @@ +Vision-Language Models +================================= + +.. toctree:: + :maxdepth: 1 + :caption: Examples + + llava.md + internvl.md + xcomposer2d5.md + cogvlm.md + minicpmv.md diff --git a/docs/en/multi_modal/internvl.md b/docs/en/multi_modal/internvl.md new file mode 100644 index 000000000..28c011199 --- /dev/null +++ b/docs/en/multi_modal/internvl.md @@ -0,0 +1 @@ +# InternVL diff --git a/docs/en/multi_modal/llava.md b/docs/en/multi_modal/llava.md new file mode 100644 index 000000000..c10506421 --- /dev/null +++ b/docs/en/multi_modal/llava.md @@ -0,0 +1 @@ +# LLaVA diff --git a/docs/en/multi_modal/vl_pipeline.md b/docs/en/multi_modal/vl_pipeline.md index 55f544e7c..72eb0b459 100644 --- a/docs/en/multi_modal/vl_pipeline.md +++ b/docs/en/multi_modal/vl_pipeline.md @@ -1,6 +1,6 @@ -# VLM Offline Inference Pipeline +# Offline Inference Pipeline -LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference [pipeline](../llms/pipeline.md). +LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference [pipeline](../llm/pipeline.md). Currently, it supports the following models. diff --git a/docs/en/quantization/kv_quant.md b/docs/en/quantization/kv_quant.md index 8aaa402aa..22a8ec91a 100644 --- a/docs/en/quantization/kv_quant.md +++ b/docs/en/quantization/kv_quant.md @@ -4,7 +4,7 @@ Since v0.4.0, LMDeploy has supported **online** key-value (kv) cache quantizatio Intuitively, quantization is beneficial for increasing the number of kv block. Compared to fp16, the number of kv block for int4/int8 kv can be increased by 4 times and 2 times respectively. This means that under the same memory conditions, the system can support a significantly increased number of concurrent operations after kv quantization, thereby ultimately enhancing throughput. -However, quantization typically brings in some loss of model accuracy. We have used OpenCompass to evaluate the accuracy of several models after applying int4/int8 quantization. int8 kv keeps the accuracy while int4 kv has slight loss. The detailed results are presented in the [Evaluation](#Evaluation) section. You can refer to the information and choose wisely based on your requirements. +However, quantization typically brings in some loss of model accuracy. We have used OpenCompass to evaluate the accuracy of several models after applying int4/int8 quantization. int8 kv keeps the accuracy while int4 kv has slight loss. The detailed results are presented in the [Evaluation](#evaluation) section. You can refer to the information and choose wisely based on your requirements. LMDeploy inference with quantized kv supports the following NVIDIA GPU models: diff --git a/docs/en/quantization/w4a16.md b/docs/en/quantization/w4a16.md index 289ac1318..7b0306c9e 100644 --- a/docs/en/quantization/w4a16.md +++ b/docs/en/quantization/w4a16.md @@ -88,7 +88,7 @@ response = pipe(["Hi, pls intro yourself", "Shanghai is"]) print(response) ``` -For more information about the pipeline parameters, please refer to [here](../llms/pipeline.md). +For more information about the pipeline parameters, please refer to [here](../llm/pipeline.md). In addition to performing inference with the quantized model on localhost, LMDeploy can also execute inference for the 4bit quantized model derived from AWQ algorithm available on Huggingface Hub, such as models from the [lmdeploy space](https://huggingface.co/lmdeploy) and [TheBloke space](https://huggingface.co/TheBloke) From 3dc3236b5d297408f21f42b3a24c15c181f9c796 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 16 Jul 2024 14:39:26 +0800 Subject: [PATCH 03/24] set depth 2 --- docs/en/multi_modal/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/multi_modal/index.rst b/docs/en/multi_modal/index.rst index 04bf8570e..3c6061e77 100644 --- a/docs/en/multi_modal/index.rst +++ b/docs/en/multi_modal/index.rst @@ -2,7 +2,7 @@ Vision-Language Models ================================= .. toctree:: - :maxdepth: 1 + :maxdepth: 2 :caption: Examples llava.md From 1633e04129091d53bf2b8d69ce3e9d31747ea177 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 16 Jul 2024 19:29:31 +0800 Subject: [PATCH 04/24] check in installation.md --- docs/en/installation.md | 58 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/docs/en/installation.md b/docs/en/installation.md index 25267fe2b..acf2402cc 100644 --- a/docs/en/installation.md +++ b/docs/en/installation.md @@ -1 +1,59 @@ # Installation + +LMDeploy is a python library for compressing, deploying, and serving Large Language Models and Vision-Language Models. +Its core inference engines include TurboMind Engine and PyTorch Engine. The former is developed by C++ and CUDA, striving for ultimate optimization of inference performance, while the latter, developed purely in Python, aims to decrease the barriers for developers. + +## Install with pip (Recommend) + +You can install lmdeploy using pip (python 3.8 - 3.12) as follows: + +```shell +pip install lmdeploy +``` + +The default prebuilt package is compiled on **CUDA 12**. If CUDA 11+ (>=11.3) is required, you can install lmdeploy by: + +```shell +export LMDEPLOY_VERSION=0.5.1 +export PYTHON_VERSION=38 +pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +``` + +## Install nightly-build package with pip + +The release frequency of LMDeploy is approximately once or twice monthly. If your desired feature has been merged to LMDeploy main branch but hasn't been published yet, you can experiment with the nightly-built package available [here](https://github.com/zhyncs/lmdeploy-build) according to your CUDA and Python versions + +## Install from the source + +If you are using the PyTorch Engine for inference, the installation from the source is quite simple: + +```shell +git clone https://github.com/InternLM/lmdeploy.git +cd lmdeploy +pip install -e . +``` + +But if you are using the TurboMind Engine, you have to build the source as shown below: + +Clone LMDeploy source code and change to its root directory: + +```shell +git clone https://github.com/InternLM/lmdeploy.git +cd lmdeploy +``` + +Run the following command to build the whl package according to your CUDA and Python versions. +Kindly select judiciously from the provided `docker_tag` options `{cuda12.1, cuda11.8}` and the Python version set `{py38, py39, py310, py311, py312}`. + +```shell +docker_tag="cuda12.1" +py_version="py310" +output_dir="lmdeploy_wheel" +bash builder/manywheel/build_wheel.sh ${py_version} "manylinux2014_x86_64" ${docker_tag} ${output_dir} +``` + +After the whl is built successfully, you can install it by: + +```shell +pip install builder/manywheel/lmdeploy_wheel/*.whl +``` From 4c8a8cdecfbcbd4a90e5dfeff69235871269ba9f Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 16 Jul 2024 21:26:31 +0800 Subject: [PATCH 05/24] check in installation.md --- docs/en/installation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/installation.md b/docs/en/installation.md index acf2402cc..b50510001 100644 --- a/docs/en/installation.md +++ b/docs/en/installation.md @@ -55,5 +55,5 @@ bash builder/manywheel/build_wheel.sh ${py_version} "manylinux2014_x86_64" ${doc After the whl is built successfully, you can install it by: ```shell -pip install builder/manywheel/lmdeploy_wheel/*.whl +pip install builder/manywheel/${output_dir}/*.whl ``` From 5a59bd69d0978f11cea780dda861444503c2ed31 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Thu, 18 Jul 2024 20:38:57 +0800 Subject: [PATCH 06/24] update quick start --- README.md | 4 +- docs/en/get_started.md | 197 +++++++++++++++++++++++++++++++++-------- 2 files changed, 163 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 32e1d735c..7bbe55031 100644 --- a/README.md +++ b/README.md @@ -189,7 +189,7 @@ print(response) > > `export LMDEPLOY_USE_MODELSCOPE=True` -For more information about inference pipeline, please refer to [here](docs/en/llms/pipeline.md). +For more information about inference pipeline, please refer to [here](docs/en/llm/pipeline.md). # Tutorials @@ -198,7 +198,7 @@ Please overview [getting_started](./docs/en/get_started.md) section for the basi For detailed user guides and advanced guides, please refer to our [tutorials](https://lmdeploy.readthedocs.io/en/latest/): - User Guide - - [LLM Inference pipeline](docs/en/llms/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ) + - [LLM Inference pipeline](docs/en/llm/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ) - [VLM Inference pipeline](docs/en/multi_modal/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing) - [LLM Serving](docs/en/llm/api_server.md) - [VLM Serving](docs/en/multi_modal/api_server_vl.md) diff --git a/docs/en/get_started.md b/docs/en/get_started.md index 91788b129..2d8050667 100644 --- a/docs/en/get_started.md +++ b/docs/en/get_started.md @@ -1,69 +1,194 @@ -# Get Started +# Quick Start -LMDeploy offers functionalities such as model quantization, offline batch inference, online serving, etc. Each function can be completed with just a few simple lines of code or commands. +This tutorial shows the usage of LMDeploy on: -## Installation +- Offline inference of LLM model and VLM model +- Serve a LLM or VLM model by the OpenAI compatible server +- Console CLI to interactively chat with LLM model -Install lmdeploy with pip (python 3.8+) or [from source](./build.md) +Before reading further, please ensure that you have installed lmdeploy as outlined in the [installation guide](installation.md) -```shell -pip install lmdeploy +## Offline batch inference + +### LLM inference + +```python +from lmdeploy import pipeline +pipe = pipeline('internlm/internlm2_5-7b-chat') +response = pipe(['Hi, pls intro yourself', 'Shanghai is']) +print(response) ``` -The default prebuilt package is compiled on **CUDA 12**. However, if CUDA 11+ is required, you can install lmdeploy by: +When constructing the `pipeline`, if an inference engine is not designated between the TurboMind Engine and the PyTorch Engine, LMDeploy will automatically assign one based on [their respective capabilities](supported_models/supported_models.md), with the TurboMind Engine taking precedence by default. -```shell -export LMDEPLOY_VERSION=0.5.0 -export PYTHON_VERSION=38 -pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +However, you have the option to manually select an engine. For instance, + +```python +from lmdeploy import pipeline, TurbomindEngineConfig +pipe = pipeline('internlm/internlm2_5-7b-chat', + backend_config=TurbomindEngineConfig( + max_batch_size=32, + enable_prefix_caching=True, + cache_max_entry_count=0.8, + session_len=8192, + )) ``` -## Offline batch inference +or, ```python -import lmdeploy -pipe = lmdeploy.pipeline("internlm/internlm2_5-7b-chat") -response = pipe(["Hi, pls intro yourself", "Shanghai is"]) +from lmdeploy import pipeline, PytorchEngineConfig +pipe = pipeline('internlm/internlm2_5-7b-chat', + backend_config=PytorchEngineConfig( + max_batch_size=32, + enable_prefix_caching=True, + cache_max_entry_count=0.8, + session_len=8192, + )) +``` + +The parameters "max_batch_size", "cache_max_entry_count" and "session_len" significantly influence the GPU memory footprint, especially "cache_max_entry_count" playing a dominant role. If you encounter an Out of Memory(OOM) error, you should consider reducing their values. + +When use the callable `pipe()` to perform token generation with given prompts, you can set the sampling parameters via `GenerationConfig` as below: + +```python +from lmdeploy import GenerationConfig, pipeline + +pipe = pipeline('internlm/internlm2_5-7b-chat') +prompts = ['Hi, pls intro yourself', 'Shanghai is'] +response = pipe(prompts, + gen_config=GenerationConfig( + max_new_tokens=1024, + top_p=0.8, + top_k=40, + temperature=0.6 + )) +``` + +In the `GenerationConfig`, `top_k=1` or `temperature=0.0` indicates greedy search. + +For more information about pipeline, please read the [detailed tutorial](llm/pipeline.md) + +### VLM inference + +The usage of VLM inference pipeline is akin to that of LLMs, with the additional capability of processing image data with the pipeline. +For example, you can utilize the following code snippet to perform the inference with an InternVL model: + +```python +from lmdeploy import pipeline +from lmdeploy.vl import load_image + +pipe = pipeline('OpenGVLab/InternVL2-8B') + +image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg') +response = pipe(('describe this image', image)) print(response) ``` -For more information on inference pipeline parameters, please refer to [here](llm/pipeline.md). +In VLM pipeline, the default image processing batch size is 1. This can be adjusted by `VisionConfig`. For instance, you might set it like this: + +```python +from lmdeploy import pipeline, VisionConfig +from lmdeploy.vl import load_image + +pipe = pipeline('OpenGVLab/InternVL2-8B', + vision_config=VisionConfig( + max_batch_size=8 + )) + +image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg') +response = pipe(('describe this image', image)) +print(response) +``` + +However, the larger the image batch size, the greater risk of an OOM error, because the LLM component within the VLM model pre-allocates a massive amount of memory in advance. + +We encourage you to manually choose between the TurboMind Engine and the PyTorch Engine based on their respective capabilities, as detailed in [the supported-models metrix](./supported_models/supported_models.md). +Additionally, follow the instructions in [LLM Inference](#llm-inference) section to reduce the values of memory-related parameters ## Serving -LMDeploy offers various serving methods, choosing one that best meet your requirements. +As demonstrated in the previous [offline batch inference](#offline-batch-inference) section, this part presents the respective serving methods for LLMs and VLMs. -- [Serving with openai compatible server](https://lmdeploy.readthedocs.io/en/latest/serving/api_server.html) -- [Serving with docker](https://lmdeploy.readthedocs.io/en/latest/serving/api_server.html#option-2-deploying-with-docker) -- [Serving with gradio](https://lmdeploy.readthedocs.io/en/latest/serving/gradio.html) +### Serve a LLM model -## Quantization +```shell +lmdeploy serve api_server internlm/internlm2_5-7b-chat +``` -LMDeploy provides the following quantization methods. Please visit the following links for the detailed guide +This command will launch an OpenAI-compatible server on the localhost at port 23333. You can specify a different server port by using the `--server-port` option. +For more options, consult the help documentation by running `lmdeploy serve api_server --help`. Most of these options align with the engine configuration. -- [4bit weight-only quantization](quantization/w4a16.md) -- [k/v quantization](quantization/kv_quant.md) -- [w8a8 quantization](quantization/w8a8.md) +To access the service, you can utilize the official OpenAI Python package `pip install openai`. Below is an example demonstrating how to use the entrypoint `v1/chat/completions` -## Useful Tools +```python +from openai import OpenAI +client = OpenAI( + api_key='YOUR_API_KEY', + base_url="http://0.0.0.0:23333/v1" +) +model_name = client.models.list().data[0].id +response = client.chat.completions.create( + model=model_name, + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": " provide three suggestions about time management"}, + ], + temperature=0.8, + top_p=0.8 +) +print(response) +``` -LMDeploy CLI offers the following utilities, helping users experience LLM features conveniently +We encourage you to refer to the detailed guide for more comprehensive information about [serving with Docker](./llm/api_server.md), [function calls](llm/api_server_tools.md) and other topics -### Inference with Command line Interface +### Serve a VLM model ```shell -lmdeploy chat internlm/internlm2_5-7b-chat +lmdeploy serve api_server OpenGVLab/InternVL2-8B ``` -### Serving with Web UI +LMDeploy reuses the vision component from upstream VLM repository. Consequently, Serving VLMs can vary, as the upstream VLM repo might introduce different dependencies and offer distinct functionalities. +We invite users to explore the serving method of each supported VLM from [here](multi_modal) -LMDeploy adopts gradio to develop the online demo. +You can access the VLM service in a manner similar to how you would access the `gptv4` service by modifying the `api_key` and `base_url` parameters: + +```python +from openai import OpenAI + +client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1') +model_name = client.models.list().data[0].id +response = client.chat.completions.create( + model=model_name, + messages=[{ + 'role': + 'user', + 'content': [{ + 'type': 'text', + 'text': 'Describe the image please', + }, { + 'type': 'image_url', + 'image_url': { + 'url': + 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg', + }, + }], + }], + temperature=0.8, + top_p=0.8) +print(response) +``` + +## Inference with Command line Interface + +LMDeploy offers a very convenient CLI tool for users to chat with the LLM model locally. For example: ```shell -# install dependencies -pip install lmdeploy[serve] -# launch gradio server -lmdeploy serve gradio internlm/internlm2_5-7b-chat +lmdeploy chat internlm/internlm2_5-7b-chat --backend turbomind ``` -![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab) +It is designed to assist users in checking and verifying whether LMDeploy supports their model, whether the chat template is applied correctly, and whether the inference results are delivered smoothly. + +Another tool, `lmdeploy check_env`, aims to gather the essential environment information. It is crucial when reporting an issue to us, as it helps us diagnose and resolve the problem more effectively. + +If you have any doubt about their usage, you can try using the `--help` option to obtain detailed information. From 7320e6d4c820af183c9a0513a8f4cb263afdd911 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Thu, 18 Jul 2024 20:54:21 +0800 Subject: [PATCH 07/24] update supported platforms --- docs/en/installation.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/en/installation.md b/docs/en/installation.md index b50510001..eef49bd07 100644 --- a/docs/en/installation.md +++ b/docs/en/installation.md @@ -3,6 +3,8 @@ LMDeploy is a python library for compressing, deploying, and serving Large Language Models and Vision-Language Models. Its core inference engines include TurboMind Engine and PyTorch Engine. The former is developed by C++ and CUDA, striving for ultimate optimization of inference performance, while the latter, developed purely in Python, aims to decrease the barriers for developers. +It supports both Linux and Windows platform, with minimum requirement of CUDA version 11.3. + ## Install with pip (Recommend) You can install lmdeploy using pip (python 3.8 - 3.12) as follows: From e821c9546c87f8e23e87085cf48a7e21a5ff91f5 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Thu, 18 Jul 2024 20:57:35 +0800 Subject: [PATCH 08/24] update supported GPUs --- docs/en/installation.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/en/installation.md b/docs/en/installation.md index eef49bd07..32197fa0a 100644 --- a/docs/en/installation.md +++ b/docs/en/installation.md @@ -3,7 +3,12 @@ LMDeploy is a python library for compressing, deploying, and serving Large Language Models and Vision-Language Models. Its core inference engines include TurboMind Engine and PyTorch Engine. The former is developed by C++ and CUDA, striving for ultimate optimization of inference performance, while the latter, developed purely in Python, aims to decrease the barriers for developers. -It supports both Linux and Windows platform, with minimum requirement of CUDA version 11.3. +It supports deployment both Linux and Windows platform, with minimum requirement of CUDA version 11.3. Furthermore, it is compatible with the following NVIDIA GPUs: + +- Volta(sm70): V100 +- Turing(sm75): 20 series, T4 +- Ampere(sm80,sm86): 30 series, A10, A16, A30, A100 +- Ada Lovelace(sm89): 40 series ## Install with pip (Recommend) From 82fddb5db699380709ee0caa647872025d393495 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Thu, 18 Jul 2024 21:27:25 +0800 Subject: [PATCH 09/24] typo --- docs/en/installation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/installation.md b/docs/en/installation.md index 32197fa0a..d49b960bd 100644 --- a/docs/en/installation.md +++ b/docs/en/installation.md @@ -3,7 +3,7 @@ LMDeploy is a python library for compressing, deploying, and serving Large Language Models and Vision-Language Models. Its core inference engines include TurboMind Engine and PyTorch Engine. The former is developed by C++ and CUDA, striving for ultimate optimization of inference performance, while the latter, developed purely in Python, aims to decrease the barriers for developers. -It supports deployment both Linux and Windows platform, with minimum requirement of CUDA version 11.3. Furthermore, it is compatible with the following NVIDIA GPUs: +It supports deployment on both Linux and Windows platform, with minimum requirement of CUDA version 11.3. Furthermore, it is compatible with the following NVIDIA GPUs: - Volta(sm70): V100 - Turing(sm75): 20 series, T4 From 6045b4a4303a906d82091e92684b7c22b687b865 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Mon, 22 Jul 2024 11:19:07 +0800 Subject: [PATCH 10/24] update --- docs/en/llm/pipeline.md | 74 +-------------------------------- docs/en/multi_modal/cogvlm.md | 2 +- docs/en/multi_modal/minicpmv.md | 2 +- 3 files changed, 3 insertions(+), 75 deletions(-) diff --git a/docs/en/llm/pipeline.md b/docs/en/llm/pipeline.md index 557067357..1bef1f221 100644 --- a/docs/en/llm/pipeline.md +++ b/docs/en/llm/pipeline.md @@ -6,41 +6,7 @@ You can overview the detailed pipeline API in [this](https://lmdeploy.readthedoc ## Usage -- **An example using default parameters:** - -```python -from lmdeploy import pipeline - -pipe = pipeline('internlm/internlm2_5-7b-chat') -response = pipe(['Hi, pls intro yourself', 'Shanghai is']) -print(response) -``` - -In this example, the pipeline by default allocates a predetermined percentage of GPU memory for storing k/v cache. The ratio is dictated by the parameter `TurbomindEngineConfig.cache_max_entry_count`. - -There have been alterations to the strategy for setting the k/v cache ratio throughout the evolution of LMDeploy. The following are the change histories: - -1. `v0.2.0 <= lmdeploy <= v0.2.1` - - `TurbomindEngineConfig.cache_max_entry_count` defaults to 0.5, indicating 50% GPU **total memory** allocated for k/v cache. Out Of Memory (OOM) errors may occur if a 7B model is deployed on a GPU with memory less than 40G. If you encounter an OOM error, please decrease the ratio of the k/v cache occupation as follows: - - ```python - from lmdeploy import pipeline, TurbomindEngineConfig - - # decrease the ratio of the k/v cache occupation to 20% - backend_config = TurbomindEngineConfig(cache_max_entry_count=0.2) - - pipe = pipeline('internlm/internlm2_5-7b-chat', - backend_config=backend_config) - response = pipe(['Hi, pls intro yourself', 'Shanghai is']) - print(response) - ``` - -2. `lmdeploy > v0.2.1` - - The allocation strategy for k/v cache is changed to reserve space from the **GPU free memory** proportionally. The ratio `TurbomindEngineConfig.cache_max_entry_count` has been adjusted to 0.8 by default. If OOM error happens, similar to the method mentioned above, please consider reducing the ratio value to decrease the memory usage of the k/v cache. - -- **An example showing how to set tensor parallel num**: +## Use multi GPUs ```python from lmdeploy import pipeline, TurbomindEngineConfig @@ -52,23 +18,6 @@ response = pipe(['Hi, pls intro yourself', 'Shanghai is']) print(response) ``` -- **An example for setting sampling parameters:** - -```python -from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig - -backend_config = TurbomindEngineConfig(tp=2) -gen_config = GenerationConfig(top_p=0.8, - top_k=40, - temperature=0.8, - max_new_tokens=1024) -pipe = pipeline('internlm/internlm2_5-7b-chat', - backend_config=backend_config) -response = pipe(['Hi, pls intro yourself', 'Shanghai is'], - gen_config=gen_config) -print(response) -``` - - **An example for OpenAI format prompt input:** ```python @@ -163,27 +112,6 @@ response = pipe(prompts, gen_config=gen_config) print(response) ``` -- **An example for slora.** - -```python -from lmdeploy import pipeline, GenerationConfig, PytorchEngineConfig - -backend_config = PytorchEngineConfig(session_len=2048, - adapters=dict(lora_name_1='chenchi/lora-chatglm2-6b-guodegang')) -gen_config = GenerationConfig(top_p=0.8, - top_k=40, - temperature=0.8, - max_new_tokens=1024) -pipe = pipeline('THUDM/chatglm2-6b', - backend_config=backend_config) -prompts = [[{ - 'role': 'user', - 'content': '您猜怎么着' -}]] -response = pipe(prompts, gen_config=gen_config, adapter_name='lora_name_1') -print(response) -``` - ## FAQs - **RuntimeError: An attempt has been made to start a new process before the current process has finished its bootstrapping phase**. diff --git a/docs/en/multi_modal/cogvlm.md b/docs/en/multi_modal/cogvlm.md index ed2e438df..e183d99f3 100644 --- a/docs/en/multi_modal/cogvlm.md +++ b/docs/en/multi_modal/cogvlm.md @@ -1,4 +1,4 @@ -# cogvlm +# CogVLM ## Introduction diff --git a/docs/en/multi_modal/minicpmv.md b/docs/en/multi_modal/minicpmv.md index d83beadb5..efc4ec823 100644 --- a/docs/en/multi_modal/minicpmv.md +++ b/docs/en/multi_modal/minicpmv.md @@ -1,4 +1,4 @@ -# MiniCPM-V Deployment +# MiniCPM-V ## Introduction From 47a4d7f912a2114d21bb61b6d13c2793aab13ed3 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Wed, 24 Jul 2024 13:47:38 +0800 Subject: [PATCH 11/24] update api_server --- docs/en/llm/api_server.md | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/docs/en/llm/api_server.md b/docs/en/llm/api_server.md index 2699ab5f4..285b0e32f 100644 --- a/docs/en/llm/api_server.md +++ b/docs/en/llm/api_server.md @@ -3,7 +3,7 @@ This article primarily discusses the deployment of a single LLM model across multiple GPUs on a single node, providing a service that is compatible with the OpenAI interface, as well as the usage of the service API. For the sake of convenience, we refer to this service as `api_server`. Regarding parallel services with multiple models, please refer to the guide about [Request Distribution Server](proxy_server.md). -In the following sections, we will first introduce two methods for starting the service, choosing the appropriate one based on your application scenario. +In the following sections, we will first introduce methods for starting the service, choosing the appropriate one based on your application scenario. Next, we focus on the definition of the service's RESTful API, explore the various ways to interact with the interface, and demonstrate how to try the service through the Swagger UI or LMDeploy CLI tools. @@ -242,10 +242,6 @@ curl http://{server_ip}:{server_port}/v1/chat/interactive \ ## Integrate with WebUI -LMDeploy utilizes `gradio` or [OpenAOE](https://github.com/InternLM/OpenAOE) to integrate a web ui for `api_server` - -### Option 1: gradio - ```shell # api_server_url is what printed in api_server.py, e.g. http://localhost:23333 # server_ip and server_port here are for gradio ui @@ -253,21 +249,12 @@ LMDeploy utilizes `gradio` or [OpenAOE](https://github.com/InternLM/OpenAOE) to lmdeploy serve gradio api_server_url --server-name ${gradio_ui_ip} --server-port ${gradio_ui_port} ``` -### Option 2: OpenAOE - -```shell -pip install -U openaoe -openaoe -f /path/to/your/config-template.yaml -``` - -Please refer to the [guidance](https://github.com/InternLM/OpenAOE/blob/main/docs/tech-report/model_serving_by_lmdeploy/model_serving_by_lmdeploy.md) for more deploy information. - ## FAQ 1. When user got `"finish_reason":"length"`, it means the session is too long to be continued. The session length can be modified by passing `--session_len` to api_server. -2. When OOM appeared at the server side, please reduce the `cache_max_entry_count` of `backend_config` when lanching the service. +2. When OOM appeared at the server side, please reduce the `cache_max_entry_count` of `backend_config` when launching the service. 3. When the request with the same `session_id` to `/v1/chat/interactive` got a empty return value and a negative `tokens`, please consider setting `interactive_mode=false` to restart the session. From e0c5f86d541c4ba47f1ff39a6b45088dbc781820 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Fri, 2 Aug 2024 13:04:04 +0800 Subject: [PATCH 12/24] update --- docs/en/installation.md | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/docs/en/installation.md b/docs/en/installation.md index d49b960bd..6ded59fa3 100644 --- a/docs/en/installation.md +++ b/docs/en/installation.md @@ -40,27 +40,38 @@ cd lmdeploy pip install -e . ``` -But if you are using the TurboMind Engine, you have to build the source as shown below: +But if you are using the TurboMind Engine, you have to build the source as shown below. The `openmmlab/lmdeploy:{tag}` docker image is strongly recommended. -Clone LMDeploy source code and change to its root directory: +**Step 1** Get LMDeploy's docker image + +```shell +docker pull openmmlab/lmdeploy:latest +``` + +```{note} +The "openmmlab/lmdeploy:latest" is based on "nvidia/cuda:12.4.1-devel-ubuntu22.04". If you are working on a platform with cuda 11+ driver, please use "openmmlab/lmdeploy:latest-cu11". +The pattern of the LMDeploy docker image tag is "openmmlab/lmdeploy:{version}-cu(11|12)" since v0.5.3. +``` + +**Step 2** Clone LMDeploy source code and change to its root directory: ```shell git clone https://github.com/InternLM/lmdeploy.git cd lmdeploy ``` -Run the following command to build the whl package according to your CUDA and Python versions. -Kindly select judiciously from the provided `docker_tag` options `{cuda12.1, cuda11.8}` and the Python version set `{py38, py39, py310, py311, py312}`. +**Step 3** launch docker container in interactive mode ```shell -docker_tag="cuda12.1" -py_version="py310" -output_dir="lmdeploy_wheel" -bash builder/manywheel/build_wheel.sh ${py_version} "manylinux2014_x86_64" ${docker_tag} ${output_dir} +docker run --gpus all --net host --shm-size 16g -v $(pwd):/opt/lmdeploy --name lmdeploy -it openmmlab/lmdeploy:latest bin/bash ``` -After the whl is built successfully, you can install it by: +**Step 4** build and installation: ```shell -pip install builder/manywheel/${output_dir}/*.whl +cd /opt/lmdeploy +mkdir -p build && cd build +../generate.sh make +make -j$(nproc) && make install +cd .. ``` From c334592b5d96bb48d9e255aee274b5671f6ba16d Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Fri, 2 Aug 2024 15:08:50 +0800 Subject: [PATCH 13/24] format the doc --- docs/en/installation.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/en/installation.md b/docs/en/installation.md index 6ded59fa3..5a2b75223 100644 --- a/docs/en/installation.md +++ b/docs/en/installation.md @@ -42,7 +42,7 @@ pip install -e . But if you are using the TurboMind Engine, you have to build the source as shown below. The `openmmlab/lmdeploy:{tag}` docker image is strongly recommended. -**Step 1** Get LMDeploy's docker image +**Step 1** - Get LMDeploy's docker image ```shell docker pull openmmlab/lmdeploy:latest @@ -53,20 +53,20 @@ The "openmmlab/lmdeploy:latest" is based on "nvidia/cuda:12.4.1-devel-ubuntu22.0 The pattern of the LMDeploy docker image tag is "openmmlab/lmdeploy:{version}-cu(11|12)" since v0.5.3. ``` -**Step 2** Clone LMDeploy source code and change to its root directory: +**Step 2** - Clone LMDeploy source code and change to its root directory ```shell git clone https://github.com/InternLM/lmdeploy.git cd lmdeploy ``` -**Step 3** launch docker container in interactive mode +**Step 3** - launch docker container in interactive mode ```shell docker run --gpus all --net host --shm-size 16g -v $(pwd):/opt/lmdeploy --name lmdeploy -it openmmlab/lmdeploy:latest bin/bash ``` -**Step 4** build and installation: +**Step 4** - build and installation ```shell cd /opt/lmdeploy From c9fb63c16ed12779fad71ee5e98373c23e02be0b Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Fri, 2 Aug 2024 19:23:33 +0800 Subject: [PATCH 14/24] fix lint --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c7ce57504..2676a13b9 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ ______________________________________________________________________ - \[2024/08\] 🔥🔥 LMDeploy is integrated into [modelscope/swift](https://github.com/modelscope/swift) as the default accelerator for VLMs inference - \[2024/07\] 🎉🎉 Support Llama3.1 8B, 70B and its TOOLS CALLING -- \[2024/07\] Support [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e) full-series models, [InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md) and [function call](docs/en/serving/api_server_tools.md) of InternLM2.5 +- \[2024/07\] Support [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e) full-series models, [InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md) and [function call](docs/en/llm/api_server_tools.md) of InternLM2.5 - \[2024/06\] PyTorch engine support DeepSeek-V2 and several VLMs, such as CogVLM2, Mini-InternVL, LlaVA-Next - \[2024/05\] Balance vision model when deploying VLMs with multiple GPUs - \[2024/05\] Support 4-bits weight-only quantization and inference on VLMs, such as InternVL v1.5, LLaVa, InternLMXComposer2 From 49284b14ed03275b477f38ff69be8f6aa620c5b0 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Fri, 2 Aug 2024 19:29:46 +0800 Subject: [PATCH 15/24] update generate.sh --- generate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generate.sh b/generate.sh index 6648d2e22..6634a8ec3 100755 --- a/generate.sh +++ b/generate.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash builder="-G Ninja" From 12945c1b60e1cd1213097bc35cc6313996bd3251 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Fri, 2 Aug 2024 19:39:51 +0800 Subject: [PATCH 16/24] rollback pipeline.md --- docs/en/llm/api_server_tools.md | 2 +- docs/en/llm/introduction.md | 1 - docs/en/llm/pipeline.md | 74 ++++++++++++++++++++++++++++- docs/en/multi_modal/introduction.md | 1 - 4 files changed, 74 insertions(+), 4 deletions(-) delete mode 100644 docs/en/llm/introduction.md delete mode 100644 docs/en/multi_modal/introduction.md diff --git a/docs/en/llm/api_server_tools.md b/docs/en/llm/api_server_tools.md index 8869497c1..ec9328fe7 100644 --- a/docs/en/llm/api_server_tools.md +++ b/docs/en/llm/api_server_tools.md @@ -155,7 +155,7 @@ ChatCompletion(id='2', choices=[Choice(finish_reason='tool_calls', index=0, logp Meta announces in [Llama3's official user guide](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1) that, -```{text} +```{note} There are three built-in tools (brave_search, wolfram_alpha, and code interpreter) can be turned on using the system prompt: 1. Brave Search: Tool call to perform web searches. diff --git a/docs/en/llm/introduction.md b/docs/en/llm/introduction.md deleted file mode 100644 index 72efd7acb..000000000 --- a/docs/en/llm/introduction.md +++ /dev/null @@ -1 +0,0 @@ -# Introduction (readme first) diff --git a/docs/en/llm/pipeline.md b/docs/en/llm/pipeline.md index 1bef1f221..557067357 100644 --- a/docs/en/llm/pipeline.md +++ b/docs/en/llm/pipeline.md @@ -6,7 +6,41 @@ You can overview the detailed pipeline API in [this](https://lmdeploy.readthedoc ## Usage -## Use multi GPUs +- **An example using default parameters:** + +```python +from lmdeploy import pipeline + +pipe = pipeline('internlm/internlm2_5-7b-chat') +response = pipe(['Hi, pls intro yourself', 'Shanghai is']) +print(response) +``` + +In this example, the pipeline by default allocates a predetermined percentage of GPU memory for storing k/v cache. The ratio is dictated by the parameter `TurbomindEngineConfig.cache_max_entry_count`. + +There have been alterations to the strategy for setting the k/v cache ratio throughout the evolution of LMDeploy. The following are the change histories: + +1. `v0.2.0 <= lmdeploy <= v0.2.1` + + `TurbomindEngineConfig.cache_max_entry_count` defaults to 0.5, indicating 50% GPU **total memory** allocated for k/v cache. Out Of Memory (OOM) errors may occur if a 7B model is deployed on a GPU with memory less than 40G. If you encounter an OOM error, please decrease the ratio of the k/v cache occupation as follows: + + ```python + from lmdeploy import pipeline, TurbomindEngineConfig + + # decrease the ratio of the k/v cache occupation to 20% + backend_config = TurbomindEngineConfig(cache_max_entry_count=0.2) + + pipe = pipeline('internlm/internlm2_5-7b-chat', + backend_config=backend_config) + response = pipe(['Hi, pls intro yourself', 'Shanghai is']) + print(response) + ``` + +2. `lmdeploy > v0.2.1` + + The allocation strategy for k/v cache is changed to reserve space from the **GPU free memory** proportionally. The ratio `TurbomindEngineConfig.cache_max_entry_count` has been adjusted to 0.8 by default. If OOM error happens, similar to the method mentioned above, please consider reducing the ratio value to decrease the memory usage of the k/v cache. + +- **An example showing how to set tensor parallel num**: ```python from lmdeploy import pipeline, TurbomindEngineConfig @@ -18,6 +52,23 @@ response = pipe(['Hi, pls intro yourself', 'Shanghai is']) print(response) ``` +- **An example for setting sampling parameters:** + +```python +from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig + +backend_config = TurbomindEngineConfig(tp=2) +gen_config = GenerationConfig(top_p=0.8, + top_k=40, + temperature=0.8, + max_new_tokens=1024) +pipe = pipeline('internlm/internlm2_5-7b-chat', + backend_config=backend_config) +response = pipe(['Hi, pls intro yourself', 'Shanghai is'], + gen_config=gen_config) +print(response) +``` + - **An example for OpenAI format prompt input:** ```python @@ -112,6 +163,27 @@ response = pipe(prompts, gen_config=gen_config) print(response) ``` +- **An example for slora.** + +```python +from lmdeploy import pipeline, GenerationConfig, PytorchEngineConfig + +backend_config = PytorchEngineConfig(session_len=2048, + adapters=dict(lora_name_1='chenchi/lora-chatglm2-6b-guodegang')) +gen_config = GenerationConfig(top_p=0.8, + top_k=40, + temperature=0.8, + max_new_tokens=1024) +pipe = pipeline('THUDM/chatglm2-6b', + backend_config=backend_config) +prompts = [[{ + 'role': 'user', + 'content': '您猜怎么着' +}]] +response = pipe(prompts, gen_config=gen_config, adapter_name='lora_name_1') +print(response) +``` + ## FAQs - **RuntimeError: An attempt has been made to start a new process before the current process has finished its bootstrapping phase**. diff --git a/docs/en/multi_modal/introduction.md b/docs/en/multi_modal/introduction.md deleted file mode 100644 index 235724180..000000000 --- a/docs/en/multi_modal/introduction.md +++ /dev/null @@ -1 +0,0 @@ -# Introduction(readme first) From 0225cf93f59098a775e160ca2fae8f1438053854 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 6 Aug 2024 16:26:48 +0800 Subject: [PATCH 17/24] update --- docs/en/get_started.md | 8 ++++++-- docs/en/installation.md | 11 ++++++----- docs/en/llm/api_server_tools.md | 2 +- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/docs/en/get_started.md b/docs/en/get_started.md index e00e45816..9361e1cc0 100644 --- a/docs/en/get_started.md +++ b/docs/en/get_started.md @@ -47,7 +47,11 @@ pipe = pipeline('internlm/internlm2_5-7b-chat', )) ``` -The parameters "max_batch_size", "cache_max_entry_count" and "session_len" significantly influence the GPU memory footprint, especially "cache_max_entry_count" playing a dominant role. If you encounter an Out of Memory(OOM) error, you should consider reducing their values. +```{note} +The parameter "cache_max_entry_count" significantly influences the GPU memory occupation. It means the proportion of FREE GPU memory occupied by the k/v cache after loading the model weight. +The default value is 0.8. Once allocated, the K/V cache memory is reused repeatedly, which is why it is common to observe that the built pipeline consumes a substantial amount of GPU memory. +If you encounter an Out-of-Memory(OOM) error, you may need to consider lowering the value of cache_max_entry_count“. +``` When use the callable `pipe()` to perform token generation with given prompts, you can set the sampling parameters via `GenerationConfig` as below: @@ -116,7 +120,7 @@ As demonstrated in the previous [offline batch inference](#offline-batch-inferen lmdeploy serve api_server internlm/internlm2_5-7b-chat ``` -This command will launch an OpenAI-compatible server on the localhost at port 23333. You can specify a different server port by using the `--server-port` option. +This command will launch an OpenAI-compatible server on the localhost at port `23333`. You can specify a different server port by using the `--server-port` option. For more options, consult the help documentation by running `lmdeploy serve api_server --help`. Most of these options align with the engine configuration. To access the service, you can utilize the official OpenAI Python package `pip install openai`. Below is an example demonstrating how to use the entrypoint `v1/chat/completions` diff --git a/docs/en/installation.md b/docs/en/installation.md index 5a2b75223..2b1a23b2d 100644 --- a/docs/en/installation.md +++ b/docs/en/installation.md @@ -1,9 +1,9 @@ # Installation -LMDeploy is a python library for compressing, deploying, and serving Large Language Models and Vision-Language Models. +LMDeploy is a python library for compressing, deploying, and serving Large Language Models(LLMs) and Vision-Language Models(VLMs). Its core inference engines include TurboMind Engine and PyTorch Engine. The former is developed by C++ and CUDA, striving for ultimate optimization of inference performance, while the latter, developed purely in Python, aims to decrease the barriers for developers. -It supports deployment on both Linux and Windows platform, with minimum requirement of CUDA version 11.3. Furthermore, it is compatible with the following NVIDIA GPUs: +It supports LLMs and VLMs deployment on both Linux and Windows platform, with minimum requirement of CUDA version 11.3. Furthermore, it is compatible with the following NVIDIA GPUs: - Volta(sm70): V100 - Turing(sm75): 20 series, T4 @@ -21,7 +21,7 @@ pip install lmdeploy The default prebuilt package is compiled on **CUDA 12**. If CUDA 11+ (>=11.3) is required, you can install lmdeploy by: ```shell -export LMDEPLOY_VERSION=0.5.1 +export LMDEPLOY_VERSION=0.5.3 export PYTHON_VERSION=38 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 ``` @@ -42,7 +42,7 @@ pip install -e . But if you are using the TurboMind Engine, you have to build the source as shown below. The `openmmlab/lmdeploy:{tag}` docker image is strongly recommended. -**Step 1** - Get LMDeploy's docker image +**Step 1** - Get the docker image of LMDeploy ```shell docker pull openmmlab/lmdeploy:latest @@ -71,7 +71,8 @@ docker run --gpus all --net host --shm-size 16g -v $(pwd):/opt/lmdeploy --name l ```shell cd /opt/lmdeploy mkdir -p build && cd build -../generate.sh make +bash ../generate.sh make make -j$(nproc) && make install cd .. +pip install -e . ``` diff --git a/docs/en/llm/api_server_tools.md b/docs/en/llm/api_server_tools.md index ec9328fe7..0a6b8f776 100644 --- a/docs/en/llm/api_server_tools.md +++ b/docs/en/llm/api_server_tools.md @@ -1,4 +1,4 @@ -# Function Call +# Tools Calling LMDeploy supports tools for InternLM2, InternLM2.5 and llama3.1 models. From 88f61fef4f1e4ae42d64681e8e22b3916cd00692 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 6 Aug 2024 16:42:30 +0800 Subject: [PATCH 18/24] update zh_cn --- README_zh-CN.md | 18 +++--- docs/en/index.rst | 2 - docs/zh_cn/benchmark/profile_api_server.md | 2 +- docs/zh_cn/get_started.md | 2 +- docs/zh_cn/index.rst | 55 ++++++++----------- docs/zh_cn/{serving => llm}/api_server.md | 0 .../{serving => llm}/api_server_tools.md | 0 docs/zh_cn/{serving => llm}/gradio.md | 0 docs/zh_cn/{inference => llm}/pipeline.md | 0 docs/zh_cn/{serving => llm}/proxy_server.md | 0 docs/zh_cn/{serving => llm}/qos.md | 0 .../{serving => multi_modal}/api_server_vl.md | 2 +- docs/zh_cn/multi_modal/index.rst | 12 ++++ .../{inference => multi_modal}/vl_pipeline.md | 2 +- docs/zh_cn/quantization/w4a16.md | 4 +- docs/zh_cn/supported_models/codellama.md | 2 +- 16 files changed, 52 insertions(+), 49 deletions(-) rename docs/zh_cn/{serving => llm}/api_server.md (100%) rename docs/zh_cn/{serving => llm}/api_server_tools.md (100%) rename docs/zh_cn/{serving => llm}/gradio.md (100%) rename docs/zh_cn/{inference => llm}/pipeline.md (100%) rename docs/zh_cn/{serving => llm}/proxy_server.md (100%) rename docs/zh_cn/{serving => llm}/qos.md (100%) rename docs/zh_cn/{serving => multi_modal}/api_server_vl.md (99%) create mode 100644 docs/zh_cn/multi_modal/index.rst rename docs/zh_cn/{inference => multi_modal}/vl_pipeline.md (99%) diff --git a/README_zh-CN.md b/README_zh-CN.md index c0c3f20f0..affe6c643 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -28,7 +28,7 @@ ______________________________________________________________________ - \[2024/08\] 🔥🔥 LMDeploy现已集成至 [modelscope/swift](https://github.com/modelscope/swift),成为 VLMs 推理的默认加速引擎 - \[2024/07\] 🎉🎉 支持 Llama3.1 8B 和 70B 模型,以及工具调用功能 -- \[2024/07\] 支持 [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e) 全系列模型,[InternLM-XComposer2.5](docs/zh_cn/multi_modal/xcomposer2d5.md) 模型和 InternLM2.5 的 [function call 功能](docs/zh_cn/serving/api_server_tools.md) +- \[2024/07\] 支持 [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e) 全系列模型,[InternLM-XComposer2.5](docs/zh_cn/multi_modal/xcomposer2d5.md) 模型和 InternLM2.5 的 [function call 功能](docs/zh_cn/llm/api_server_tools.md) - \[2024/06\] PyTorch engine 支持了 DeepSeek-V2 和若干 VLM 模型推理, 比如 CogVLM2,Mini-InternVL,LlaVA-Next - \[2024/05\] 在多 GPU 上部署 VLM 模型时,支持把视觉部分的模型均分到多卡上 - \[2024/05\] 支持InternVL v1.5, LLaVa, InternLMXComposer2 等 VLMs 模型的 4bit 权重量化和推理 @@ -39,8 +39,8 @@ ______________________________________________________________________ - \[2024/03\] 支持 DeepSeek-VL 的离线推理 pipeline 和推理服务 - \[2024/03\] 支持视觉-语言模型(VLM)的离线推理 pipeline 和推理服务 - \[2024/02\] 支持 Qwen 1.5、Gemma、Mistral、Mixtral、Deepseek-MOE 等模型 -- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) 发布,支持无缝接入[LMDeploy Serving Service](./docs/zh_cn/serving/api_server.md) -- \[2024/01\] 支持多模型、多机、多卡推理服务。使用方法请参考[此处](./docs/zh_cn/serving/proxy_server.md) +- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) 发布,支持无缝接入[LMDeploy Serving Service](docs/zh_cn/llm/api_server.md) +- \[2024/01\] 支持多模型、多机、多卡推理服务。使用方法请参考[此处](docs/zh_cn/llm/proxy_server.md) - \[2024/01\] 增加 [PyTorch 推理引擎](./docs/zh_cn/inference/pytorch.md),作为 TurboMind 引擎的补充。帮助降低开发门槛,和快速实验新特性、新技术 @@ -196,7 +196,7 @@ print(response) > > `export LMDEPLOY_USE_MODELSCOPE=True` -关于 pipeline 的更多推理参数说明,请参考[这里](./docs/zh_cn/inference/pipeline.md) +关于 pipeline 的更多推理参数说明,请参考[这里](docs/zh_cn/llm/pipeline.md) # 用户教程 @@ -205,10 +205,10 @@ print(response) 为了帮助用户更进一步了解 LMDeploy,我们准备了用户指南和进阶指南,请阅读我们的[文档](https://lmdeploy.readthedocs.io/zh-cn/latest/): - 用户指南 - - [LLM 推理 pipeline](./docs/zh_cn/inference/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ) - - [VLM 推理 pipeline](./docs/zh_cn/inference/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing) - - [LLM 推理服务](./docs/zh_cn/serving/api_server.md) - - [VLM 推理服务](./docs/zh_cn/serving/api_server_vl.md) + - [LLM 推理 pipeline](docs/zh_cn/llm/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ) + - [VLM 推理 pipeline](docs/zh_cn/multi_modal/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing) + - [LLM 推理服务](docs/zh_cn/llm/api_server.md) + - [VLM 推理服务](docs/zh_cn/multi_modal/api_server_vl.md) - [模型量化](./docs/zh_cn/quantization) - 进阶指南 - [推理引擎 - TurboMind](./docs/zh_cn/inference/turbomind.md) @@ -217,7 +217,7 @@ print(response) - [支持新模型](./docs/zh_cn/advance/pytorch_new_model.md) - gemm tuning - [长文本推理](./docs/zh_cn/advance/long_context.md) - - [多模型推理服务](./docs/zh_cn/serving/proxy_server.md) + - [多模型推理服务](docs/zh_cn/llm/proxy_server.md) # 社区项目 diff --git a/docs/en/index.rst b/docs/en/index.rst index 176e8e6aa..3842b54f0 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -56,7 +56,6 @@ Documentation :maxdepth: 1 :caption: Large Language Models(LLMs) Deployment - llm/introduction.md llm/pipeline.md llm/api_server.md llm/api_server_tools.md @@ -68,7 +67,6 @@ Documentation :maxdepth: 1 :caption: Vision-Language Models(VLMs) Deployment - multi_modal/introduction.md multi_modal/vl_pipeline.md multi_modal/api_server_vl.md multi_modal/index.rst diff --git a/docs/zh_cn/benchmark/profile_api_server.md b/docs/zh_cn/benchmark/profile_api_server.md index 01c6fa35c..c87282004 100644 --- a/docs/zh_cn/benchmark/profile_api_server.md +++ b/docs/zh_cn/benchmark/profile_api_server.md @@ -41,7 +41,7 @@ $$ lmdeploy serve api_server internlm/internlm-7b ``` -如果你想改变 server 的端口,或者诸如推理引擎、最大批处理值等参数,请运行 `lmdeploy serve api_server -h` 或者阅读[这篇文档](../serving/api_server.md),查看详细的参数说明。 +如果你想改变 server 的端口,或者诸如推理引擎、最大批处理值等参数,请运行 `lmdeploy serve api_server -h` 或者阅读[这篇文档](../llm/api_server.md),查看详细的参数说明。 ### 测速 diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md index 8577f4834..2a2de9fac 100644 --- a/docs/zh_cn/get_started.md +++ b/docs/zh_cn/get_started.md @@ -27,7 +27,7 @@ response = pipe(["Hi, pls intro yourself", "Shanghai is"]) print(response) ``` -有关 pipeline 的详细使用说明,请参考[这里](./inference/pipeline.md) +有关 pipeline 的详细使用说明,请参考[这里](llm/pipeline.md) ## 推理服务 diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst index 5a6df1fe4..8691c423b 100644 --- a/docs/zh_cn/index.rst +++ b/docs/zh_cn/index.rst @@ -41,25 +41,9 @@ LMDeploy 工具箱提供以下核心功能: :maxdepth: 2 :caption: 快速上手 + installation.md get_started.md -.. _编译和安装: -.. toctree:: - :maxdepth: 1 - :caption: 编译和安装 - - build.md - -.. _测试基准: -.. toctree:: - :maxdepth: 1 - :caption: 测试基准 - - benchmark/profile_generation.md - benchmark/profile_throughput.md - benchmark/profile_api_server.md - benchmark/evaluate_with_opencompass.md - .. _支持的模型: .. toctree:: :maxdepth: 1 @@ -67,25 +51,25 @@ LMDeploy 工具箱提供以下核心功能: supported_models/supported_models.md -.. _推理: +.. _llm_部署: .. toctree:: :maxdepth: 1 - :caption: 推理 + :caption: 大语言模型(LLMs)部署 - inference/pipeline.md - inference/vl_pipeline.md + llm/pipeline.md + llm/api_server.md + llm/api_server_tools.md + llm/gradio.md + llm/proxy_server.md - -.. _服务: +.. _vlm_部署: .. toctree:: :maxdepth: 1 - :caption: 服务 + :caption: 视觉-语言模型(VLMs)部署 - serving/api_server.md - serving/api_server_vl.md - serving/api_server_tools.md - serving/gradio.md - serving/proxy_server.md + multi_modal/vl_pipeline.md + multi_modal/api_server_vl.md + multi_modal/index.rst .. _量化: @@ -94,8 +78,18 @@ LMDeploy 工具箱提供以下核心功能: :caption: 量化 quantization/w4a16.md - quantization/kv_quant.md quantization/w8a8.md + quantization/kv_quant.md + +.. _测试基准: +.. toctree:: + :maxdepth: 1 + :caption: 测试基准 + + benchmark/profile_generation.md + benchmark/profile_throughput.md + benchmark/profile_api_server.md + benchmark/evaluate_with_opencompass.md .. toctree:: :maxdepth: 1 @@ -107,7 +101,6 @@ LMDeploy 工具箱提供以下核心功能: advance/long_context.md advance/chat_template.md advance/debug_turbomind.md - serving/qos.md .. toctree:: :maxdepth: 1 diff --git a/docs/zh_cn/serving/api_server.md b/docs/zh_cn/llm/api_server.md similarity index 100% rename from docs/zh_cn/serving/api_server.md rename to docs/zh_cn/llm/api_server.md diff --git a/docs/zh_cn/serving/api_server_tools.md b/docs/zh_cn/llm/api_server_tools.md similarity index 100% rename from docs/zh_cn/serving/api_server_tools.md rename to docs/zh_cn/llm/api_server_tools.md diff --git a/docs/zh_cn/serving/gradio.md b/docs/zh_cn/llm/gradio.md similarity index 100% rename from docs/zh_cn/serving/gradio.md rename to docs/zh_cn/llm/gradio.md diff --git a/docs/zh_cn/inference/pipeline.md b/docs/zh_cn/llm/pipeline.md similarity index 100% rename from docs/zh_cn/inference/pipeline.md rename to docs/zh_cn/llm/pipeline.md diff --git a/docs/zh_cn/serving/proxy_server.md b/docs/zh_cn/llm/proxy_server.md similarity index 100% rename from docs/zh_cn/serving/proxy_server.md rename to docs/zh_cn/llm/proxy_server.md diff --git a/docs/zh_cn/serving/qos.md b/docs/zh_cn/llm/qos.md similarity index 100% rename from docs/zh_cn/serving/qos.md rename to docs/zh_cn/llm/qos.md diff --git a/docs/zh_cn/serving/api_server_vl.md b/docs/zh_cn/multi_modal/api_server_vl.md similarity index 99% rename from docs/zh_cn/serving/api_server_vl.md rename to docs/zh_cn/multi_modal/api_server_vl.md index 878edbbc6..fea4d33ef 100644 --- a/docs/zh_cn/serving/api_server_vl.md +++ b/docs/zh_cn/multi_modal/api_server_vl.md @@ -1,6 +1,6 @@ # 部署 VLM 类 openai 服务 -本文主要介绍单个VL模型在单机多卡环境下,部署兼容 openai 接口服务的方式,以及服务接口的用法。为行文方便,我们把该服务名称为 `api_server`。对于多模型的并行服务,请阅读[请求分发服务器](./proxy_server.md)一文。 +本文主要介绍单个VL模型在单机多卡环境下,部署兼容 openai 接口服务的方式,以及服务接口的用法。为行文方便,我们把该服务名称为 `api_server`。对于多模型的并行服务,请阅读[请求分发服务器](../llm/proxy_server.md)一文。 在这篇文章中, 我们首先介绍服务启动的两种方法,你可以根据应用场景,选择合适的。 diff --git a/docs/zh_cn/multi_modal/index.rst b/docs/zh_cn/multi_modal/index.rst new file mode 100644 index 000000000..c27b420e2 --- /dev/null +++ b/docs/zh_cn/multi_modal/index.rst @@ -0,0 +1,12 @@ +视觉语言模型 +================================= + +.. toctree:: + :maxdepth: 2 + :caption: 示例 + + llava.md + internvl.md + xcomposer2d5.md + cogvlm.md + minicpmv.md diff --git a/docs/zh_cn/inference/vl_pipeline.md b/docs/zh_cn/multi_modal/vl_pipeline.md similarity index 99% rename from docs/zh_cn/inference/vl_pipeline.md rename to docs/zh_cn/multi_modal/vl_pipeline.md index b550d70e7..31533b38f 100644 --- a/docs/zh_cn/inference/vl_pipeline.md +++ b/docs/zh_cn/multi_modal/vl_pipeline.md @@ -1,6 +1,6 @@ # VLM 离线推理 pipeline -LMDeploy 把视觉-语言模型(VLM)复杂的推理过程,抽象为简单好用的 pipeline。它的用法与大语言模型(LLM)推理 [pipeline](./pipeline.md) 类似。 +LMDeploy 把视觉-语言模型(VLM)复杂的推理过程,抽象为简单好用的 pipeline。它的用法与大语言模型(LLM)推理 [pipeline](../llm/pipeline.md) 类似。 目前,VLM pipeline 支持以下模型: diff --git a/docs/zh_cn/quantization/w4a16.md b/docs/zh_cn/quantization/w4a16.md index 4b42f3970..de6729354 100644 --- a/docs/zh_cn/quantization/w4a16.md +++ b/docs/zh_cn/quantization/w4a16.md @@ -87,7 +87,7 @@ response = pipe(["Hi, pls intro yourself", "Shanghai is"]) print(response) ``` -关于 pipeline 的详细介绍,请参考[这里](../inference/pipeline.md) +关于 pipeline 的详细介绍,请参考[这里](../llm/pipeline.md) 除了推理本地量化模型外,LMDeploy 还支持直接推理 huggingface hub 上的通过 AWQ 量化的 4bit 权重模型,比如 [lmdeploy 空间](https://huggingface.co/lmdeploy)和 [TheBloke 空间](https://huggingface.co/TheBloke)下的模型。 @@ -123,7 +123,7 @@ lmdeploy serve api_server ./internlm2_5-7b-chat-4bit --backend turbomind --model lmdeploy serve api_client http://0.0.0.0:23333 ``` -还可以通过 Swagger UI `http://0.0.0.0:23333` 在线阅读和试用 `api_server` 的各接口,也可直接查阅[文档](../serving/api_server.md),了解各接口的定义和使用方法。 +还可以通过 Swagger UI `http://0.0.0.0:23333` 在线阅读和试用 `api_server` 的各接口,也可直接查阅[文档](../llm/api_server.md),了解各接口的定义和使用方法。 ## 推理性能 diff --git a/docs/zh_cn/supported_models/codellama.md b/docs/zh_cn/supported_models/codellama.md index 2a8a86330..b9e881c05 100644 --- a/docs/zh_cn/supported_models/codellama.md +++ b/docs/zh_cn/supported_models/codellama.md @@ -110,4 +110,4 @@ lmdeploy serve api_client api_server_url lmdeploy serve gradio api_server_url --server-name ${gradio_ui_ip} --server-port ${gradio_ui_port} ``` -关于 RESTful API的详细介绍,请参考[这份](../serving/api_server.md)文档。 +关于 RESTful API的详细介绍,请参考[这份](../llm/api_server.md)文档。 From d169be506fbf2ed3f54f5bfb8e1896d9b41f480b Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 6 Aug 2024 17:51:33 +0800 Subject: [PATCH 19/24] update --- docs/en/get_started.md | 13 ++- docs/en/installation.md | 4 +- docs/zh_cn/get_started.md | 204 ++++++++++++++++++++++++++++++------- docs/zh_cn/installation.md | 80 +++++++++++++++ 4 files changed, 260 insertions(+), 41 deletions(-) create mode 100644 docs/zh_cn/installation.md diff --git a/docs/en/get_started.md b/docs/en/get_started.md index 9361e1cc0..2d6b28199 100644 --- a/docs/en/get_started.md +++ b/docs/en/get_started.md @@ -48,8 +48,8 @@ pipe = pipeline('internlm/internlm2_5-7b-chat', ``` ```{note} -The parameter "cache_max_entry_count" significantly influences the GPU memory occupation. It means the proportion of FREE GPU memory occupied by the k/v cache after loading the model weight. -The default value is 0.8. Once allocated, the K/V cache memory is reused repeatedly, which is why it is common to observe that the built pipeline consumes a substantial amount of GPU memory. +The parameter "cache_max_entry_count" significantly influences the GPU memory usage. It means the proportion of FREE GPU memory occupied by the K/V cache after the model weights are loaded. +The default value is 0.8. Once allocated, the K/V cache memory is reused repeatedly, which is why it is common to observe that the built pipeline and the api_server mentioned later in the next consumes a substantial amount of GPU memory. If you encounter an Out-of-Memory(OOM) error, you may need to consider lowering the value of cache_max_entry_count“. ``` @@ -152,10 +152,13 @@ We encourage you to refer to the detailed guide for more comprehensive informati lmdeploy serve api_server OpenGVLab/InternVL2-8B ``` -LMDeploy reuses the vision component from upstream VLM repository. Consequently, Serving VLMs can vary, as the upstream VLM repo might introduce different dependencies and offer distinct functionalities. -We invite users to explore the serving method of each supported VLM from [here](multi_modal) +```{note} +LMDeploy reuses the vision component from upstream VLM repositories. Each upstream VLM model may have different dependencies. +Consequently, LMDeploy has decided not to include the dependencies of the upstream VLM repositories in its own dependency list. +If you encounter an "ImportError" when using LMDeploy for inference with VLM models, please install the relevant dependencies yourself. +``` -You can access the VLM service in a manner similar to how you would access the `gptv4` service by modifying the `api_key` and `base_url` parameters: +After the service is launched successfully, you can access the VLM service in a manner similar to how you would access the `gptv4` service by modifying the `api_key` and `base_url` parameters: ```python from openai import OpenAI diff --git a/docs/en/installation.md b/docs/en/installation.md index 2b1a23b2d..9e4ad4d95 100644 --- a/docs/en/installation.md +++ b/docs/en/installation.md @@ -12,9 +12,11 @@ It supports LLMs and VLMs deployment on both Linux and Windows platform, with mi ## Install with pip (Recommend) -You can install lmdeploy using pip (python 3.8 - 3.12) as follows: +It is recommended installing lmdeploy using pip in a conda environment (python 3.8 - 3.12): ```shell +conda create -n lmdeploy python=3.8 -y +conda activate lmdeploy pip install lmdeploy ``` diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md index 2a2de9fac..5649397a8 100644 --- a/docs/zh_cn/get_started.md +++ b/docs/zh_cn/get_started.md @@ -1,25 +1,19 @@ -# 快速上手 +# 快速开始 LMDeploy提供了快速安装、模型量化、离线批处理、在线推理服务等功能。每个功能只需简单的几行代码或者命令就可以完成。 -## 安装 +本教程将展示 LMDeploy 在以下几方面的使用方法: -使用 pip (python 3.8+) 安装 LMDeploy,或者[源码安装](./build.md) +- LLM 模型和 VLM 模型的离线推理 +- 搭建与 OpenAI 接口兼容的 LLM 或 VLM 模型服务 +- 通过控制台命令行与 LLM 模型进行交互式聊天 -```shell -pip install lmdeploy -``` - -LMDeploy的预编译包默认是基于 CUDA 12 编译的。如果需要在 CUDA 11+ 下安装 LMDeploy,请执行以下命令: - -```shell -export LMDEPLOY_VERSION=0.5.2 -export PYTHON_VERSION=38 -pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 -``` +在继续阅读之前,请确保你已经按照[安装指南](installation.md)安装了 lmdeploy。 ## 离线批处理 +### LLM 推理 + ```python import lmdeploy pipe = lmdeploy.pipeline("internlm/internlm2_5-7b-chat") @@ -27,41 +21,181 @@ response = pipe(["Hi, pls intro yourself", "Shanghai is"]) print(response) ``` -有关 pipeline 的详细使用说明,请参考[这里](llm/pipeline.md) +在构造 `pipeline` 时,如果没有指定使用 TurboMind 引擎或 PyTorch 引擎进行推理,LMDeploy 将根据[它们各自的能力](supported_models/supported_models.md)自动分配一个,默认优先使用 TurboMind 引擎。 + +然而,你可以选择手动选择一个引擎。例如, + +```python +from lmdeploy import pipeline, TurbomindEngineConfig +pipe = pipeline('internlm/internlm2_5-7b-chat', + backend_config=TurbomindEngineConfig( + max_batch_size=32, + enable_prefix_caching=True, + cache_max_entry_count=0.8, + session_len=8192, + )) +``` + +或者, + +```python +from lmdeploy import pipeline, PytorchEngineConfig +pipe = pipeline('internlm/internlm2_5-7b-chat', + backend_config=PytorchEngineConfig( + max_batch_size=32, + enable_prefix_caching=True, + cache_max_entry_count=0.8, + session_len=8192, + )) +``` + +```{note} +参数 "cache_max_entry_count" 显著影响 GPU 内存占用。它表示加载模型权重后 K/V 缓存占用的空闲 GPU 内存的比例。 +默认值是 0.8。K/V 缓存分配方式是一次性申请,重复性使用,这就是为什么 pipeline 以及下文中的 api_server 在启动后会消耗大量 GPU 内存。 +如果你遇到内存不足(OOM)错误的错误,可能需要考虑降低 cache_max_entry_count 的值。 +``` + +当使用 `pipe()` 生成提示词的 token 时,你可以通过 `GenerationConfig` 设置采样参数,如下所示: + +```python +from lmdeploy import GenerationConfig, pipeline + +pipe = pipeline('internlm/internlm2_5-7b-chat') +prompts = ['Hi, pls intro yourself', 'Shanghai is'] +response = pipe(prompts, + gen_config=GenerationConfig( + max_new_tokens=1024, + top_p=0.8, + top_k=40, + temperature=0.6 + )) +``` + +在 `GenerationConfig` 中,`top_k=1` 或 `temperature=0.0` 表示贪心搜索。 -## 推理服务 +有关 pipeline 的更多信息,请参考[这里](llm/pipeline.md) -LMDeploy 提供了多种部署模型推理服务的方式,总有一款适合你。 +### VLM 推理 -- [部署类 openai 的服务](https://lmdeploy.readthedocs.io/zh-cn/latest//serving/api_server.html) -- [通过 docker 部署服务](https://lmdeploy.readthedocs.io/zh-cn/latest/serving/api_server.html#docker) -- [部署 gradio 服务](https://lmdeploy.readthedocs.io/zh-cn/latest/serving/gradio.html) +VLM 推理 pipeline 与 LLM 类似,但增加了使用 pipeline 处理图像数据的能力。例如,你可以使用以下代码片段对 InternVL 模型进行推理: -## 模型量化 +```python +from lmdeploy import pipeline +from lmdeploy.vl import load_image -- [INT4 权重量化](quantization/w4a16.md) -- [K/V 量化](quantization/kv_quant.md) -- [W8A8 量化](quantization/w8a8.md) +pipe = pipeline('OpenGVLab/InternVL2-8B') -## 好用的工具 +image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg') +response = pipe(('describe this image', image)) +print(response) +``` -LMDeploy CLI 提供了如下便捷的工具,方便用户快速体验模型对话效果 +在 VLM pipeline 中,默认的图像处理批量大小是 1。这可以通过 `VisionConfig` 调整。例如,你可以这样设置: -### 控制台交互式对话 +```python +from lmdeploy import pipeline, VisionConfig +from lmdeploy.vl import load_image + +pipe = pipeline('OpenGVLab/InternVL2-8B', + vision_config=VisionConfig( + max_batch_size=8 + )) + +image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg') +response = pipe(('describe this image', image)) +print(response) +``` + +然而,图像批量大小越大,OOM 错误的风险越大,因为 VLM 模型中的 LLM 部分会提前预分配大量的内存。 + +VLM pipeline 对于推理引擎的选择方式与 LLM pipeline 类似。你可以参考 [LLM 推理](#llm-推理)并结合两个引擎支持的 VLM 模型列表,手动选择和配置推理引擎。 + +## 模型服务 + +类似前文[离线批量推理](#离线批处理),我们在本章节介绍 LLM 和 VLM 各自构建服务方法。 + +### LLM 模型服务 ```shell -lmdeploy chat internlm/internlm2_5-7b-chat +lmdeploy serve api_server internlm/internlm2_5-7b-chat ``` -### WebUI 交互式对话 +此命令将在本地主机上的端口 `23333` 启动一个与 OpenAI 接口兼容的模型推理服务。你可以使用 `--server-port` 选项指定不同的服务器端口。 +更多选项,请通过运行 `lmdeploy serve api_server --help` 查阅帮助文档。这些选项大多与引擎配置一致。 -LMDeploy 使用 gradio 开发了在线对话 demo。 +要访问服务,你可以使用官方的 OpenAI Python 包 `pip install openai`。以下是演示如何使用入口点 v1/chat/completions 的示例: + +```python +from openai import OpenAI +client = OpenAI( + api_key='YOUR_API_KEY', + base_url="http://0.0.0.0:23333/v1" +) +model_name = client.models.list().data[0].id +response = client.chat.completions.create( + model=model_name, + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": " provide three suggestions about time management"}, + ], + temperature=0.8, + top_p=0.8 +) +print(response) +``` + +我们鼓励你参考详细指南,了解关于[使用 Docker 部署服务](./llm/api_server.md)、[工具调用](llm/api_server_tools.md)和其他更多功能的信息。 + +### VLM 模型服务 ```shell -# 安装依赖 -pip install lmdeploy[serve] -# 启动 -lmdeploy serve gradio internlm/internlm2_5-7b-chat +lmdeploy serve api_server OpenGVLab/InternVL2-8B ``` -![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab) +```{note} +LMDeploy 复用了上游 VLM 仓库的视觉组件。而每个上游的 VLM 模型,它们的视觉模型可能互不相同,依赖库也各有区别。 +因此,LMDeploy 决定不在自身的依赖列表中加入上游 VLM 库的依赖。如果你在使用 LMDeploy 推理 VLM 模型时出现 "ImportError" 的问题,请自行安装相关的依赖。 +``` + +服务成功启动后,你可以以类似访问 `gptv4` 服务的方式访问 VLM 服务: + +```python +from openai import OpenAI + +client = OpenAI(api_key='YOUR_API_KEY', # A dummy api_key is required + base_url='http://0.0.0.0:23333/v1') +model_name = client.models.list().data[0].id +response = client.chat.completions.create( + model=model_name, + messages=[{ + 'role': + 'user', + 'content': [{ + 'type': 'text', + 'text': 'Describe the image please', + }, { + 'type': 'image_url', + 'image_url': { + 'url': + 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg', + }, + }], + }], + temperature=0.8, + top_p=0.8) +print(response) +``` + +## 使用命令行与 LLM 模型对话 + +LMDeploy 提供了一个非常方便的 CLI 工具,供用户与 LLM 模型进行本地聊天。例如: + +```shell +lmdeploy chat internlm/internlm2_5-7b-chat --backend turbomind +``` + +它的设计目的是帮助用户检查和验证 LMDeploy 是否支持提供的模型,聊天模板是否被正确应用,以及推理结果是否正确。 + +另外,`lmdeploy check_env` 收集基本的环境信息。在给 LMDeploy 提交问题报告时,这非常重要,因为它有助于我们更有效地诊断和解决问题。 + +如果你对它们的使用方法有任何疑问,你可以尝试使用 `--help` 选项获取详细信息。 diff --git a/docs/zh_cn/installation.md b/docs/zh_cn/installation.md new file mode 100644 index 000000000..c5758a642 --- /dev/null +++ b/docs/zh_cn/installation.md @@ -0,0 +1,80 @@ +# 安装 + +LMDeploy 是一个用于大型语言模型(LLMs)和视觉-语言模型(VLMs)压缩、部署和服务的 Python 库。 +其核心推理引擎包括 TurboMind 引擎和 PyTorch 引擎。前者由 C++ 和 CUDA 开发,致力于推理性能的优化,而后者纯 Python 开发,旨在降低开发者的门槛。 + +LMDeploy 支持在 Linux 和 Windows 平台上部署 LLMs 和 VLMs,最低要求 CUDA 版本为 11.3。此外,它还与以下 NVIDIA GPU 兼容: + +Volta(sm70): V100 +Turing(sm75): 20 系列,T4 +Ampere(sm80,sm86): 30 系列,A10, A16, A30, A100 +Ada Lovelace(sm89): 40 系列 + +## 使用 pip 安装(推荐) + +我们推荐在一个干净的conda环境下(python3.8 - 3.12),安装 lmdeploy: + +```shell +conda create -n lmdeploy python=3.8 -y +conda activate lmdeploy +pip install lmdeploy +``` + +默认的预构建包是在 **CUDA 12** 上编译的。如果需要 CUDA 11+ (>=11.3),你可以使用以下命令安装 lmdeploy: + +```shell +export LMDEPLOY_VERSION=0.5.3 +export PYTHON_VERSION=38 +pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +``` + +## 使用 pip 安装夜间构建包 + +LMDeploy 的发布频率大约是每月一次或两次。如果你所需的功能已经被合并到 LMDeploy 的主分支但还没有发布,你可以环境中的 CUDA 和 Python 版本,尝试使用[这里](https://github.com/zhyncs/lmdeploy-build)提供的夜间构建包。 + +## 从源码安装 + +如果你使用 PyTorch 引擎进行推理,从源代码安装非常简单: + +```shell +git clone https://github.com/InternLM/lmdeploy.git +cd lmdeploy +pip install -e . +``` + +但如果你使用 TurboMind 引擎,请参考以下说明编译源代码。我们强烈推荐使用 `openmmlab/lmdeploy:{tag}` docker 镜像作为编译安装的环境 + +**步骤 1** - 获取 LMDeploy 的 docker 镜像 + +```shell +docker pull openmmlab/lmdeploy:latest +``` + +```{note} +"openmmlab/lmdeploy:latest" 基于 "nvidia/cuda:12.4.1-devel-ubuntu22.04"。如果你在带有 cuda 11+ 驱动的平台上工作,请使用 "openmmlab/lmdeploy:latest-cu11"。 +从 v0.5.3 开始,LMDeploy docker 镜像标签的模式是 "openmmlab/lmdeploy:{version}-cu(11|12)"。 +``` + +**步骤 2** - 克隆 LMDeploy 源代码 + +```shell +git clone https://github.com/InternLM/lmdeploy.git +cd lmdeploy +``` + +**步骤 3** - 以交互模式启动 docker 容器 + +```shell +docker run --gpus all --net host --shm-size 16g -v $(pwd):/opt/lmdeploy --name lmdeploy -it openmmlab/lmdeploy:latest bin/bash +``` + +**步骤 4** - 编译与安装 + +```shell +cd /opt/lmdeploy +mkdir -p build && cd build +bash ../generate.sh make +make -j$(nproc) && make install +cd .. +pip install -e . +``` From f0a613e062d83025067e2aa55f4eff15e49c3d82 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 6 Aug 2024 18:52:04 +0800 Subject: [PATCH 20/24] fix lint --- README_ja.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README_ja.md b/README_ja.md index 4cf67cbd4..e9bd0e24f 100644 --- a/README_ja.md +++ b/README_ja.md @@ -28,7 +28,7 @@ ______________________________________________________________________ - \[2024/08\] 🔥🔥 LMDeployは[modelscope/swift](https://github.com/modelscope/swift)に統合され、VLMs推論のデフォルトアクセラレータとなりました - \[2024/07\] 🎉🎉 Llama3.1 8B、70Bおよびそのツールコールをサポート -- \[2024/07\] [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e)全シリーズモデル、[InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md)およびInternLM2.5の[ファンクションコール](docs/en/serving/api_server_tools.md)をサポート +- \[2024/07\] [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e)全シリーズモデル、[InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md)およびInternLM2.5の[ファンクションコール](docs/en/llm/api_server_tools.md)をサポート - \[2024/06\] PyTorchエンジンはDeepSeek-V2およびいくつかのVLMs、例えばCogVLM2、Mini-InternVL、LlaVA-Nextをサポート - \[2024/05\] 複数のGPUでVLMsをデプロイする際にビジョンモデルをバランスさせる - \[2024/05\] InternVL v1.5、LLaVa、InternLMXComposer2などのVLMsで4ビットの重みのみの量子化と推論をサポート @@ -39,8 +39,8 @@ ______________________________________________________________________ - \[2024/03\] DeepSeek-VLのオフライン推論パイプラインとサービングをサポート - \[2024/03\] VLMのオフライン推論パイプラインとサービングをサポート - \[2024/02\] Qwen 1.5、Gemma、Mistral、Mixtral、Deepseek-MOEなどをサポート -- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE)が[LMDeployサービングサービス](./docs/en/serving/api_server.md)とシームレスに統合されました -- \[2024/01\] 複数モデル、複数マシン、複数カードの推論サービスをサポート。使用方法は[こちら](./docs/en/serving/proxy_server.md)を参照してください +- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE)が[LMDeployサービングサービス](./docs/en/llm/api_server.md)とシームレスに統合されました +- \[2024/01\] 複数モデル、複数マシン、複数カードの推論サービスをサポート。使用方法は[こちら](./docs/en/llm/proxy_server.md)を参照してください - \[2024/01\] [PyTorch推論エンジン](./docs/en/inference/pytorch.md)をサポートし、完全にPythonで開発されており、開発者の障壁を下げ、新機能や技術の迅速な実験を可能にします @@ -196,7 +196,7 @@ print(response) > > `export LMDEPLOY_USE_MODELSCOPE=True` -推論パイプラインに関する詳細情報は[こちら](./docs/en/inference/pipeline.md)を参照してください。 +推論パイプラインに関する詳細情報は[こちら](./docs/en/llm/pipeline.md)を参照してください。 # チュートリアル @@ -205,10 +205,10 @@ LMDeployの基本的な使用方法については、[getting_started](./docs/en 詳細なユーザーガイドと高度なガイドについては、[チュートリアル](https://lmdeploy.readthedocs.io/en/latest/)を参照してください: - ユーザーガイド - - [LLM推論パイプライン](./docs/en/inference/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ) - - [VLM推論パイプライン](./docs/en/inference/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing) - - [LLMサービング](docs/en/serving/api_server.md) - - [VLMサービング](docs/en/serving/api_server_vl.md) + - [LLM推論パイプライン](./docs/en/llm/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ) + - [VLM推論パイプライン](./docs/en/multi_modal/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing) + - [LLMサービング](docs/en/llm/api_server.md) + - [VLMサービング](docs/en/llm/api_server_vl.md) - [量子化](docs/en/quantization) - 高度なガイド - [推論エンジン - TurboMind](docs/en/inference/turbomind.md) @@ -217,7 +217,7 @@ LMDeployの基本的な使用方法については、[getting_started](./docs/en - [新しいモデルの追加](docs/en/advance/pytorch_new_model.md) - gemmチューニング - [長文推論](docs/en/advance/long_context.md) - - [マルチモデル推論サービス](docs/en/serving/proxy_server.md) + - [マルチモデル推論サービス](docs/en/llm/proxy_server.md) # サードパーティプロジェクト From 37eca3f6e3028378af290f8f3c0db41894a2c2be Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 6 Aug 2024 18:58:19 +0800 Subject: [PATCH 21/24] fix lint --- README_ja.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_ja.md b/README_ja.md index e9bd0e24f..d03d29e91 100644 --- a/README_ja.md +++ b/README_ja.md @@ -208,7 +208,7 @@ LMDeployの基本的な使用方法については、[getting_started](./docs/en - [LLM推論パイプライン](./docs/en/llm/pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dh-YlSwg78ZO3AlleO441NF_QP2shs95#scrollTo=YALmXnwCG1pQ) - [VLM推論パイプライン](./docs/en/multi_modal/vl_pipeline.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nKLfnPeDA3p-FMNw2NhI-KOpk7-nlNjF?usp=sharing) - [LLMサービング](docs/en/llm/api_server.md) - - [VLMサービング](docs/en/llm/api_server_vl.md) + - [VLMサービング](docs/en/multi_modal/api_server_vl.md) - [量子化](docs/en/quantization) - 高度なガイド - [推論エンジン - TurboMind](docs/en/inference/turbomind.md) From bc7d225850245c631ccc48a712b6eae08dead4ed Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 6 Aug 2024 20:37:24 +0800 Subject: [PATCH 22/24] fix --- docs/en/get_started.md | 2 +- docs/en/multi_modal/internvl.md | 2 ++ docs/en/multi_modal/llava.md | 2 ++ docs/zh_cn/multi_modal/internvl.md | 3 +++ docs/zh_cn/multi_modal/llava.md | 3 +++ 5 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 docs/zh_cn/multi_modal/internvl.md create mode 100644 docs/zh_cn/multi_modal/llava.md diff --git a/docs/en/get_started.md b/docs/en/get_started.md index 2d6b28199..76045aeba 100644 --- a/docs/en/get_started.md +++ b/docs/en/get_started.md @@ -50,7 +50,7 @@ pipe = pipeline('internlm/internlm2_5-7b-chat', ```{note} The parameter "cache_max_entry_count" significantly influences the GPU memory usage. It means the proportion of FREE GPU memory occupied by the K/V cache after the model weights are loaded. The default value is 0.8. Once allocated, the K/V cache memory is reused repeatedly, which is why it is common to observe that the built pipeline and the api_server mentioned later in the next consumes a substantial amount of GPU memory. -If you encounter an Out-of-Memory(OOM) error, you may need to consider lowering the value of cache_max_entry_count“. +If you encounter an Out-of-Memory(OOM) error, you may need to consider lowering the value of "cache_max_entry_count". ``` When use the callable `pipe()` to perform token generation with given prompts, you can set the sampling parameters via `GenerationConfig` as below: diff --git a/docs/en/multi_modal/internvl.md b/docs/en/multi_modal/internvl.md index 28c011199..0b204cb92 100644 --- a/docs/en/multi_modal/internvl.md +++ b/docs/en/multi_modal/internvl.md @@ -1 +1,3 @@ # InternVL + +TODO diff --git a/docs/en/multi_modal/llava.md b/docs/en/multi_modal/llava.md index c10506421..cf95e15d5 100644 --- a/docs/en/multi_modal/llava.md +++ b/docs/en/multi_modal/llava.md @@ -1 +1,3 @@ # LLaVA + +TODO diff --git a/docs/zh_cn/multi_modal/internvl.md b/docs/zh_cn/multi_modal/internvl.md new file mode 100644 index 000000000..0b204cb92 --- /dev/null +++ b/docs/zh_cn/multi_modal/internvl.md @@ -0,0 +1,3 @@ +# InternVL + +TODO diff --git a/docs/zh_cn/multi_modal/llava.md b/docs/zh_cn/multi_modal/llava.md new file mode 100644 index 000000000..cf95e15d5 --- /dev/null +++ b/docs/zh_cn/multi_modal/llava.md @@ -0,0 +1,3 @@ +# LLaVA + +TODO From 974b3b8861089ee7525373bb372104ff337416c4 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Wed, 7 Aug 2024 14:46:30 +0800 Subject: [PATCH 23/24] remove build.md --- README.md | 13 ++- README_ja.md | 13 ++- README_zh-CN.md | 12 +-- docs/en/advance/debug_turbomind.md | 2 +- .../en/benchmark/evaluate_with_opencompass.md | 6 +- docs/en/build.md | 85 ------------------ docs/en/installation.md | 2 +- docs/zh_cn/advance/debug_turbomind.md | 2 +- .../benchmark/evaluate_with_opencompass.md | 6 +- docs/zh_cn/build.md | 86 ------------------- 10 files changed, 19 insertions(+), 208 deletions(-) delete mode 100644 docs/en/build.md delete mode 100644 docs/zh_cn/build.md diff --git a/README.md b/README.md index 3d33c9a8d..4a6ecdf51 100644 --- a/README.md +++ b/README.md @@ -167,19 +167,16 @@ They differ in the types of supported models and the inference data type. Please ## Installation -Install lmdeploy with pip ( python 3.8+) or [from source](./docs/en/build.md) +It is recommended installing lmdeploy using pip in a conda environment (python 3.8 - 3.12): ```shell +conda create -n lmdeploy python=3.8 -y +conda activate lmdeploy pip install lmdeploy ``` -Since v0.3.0, The default prebuilt package is compiled on **CUDA 12**. However, if CUDA 11+ is required, you can install lmdeploy by: - -```shell -export LMDEPLOY_VERSION=0.5.3 -export PYTHON_VERSION=38 -pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 -``` +The default prebuilt package is compiled on **CUDA 12** since v0.3.0. +For more information on installing on CUDA 11+ platform, or for instructions on building from source, please refer to the [installation guide](./docs/en/installation.md). ## Offline Batch Inference diff --git a/README_ja.md b/README_ja.md index ccddd078c..62b77e214 100644 --- a/README_ja.md +++ b/README_ja.md @@ -168,19 +168,16 @@ LMDeployは、[TurboMind](./docs/en/inference/turbomind.md)および[PyTorch](./ ## インストール -pip(python 3.8+)を使用してlmdeployをインストールするか、[ソースからインストール](./docs/en/build.md)します +クリーンなconda環境(Python 3.8 - 3.12)でlmdeployをインストールすることをお勧めします。 ```shell +conda create -n lmdeploy python=3.8 -y +conda activate lmdeploy pip install lmdeploy ``` -v0.3.0以降、デフォルトのプリビルドパッケージは**CUDA 12**でコンパイルされています。ただし、CUDA 11+が必要な場合は、次のコマンドでlmdeployをインストールできます: - -```shell -export LMDEPLOY_VERSION=0.5.3 -export PYTHON_VERSION=38 -pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 -``` +v0.3.0から、デフォルトの事前構築済みパッケージはCUDA 12でコンパイルされています。 +CUDA 11+プラットフォームでのインストールに関する情報、またはソースからのビルド手順については、[インストールガイドを](docs/en/installation.md)参照してください。 ## オフラインバッチ推論 diff --git a/README_zh-CN.md b/README_zh-CN.md index 2c50ee54e..b7d5634fa 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -168,19 +168,15 @@ LMDeploy 支持 2 种推理引擎: [TurboMind](./docs/zh_cn/inference/turbomin ## 安装 -使用 pip ( python 3.8+) 安装 LMDeploy,或者[源码安装](./docs/zh_cn/build.md) +我们推荐在一个干净的conda环境下(python3.8 - 3.12),安装 lmdeploy: ```shell +conda create -n lmdeploy python=3.8 -y +conda activate lmdeploy pip install lmdeploy ``` -自 v0.3.0 起,LMDeploy 预编译包默认基于 CUDA 12 编译。如果需要在 CUDA 11+ 下安装 LMDeploy,请执行以下命令: - -```shell -export LMDEPLOY_VERSION=0.5.3 -export PYTHON_VERSION=38 -pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 -``` +自 v0.3.0 起,LMDeploy 预编译包默认基于 CUDA 12 编译。如果需要在 CUDA 11+ 下安装 LMDeploy,或者源码安装 LMDeploy,请参考[安装文档](./docs/zh_cn/installation.md) ## 离线批处理 diff --git a/docs/en/advance/debug_turbomind.md b/docs/en/advance/debug_turbomind.md index 5af559a27..c4c7b32f7 100644 --- a/docs/en/advance/debug_turbomind.md +++ b/docs/en/advance/debug_turbomind.md @@ -4,7 +4,7 @@ Turbomind is implemented in C++, which is not as easy to debug as Python. This d ## Prerequisite -First, complete the local compilation according to the commands in [Build in localhost](../build.md). +First, complete the local compilation according to the commands in [Install from source](../installation.md). ## Configure Python debug environment diff --git a/docs/en/benchmark/evaluate_with_opencompass.md b/docs/en/benchmark/evaluate_with_opencompass.md index 9971b9cc6..f078c6e44 100644 --- a/docs/en/benchmark/evaluate_with_opencompass.md +++ b/docs/en/benchmark/evaluate_with_opencompass.md @@ -8,11 +8,7 @@ In this part, we are going to setup the environment for evaluation. ### Install lmdeploy -Install lmdeploy through pip (python 3.8+). If you want to install from source, you can refer to [build.md](../build.md). - -```shell -pip install lmdeploy -``` +Please follow the [installation guide](../installation.md) to install lmdeploy. ### Install OpenCompass diff --git a/docs/en/build.md b/docs/en/build.md deleted file mode 100644 index 51c660b30..000000000 --- a/docs/en/build.md +++ /dev/null @@ -1,85 +0,0 @@ -# Build from source - -LMDeploy provides prebuilt package that can be easily installed by `pip install lmdeploy`. - -If you have requests to build lmdeploy from source, please clone lmdeploy repository from GitHub, and follow instructions in next sections - -```shell -git clone --depth=1 https://github.com/InternLM/lmdeploy -``` - -## Build in Docker (recommended) - -We highly advise using the provided docker image for lmdeploy build to circumvent complex environment setup. - -The docker image is `openmmlab/lmdeploy-builder:cuda11.8`. Make sure that docker is installed before using this image. - -In the root directory of the lmdeploy source code, please run the following command: - -```shell -# the home folder of lmdeploy source code -cd lmdeploy -bash builder/manywheel/build_all_wheel.sh -``` - -All the wheel files for lmdeploy under py3.8 - py3.11 will be found in the `builder/manywheel/cuda11.8_dist` directory, such as, - -```text -builder/manywheel/cuda11.8_dist/ -├── lmdeploy-0.0.12-cp310-cp310-manylinux2014_x86_64.whl -├── lmdeploy-0.0.12-cp311-cp311-manylinux2014_x86_64.whl -├── lmdeploy-0.0.12-cp38-cp38-manylinux2014_x86_64.whl -└── lmdeploy-0.0.12-cp39-cp39-manylinux2014_x86_64.whl -``` - -If the wheel file for a specific Python version is required, such as py3.8, please execute: - -```shell -bash builder/manywheel/build_wheel.sh py38 manylinux2014_x86_64 cuda11.8 cuda11.8_dist -``` - -And the wheel file will be found in the `builder/manywheel/cuda11.8_dist` directory. - -You can use `pip install` to install the wheel file that matches the Python version on your host machine. - -## Build in localhost (optional) - -Firstly, please make sure gcc version is no less than 9, which can be conformed by `gcc --version`. - -Then, follow the steps below to set up the compilation environment: - -- install the dependent packages: - ```shell - pip install -r requirements.txt - apt-get install rapidjson-dev - ``` -- install [nccl](https://docs.nvidia.com/deeplearning/nccl/install-guide/index.html), and set environment variables: - ```shell - export NCCL_ROOT_DIR=/path/to/nccl - export NCCL_LIBRARIES=/path/to/nccl/lib - ``` -- install openmpi from source: - ```shell - wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz - tar xf openmpi-4.1.5.tar.gz - cd openmpi-4.1.5 - ./configure --prefix=/usr/local/openmpi - make -j$(nproc) && make install - export PATH=$PATH:/usr/local/openmpi/bin - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/openmpi/lib - ``` -- build and install lmdeploy libraries: - ```shell - # install ninja - apt install ninja-build - # the home folder of lmdeploy - cd lmdeploy - mkdir build && cd build - sh ../generate.sh - ninja -j$(nproc) && ninja install - ``` -- install lmdeploy python package: - ```shell - cd .. - pip install -e . - ``` diff --git a/docs/en/installation.md b/docs/en/installation.md index 9e4ad4d95..d1333f45a 100644 --- a/docs/en/installation.md +++ b/docs/en/installation.md @@ -32,7 +32,7 @@ pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_V The release frequency of LMDeploy is approximately once or twice monthly. If your desired feature has been merged to LMDeploy main branch but hasn't been published yet, you can experiment with the nightly-built package available [here](https://github.com/zhyncs/lmdeploy-build) according to your CUDA and Python versions -## Install from the source +## Install from source If you are using the PyTorch Engine for inference, the installation from the source is quite simple: diff --git a/docs/zh_cn/advance/debug_turbomind.md b/docs/zh_cn/advance/debug_turbomind.md index 190600c1f..cb95c6ef4 100644 --- a/docs/zh_cn/advance/debug_turbomind.md +++ b/docs/zh_cn/advance/debug_turbomind.md @@ -4,7 +4,7 @@ Turbomind 使用 C++ 实现,不像 Python 一样易于调试。该文档提供 ## 前置工作 -首先,根据构建[命令](../build.md)完成本地编译。 +首先,根据构建[命令](../installation.md)完成源码编译和安装。 ## 配置 Python 调试环境 diff --git a/docs/zh_cn/benchmark/evaluate_with_opencompass.md b/docs/zh_cn/benchmark/evaluate_with_opencompass.md index d12a82f11..d45c8b28a 100644 --- a/docs/zh_cn/benchmark/evaluate_with_opencompass.md +++ b/docs/zh_cn/benchmark/evaluate_with_opencompass.md @@ -8,11 +8,7 @@ LMDeploy设计了TurboMind推理引擎用来加速大模型推理,其推理精 ### 安装 lmdeploy -使用 pip (python 3.8+) 安装 LMDeploy,或者[源码安装](../build.md) - -```shell -pip install lmdeploy -``` +请参考[安装指南](../installation.md)安装 lmdeploy ### 安装 OpenCompass diff --git a/docs/zh_cn/build.md b/docs/zh_cn/build.md deleted file mode 100644 index 48145ec0a..000000000 --- a/docs/zh_cn/build.md +++ /dev/null @@ -1,86 +0,0 @@ -# 编译和安装 - -LMDeploy 提供了预编译包,可以很方便的通过 `pip install lmdeploy` 安装和使用。 - -如果有源码编译的需求,请先下载 lmdeploy 源码: - -```shell -git clone --depth=1 https://github.com/InternLM/lmdeploy -``` - -然后,参考以下章节编译和安装。 - -## 在 docker 内编译安装(强烈推荐) - -LMDeploy 提供了编译镜像 `openmmlab/lmdeploy-builder:cuda11.8`。使用之前,请确保 docker 已安装。 - -在 lmdeploy 源码的根目录下,运行以下命令: - -```shell -# lmdeploy 源码根目录 -cd lmdeploy -bash builder/manywheel/build_all_wheel.sh -``` - -即可在 `builder/manywheel/cuda11.8_dist` 文件夹下,得到 lmdeploy 在 py3.8 - py3.11 下所有的 wheel 文件。比如, - -```text -builder/manywheel/cuda11.8_dist/ -├── lmdeploy-0.0.12-cp310-cp310-manylinux2014_x86_64.whl -├── lmdeploy-0.0.12-cp311-cp311-manylinux2014_x86_64.whl -├── lmdeploy-0.0.12-cp38-cp38-manylinux2014_x86_64.whl -└── lmdeploy-0.0.12-cp39-cp39-manylinux2014_x86_64.whl -``` - -如果需要固定 python 版本的 wheel 文件,比如 py3.8,可以执行: - -```shell -bash builder/manywheel/build_wheel.sh py38 manylinux2014_x86_64 cuda11.8 cuda11.8_dist -``` - -wheel 文件存放在目录 `builder/manywheel/cuda11.8_dist` 下。 - -在宿主机上,通过 `pip install` 安装和宿主机python版本一致的 wheel 文件,即完成 lmdeploy 整个编译安装过程。 - -## 在物理机上编译安装(可选) - -首先,请确保物理机环境的 gcc 版本不低于 9,可以通过`gcc --version`确认。 - -然后,按如下步骤,配置编译环境: - -- 安装编译和运行依赖包: - ```shell - pip install -r requirements.txt - apt-get install rapidjson-dev - ``` -- 安装 [nccl](https://docs.nvidia.com/deeplearning/nccl/install-guide/index.html),设置环境变量 - ```shell - export NCCL_ROOT_DIR=/path/to/nccl - export NCCL_LIBRARIES=/path/to/nccl/lib - ``` -- 源码编译安装 openmpi: - ```shell - wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz - tar xf openmpi-4.1.5.tar.gz - cd openmpi-4.1.5 - ./configure --prefix=/usr/local/openmpi - make -j$(nproc) && make install - export PATH=$PATH:/usr/local/openmpi/bin - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/openmpi/lib - ``` -- lmdeploy 编译安装: - ```shell - # 安装更快的 Ninja - apt install ninja-build - # lmdeploy 源码的根目录 - cd lmdeploy - mkdir build && cd build - sh ../generate.sh - ninja && ninja install - ninja -j$(nproc) && ninja install - ``` -- 安装 lmdeploy python package: - ```shell - cd .. - pip install -e . - ``` From 51b1df830f1281d41c83ea2802893f9807933388 Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Wed, 7 Aug 2024 15:26:09 +0800 Subject: [PATCH 24/24] debug --- docs/en/conf.py | 8 ++++---- docs/zh_cn/conf.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/en/conf.py b/docs/en/conf.py index 18a1b7d1d..c24e6ab6f 100644 --- a/docs/en/conf.py +++ b/docs/en/conf.py @@ -106,16 +106,16 @@ 'path_to_docs': 'docs/en', 'repository_url': 'https://github.com/InternLM/lmdeploy', 'repository_branch': 'main', - 'show_navbar_depth': 3, - 'max_navbar_depth': 4, - 'collapse_navbar': True, + # 'show_navbar_depth': 3, + # 'navigation_depth': 4, + # 'collapse_navigation': False, 'use_edit_page_button': True, 'use_source_button': True, 'use_issues_button': True, 'use_repository_button': True, 'use_download_button': True, 'use_sidenotes': True, - 'show_toc_level': 2, + # 'show_toc_level': 2, # "icon_links": [ # { # "name": "切换至简体中文", diff --git a/docs/zh_cn/conf.py b/docs/zh_cn/conf.py index 21e96f2dc..6804c626c 100644 --- a/docs/zh_cn/conf.py +++ b/docs/zh_cn/conf.py @@ -107,16 +107,16 @@ 'path_to_docs': 'docs/zh_cn', 'repository_url': 'https://github.com/InternLM/lmdeploy', 'repository_branch': 'main', - 'show_navbar_depth': 3, - 'max_navbar_depth': 4, - 'collapse_navbar': True, + # 'show_navbar_depth': 3, + # 'navigation_depth': 4, + # 'collapse_navigation': True, 'use_edit_page_button': True, 'use_source_button': True, 'use_issues_button': True, 'use_repository_button': True, 'use_download_button': True, 'use_sidenotes': True, - 'show_toc_level': 2, + # 'show_toc_level': 2, # "icon_links": [ # { # "name": "Switch to English",