Merge branch 'main' into mrwyattii/add-sd-tests

deepspeedai · Feb 26, 2024 · 40dc447 · 40dc447
2 parents 44e0b49 + 8036370
commit 40dc447
Show file tree

Hide file tree

Showing 26 changed files with 955 additions and 51 deletions.
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -0,0 +1,13 @@
+version: "2"
+
+build:
+  os: "ubuntu-22.04"
+  tools:
+    python: "3.10"
+
+python:
+  install:
+    - requirements: docs/requirements.txt
+
+sphinx:
+  configuration: docs/source/conf.py
diff --git a/docs/Makefile b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/make.bat b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -0,0 +1,14 @@
+asyncio
+autodoc_pydantic<2.0.0
+deepspeed>=0.13.0
+grpcio
+grpcio-tools
+sphinx==7.1.2
+sphinx-prompt
+sphinx-rtd-theme==1.3.0rc1
+sphinx_autodoc_typehints
+sphinx_copybutton
+torch
+transformers
+ujson
+zmq
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -0,0 +1,25 @@
+API
+===
+
+DeepSpeed-MII provides a very simple API to deploy your LLM:
+
+.. autofunction:: mii.pipeline
+
+The :func:`mii.pipeline` API is a great way to try DeepSpeed-MII with ragged
+batching and dynamic splitfuse. The pipeline is non-persistent and only exists
+for the lifetime of the python script where it is used. For examples of how to
+use :func:`mii.pipeline` please see :doc:`pipeline`.
+
+.. autofunction:: mii.serve
+
+The :func:`mii.serve` API is intended for production use cases, where a
+persistent model deployment is necessary. The persistent deployment utilizes
+ragged batching and dynamic splitfuse to deliver high throughput and low latency
+to multiple clients in parallel. For examples of how to use :func:`mii.serve`
+please see :doc:`deployment`.
+
+.. autofunction:: mii.client
+
+The :func:`mii.client` API allows multiple processes to connect to a persistent
+deployment created with :func:`mii.serve`. For examples of how to use
+:func:`mii.client` please see :doc:`deployment`.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -0,0 +1,79 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+# Configuration file for the Sphinx documentation builder.
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath('../../'))
+
+# -- Project information
+
+project = 'DeepSpeed-MII'
+copyright = '2023, Microsoft'
+author = 'Microsoft'
+
+with open("../../version.txt", "r") as f:
+    release = f.readline().rstrip()
+
+# -- General configuration
+
+extensions = [
+    'sphinx.ext.duration',
+    'sphinx.ext.doctest',
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.viewcode',
+    'sphinx_autodoc_typehints',
+    'sphinx_copybutton',
+    'sphinx-prompt',
+    'sphinxcontrib.autodoc_pydantic',
+]
+
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3/',
+               None),
+    'sphinx': ('https://www.sphinx-doc.org/en/master/',
+               None),
+}
+intersphinx_disabled_domains = ['std']
+
+# sphinx_autodoc_typehints config
+typehints_defaults = "braces"
+
+# autodoc_pyandtic config
+autodoc_pydantic_model_show_field_summary = False
+autodoc_pydantic_field_signature_prefix = ' '
+autodoc_pydantic_model_signature_prefix = 'class'
+autodoc_pydantic_model_show_json = False
+autodoc_pydantic_model_show_config_summary = False
+autodoc_pydantic_model_show_config_member = False
+autodoc_pydantic_model_show_validator_summary = False
+autodoc_pydantic_model_show_validator_members = False
+autodoc_pydantic_model_summary_list_order = 'bysource'
+autodoc_pydantic_model_member_order = 'bysource'
+autodoc_pydantic_field_list_validators = False
+
+# sphinx_copybutton config
+copybutton_prompt_text = r">>> |\$ |\(.venv\) \$ "
+copybutton_prompt_is_regexp = True
+
+#autodoc_mock_imports = ["deepspeed", "torch"]
+autodoc_member_order = 'bysource'
+autosummary_generate = True
+
+templates_path = ['_templates']
+
+# -- Options for HTML output
+
+html_theme = 'sphinx_rtd_theme'
+html_theme_options = {
+    "logo_only": True,
+}
+html_logo = "../images/mii-dark.svg"
+logo_only = True
+
+# -- Options for EPUB output
+epub_show_urls = 'footnote'
diff --git a/docs/source/config.rst b/docs/source/config.rst
@@ -0,0 +1,72 @@
+Configuration
+=============
+
+The config classes described here are used to customize :doc:`pipeline` and :doc:`deployment`.
+
+.. _model_configuration:
+
+Model Configuration
+-------------------
+
+The :class:`ModelConfig <mii.config.ModelConfig>` is used to stand up a
+DeepSpeed inference engine and provides a large amount of control to users. This
+class is automatically generated from user-provided arguments to
+:func:`mii.pipeline` and :func:`mii.serve`. The fields can be provided in a
+``model_config`` dictionary or as keyword arguments.
+
+For example, to change the default ``max_length`` for token generation of a
+pipeline, the following are equivalent:
+
+As a keyword argument:
+
+.. code-block:: python
+
+    pipe = mii.pipeline("mistralai/Mistral-7B-v0.1", max_length=2048)
+
+As a ``model_config`` dictionary:
+
+.. code-block:: python
+
+    pipe = mii.pipeline("mistralai/Mistral-7B-v0.1", model_config={"max_length": 2048})
+
+.. autopydantic_model:: mii.config.ModelConfig
+
+.. _mii_configuration:
+
+MII Server Configuration
+------------------------
+
+The :class:`MIIConfig <mii.config.MIIConfig>` is used to stand up a
+DeepSpeed-MII `gRPC <https://grpc.io/>`_ server and provide a large amount of
+control to users. This class is automatically generated from user-provided
+arguments to :func:`mii.serve`. The fields can be provided in a ``mii_config``
+dictionary or as keyword arguments.
+
+For example, to change the base port number used to to communicate with a
+persistent deployment and the default ``max_length`` for token generation, the
+following are equivalent:
+
+As keyword arguments:
+
+.. code-block:: python
+
+    client = mii.serve("mistralai/Mistral-7B-v0.1", port_number=50055, max_length=2048)
+
+As ``model_config`` and ``mii_config`` dictionaries:
+
+.. code-block:: python
+
+    client = mii.serve("mistralai/Mistral-7B-v0.1", mii_config={"port_number": 50055}, model_config={"max_length": 2048})
+
+.. autopydantic_model:: mii.config.MIIConfig
+
+Text-Generation Configuration
+-----------------------------
+
+The :class:`GenerateParamsConfig <mii.config.GenerateParamsConfig>` is used to
+process user-provided keyword arguments passed to :class:`MIIPipeline
+<mii.batching.ragged_batching.MIIPipeline>` and :class:`MIIClient
+<mii.backend.client.MIIClient>` when doing text-generation.
+
+.. autopydantic_model:: mii.config.GenerateParamsConfig
+    :exclude-members: prompt_length
diff --git a/docs/source/deployment.rst b/docs/source/deployment.rst
@@ -0,0 +1,128 @@
+Persistent Deployments
+======================
+
+A persistent model deployment can created with the :func:`mii.serve` API. This
+stands up a gRPC server and returns a :class:`MIIClient
+<mii.backend.client.MIIClient>` object that can be used to send generation
+requests to the inference server. The inference server will persist after the
+python script exits and until it is explicitly terminated.
+
+To connect to an existing deployment, the :func:`mii.client` API is used. This
+will connect with an existing gRPC server and return a :class:`MIIClient
+<mii.backend.client.MIIClient>` object.
+
+MIIClient
+---------
+
+.. autoclass::
+    mii.backend.client.MIIClient
+
+    .. automethod:: __call__
+
+    .. automethod:: generate
+
+    .. automethod:: terminate_server
+
+:class:`MIIClient <mii.backend.client.MIIClient>` is a callable class that
+provides a simplified interface for generating text for prompt inputs on a
+persistent model deployment. To create a persistent deployment, you must only
+provide the HuggingFace model name (or path to a locally stored model) to the
+:func:`mii.serve` API. DeepSpeed-MII will automatically load the model weights,
+create an inference engine, stand up a gRPC server, and return the callable
+client. An example is provided below:
+
+.. code-block:: python
+
+    import mii
+    client = mii.serve("mistralai/Mistral-7B-v0.1")
+    response = client(["DeepSpeed is", "Seattle is"], max_new_tokens=128)
+    print(response)
+
+Because the deployment is persistent, this server will continue running until it
+is explicitly shutdown. This allows users to connect to a deployment from other
+processes using the :func:`mii.client` API:
+
+.. code-block:: python
+
+    import mii
+    client = mii.client("mistralai/Mistral-7B-v0.1")
+    response = client(["DeepSpeed is", "Seattle is"], max_new_tokens=128)
+    print(response)
+
+When a server needs to be shutdown, this can be done from any client object:
+
+.. code-block:: python
+
+    import mii
+    client = mii.client("mistralai/Mistral-7B-v0.1")
+    client.terminate_server()
+
+Deployment Configuration
+------------------------
+
+While we prioritize offering a simple interface for loading models into
+production-ready persistent deployments, we also provide many configuration
+options for our persistent deployment.
+
+**Any of the fields in** :class:`ModelConfig <mii.config.ModelConfig>` **and**
+:class:`MIIConfig <mii.config.MIIConfig>` **can be passed as keyword
+arguments or in respective** ``model_config`` **and** ``mii_config``
+**dictionaries to the** :func:`mii.serve` **API. Please see** :ref:`Model
+Configuration <model_configuration>` **and** :ref:`MII Server Configuration
+<mii_configuration>` **for more information.**
+
+
+Generate Options
+----------------
+
+Text-generation behavior using the callable :class:`MIIClient
+<mii.backend.client.MIIClient>` class can be customized with several keyword
+arguments. A full list of the available options can be found in
+:class:`GenerateParamsConfig <mii.config.GenerateParamsConfig>`.
+
+The generate options affect on the prompt(s) passed in a given call the client.
+For example, the generation length can be controlled on a per-prompt basis and
+override the default ``max_length``:
+
+.. code-block:: python
+
+    response_long = client(prompt, max_length=1024)
+    response_short = client(prompt, max_length=128)
+
+.. _deployment_model_parallelism:
+
+Model Parallelism
+-----------------
+
+Our persistent deployment supports splitting models across multiple GPUs using
+tensor parallelism. To enable model parallelism, pass the ``tensor_parallel``
+argument to :func:`mii.serve`:
+
+.. code-block:: python
+
+    client = mii.serve("mistralai/Mistral-7B-v0.1", tensor_parallel=2)
+
+.. _deployment_model_replicas:
+
+Model Replicas
+--------------
+
+The persistent deployment can also create multiple model replicas. Passing the
+``replica_num`` argument to :func:`mii.serve` enables this feature:
+
+.. code-block:: python
+
+    client = mii.serve("mistralai/Mistral-7B-v0.1", replica_num=2)
+
+With multiple model replicas, the incoming requests from clients will be
+forwarded to the replicas in a round-robin scheduling by an intermediate
+load-balancer process. For example, if 4 requests with ids ``0, 1, 2, 3`` are
+sent to the persistent deployment, then ``replica 0`` will process requests
+``0`` and ``2`` while ``replica 1`` will process requests ``1`` and ``3``.
+
+Model replicas also compose with model parallelism. For example, 2 replicas can
+be created each split across 2 GPUs on a system with 4 GPUs total:
+
+.. code-block:: python
+
+    client = mii.serve("mistralai/Mistral-7B-v0.1", replica_num=2, tensor_parallel=2)