add docs

Signed-off-by: Frost Ming <me@frostming.com>
bentoml · May 16, 2023 · 7bd9b78 · 7bd9b78
1 parent 70b8d5b
commit 7bd9b78
Show file tree

Hide file tree

Showing 5 changed files with 41 additions and 13 deletions.
diff --git a/docs/source/concepts/runner.rst b/docs/source/concepts/runner.rst
@@ -418,10 +418,12 @@ can be specified for the ``nvidia.com/gpu`` key. For example, the following conf
 
 For the detailed information on the meaning of each resource allocation configuration, see :doc:`/guides/scheduling`.
 
-Timeout
-^^^^^^^
+Traffic Control
+^^^^^^^^^^^^^^^
 
-Runner timeout defines the amount of time in seconds to wait before calls a runner is timed out on the API server.
+Same as API server, you can also configure the traffic settings for both all runners and individual runner.
+Specifcally, ``traffic.timeout`` defines the amount of time in seconds that the runner will wait for a response from the model before timing out.
+``traffic.max_concurrency`` defines the maximum number of concurrent requests the runner will accept before returning an error.
 
 .. tab-set::
 
@@ -432,7 +434,9 @@ Runner timeout defines the amount of time in seconds to wait before calls a runn
           :caption: ⚙️ `configuration.yml`
 
           runners:
-            timeout: 60
+            traffic:
+              timeout: 60
+              max_concurrency: 10
 
     .. tab-item:: Individual Runner
        :sync: individual_runner
@@ -442,7 +446,9 @@ Runner timeout defines the amount of time in seconds to wait before calls a runn
 
           runners:
             iris_clf:
-              timeout: 60
+              traffic:
+                timeout: 60
+                max_concurrency: 10
 
 Access Logging
 ^^^^^^^^^^^^^^

diff --git a/docs/source/guides/configuration.rst b/docs/source/guides/configuration.rst
@@ -152,7 +152,7 @@ The following options are available for the ``api_server`` section:
 +=============+=============================================================+=================================================+
 | ``workers`` | Number of API workers for to spawn                          | null [#default_workers]_                        |
 +-------------+-------------------------------------------------------------+-------------------------------------------------+
-| ``timeout`` | Timeout for API server in seconds                           | 60                                              |
+| ``traffic`` | Traffic control for API server                              | See :ref:`guides/configuration:\`\`traffic\`\`` |
 +-------------+-------------------------------------------------------------+-------------------------------------------------+
 | ``backlog`` | Maximum number of connections to hold in backlog            | 2048                                            |
 +-------------+-------------------------------------------------------------+-------------------------------------------------+
@@ -169,6 +169,27 @@ The following options are available for the ``api_server`` section:
 | ``tracing`` | Key and values to configure tracing exporter for API server | See :doc:`/guides/tracing`                      |
 +-------------+-------------------------------------------------------------+-------------------------------------------------+
 
+``traffic``
+"""""""""""
+
+You can control the traffic of the API server by setting the ``traffic`` field.
+
+To set the maximum number of seconds to wait before a response is received, set ``api_server.traffic.timeout``, the default value is ``60``s:
+
+.. code-block:: yaml
+
+   api_server:
+     traffic:
+       timeout: 120
+
+To set the maximum number of requests in the process queue across all runners, set ``api_server.traffic.max_concurrency``, the default value is infinite:
+
+.. code-block:: yaml
+
+   api_server:
+     traffic:
+       max_concurrency: 50
+
 ``metrics``
 """""""""""
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -269,7 +269,6 @@ target-version = "py310"
 convention = "google"
 
 [tool.ruff.isort]
-lines-after-imports = 2
 force-single-line = true
 known-first-party = ["bentoml"]
 

diff --git a/tests/e2e/bento_server_http/tests/test_serve.py b/tests/e2e/bento_server_http/tests/test_serve.py
@@ -1,7 +1,7 @@
-import asyncio
 import os
 import sys
 import time
+import asyncio
 
 import pytest
 

diff --git a/tests/unit/_internal/configuration/test_containers.py b/tests/unit/_internal/configuration/test_containers.py
@@ -145,14 +145,16 @@ def test_runner_gpu_configuration(
 def test_runner_timeouts(container_from_file: t.Callable[[str], ConfigDictType]):
     RUNNER_TIMEOUTS = """\
 runners:
-    timeout: 50
+    traffic:
+        timeout: 50
     test_runner_1:
-        timeout: 100
+        traffic:
+            timeout: 100
     test_runner_2:
         resources: system
 """
     bentoml_cfg = container_from_file(RUNNER_TIMEOUTS)
     runner_cfg = bentoml_cfg["runners"]
-    assert runner_cfg["timeout"] == 50
-    assert runner_cfg["test_runner_1"]["timeout"] == 100
-    assert runner_cfg["test_runner_2"]["timeout"] == 50
+    assert runner_cfg["traffic"]["timeout"] == 50
+    assert runner_cfg["test_runner_1"]["traffic"]["timeout"] == 100
+    assert runner_cfg["test_runner_2"]["traffic"]["timeout"] == 50