update makefile and docs

michaelfeil · Sep 24, 2024 · c71b20d · c71b20d
1 parent 228bf17
commit c71b20d
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 2 deletions.
diff --git a/docs/docs/cli_v2.md b/docs/docs/cli_v2.md
@@ -5,3 +5,47 @@ Note: The section below is auto-generated by the makefile.
 
 ```bash
 infinity_emb v2 --help
+
+ Usage: infinity_emb v2 [OPTIONS]                                                                                                                                                                                                                                             
+
+ Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil                                                                                                                                                                                                     
+ Multiple Model CLI Playbook:                                                                                                                                                                                                                                                 
+ - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id/id2 --batch-size 8 --batch-size 4`                                                                                                                                                               
+ - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" && INFINITY_BATCH_SIZE="8;4;"                                                                                                                                 
+ - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size 8` both models have batch-size 8.                                                                                                                      
+
+╭─ Options ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ --model-id                                             TEXT                                       Huggingface model repo id. Subset of possible models: https://huggingface.co/models?other=text-embeddings-inference& [env var: `INFINITY_MODEL_ID`]                      │
+│                                                                                                   [default: michaelfeil/bge-small-en-v1.5]                                                                                                                                 │
+│ --served-model-name                                    TEXT                                       the nickname for the API, under which the model_id can be selected [env var: `INFINITY_SERVED_MODEL_NAME`]                                                               │
+│ --batch-size                                           INTEGER                                    maximum batch size for inference [env var: `INFINITY_BATCH_SIZE`] [default: 32]                                                                                          │
+│ --revision                                             TEXT                                       huggingface  model repo revision. [env var: `INFINITY_REVISION`]                                                                                                         │
+│ --trust-remote-code       --no-trust-remote-code                                                  if potential remote modeling code from huggingface repo is trusted. [env var: `INFINITY_TRUST_REMOTE_CODE`] [default: trust-remote-code]                                 │
+│ --engine                                               [torch|ctranslate2|optimum|debugengine]    Which backend to use. `torch` uses Pytorch GPU/CPU, optimum uses ONNX on GPU/CPU/NVIDIA-TensorRT, `CTranslate2` uses torch+ctranslate2 on CPU/GPU.                       │
+│                                                                                                   [env var: `INFINITY_ENGINE`]                                                                                                                                             │
+│                                                                                                   [default: torch]                                                                                                                                                         │
+│ --model-warmup            --no-model-warmup                                                       if model should be warmed up after startup, and before ready. [env var: `INFINITY_MODEL_WARMUP`] [default: model-warmup]                                                 │
+│ --vector-disk-cache       --no-vector-disk-cache                                                  If hash(request)/results should be cached to SQLite for latency improvement. [env var: `INFINITY_VECTOR_DISK_CACHE`] [default: vector-disk-cache]                        │
+│ --device                                               [cpu|cuda|mps|tensorrt|auto]               device to use for computing the model forward pass. [env var: `INFINITY_DEVICE`] [default: auto]                                                                         │
+│ --lengths-via-tokenize    --no-lengths-via-tokenize                                               if True, returned tokens is based on actual tokenizer count. If false, uses len(input) as proxy. [env var: `INFINITY_LENGTHS_VIA_TOKENIZE`]                              │
+│                                                                                                   [default: lengths-via-tokenize]                                                                                                                                          │
+│ --dtype                                                [float32|float16|int8|fp8|auto]            dtype for the model weights. [env var: `INFINITY_DTYPE`] [default: auto]                                                                                                 │
+│ --embedding-dtype                                      [float32|int8|uint8|binary|ubinary]        dtype post-forward pass. If != `float32`, using Post-Forward Static quantization. [env var: `INFINITY_EMBEDDING_DTYPE`] [default: float32]                               │
+│ --pooling-method                                       [mean|cls|auto]                            overwrite the pooling method if inferred incorrectly. [env var: `INFINITY_POOLING_METHOD`] [default: auto]                                                               │
+│ --compile                 --no-compile                                                            Enable usage of `torch.compile(dynamic=True)` if engine relies on it. [env var: `INFINITY_COMPILE`] [default: compile]                                                   │
+│ --bettertransformer       --no-bettertransformer                                                  Enables varlen flash-attention-2 via the `BetterTransformer` implementation. If available for this model. [env var: `INFINITY_BETTERTRANSFORMER`]                        │
+│                                                                                                   [default: bettertransformer]                                                                                                                                             │
+│ --preload-only            --no-preload-only                                                       If true, only downloads models and verifies setup, then exit. Recommended for pre-caching the download in a Dockerfile. [env var: `INFINITY_PRELOAD_ONLY`]               │
+│                                                                                                   [default: no-preload-only]                                                                                                                                               │
+│ --host                                                 TEXT                                       host for the FastAPI uvicorn server [env var: `INFINITY_HOST`] [default: 0.0.0.0]                                                                                        │
+│ --port                                                 INTEGER                                    port for the FastAPI uvicorn server [env var: `INFINITY_PORT`] [default: 7997]                                                                                           │
+│ --url-prefix                                           TEXT                                       prefix for all routes of the FastAPI uvicorn server. Useful if you run behind a proxy / cascaded API. [env var: `INFINITY_URL_PREFIX`]                                   │
+│ --redirect-slash                                       TEXT                                       where to redirect `/` requests to. [env var: `INFINITY_REDIRECT_SLASH`] [default: /docs]                                                                                 │
+│ --log-level                                            [critical|error|warning|info|debug|trace]  console log level. [env var: `INFINITY_LOG_LEVEL`] [default: info]                                                                                                       │
+│ --permissive-cors         --no-permissive-cors                                                    whether to allow permissive cors. [env var: `INFINITY_PERMISSIVE_CORS`] [default: no-permissive-cors]                                                                    │
+│ --api-key                                              TEXT                                       api_key used for authentication headers. [env var: `INFINITY_API_KEY`]                                                                                                   │
+│ --proxy-root-path                                      TEXT                                       Proxy prefix for the application. See: https://fastapi.tiangolo.com/advanced/behind-a-proxy/ [env var: `INFINITY_PROXY_ROOT_PATH`]                                       │
+│ --help                                                                                            Show this message and exit.                                                                                                                                              │
+╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+
+```
diff --git a/libs/infinity_emb/Makefile b/libs/infinity_emb/Makefile
@@ -58,13 +58,14 @@ benchmark_embed: tests/data/benchmark/benchmark_embed.json
 	http://127.0.0.1:7997/embeddings
 	# sudo apt-get apache2-utils
 
+# Generate CLI v2 documentation
 cli_v2_docs:
 	@echo 'Generating CLI v2 documentation...'
 	@echo '# CLI v2 Documentation' > ../../docs/docs/cli_v2.md
-	@echo '' >> ../../docs/docs/cli_v2.md
+	@echo >> ../../docs/docs/cli_v2.md
 	@echo 'The current version of Infinity uses the following arguments in its CLI:' >> ../../docs/docs/cli_v2.md
 	@echo 'Note: The section below is auto-generated by the makefile.' >> ../../docs/docs/cli_v2.md
-	@echo '' >> ../../docs/docs/cli_v2.md
+	@echo >> ../../docs/docs/cli_v2.md
 	@echo '```bash' >> ../../docs/docs/cli_v2.md
 	@echo '$ infinity_emb v2 --help' >> ../../docs/docs/cli_v2.md
 	poetry run infinity_emb v2 --help >> ../../docs/docs/cli_v2.md