Wordcab · chainyo · May 15, 2023 · May 15, 2023 · May 15, 2023 · May 15, 2023
diff --git a/.env b/.env
@@ -1,18 +1,96 @@
+# --------------------------------------- ⚙️ WORDCAB TRANSCRIBE CONFIGURATION ---------------------------------------- #
+#
+# ---------------------------------------------- GENERAL CONFIGURATION ----------------------------------------------- #
+#
+# The name of the project, used for API documentation.
 PROJECT_NAME="Wordcab Transcribe"
+# The version of the project, used for API documentation.
 VERSION="0.2.0"
+# The description of the project, used for API documentation.
 DESCRIPTION="💬 ASR FastAPI server using faster-whisper and NVIDIA NeMo."
+# This API prefix is used for all endpoints in the API outside of the status and cortex endpoints.
 API_PREFIX="/api/v1"
+# Debug mode for FastAPI. It allows for hot reloading when code changes in development.
 DEBUG=True
+#
+# ----------------------------------------------- BATCH CONFIGURATION ------------------------------------------------ #
+#
+# The batch_size parameter is used to control the number of audio files that are processed in parallel.
+# If your server GPU has a lot of memory, you can increase this value to improve performance.
+# For simplicity, we recommend leaving this value at 1, unless you are sure that your GPU has enough memory (> 40GB)
 BATCH_SIZE=1
+# The max_wait parameter is used to control the maximum amount of time (in seconds) that the server will wait for
+# processing the tasks in the queue, if not empty. It's useful only when the batch_size is greater than 1.
 MAX_WAIT=0.1
+#
+# ----------------------------------------------- MODELS CONFIGURATION ----------------------------------------------- #
+#
+# ----------------------------------------------------- WHISPER ------------------------------------------------------ #
+#
+# The whisper_model parameter is used to control the model used for ASR.
+#
+# Cloud models:
+# The available models are: tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, or large-v2
+# You can try different model size, but you should see a trade-off between performance and speed.
+#
+# Local models:
+# You can also link a local folder path to use a custom model. If you do so, you should also mount the folder in the
+# docker run command as a volume.
+# e.g. WHISPER_MODEL="/app/models/custom"
+# docker cmd: -v /path/to/custom/model:/app/models/custom
 WHISPER_MODEL="large-v2"
+# The compute_type parameter is used to control the precision of the model. You can choose between:
+# "int8", "int8_float16", "int16", "float_16". The default value is "int8_float16", which is the fastest option with
+# minimal loss in accuracy using the `large-v2` model.
 COMPUTE_TYPE="int8_float16"
-NEMO_DOMAIN_TYPE="telephonic"  # Can be general, meeting or telephonic based on domain type of the audio file
+#
+# --------------------------------------------------- NVIDIA NEMO ---------------------------------------------------- #
+#
+# The nemo_domain_type define the configuration file used by the model for diarization. The available options are:
+# `general`, `meeting` and `telephonic`. The default value is `telephonic`. If you choose another type, you will need
+# to provide a custom model
+NEMO_DOMAIN_TYPE="telephonic"
+# The nemo_storage_path parameter is used to control the path where the NeuralDiarizer from the NeMo toolkit will
+# store the diarization models.
 NEMO_STORAGE_PATH="nemo_storage"
+# The nemo_output_path parameter is used to control the path where the NeuralDiarizer from the NeMo toolkit will
+# store the diarization outputs.
 NEMO_OUTPUT_PATH="nemo_outputs"
+#
+# ---------------------------------------------- ASR TYPE CONFIGURATION ---------------------------------------------- #
+#
+# The asr_type parameter is used to control the type of ASR used. The available options are: `async` or `live`.
+# * `async` is the default option. It will process the audio files in batches, and return the results when all the
+# files are processed.
+# * `live` is the option to use when you want to process a live audio stream. It will process the audio in chunks,
+# and return the results as soon as they are available. Live option is still a feature in development.
+# Use `live` only if you need live results, otherwise, use `async`.
 ASR_TYPE="async"
+#
+# --------------------------------------------- ENDPOINTS CONFIGURATION ---------------------------------------------- #
+#
+# Include the `audio` endpoint in the API. This endpoint is used to process uploaded local audio files.
 AUDIO_FILE_ENDPOINT=True
+# Include the `audio-url` endpoint in the API. This endpoint is used to process audio files from a URL.
 AUDIO_URL_ENDPOINT=True
+# Include the cortex endpoint in the API. This endpoint is used to process audio files from the Cortex API.
+# Use this only if you deploy the API using Cortex and Kubernetes.
 CORTEX_ENDPOINT=True
+# Include the `youtube` endpoint in the API. This endpoint is used to process audio files from YouTube URLs.
 YOUTUBE_ENDPOINT=True
+# Include the `live` endpoint in the API. This endpoint is used to process live audio streams.
 LIVE_ENDPOINT=False
+#
+# ---------------------------------------------- CORTEX CONFIGURATION ------------------------------------------------ #
+#
+# The cortex_api_key parameter is used to control the API key used to authenticate the requests to the cortex endpoint.
+WORDCAB_TRANSCRIBE_API_KEY=
+#
+# ----------------------------------------------- SVIX CONFIGURATION ------------------------------------------------- #
+#
+# The svix_api_key parameter is used in the cortex implementation to enable webhooks.
+SVIX_API_KEY=
+# The svix_app_id parameter is used in the cortex implementation to enable webhooks.
+SVIX_APP_ID=
+#
+# -------------------------------------------------------------------------------------------------------------------- #
diff --git a/README.md b/README.md
@@ -41,6 +41,20 @@ docker run -d --name wordcab-transcribe \
     wordcab-transcribe:latest
 ```
 
+You can mount a volume to the container to load local whisper models.
+
+If you mount a volume, you need to update the `WHISPER_MODEL` environment variable in the `.env` file.
+
+```bash
+docker run -d --name wordcab-transcribe \
+    --gpus all \
+    --shm-size 1g \
+    --restart unless-stopped \
+    -p 5001:5001 \
+    -v /path/to/whisper/models:/app/whisper/models \
+    wordcab-transcribe:latest
+```
+
 ## Test the API
 
 Once the container is running, you can test the API.

diff --git a/wordcab_transcribe/config.py b/wordcab_transcribe/config.py
@@ -26,32 +26,34 @@
 class Settings:
     """Configuration settings for the Wordcab Transcribe API."""
 
-    # Basic API settings
+    # General configuration
     project_name: str
     version: str
     description: str
     api_prefix: str
     debug: bool
-    # Batch request settings
+    # Batch configuration
     batch_size: int
     max_wait: float
-    # Model settings
+    # Models configuration
+    # Whisper
     whisper_model: str
     compute_type: str
+    # NVIDIA NeMo
     nemo_domain_type: str
     nemo_storage_path: str
     nemo_output_path: str
-    # ASR service
+    # ASR type configuration
     asr_type: str
-    # API endpoints
+    # Endpoints configuration
     audio_file_endpoint: bool
     audio_url_endpoint: bool
     cortex_endpoint: bool
     youtube_endpoint: bool
     live_endpoint: bool
-    # Auth
+    # Cortex configuration
     cortex_api_key: str
-    # Svix
+    # Svix configuration
     svix_api_key: str
     svix_app_id: str
 
@@ -112,28 +114,38 @@ def asr_type_must_be_valid(cls, value: str):  # noqa: B902, N805
 
 load_dotenv()
 
+
 settings = Settings(
+    # General configuration
     project_name=getenv("PROJECT_NAME", "Wordcab Transcribe"),
     version=getenv("VERSION", "0.2.0"),
     description=getenv(
         "DESCRIPTION", "💬 ASR FastAPI server using faster-whisper and NVIDIA NeMo."
     ),
     api_prefix=getenv("API_PREFIX", "/api/v1"),
     debug=getenv("DEBUG", True),
+    # Batch configuration
     batch_size=getenv("BATCH_SIZE", 1),
     max_wait=getenv("MAX_WAIT", 0.1),
+    # Models configuration
+    # Whisper
     whisper_model=getenv("WHISPER_MODEL", "large-v2"),
     compute_type=getenv("COMPUTE_TYPE", "int8_float16"),
+    # NeMo
     nemo_domain_type=getenv("NEMO_DOMAIN_TYPE", "general"),
     nemo_storage_path=getenv("NEMO_STORAGE_PATH", "nemo_storage"),
     nemo_output_path=getenv("NEMO_OUTPUT_PATH", "nemo_outputs"),
+    # ASR type
     asr_type=getenv("ASR_TYPE", "async"),
+    # Endpoints configuration
     audio_file_endpoint=getenv("AUDIO_FILE_ENDPOINT", True),
     audio_url_endpoint=getenv("AUDIO_URL_ENDPOINT", True),
     cortex_endpoint=getenv("CORTEX_ENDPOINT", True),
     youtube_endpoint=getenv("YOUTUBE_ENDPOINT", True),
     live_endpoint=getenv("LIVE_ENDPOINT", False),
+    # Cortex configuration
     cortex_api_key=getenv("WORDCAB_TRANSCRIBE_API_KEY", ""),
+    # Svix configuration
     svix_api_key=getenv("SVIX_API_KEY", ""),
     svix_app_id=getenv("SVIX_APP_ID", ""),
 )