From 294006b041fbe65816770a5d1f04baca9f7a723a Mon Sep 17 00:00:00 2001 From: chainyo Date: Mon, 15 May 2023 10:34:46 +0000 Subject: [PATCH 1/3] improve config and .env readability --- .env | 80 +++++++++++++++++++++++++++++++++++- wordcab_transcribe/config.py | 26 ++++++++---- 2 files changed, 98 insertions(+), 8 deletions(-) diff --git a/.env b/.env index 341316c..590e184 100644 --- a/.env +++ b/.env @@ -1,18 +1,96 @@ +# --------------------------------------- ⚙️ WORDCAB TRANSCRIBE CONFIGURATION ---------------------------------------- # +# +# ---------------------------------------------- GENERAL CONFIGURATION ----------------------------------------------- # +# +# The name of the project, used for API documentation. PROJECT_NAME="Wordcab Transcribe" +# The version of the project, used for API documentation. VERSION="0.2.0" +# The description of the project, used for API documentation. DESCRIPTION="💬 ASR FastAPI server using faster-whisper and NVIDIA NeMo." +# This API prefix is used for all endpoints in the API outside of the status and cortex endpoints. API_PREFIX="/api/v1" +# Debug mode for FastAPI. It allows for hot reloading when code changes in development. DEBUG=True +# +# ----------------------------------------------- BATCH CONFIGURATION ------------------------------------------------ # +# +# The batch_size parameter is used to control the number of audio files that are processed in parallel. +# If your server GPU has a lot of memory, you can increase this value to improve performance. +# For simplicity, we recommend leaving this value at 1, unless you are sure that your GPU has enough memory (> 40GB) BATCH_SIZE=1 +# The max_wait parameter is used to control the maximum amount of time (in seconds) that the server will wait for +# processing the tasks in the queue, if not empty. It's useful only when the batch_size is greater than 1. MAX_WAIT=0.1 +# +# ----------------------------------------------- MODELS CONFIGURATION ----------------------------------------------- # +# +# ----------------------------------------------------- WHISPER ------------------------------------------------------ # +# +# The whisper_model parameter is used to control the model used for ASR. +# +# Cloud models: +# The available models are: tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, or large-v2 +# You can try different model size, but you should see a trade-off between performance and speed. +# +# Local models: +# You can also link a local folder path to use a custom model. If you do so, you should also mount the folder in the +# docker run command as a volume. +# e.g. WHISPER_MODEL="/app/models/custom" +# docker cmd: -v /path/to/custom/model:/app/models/custom WHISPER_MODEL="large-v2" +# The compute_type parameter is used to control the precision of the model. You can choose between: +# "int8", "int8_float16", "int16", "float_16". The default value is "int8_float16", which is the fastest option with +# minimal loss in accuracy using the `large-v2` model. COMPUTE_TYPE="int8_float16" -NEMO_DOMAIN_TYPE="telephonic" # Can be general, meeting or telephonic based on domain type of the audio file +# +# --------------------------------------------------- NVIDIA NEMO ---------------------------------------------------- # +# +# The nemo_domain_type define the configuration file used by the model for diarization. The available options are: +# `general`, `meeting` and `telephonic`. The default value is `telephonic`. If you choose another type, you will need +# to provide a custom model +NEMO_DOMAIN_TYPE="telephonic" +# The nemo_storage_path parameter is used to control the path where the NeuralDiarizer from the NeMo toolkit will +# store the diarization models. NEMO_STORAGE_PATH="nemo_storage" +# The nemo_output_path parameter is used to control the path where the NeuralDiarizer from the NeMo toolkit will +# store the diarization outputs. NEMO_OUTPUT_PATH="nemo_outputs" +# +# ---------------------------------------------- ASR TYPE CONFIGURATION ---------------------------------------------- # +# +# The asr_type parameter is used to control the type of ASR used. The available options are: `async` or `live`. +# * `async` is the default option. It will process the audio files in batches, and return the results when all the +# files are processed. +# * `live` is the option to use when you want to process a live audio stream. It will process the audio in chunks, +# and return the results as soon as they are available. Live option is still a feature in development. +# Use `live` only if you need live results, otherwise, use `async`. ASR_TYPE="async" +# +# --------------------------------------------- ENDPOINTS CONFIGURATION ---------------------------------------------- # +# +# Include the `audio` endpoint in the API. This endpoint is used to process uploaded local audio files. AUDIO_FILE_ENDPOINT=True +# Include the `audio-url` endpoint in the API. This endpoint is used to process audio files from a URL. AUDIO_URL_ENDPOINT=True +# Include the cortex endpoint in the API. This endpoint is used to process audio files from the Cortex API. +# Use this only if you deploy the API using Cortex and Kubernetes. CORTEX_ENDPOINT=True +# Include the `youtube` endpoint in the API. This endpoint is used to process audio files from YouTube URLs. YOUTUBE_ENDPOINT=True +# Include the `live` endpoint in the API. This endpoint is used to process live audio streams. LIVE_ENDPOINT=False +# +# ---------------------------------------------- CORTEX CONFIGURATION ------------------------------------------------ # +# +# The cortex_api_key parameter is used to control the API key used to authenticate the requests to the cortex endpoint. +WORDCAB_TRANSCRIBE_API_KEY= +# +# ----------------------------------------------- SVIX CONFIGURATION ------------------------------------------------- # +# +# The svix_api_key parameter is used in the cortex implementation to enable webhooks. +SVIX_API_KEY= +# The svix_app_id parameter is used in the cortex implementation to enable webhooks. +SVIX_APP_ID= +# +# -------------------------------------------------------------------------------------------------------------------- # \ No newline at end of file diff --git a/wordcab_transcribe/config.py b/wordcab_transcribe/config.py index 4fb7722..419115f 100644 --- a/wordcab_transcribe/config.py +++ b/wordcab_transcribe/config.py @@ -26,32 +26,34 @@ class Settings: """Configuration settings for the Wordcab Transcribe API.""" - # Basic API settings + # General configuration project_name: str version: str description: str api_prefix: str debug: bool - # Batch request settings + # Batch configuration batch_size: int max_wait: float - # Model settings + # Models configuration + # Whisper whisper_model: str compute_type: str + # NVIDIA NeMo nemo_domain_type: str nemo_storage_path: str nemo_output_path: str - # ASR service + # ASR type configuration asr_type: str - # API endpoints + # Endpoints configuration audio_file_endpoint: bool audio_url_endpoint: bool cortex_endpoint: bool youtube_endpoint: bool live_endpoint: bool - # Auth + # Cortex configuration cortex_api_key: str - # Svix + # Svix configuration svix_api_key: str svix_app_id: str @@ -112,7 +114,9 @@ def asr_type_must_be_valid(cls, value: str): # noqa: B902, N805 load_dotenv() + settings = Settings( + # General configuration project_name=getenv("PROJECT_NAME", "Wordcab Transcribe"), version=getenv("VERSION", "0.2.0"), description=getenv( @@ -120,20 +124,28 @@ def asr_type_must_be_valid(cls, value: str): # noqa: B902, N805 ), api_prefix=getenv("API_PREFIX", "/api/v1"), debug=getenv("DEBUG", True), + # Batch configuration batch_size=getenv("BATCH_SIZE", 1), max_wait=getenv("MAX_WAIT", 0.1), + # Models configuration + # Whisper whisper_model=getenv("WHISPER_MODEL", "large-v2"), compute_type=getenv("COMPUTE_TYPE", "int8_float16"), + # NeMo nemo_domain_type=getenv("NEMO_DOMAIN_TYPE", "general"), nemo_storage_path=getenv("NEMO_STORAGE_PATH", "nemo_storage"), nemo_output_path=getenv("NEMO_OUTPUT_PATH", "nemo_outputs"), + # ASR type asr_type=getenv("ASR_TYPE", "async"), + # Endpoints configuration audio_file_endpoint=getenv("AUDIO_FILE_ENDPOINT", True), audio_url_endpoint=getenv("AUDIO_URL_ENDPOINT", True), cortex_endpoint=getenv("CORTEX_ENDPOINT", True), youtube_endpoint=getenv("YOUTUBE_ENDPOINT", True), live_endpoint=getenv("LIVE_ENDPOINT", False), + # Cortex configuration cortex_api_key=getenv("WORDCAB_TRANSCRIBE_API_KEY", ""), + # Svix configuration svix_api_key=getenv("SVIX_API_KEY", ""), svix_app_id=getenv("SVIX_APP_ID", ""), ) From 430eba8cf346aab7599acbf608bdb21cd65a258f Mon Sep 17 00:00:00 2001 From: chainyo Date: Mon, 15 May 2023 11:40:29 +0000 Subject: [PATCH 2/3] fix linting --- .env | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.env b/.env index 590e184..d0f86fc 100644 --- a/.env +++ b/.env @@ -48,7 +48,7 @@ COMPUTE_TYPE="int8_float16" # # The nemo_domain_type define the configuration file used by the model for diarization. The available options are: # `general`, `meeting` and `telephonic`. The default value is `telephonic`. If you choose another type, you will need -# to provide a custom model +# to provide a custom model NEMO_DOMAIN_TYPE="telephonic" # The nemo_storage_path parameter is used to control the path where the NeuralDiarizer from the NeMo toolkit will # store the diarization models. @@ -93,4 +93,4 @@ SVIX_API_KEY= # The svix_app_id parameter is used in the cortex implementation to enable webhooks. SVIX_APP_ID= # -# -------------------------------------------------------------------------------------------------------------------- # \ No newline at end of file +# -------------------------------------------------------------------------------------------------------------------- # From d487ee904b444d98de7cc046f94fc3b73ea7e9c6 Mon Sep 17 00:00:00 2001 From: chainyo Date: Mon, 15 May 2023 11:48:29 +0000 Subject: [PATCH 3/3] add docker volume mount instructions --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index 9438e70..032db10 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,20 @@ docker run -d --name wordcab-transcribe \ wordcab-transcribe:latest ``` +You can mount a volume to the container to load local whisper models. + +If you mount a volume, you need to update the `WHISPER_MODEL` environment variable in the `.env` file. + +```bash +docker run -d --name wordcab-transcribe \ + --gpus all \ + --shm-size 1g \ + --restart unless-stopped \ + -p 5001:5001 \ + -v /path/to/whisper/models:/app/whisper/models \ + wordcab-transcribe:latest +``` + ## Test the API Once the container is running, you can test the API.