diff --git a/faster-whisper/CONTRIBUTING.md b/faster-whisper/CONTRIBUTING.md new file mode 100644 index 0000000..8d6a9c2 --- /dev/null +++ b/faster-whisper/CONTRIBUTING.md @@ -0,0 +1,31 @@ +# Contributing to faster-whisper + +Contributions are welcome! Here are some pointers to help you install the library for development and validate your changes before submitting a pull request. + +## Install the library for development + +We recommend installing the module in editable mode with the `dev` extra requirements: + +```bash +git clone https://github.com/SYSTRAN/faster-whisper.git +cd faster-whisper/ +pip install -e .[dev] +``` + +## Validate the changes before creating a pull request + +1. Make sure the existing tests are still passing (and consider adding new tests as well!): + +```bash +pytest tests/ +``` + +2. Reformat and validate the code with the following tools: + +```bash +black . +isort . +flake8 . +``` + +These steps are also run automatically in the CI when you open the pull request. diff --git a/faster-whisper/LICENSE b/faster-whisper/LICENSE new file mode 100644 index 0000000..2d92330 --- /dev/null +++ b/faster-whisper/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 SYSTRAN + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/faster-whisper/MANIFEST.in b/faster-whisper/MANIFEST.in new file mode 100644 index 0000000..8a103dd --- /dev/null +++ b/faster-whisper/MANIFEST.in @@ -0,0 +1,4 @@ +include faster_whisper/assets/silero_vad.onnx +include requirements.txt +include requirements.conversion.txt +include faster_whisper/assets/pyannote_vad_model.bin diff --git a/faster-whisper/README.md b/faster-whisper/README.md new file mode 100644 index 0000000..9d319ec --- /dev/null +++ b/faster-whisper/README.md @@ -0,0 +1,320 @@ +[![CI](https://github.com/SYSTRAN/faster-whisper/workflows/CI/badge.svg)](https://github.com/SYSTRAN/faster-whisper/actions?query=workflow%3ACI) [![PyPI version](https://badge.fury.io/py/faster-whisper.svg)](https://badge.fury.io/py/faster-whisper) + +# Faster Whisper transcription with CTranslate2 + +**faster-whisper** is a reimplementation of OpenAI's Whisper model using [CTranslate2](https://github.com/OpenNMT/CTranslate2/), which is a fast inference engine for Transformer models. + +This implementation is up to 4 times faster than [openai/whisper](https://github.com/openai/whisper) for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU. + +## Benchmark + +### Whisper + +For reference, here's the time and memory usage that are required to transcribe [**13 minutes**](https://www.youtube.com/watch?v=0u7tTptBo9I) of audio using different implementations: + +* [openai/whisper](https://github.com/openai/whisper)@[6dea21fd](https://github.com/openai/whisper/commit/6dea21fd7f7253bfe450f1e2512a0fe47ee2d258) +* [whisper.cpp](https://github.com/ggerganov/whisper.cpp)@[3b010f9](https://github.com/ggerganov/whisper.cpp/commit/3b010f9bed9a6068609e9faf52383aea792b0362) +* [faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[cce6b53e](https://github.com/SYSTRAN/faster-whisper/commit/cce6b53e4554f71172dad188c45f10fb100f6e3e) + +### Large-v2 model on GPU + +| Implementation | Precision | Beam size | Time | Max. GPU memory | Max. CPU memory | +| --- | --- | --- | --- | --- | --- | +| openai/whisper | fp16 | 5 | 4m30s | 11325MB | 9439MB | +| faster-whisper | fp16 | 5 | 54s | 4755MB | 3244MB | +| faster-whisper | int8 | 5 | 59s | 3091MB | 3117MB | + +*Executed with CUDA 11.7.1 on a NVIDIA Tesla V100S.* + +### Small model on CPU + +| Implementation | Precision | Beam size | Time | Max. memory | +| --- | --- | --- | --- | --- | +| openai/whisper | fp32 | 5 | 10m31s | 3101MB | +| whisper.cpp | fp32 | 5 | 17m42s | 1581MB | +| whisper.cpp | fp16 | 5 | 12m39s | 873MB | +| faster-whisper | fp32 | 5 | 2m44s | 1675MB | +| faster-whisper | int8 | 5 | 2m04s | 995MB | + +*Executed with 8 threads on a Intel(R) Xeon(R) Gold 6226R.* + + +### Distil-whisper + +| Implementation | Precision | Beam size | Time | Gigaspeech WER | +| --- | --- | --- | --- | --- | +| distil-whisper/distil-large-v2 | fp16 | 4 |- | 10.36 | +| [faster-distil-large-v2](https://huggingface.co/Systran/faster-distil-whisper-large-v2) | fp16 | 5 | - | 10.28 | +| distil-whisper/distil-medium.en | fp16 | 4 | - | 11.21 | +| [faster-distil-medium.en](https://huggingface.co/Systran/faster-distil-whisper-medium.en) | fp16 | 5 | - | 11.21 | + +*Executed with CUDA 11.4 on a NVIDIA 3090.* + +
+testing details (click to expand) + +For `distil-whisper/distil-large-v2`, the WER is tested with code sample from [link](https://huggingface.co/distil-whisper/distil-large-v2#evaluation). for `faster-distil-whisper`, the WER is tested with setting: +```python +from faster_whisper import WhisperModel + +model_size = "distil-large-v2" +# model_size = "distil-medium.en" +# Run on GPU with FP16 +model = WhisperModel(model_size, device="cuda", compute_type="float16") +segments, info = model.transcribe("audio.mp3", beam_size=5, language="en") +``` +
+ +## Requirements + +* Python 3.8 or greater + +Unlike openai-whisper, FFmpeg does **not** need to be installed on the system. The audio is decoded with the Python library [PyAV](https://github.com/PyAV-Org/PyAV) which bundles the FFmpeg libraries in its package. + +### GPU + +GPU execution requires the following NVIDIA libraries to be installed: + +* [cuBLAS for CUDA 12](https://developer.nvidia.com/cublas) +* [cuDNN 8 for CUDA 12](https://developer.nvidia.com/cudnn) + +**Note**: Latest versions of `ctranslate2` support CUDA 12 only. For CUDA 11, the current workaround is downgrading to the `3.24.0` version of `ctranslate2` (This can be done with `pip install --force-reinstall ctranslate2==3.24.0` or specifying the version in a `requirements.txt`). + +There are multiple ways to install the NVIDIA libraries mentioned above. The recommended way is described in the official NVIDIA documentation, but we also suggest other installation methods below. + +
+Other installation methods (click to expand) + + +**Note:** For all these methods below, keep in mind the above note regarding CUDA versions. Depending on your setup, you may need to install the _CUDA 11_ versions of libraries that correspond to the CUDA 12 libraries listed in the instructions below. + +#### Use Docker + +The libraries (cuBLAS, cuDNN) are installed in these official NVIDIA CUDA Docker images: `nvidia/cuda:12.0.0-runtime-ubuntu20.04` or `nvidia/cuda:12.0.0-runtime-ubuntu22.04`. + +#### Install with `pip` (Linux only) + +On Linux these libraries can be installed with `pip`. Note that `LD_LIBRARY_PATH` must be set before launching Python. + +```bash +pip install nvidia-cublas-cu12 nvidia-cudnn-cu12 + +export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'` +``` + +**Note**: Version 9+ of `nvidia-cudnn-cu12` appears to cause issues due its reliance on cuDNN 9 (Faster-Whisper does not currently support cuDNN 9). Ensure your version of the Python package is for cuDNN 8. + +#### Download the libraries from Purfview's repository (Windows & Linux) + +Purfview's [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) provides the required NVIDIA libraries for Windows & Linux in a [single archive](https://github.com/Purfview/whisper-standalone-win/releases/tag/libs). Decompress the archive and place the libraries in a directory included in the `PATH`. + +
+ +## Installation + +The module can be installed from [PyPI](https://pypi.org/project/faster-whisper/): + +```bash +pip install faster-whisper +``` + +
+Other installation methods (click to expand) + +### Install the master branch + +```bash +pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz" +``` + +### Install a specific commit + +```bash +pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/a4f1cc8f11433e454c3934442b5e1a4ed5e865c3.tar.gz" +``` + +
+ +## Usage + +### Faster-whisper + +```python +from faster_whisper import WhisperModel + +model_size = "large-v3" + +# Run on GPU with FP16 +model = WhisperModel(model_size, device="cuda", compute_type="float16") + +# or run on GPU with INT8 +# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16") +# or run on CPU with INT8 +# model = WhisperModel(model_size, device="cpu", compute_type="int8") + +segments, info = model.transcribe("audio.mp3", beam_size=5) + +print("Detected language '%s' with probability %f" % (info.language, info.language_probability)) + +for segment in segments: + print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) +``` + +**Warning:** `segments` is a *generator* so the transcription only starts when you iterate over it. The transcription can be run to completion by gathering the segments in a list or a `for` loop: + +```python +segments, _ = model.transcribe("audio.mp3") +segments = list(segments) # The transcription will actually run here. +``` + +### multi-segment language detection + +To directly use the model for improved language detection, the following code snippet can be used: + +```python +from faster_whisper import WhisperModel +model = WhisperModel("medium", device="cuda", compute_type="float16") +language_info = model.detect_language_multi_segment("audio.mp3") +``` + +### Batched faster-whisper + + +The batched version of faster-whisper is inspired by [whisper-x](https://github.com/m-bain/whisperX) licensed under the BSD-4 Clause license and integrates its VAD model to this library. This product includes software developed by Max Bain. We modify this implementation and also added kaldi-based feature extraction. It improves the speed upto 10-12x compared to openAI implementation and 3-4x compared to the sequential faster_whisper version. It works by transcribing semantically meaningful audio chunks as batches leading to faster inference. + +The following code snippet illustrates how to run inference with batched version on an example audio file. Please also refer to the test scripts of batched faster whisper. + +```python +from faster_whisper import BatchedInferencePipeline + +model = WhisperModel("medium", device="cuda", compute_type="float16") +batched_model = BatchedInferencePipeline(model=model) +result = batched_model.transcribe("audio.mp3", batch_size=16) + +for segment, info in result: + print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) +``` + +### Faster Distil-Whisper + +The Distil-Whisper checkpoints are compatible with the Faster-Whisper package. In particular, the latest [distil-large-v3](https://huggingface.co/distil-whisper/distil-large-v3) +checkpoint is intrinsically designed to work with the Faster-Whisper transcription algorithm. The following code snippet +demonstrates how to run inference with distil-large-v3 on a specified audio file: + +```python +from faster_whisper import WhisperModel + +model_size = "distil-large-v3" + +model = WhisperModel(model_size, device="cuda", compute_type="float16") +segments, info = model.transcribe("audio.mp3", beam_size=5, language="en", condition_on_previous_text=False) + +for segment in segments: + print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) +``` + +For more information about the distil-large-v3 model, refer to the original [model card](https://huggingface.co/distil-whisper/distil-large-v3). + +### Word-level timestamps + +```python +segments, _ = model.transcribe("audio.mp3", word_timestamps=True) + +for segment in segments: + for word in segment.words: + print("[%.2fs -> %.2fs] %s" % (word.start, word.end, word.word)) +``` + +### VAD filter + +The library integrates the [Silero VAD](https://github.com/snakers4/silero-vad) model to filter out parts of the audio without speech: + +```python +segments, _ = model.transcribe("audio.mp3", vad_filter=True) +``` + +The default behavior is conservative and only removes silence longer than 2 seconds. See the available VAD parameters and default values in the [source code](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py). They can be customized with the dictionary argument `vad_parameters`: + +```python +segments, _ = model.transcribe( + "audio.mp3", + vad_filter=True, + vad_parameters=dict(min_silence_duration_ms=500), +) +``` + +### Logging + +The library logging level can be configured like this: + +```python +import logging + +logging.basicConfig() +logging.getLogger("faster_whisper").setLevel(logging.DEBUG) +``` + +### Going further + +See more model and transcription options in the [`WhisperModel`](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py) class implementation. + +## Community integrations + +Here is a non exhaustive list of open-source projects using faster-whisper. Feel free to add your project to the list! + + +* [faster-whisper-server](https://github.com/fedirz/faster-whisper-server) is an OpenAI compatible server using `faster-whisper`. It's easily deployable with Docker, works with OpenAI SDKs/CLI, supports streaming, and live transcription. +* [WhisperX](https://github.com/m-bain/whisperX) is an award-winning Python library that offers speaker diarization and accurate word-level timestamps using wav2vec2 alignment +* [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on faster-whisper and compatible with the original client from openai/whisper. +* [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on faster-whisper and NVIDIA NeMo. +* [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) Standalone CLI executables of faster-whisper for Windows, Linux & macOS. +* [asr-sd-pipeline](https://github.com/hedrergudene/asr-sd-pipeline) provides a scalable, modular, end to end multi-speaker speech to text solution implemented using AzureML pipelines. +* [Open-Lyrics](https://github.com/zh-plus/Open-Lyrics) is a Python library that transcribes voice files using faster-whisper, and translates/polishes the resulting text into `.lrc` files in the desired language using OpenAI-GPT. +* [wscribe](https://github.com/geekodour/wscribe) is a flexible transcript generation tool supporting faster-whisper, it can export word level transcript and the exported transcript then can be edited with [wscribe-editor](https://github.com/geekodour/wscribe-editor) +* [aTrain](https://github.com/BANDAS-Center/aTrain) is a graphical user interface implementation of faster-whisper developed at the BANDAS-Center at the University of Graz for transcription and diarization in Windows ([Windows Store App](https://apps.microsoft.com/detail/atrain/9N15Q44SZNS2)) and Linux. +* [Whisper-Streaming](https://github.com/ufal/whisper_streaming) implements real-time mode for offline Whisper-like speech-to-text models with faster-whisper as the most recommended back-end. It implements a streaming policy with self-adaptive latency based on the actual source complexity, and demonstrates the state of the art. +* [WhisperLive](https://github.com/collabora/WhisperLive) is a nearly-live implementation of OpenAI's Whisper which uses faster-whisper as the backend to transcribe audio in real-time. +* [Faster-Whisper-Transcriber](https://github.com/BBC-Esq/ctranslate2-faster-whisper-transcriber) is a simple but reliable voice transcriber that provides a user-friendly interface. + +## Model conversion + +When loading a model from its size such as `WhisperModel("large-v3")`, the corresponding CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran). + +We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models. + +For example the command below converts the [original "large-v3" Whisper model](https://huggingface.co/openai/whisper-large-v3) and saves the weights in FP16: + +```bash +pip install transformers[torch]>=4.23 + +ct2-transformers-converter --model openai/whisper-large-v3 --output_dir whisper-large-v3-ct2 +--copy_files tokenizer.json preprocessor_config.json --quantization float16 +``` + +* The option `--model` accepts a model name on the Hub or a path to a model directory. +* If the option `--copy_files tokenizer.json` is not used, the tokenizer configuration is automatically downloaded when the model is loaded later. + +Models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html). + +### Load a converted model + +1. Directly load the model from a local directory: +```python +model = faster_whisper.WhisperModel("whisper-large-v3-ct2") +``` + +2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name: +```python +model = faster_whisper.WhisperModel("username/whisper-large-v3-ct2") +``` + +## Comparing performance against other implementations + +If you are comparing the performance against other Whisper implementations, you should make sure to run the comparison with similar settings. In particular: + +* Verify that the same transcription options are used, especially the same beam size. For example in openai/whisper, `model.transcribe` uses a default beam size of 1 but here we use a default beam size of 5. +* When running on CPU, make sure to set the same number of threads. Many frameworks will read the environment variable `OMP_NUM_THREADS`, which can be set when running your script: + +```bash +OMP_NUM_THREADS=4 python3 my_script.py +``` diff --git a/faster-whisper/benchmark/benchmark.m4a b/faster-whisper/benchmark/benchmark.m4a new file mode 100644 index 0000000..66259d7 Binary files /dev/null and b/faster-whisper/benchmark/benchmark.m4a differ diff --git a/faster-whisper/benchmark/memory_benchmark.py b/faster-whisper/benchmark/memory_benchmark.py new file mode 100644 index 0000000..1fbdfbd --- /dev/null +++ b/faster-whisper/benchmark/memory_benchmark.py @@ -0,0 +1,94 @@ +import argparse +import time + +from typing import Callable + +import py3nvml.py3nvml as nvml + +from memory_profiler import memory_usage +from utils import MyThread, get_logger, inference + +logger = get_logger("faster-whisper") +parser = argparse.ArgumentParser(description="Memory benchmark") +parser.add_argument( + "--gpu_memory", action="store_true", help="Measure GPU memory usage" +) +parser.add_argument("--device-index", type=int, default=0, help="GPU device index") +parser.add_argument( + "--interval", + type=float, + default=0.5, + help="Interval at which measurements are collected", +) +args = parser.parse_args() +device_idx = args.device_index +interval = args.interval + + +def measure_memory(func: Callable[[], None]): + if args.gpu_memory: + logger.info( + "Measuring maximum GPU memory usage on GPU device." + " Make sure to not have additional processes running on the same GPU." + ) + # init nvml + nvml.nvmlInit() + handle = nvml.nvmlDeviceGetHandleByIndex(device_idx) + gpu_name = nvml.nvmlDeviceGetName(handle) + gpu_memory_limit = nvml.nvmlDeviceGetMemoryInfo(handle).total >> 20 + gpu_power_limit = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000.0 + info = {"gpu_memory_usage": [], "gpu_power_usage": []} + + def _get_gpu_info(): + while True: + info["gpu_memory_usage"].append( + nvml.nvmlDeviceGetMemoryInfo(handle).used >> 20 + ) + info["gpu_power_usage"].append( + nvml.nvmlDeviceGetPowerUsage(handle) / 1000 + ) + time.sleep(interval) + + if stop: + break + + return info + + stop = False + thread = MyThread(_get_gpu_info, params=()) + thread.start() + func() + stop = True + thread.join() + result = thread.get_result() + + # shutdown nvml + nvml.nvmlShutdown() + max_memory_usage = max(result["gpu_memory_usage"]) + max_power_usage = max(result["gpu_power_usage"]) + print("GPU name: %s" % gpu_name) + print("GPU device index: %s" % device_idx) + print( + "Maximum GPU memory usage: %dMiB / %dMiB (%.2f%%)" + % ( + max_memory_usage, + gpu_memory_limit, + (max_memory_usage / gpu_memory_limit) * 100, + ) + ) + print( + "Maximum GPU power usage: %dW / %dW (%.2f%%)" + % ( + max_power_usage, + gpu_power_limit, + (max_power_usage / gpu_power_limit) * 100, + ) + ) + else: + logger.info("Measuring maximum increase of memory usage.") + max_usage = memory_usage(func, max_usage=True, interval=interval) + print("Maximum increase of RAM memory usage: %d MiB" % max_usage) + + +if __name__ == "__main__": + measure_memory(inference) diff --git a/faster-whisper/benchmark/normalizer.json b/faster-whisper/benchmark/normalizer.json new file mode 100644 index 0000000..dd6ae81 --- /dev/null +++ b/faster-whisper/benchmark/normalizer.json @@ -0,0 +1,1742 @@ +{ + "accessorise": "accessorize", + "accessorised": "accessorized", + "accessorises": "accessorizes", + "accessorising": "accessorizing", + "acclimatisation": "acclimatization", + "acclimatise": "acclimatize", + "acclimatised": "acclimatized", + "acclimatises": "acclimatizes", + "acclimatising": "acclimatizing", + "accoutrements": "accouterments", + "aeon": "eon", + "aeons": "eons", + "aerogramme": "aerogram", + "aerogrammes": "aerograms", + "aeroplane": "airplane", + "aeroplanes": "airplanes", + "aesthete": "esthete", + "aesthetes": "esthetes", + "aesthetic": "esthetic", + "aesthetically": "esthetically", + "aesthetics": "esthetics", + "aetiology": "etiology", + "ageing": "aging", + "aggrandisement": "aggrandizement", + "agonise": "agonize", + "agonised": "agonized", + "agonises": "agonizes", + "agonising": "agonizing", + "agonisingly": "agonizingly", + "almanack": "almanac", + "almanacks": "almanacs", + "aluminium": "aluminum", + "amortisable": "amortizable", + "amortisation": "amortization", + "amortisations": "amortizations", + "amortise": "amortize", + "amortised": "amortized", + "amortises": "amortizes", + "amortising": "amortizing", + "amphitheatre": "amphitheater", + "amphitheatres": "amphitheaters", + "anaemia": "anemia", + "anaemic": "anemic", + "anaesthesia": "anesthesia", + "anaesthetic": "anesthetic", + "anaesthetics": "anesthetics", + "anaesthetise": "anesthetize", + "anaesthetised": "anesthetized", + "anaesthetises": "anesthetizes", + "anaesthetising": "anesthetizing", + "anaesthetist": "anesthetist", + "anaesthetists": "anesthetists", + "anaesthetize": "anesthetize", + "anaesthetized": "anesthetized", + "anaesthetizes": "anesthetizes", + "anaesthetizing": "anesthetizing", + "analogue": "analog", + "analogues": "analogs", + "analyse": "analyze", + "analysed": "analyzed", + "analyses": "analyzes", + "analysing": "analyzing", + "anglicise": "anglicize", + "anglicised": "anglicized", + "anglicises": "anglicizes", + "anglicising": "anglicizing", + "annualised": "annualized", + "antagonise": "antagonize", + "antagonised": "antagonized", + "antagonises": "antagonizes", + "antagonising": "antagonizing", + "apologise": "apologize", + "apologised": "apologized", + "apologises": "apologizes", + "apologising": "apologizing", + "appal": "appall", + "appals": "appalls", + "appetiser": "appetizer", + "appetisers": "appetizers", + "appetising": "appetizing", + "appetisingly": "appetizingly", + "arbour": "arbor", + "arbours": "arbors", + "archaeologically": "archeologically", + "archaeologist": "archeologist", + "archaeologists": "archeologists", + "archaeology": "archeology", + "archeological": "archaeological", + "ardour": "ardor", + "armour": "armor", + "armoured": "armored", + "armourer": "armorer", + "armourers": "armorers", + "armouries": "armories", + "armoury": "armory", + "artefact": "artifact", + "artefacts": "artifacts", + "authorise": "authorize", + "authorised": "authorized", + "authorises": "authorizes", + "authorising": "authorizing", + "axe": "ax", + "backpedalled": "backpedaled", + "backpedalling": "backpedaling", + "bannister": "banister", + "bannisters": "banisters", + "baptise": "baptize", + "baptised": "baptized", + "baptises": "baptizes", + "baptising": "baptizing", + "bastardise": "bastardize", + "bastardised": "bastardized", + "bastardises": "bastardizes", + "bastardising": "bastardizing", + "battleax": "battleaxe", + "baulk": "balk", + "baulked": "balked", + "baulking": "balking", + "baulks": "balks", + "bedevilled": "bedeviled", + "bedevilling": "bedeviling", + "behaviour": "behavior", + "behavioural": "behavioral", + "behaviourism": "behaviorism", + "behaviourist": "behaviorist", + "behaviourists": "behaviorists", + "behaviours": "behaviors", + "behove": "behoove", + "behoved": "behooved", + "behoves": "behooves", + "bejewelled": "bejeweled", + "belabour": "belabor", + "belaboured": "belabored", + "belabouring": "belaboring", + "belabours": "belabors", + "bevelled": "beveled", + "bevvies": "bevies", + "bevvy": "bevy", + "biassed": "biased", + "biassing": "biasing", + "bingeing": "binging", + "bougainvillaea": "bougainvillea", + "bougainvillaeas": "bougainvilleas", + "bowdlerise": "bowdlerize", + "bowdlerised": "bowdlerized", + "bowdlerises": "bowdlerizes", + "bowdlerising": "bowdlerizing", + "breathalyse": "breathalyze", + "breathalysed": "breathalyzed", + "breathalyser": "breathalyzer", + "breathalysers": "breathalyzers", + "breathalyses": "breathalyzes", + "breathalysing": "breathalyzing", + "brutalise": "brutalize", + "brutalised": "brutalized", + "brutalises": "brutalizes", + "brutalising": "brutalizing", + "busses": "buses", + "bussing": "busing", + "caesarean": "cesarean", + "caesareans": "cesareans", + "calibre": "caliber", + "calibres": "calibers", + "calliper": "caliper", + "callipers": "calipers", + "callisthenics": "calisthenics", + "canalise": "canalize", + "canalised": "canalized", + "canalises": "canalizes", + "canalising": "canalizing", + "cancelation": "cancellation", + "cancelations": "cancellations", + "cancelled": "canceled", + "cancelling": "canceling", + "candour": "candor", + "cannibalise": "cannibalize", + "cannibalised": "cannibalized", + "cannibalises": "cannibalizes", + "cannibalising": "cannibalizing", + "canonise": "canonize", + "canonised": "canonized", + "canonises": "canonizes", + "canonising": "canonizing", + "capitalise": "capitalize", + "capitalised": "capitalized", + "capitalises": "capitalizes", + "capitalising": "capitalizing", + "caramelise": "caramelize", + "caramelised": "caramelized", + "caramelises": "caramelizes", + "caramelising": "caramelizing", + "carbonise": "carbonize", + "carbonised": "carbonized", + "carbonises": "carbonizes", + "carbonising": "carbonizing", + "carolled": "caroled", + "carolling": "caroling", + "catalogue": "catalog", + "catalogued": "cataloged", + "catalogues": "catalogs", + "cataloguing": "cataloging", + "catalyse": "catalyze", + "catalysed": "catalyzed", + "catalyses": "catalyzes", + "catalysing": "catalyzing", + "categorise": "categorize", + "categorised": "categorized", + "categorises": "categorizes", + "categorising": "categorizing", + "cauterise": "cauterize", + "cauterised": "cauterized", + "cauterises": "cauterizes", + "cauterising": "cauterizing", + "cavilled": "caviled", + "cavilling": "caviling", + "centigramme": "centigram", + "centigrammes": "centigrams", + "centilitre": "centiliter", + "centilitres": "centiliters", + "centimetre": "centimeter", + "centimetres": "centimeters", + "centralise": "centralize", + "centralised": "centralized", + "centralises": "centralizes", + "centralising": "centralizing", + "centre": "center", + "centred": "centered", + "centrefold": "centerfold", + "centrefolds": "centerfolds", + "centrepiece": "centerpiece", + "centrepieces": "centerpieces", + "centres": "centers", + "channelled": "channeled", + "channelling": "channeling", + "characterise": "characterize", + "characterised": "characterized", + "characterises": "characterizes", + "characterising": "characterizing", + "cheque": "check", + "chequebook": "checkbook", + "chequebooks": "checkbooks", + "chequered": "checkered", + "cheques": "checks", + "chilli": "chili", + "chimaera": "chimera", + "chimaeras": "chimeras", + "chiselled": "chiseled", + "chiselling": "chiseling", + "circularise": "circularize", + "circularised": "circularized", + "circularises": "circularizes", + "circularising": "circularizing", + "civilise": "civilize", + "civilised": "civilized", + "civilises": "civilizes", + "civilising": "civilizing", + "clamour": "clamor", + "clamoured": "clamored", + "clamouring": "clamoring", + "clamours": "clamors", + "clangour": "clangor", + "clarinettist": "clarinetist", + "clarinettists": "clarinetists", + "collectivise": "collectivize", + "collectivised": "collectivized", + "collectivises": "collectivizes", + "collectivising": "collectivizing", + "colonisation": "colonization", + "colonise": "colonize", + "colonised": "colonized", + "coloniser": "colonizer", + "colonisers": "colonizers", + "colonises": "colonizes", + "colonising": "colonizing", + "colour": "color", + "colourant": "colorant", + "colourants": "colorants", + "coloured": "colored", + "coloureds": "coloreds", + "colourful": "colorful", + "colourfully": "colorfully", + "colouring": "coloring", + "colourize": "colorize", + "colourized": "colorized", + "colourizes": "colorizes", + "colourizing": "colorizing", + "colourless": "colorless", + "colours": "colors", + "commercialise": "commercialize", + "commercialised": "commercialized", + "commercialises": "commercializes", + "commercialising": "commercializing", + "compartmentalise": "compartmentalize", + "compartmentalised": "compartmentalized", + "compartmentalises": "compartmentalizes", + "compartmentalising": "compartmentalizing", + "computerise": "computerize", + "computerised": "computerized", + "computerises": "computerizes", + "computerising": "computerizing", + "conceptualise": "conceptualize", + "conceptualised": "conceptualized", + "conceptualises": "conceptualizes", + "conceptualising": "conceptualizing", + "connexion": "connection", + "connexions": "connections", + "contextualise": "contextualize", + "contextualised": "contextualized", + "contextualises": "contextualizes", + "contextualising": "contextualizing", + "cosier": "cozier", + "cosies": "cozies", + "cosiest": "coziest", + "cosily": "cozily", + "cosiness": "coziness", + "cosy": "cozy", + "councillor": "councilor", + "councillors": "councilors", + "counselled": "counseled", + "counselling": "counseling", + "counsellor": "counselor", + "counsellors": "counselors", + "crenelated": "crenellated", + "criminalise": "criminalize", + "criminalised": "criminalized", + "criminalises": "criminalizes", + "criminalising": "criminalizing", + "criticise": "criticize", + "criticised": "criticized", + "criticises": "criticizes", + "criticising": "criticizing", + "crueller": "crueler", + "cruellest": "cruelest", + "crystallisation": "crystallization", + "crystallise": "crystallize", + "crystallised": "crystallized", + "crystallises": "crystallizes", + "crystallising": "crystallizing", + "cudgelled": "cudgeled", + "cudgelling": "cudgeling", + "customise": "customize", + "customised": "customized", + "customises": "customizes", + "customising": "customizing", + "cypher": "cipher", + "cyphers": "ciphers", + "decentralisation": "decentralization", + "decentralise": "decentralize", + "decentralised": "decentralized", + "decentralises": "decentralizes", + "decentralising": "decentralizing", + "decriminalisation": "decriminalization", + "decriminalise": "decriminalize", + "decriminalised": "decriminalized", + "decriminalises": "decriminalizes", + "decriminalising": "decriminalizing", + "defence": "defense", + "defenceless": "defenseless", + "defences": "defenses", + "dehumanisation": "dehumanization", + "dehumanise": "dehumanize", + "dehumanised": "dehumanized", + "dehumanises": "dehumanizes", + "dehumanising": "dehumanizing", + "demeanour": "demeanor", + "demilitarisation": "demilitarization", + "demilitarise": "demilitarize", + "demilitarised": "demilitarized", + "demilitarises": "demilitarizes", + "demilitarising": "demilitarizing", + "demobilisation": "demobilization", + "demobilise": "demobilize", + "demobilised": "demobilized", + "demobilises": "demobilizes", + "demobilising": "demobilizing", + "democratisation": "democratization", + "democratise": "democratize", + "democratised": "democratized", + "democratises": "democratizes", + "democratising": "democratizing", + "demonise": "demonize", + "demonised": "demonized", + "demonises": "demonizes", + "demonising": "demonizing", + "demoralisation": "demoralization", + "demoralise": "demoralize", + "demoralised": "demoralized", + "demoralises": "demoralizes", + "demoralising": "demoralizing", + "denationalisation": "denationalization", + "denationalise": "denationalize", + "denationalised": "denationalized", + "denationalises": "denationalizes", + "denationalising": "denationalizing", + "deodorise": "deodorize", + "deodorised": "deodorized", + "deodorises": "deodorizes", + "deodorising": "deodorizing", + "depersonalise": "depersonalize", + "depersonalised": "depersonalized", + "depersonalises": "depersonalizes", + "depersonalising": "depersonalizing", + "deputise": "deputize", + "deputised": "deputized", + "deputises": "deputizes", + "deputising": "deputizing", + "desensitisation": "desensitization", + "desensitise": "desensitize", + "desensitised": "desensitized", + "desensitises": "desensitizes", + "desensitising": "desensitizing", + "destabilisation": "destabilization", + "destabilise": "destabilize", + "destabilised": "destabilized", + "destabilises": "destabilizes", + "destabilising": "destabilizing", + "dialled": "dialed", + "dialling": "dialing", + "dialogue": "dialog", + "dialogues": "dialogs", + "diarrhoea": "diarrhea", + "digitise": "digitize", + "digitised": "digitized", + "digitises": "digitizes", + "digitising": "digitizing", + "disc": "disk", + "discolour": "discolor", + "discoloured": "discolored", + "discolouring": "discoloring", + "discolours": "discolors", + "discs": "disks", + "disembowelled": "disemboweled", + "disembowelling": "disemboweling", + "disfavour": "disfavor", + "dishevelled": "disheveled", + "dishonour": "dishonor", + "dishonourable": "dishonorable", + "dishonourably": "dishonorably", + "dishonoured": "dishonored", + "dishonouring": "dishonoring", + "dishonours": "dishonors", + "disorganisation": "disorganization", + "disorganised": "disorganized", + "distil": "distill", + "distils": "distills", + "dramatisation": "dramatization", + "dramatisations": "dramatizations", + "dramatise": "dramatize", + "dramatised": "dramatized", + "dramatises": "dramatizes", + "dramatising": "dramatizing", + "draught": "draft", + "draughtboard": "draftboard", + "draughtboards": "draftboards", + "draughtier": "draftier", + "draughtiest": "draftiest", + "draughts": "drafts", + "draughtsman": "draftsman", + "draughtsmanship": "draftsmanship", + "draughtsmen": "draftsmen", + "draughtswoman": "draftswoman", + "draughtswomen": "draftswomen", + "draughty": "drafty", + "drivelled": "driveled", + "drivelling": "driveling", + "duelled": "dueled", + "duelling": "dueling", + "economise": "economize", + "economised": "economized", + "economises": "economizes", + "economising": "economizing", + "editorialise": "editorialize", + "editorialised": "editorialized", + "editorialises": "editorializes", + "editorialising": "editorializing", + "edoema": "edema", + "empathise": "empathize", + "empathised": "empathized", + "empathises": "empathizes", + "empathising": "empathizing", + "emphasise": "emphasize", + "emphasised": "emphasized", + "emphasises": "emphasizes", + "emphasising": "emphasizing", + "enamelled": "enameled", + "enamelling": "enameling", + "enamoured": "enamored", + "encyclopaedia": "encyclopedia", + "encyclopaedias": "encyclopedias", + "encyclopaedic": "encyclopedic", + "endeavour": "endeavor", + "endeavoured": "endeavored", + "endeavouring": "endeavoring", + "endeavours": "endeavors", + "energise": "energize", + "energised": "energized", + "energises": "energizes", + "energising": "energizing", + "enrol": "enroll", + "enrols": "enrolls", + "enthral": "enthrall", + "enthrals": "enthralls", + "epaulette": "epaulet", + "epaulettes": "epaulets", + "epicentre": "epicenter", + "epicentres": "epicenters", + "epilogue": "epilog", + "epilogues": "epilogs", + "epitomise": "epitomize", + "epitomised": "epitomized", + "epitomises": "epitomizes", + "epitomising": "epitomizing", + "equalisation": "equalization", + "equalise": "equalize", + "equalised": "equalized", + "equaliser": "equalizer", + "equalisers": "equalizers", + "equalises": "equalizes", + "equalising": "equalizing", + "eulogise": "eulogize", + "eulogised": "eulogized", + "eulogises": "eulogizes", + "eulogising": "eulogizing", + "evangelise": "evangelize", + "evangelised": "evangelized", + "evangelises": "evangelizes", + "evangelising": "evangelizing", + "exorcise": "exorcize", + "exorcised": "exorcized", + "exorcises": "exorcizes", + "exorcising": "exorcizing", + "extemporisation": "extemporization", + "extemporise": "extemporize", + "extemporised": "extemporized", + "extemporises": "extemporizes", + "extemporising": "extemporizing", + "externalisation": "externalization", + "externalisations": "externalizations", + "externalise": "externalize", + "externalised": "externalized", + "externalises": "externalizes", + "externalising": "externalizing", + "factorise": "factorize", + "factorised": "factorized", + "factorises": "factorizes", + "factorising": "factorizing", + "faecal": "fecal", + "faeces": "feces", + "familiarisation": "familiarization", + "familiarise": "familiarize", + "familiarised": "familiarized", + "familiarises": "familiarizes", + "familiarising": "familiarizing", + "fantasise": "fantasize", + "fantasised": "fantasized", + "fantasises": "fantasizes", + "fantasising": "fantasizing", + "favour": "favor", + "favourable": "favorable", + "favourably": "favorably", + "favoured": "favored", + "favouring": "favoring", + "favourite": "favorite", + "favourites": "favorites", + "favouritism": "favoritism", + "favours": "favors", + "feminise": "feminize", + "feminised": "feminized", + "feminises": "feminizes", + "feminising": "feminizing", + "fertilisation": "fertilization", + "fertilise": "fertilize", + "fertilised": "fertilized", + "fertiliser": "fertilizer", + "fertilisers": "fertilizers", + "fertilises": "fertilizes", + "fertilising": "fertilizing", + "fervour": "fervor", + "fibre": "fiber", + "fibreglass": "fiberglass", + "fibres": "fibers", + "fictionalisation": "fictionalization", + "fictionalisations": "fictionalizations", + "fictionalise": "fictionalize", + "fictionalised": "fictionalized", + "fictionalises": "fictionalizes", + "fictionalising": "fictionalizing", + "fillet": "filet", + "filleted": "fileted", + "filleting": "fileting", + "fillets": "filets", + "finalisation": "finalization", + "finalise": "finalize", + "finalised": "finalized", + "finalises": "finalizes", + "finalising": "finalizing", + "flautist": "flutist", + "flautists": "flutists", + "flavour": "flavor", + "flavoured": "flavored", + "flavouring": "flavoring", + "flavourings": "flavorings", + "flavourless": "flavorless", + "flavours": "flavors", + "flavoursome": "flavorsome", + "flyer / flier": "flier / flyer", + "foetal": "fetal", + "foetid": "fetid", + "foetus": "fetus", + "foetuses": "fetuses", + "formalisation": "formalization", + "formalise": "formalize", + "formalised": "formalized", + "formalises": "formalizes", + "formalising": "formalizing", + "fossilisation": "fossilization", + "fossilise": "fossilize", + "fossilised": "fossilized", + "fossilises": "fossilizes", + "fossilising": "fossilizing", + "fraternisation": "fraternization", + "fraternise": "fraternize", + "fraternised": "fraternized", + "fraternises": "fraternizes", + "fraternising": "fraternizing", + "fulfil": "fulfill", + "fulfilment": "fulfillment", + "fulfils": "fulfills", + "funnelled": "funneled", + "funnelling": "funneling", + "gage": "gauge", + "gaged": "gauged", + "gages": "gauges", + "gaging": "gauging", + "galvanise": "galvanize", + "galvanised": "galvanized", + "galvanises": "galvanizes", + "galvanising": "galvanizing", + "gambolled": "gamboled", + "gambolling": "gamboling", + "gaol": "jail", + "gaolbird": "jailbird", + "gaolbirds": "jailbirds", + "gaolbreak": "jailbreak", + "gaolbreaks": "jailbreaks", + "gaoled": "jailed", + "gaoler": "jailer", + "gaolers": "jailers", + "gaoling": "jailing", + "gaols": "jails", + "gasses": "gases", + "generalisation": "generalization", + "generalisations": "generalizations", + "generalise": "generalize", + "generalised": "generalized", + "generalises": "generalizes", + "generalising": "generalizing", + "ghettoise": "ghettoize", + "ghettoised": "ghettoized", + "ghettoises": "ghettoizes", + "ghettoising": "ghettoizing", + "gipsies": "gypsies", + "glamor": "glamour", + "glamorise": "glamorize", + "glamorised": "glamorized", + "glamorises": "glamorizes", + "glamorising": "glamorizing", + "globalisation": "globalization", + "globalise": "globalize", + "globalised": "globalized", + "globalises": "globalizes", + "globalising": "globalizing", + "glueing": "gluing", + "goitre": "goiter", + "goitres": "goiters", + "gonorrhoea": "gonorrhea", + "gramme": "gram", + "grammes": "grams", + "gravelled": "graveled", + "grey": "gray", + "greyed": "grayed", + "greying": "graying", + "greyish": "grayish", + "greyness": "grayness", + "greys": "grays", + "grovelled": "groveled", + "grovelling": "groveling", + "groyne": "groin", + "groynes": "groins", + "gruelling": "grueling", + "gruellingly": "gruelingly", + "gryphon": "griffin", + "gryphons": "griffins", + "gynaecological": "gynecological", + "gynaecologist": "gynecologist", + "gynaecologists": "gynecologists", + "gynaecology": "gynecology", + "haematological": "hematological", + "haematologist": "hematologist", + "haematologists": "hematologists", + "haematology": "hematology", + "haemoglobin": "hemoglobin", + "haemophilia": "hemophilia", + "haemophiliac": "hemophiliac", + "haemophiliacs": "hemophiliacs", + "haemorrhage": "hemorrhage", + "haemorrhaged": "hemorrhaged", + "haemorrhages": "hemorrhages", + "haemorrhaging": "hemorrhaging", + "haemorrhoids": "hemorrhoids", + "harbour": "harbor", + "harboured": "harbored", + "harbouring": "harboring", + "harbours": "harbors", + "harmonisation": "harmonization", + "harmonise": "harmonize", + "harmonised": "harmonized", + "harmonises": "harmonizes", + "harmonising": "harmonizing", + "homoeopath": "homeopath", + "homoeopathic": "homeopathic", + "homoeopaths": "homeopaths", + "homoeopathy": "homeopathy", + "homogenise": "homogenize", + "homogenised": "homogenized", + "homogenises": "homogenizes", + "homogenising": "homogenizing", + "honour": "honor", + "honourable": "honorable", + "honourably": "honorably", + "honoured": "honored", + "honouring": "honoring", + "honours": "honors", + "hospitalisation": "hospitalization", + "hospitalise": "hospitalize", + "hospitalised": "hospitalized", + "hospitalises": "hospitalizes", + "hospitalising": "hospitalizing", + "humanise": "humanize", + "humanised": "humanized", + "humanises": "humanizes", + "humanising": "humanizing", + "humour": "humor", + "humoured": "humored", + "humouring": "humoring", + "humourless": "humorless", + "humours": "humors", + "hybridise": "hybridize", + "hybridised": "hybridized", + "hybridises": "hybridizes", + "hybridising": "hybridizing", + "hypnotise": "hypnotize", + "hypnotised": "hypnotized", + "hypnotises": "hypnotizes", + "hypnotising": "hypnotizing", + "hypothesise": "hypothesize", + "hypothesised": "hypothesized", + "hypothesises": "hypothesizes", + "hypothesising": "hypothesizing", + "idealisation": "idealization", + "idealise": "idealize", + "idealised": "idealized", + "idealises": "idealizes", + "idealising": "idealizing", + "idolise": "idolize", + "idolised": "idolized", + "idolises": "idolizes", + "idolising": "idolizing", + "immobilisation": "immobilization", + "immobilise": "immobilize", + "immobilised": "immobilized", + "immobiliser": "immobilizer", + "immobilisers": "immobilizers", + "immobilises": "immobilizes", + "immobilising": "immobilizing", + "immortalise": "immortalize", + "immortalised": "immortalized", + "immortalises": "immortalizes", + "immortalising": "immortalizing", + "immunisation": "immunization", + "immunise": "immunize", + "immunised": "immunized", + "immunises": "immunizes", + "immunising": "immunizing", + "impanelled": "impaneled", + "impanelling": "impaneling", + "imperilled": "imperiled", + "imperilling": "imperiling", + "individualise": "individualize", + "individualised": "individualized", + "individualises": "individualizes", + "individualising": "individualizing", + "industrialise": "industrialize", + "industrialised": "industrialized", + "industrialises": "industrializes", + "industrialising": "industrializing", + "inflexion": "inflection", + "inflexions": "inflections", + "initialise": "initialize", + "initialised": "initialized", + "initialises": "initializes", + "initialising": "initializing", + "initialled": "initialed", + "initialling": "initialing", + "instal": "install", + "instalment": "installment", + "instalments": "installments", + "instals": "installs", + "instil": "instill", + "instils": "instills", + "institutionalisation": "institutionalization", + "institutionalise": "institutionalize", + "institutionalised": "institutionalized", + "institutionalises": "institutionalizes", + "institutionalising": "institutionalizing", + "intellectualise": "intellectualize", + "intellectualised": "intellectualized", + "intellectualises": "intellectualizes", + "intellectualising": "intellectualizing", + "internalisation": "internalization", + "internalise": "internalize", + "internalised": "internalized", + "internalises": "internalizes", + "internalising": "internalizing", + "internationalisation": "internationalization", + "internationalise": "internationalize", + "internationalised": "internationalized", + "internationalises": "internationalizes", + "internationalising": "internationalizing", + "ionisation": "ionization", + "ionise": "ionize", + "ionised": "ionized", + "ioniser": "ionizer", + "ionisers": "ionizers", + "ionises": "ionizes", + "ionising": "ionizing", + "italicise": "italicize", + "italicised": "italicized", + "italicises": "italicizes", + "italicising": "italicizing", + "itemise": "itemize", + "itemised": "itemized", + "itemises": "itemizes", + "itemising": "itemizing", + "jeopardise": "jeopardize", + "jeopardised": "jeopardized", + "jeopardises": "jeopardizes", + "jeopardising": "jeopardizing", + "jewelled": "jeweled", + "jeweller": "jeweler", + "jewellers": "jewelers", + "jewellery": "jewelry", + "judgement": "judgment", + "kilogramme": "kilogram", + "kilogrammes": "kilograms", + "kilometre": "kilometer", + "kilometres": "kilometers", + "labelled": "labeled", + "labelling": "labeling", + "labour": "labor", + "laboured": "labored", + "labourer": "laborer", + "labourers": "laborers", + "labouring": "laboring", + "labours": "labors", + "lacklustre": "lackluster", + "legalisation": "legalization", + "legalise": "legalize", + "legalised": "legalized", + "legalises": "legalizes", + "legalising": "legalizing", + "legitimise": "legitimize", + "legitimised": "legitimized", + "legitimises": "legitimizes", + "legitimising": "legitimizing", + "leukaemia": "leukemia", + "levelled": "leveled", + "leveller": "leveler", + "levellers": "levelers", + "levelling": "leveling", + "libelled": "libeled", + "libelling": "libeling", + "libellous": "libelous", + "liberalisation": "liberalization", + "liberalise": "liberalize", + "liberalised": "liberalized", + "liberalises": "liberalizes", + "liberalising": "liberalizing", + "licence": "license", + "licenced": "licensed", + "licences": "licenses", + "licencing": "licensing", + "likeable": "likable", + "lionisation": "lionization", + "lionise": "lionize", + "lionised": "lionized", + "lionises": "lionizes", + "lionising": "lionizing", + "liquidise": "liquidize", + "liquidised": "liquidized", + "liquidiser": "liquidizer", + "liquidisers": "liquidizers", + "liquidises": "liquidizes", + "liquidising": "liquidizing", + "litre": "liter", + "litres": "liters", + "localise": "localize", + "localised": "localized", + "localises": "localizes", + "localising": "localizing", + "louvre": "louver", + "louvred": "louvered", + "louvres": "louvers", + "lustre": "luster", + "magnetise": "magnetize", + "magnetised": "magnetized", + "magnetises": "magnetizes", + "magnetising": "magnetizing", + "manoeuvrability": "maneuverability", + "manoeuvrable": "maneuverable", + "manoeuvre": "maneuver", + "manoeuvred": "maneuvered", + "manoeuvres": "maneuvers", + "manoeuvring": "maneuvering", + "manoeuvrings": "maneuverings", + "marginalisation": "marginalization", + "marginalise": "marginalize", + "marginalised": "marginalized", + "marginalises": "marginalizes", + "marginalising": "marginalizing", + "marshalled": "marshaled", + "marshalling": "marshaling", + "marvelled": "marveled", + "marvelling": "marveling", + "marvellous": "marvelous", + "marvellously": "marvelously", + "materialisation": "materialization", + "materialise": "materialize", + "materialised": "materialized", + "materialises": "materializes", + "materialising": "materializing", + "maximisation": "maximization", + "maximise": "maximize", + "maximised": "maximized", + "maximises": "maximizes", + "maximising": "maximizing", + "meagre": "meager", + "mechanisation": "mechanization", + "mechanise": "mechanize", + "mechanised": "mechanized", + "mechanises": "mechanizes", + "mechanising": "mechanizing", + "mediaeval": "medieval", + "memorialise": "memorialize", + "memorialised": "memorialized", + "memorialises": "memorializes", + "memorialising": "memorializing", + "memorise": "memorize", + "memorised": "memorized", + "memorises": "memorizes", + "memorising": "memorizing", + "mesmerise": "mesmerize", + "mesmerised": "mesmerized", + "mesmerises": "mesmerizes", + "mesmerising": "mesmerizing", + "metabolise": "metabolize", + "metabolised": "metabolized", + "metabolises": "metabolizes", + "metabolising": "metabolizing", + "metre": "meter", + "metres": "meters", + "mhm": "hmm", + "micrometre": "micrometer", + "micrometres": "micrometers", + "militarise": "militarize", + "militarised": "militarized", + "militarises": "militarizes", + "militarising": "militarizing", + "milligramme": "milligram", + "milligrammes": "milligrams", + "millilitre": "milliliter", + "millilitres": "milliliters", + "millimetre": "millimeter", + "millimetres": "millimeters", + "miniaturisation": "miniaturization", + "miniaturise": "miniaturize", + "miniaturised": "miniaturized", + "miniaturises": "miniaturizes", + "miniaturising": "miniaturizing", + "minibusses": "minibuses", + "minimise": "minimize", + "minimised": "minimized", + "minimises": "minimizes", + "minimising": "minimizing", + "misbehaviour": "misbehavior", + "misdemeanour": "misdemeanor", + "misdemeanours": "misdemeanors", + "misspelt": "misspelled", + "mitre": "miter", + "mitres": "miters", + "mm": "hmm", + "mmm": "hmm", + "mobilisation": "mobilization", + "mobilise": "mobilize", + "mobilised": "mobilized", + "mobilises": "mobilizes", + "mobilising": "mobilizing", + "modelled": "modeled", + "modeller": "modeler", + "modellers": "modelers", + "modelling": "modeling", + "modernise": "modernize", + "modernised": "modernized", + "modernises": "modernizes", + "modernising": "modernizing", + "moisturise": "moisturize", + "moisturised": "moisturized", + "moisturiser": "moisturizer", + "moisturisers": "moisturizers", + "moisturises": "moisturizes", + "moisturising": "moisturizing", + "monologue": "monolog", + "monologues": "monologs", + "monopolisation": "monopolization", + "monopolise": "monopolize", + "monopolised": "monopolized", + "monopolises": "monopolizes", + "monopolising": "monopolizing", + "moralise": "moralize", + "moralised": "moralized", + "moralises": "moralizes", + "moralising": "moralizing", + "motorised": "motorized", + "mould": "mold", + "moulded": "molded", + "moulder": "molder", + "mouldered": "moldered", + "mouldering": "moldering", + "moulders": "molders", + "mouldier": "moldier", + "mouldiest": "moldiest", + "moulding": "molding", + "mouldings": "moldings", + "moulds": "molds", + "mouldy": "moldy", + "moult": "molt", + "moulted": "molted", + "moulting": "molting", + "moults": "molts", + "moustache": "mustache", + "moustached": "mustached", + "moustaches": "mustaches", + "moustachioed": "mustachioed", + "multicoloured": "multicolored", + "nationalisation": "nationalization", + "nationalisations": "nationalizations", + "nationalise": "nationalize", + "nationalised": "nationalized", + "nationalises": "nationalizes", + "nationalising": "nationalizing", + "naturalisation": "naturalization", + "naturalise": "naturalize", + "naturalised": "naturalized", + "naturalises": "naturalizes", + "naturalising": "naturalizing", + "neighbour": "neighbor", + "neighbourhood": "neighborhood", + "neighbourhoods": "neighborhoods", + "neighbouring": "neighboring", + "neighbourliness": "neighborliness", + "neighbourly": "neighborly", + "neighbours": "neighbors", + "neutralisation": "neutralization", + "neutralise": "neutralize", + "neutralised": "neutralized", + "neutralises": "neutralizes", + "neutralising": "neutralizing", + "normalisation": "normalization", + "normalise": "normalize", + "normalised": "normalized", + "normalises": "normalizes", + "normalising": "normalizing", + "odour": "odor", + "odourless": "odorless", + "odours": "odors", + "oesophagus": "esophagus", + "oesophaguses": "esophaguses", + "oestrogen": "estrogen", + "offence": "offense", + "offences": "offenses", + "omelette": "omelet", + "omelettes": "omelets", + "optimise": "optimize", + "optimised": "optimized", + "optimises": "optimizes", + "optimising": "optimizing", + "organisation": "organization", + "organisational": "organizational", + "organisations": "organizations", + "organise": "organize", + "organised": "organized", + "organiser": "organizer", + "organisers": "organizers", + "organises": "organizes", + "organising": "organizing", + "orthopaedic": "orthopedic", + "orthopaedics": "orthopedics", + "ostracise": "ostracize", + "ostracised": "ostracized", + "ostracises": "ostracizes", + "ostracising": "ostracizing", + "outmanoeuvre": "outmaneuver", + "outmanoeuvred": "outmaneuvered", + "outmanoeuvres": "outmaneuvers", + "outmanoeuvring": "outmaneuvering", + "overemphasise": "overemphasize", + "overemphasised": "overemphasized", + "overemphasises": "overemphasizes", + "overemphasising": "overemphasizing", + "oxidisation": "oxidization", + "oxidise": "oxidize", + "oxidised": "oxidized", + "oxidises": "oxidizes", + "oxidising": "oxidizing", + "paederast": "pederast", + "paederasts": "pederasts", + "paediatric": "pediatric", + "paediatrician": "pediatrician", + "paediatricians": "pediatricians", + "paediatrics": "pediatrics", + "paedophile": "pedophile", + "paedophiles": "pedophiles", + "paedophilia": "pedophilia", + "palaeolithic": "paleolithic", + "palaeontologist": "paleontologist", + "palaeontologists": "paleontologists", + "palaeontology": "paleontology", + "panelled": "paneled", + "panelling": "paneling", + "panellist": "panelist", + "panellists": "panelists", + "paralyse": "paralyze", + "paralysed": "paralyzed", + "paralyses": "paralyzes", + "paralysing": "paralyzing", + "parcelled": "parceled", + "parcelling": "parceling", + "parlour": "parlor", + "parlours": "parlors", + "particularise": "particularize", + "particularised": "particularized", + "particularises": "particularizes", + "particularising": "particularizing", + "passivisation": "passivization", + "passivise": "passivize", + "passivised": "passivized", + "passivises": "passivizes", + "passivising": "passivizing", + "pasteurisation": "pasteurization", + "pasteurise": "pasteurize", + "pasteurised": "pasteurized", + "pasteurises": "pasteurizes", + "pasteurising": "pasteurizing", + "patronise": "patronize", + "patronised": "patronized", + "patronises": "patronizes", + "patronising": "patronizing", + "patronisingly": "patronizingly", + "pedalled": "pedaled", + "pedalling": "pedaling", + "pedestrianisation": "pedestrianization", + "pedestrianise": "pedestrianize", + "pedestrianised": "pedestrianized", + "pedestrianises": "pedestrianizes", + "pedestrianising": "pedestrianizing", + "penalise": "penalize", + "penalised": "penalized", + "penalises": "penalizes", + "penalising": "penalizing", + "pencilled": "penciled", + "pencilling": "penciling", + "personalise": "personalize", + "personalised": "personalized", + "personalises": "personalizes", + "personalising": "personalizing", + "pharmacopoeia": "pharmacopeia", + "pharmacopoeias": "pharmacopeias", + "philosophise": "philosophize", + "philosophised": "philosophized", + "philosophises": "philosophizes", + "philosophising": "philosophizing", + "philtre": "filter", + "philtres": "filters", + "phoney": "phony", + "plagiarise": "plagiarize", + "plagiarised": "plagiarized", + "plagiarises": "plagiarizes", + "plagiarising": "plagiarizing", + "plough": "plow", + "ploughed": "plowed", + "ploughing": "plowing", + "ploughman": "plowman", + "ploughmen": "plowmen", + "ploughs": "plows", + "ploughshare": "plowshare", + "ploughshares": "plowshares", + "polarisation": "polarization", + "polarise": "polarize", + "polarised": "polarized", + "polarises": "polarizes", + "polarising": "polarizing", + "politicisation": "politicization", + "politicise": "politicize", + "politicised": "politicized", + "politicises": "politicizes", + "politicising": "politicizing", + "popularisation": "popularization", + "popularise": "popularize", + "popularised": "popularized", + "popularises": "popularizes", + "popularising": "popularizing", + "pouffe": "pouf", + "pouffes": "poufs", + "practise": "practice", + "practised": "practiced", + "practises": "practices", + "practising": "practicing", + "praesidium": "presidium", + "praesidiums": "presidiums", + "pressurisation": "pressurization", + "pressurise": "pressurize", + "pressurised": "pressurized", + "pressurises": "pressurizes", + "pressurising": "pressurizing", + "pretence": "pretense", + "pretences": "pretenses", + "primaeval": "primeval", + "prioritisation": "prioritization", + "prioritise": "prioritize", + "prioritised": "prioritized", + "prioritises": "prioritizes", + "prioritising": "prioritizing", + "privatisation": "privatization", + "privatisations": "privatizations", + "privatise": "privatize", + "privatised": "privatized", + "privatises": "privatizes", + "privatising": "privatizing", + "professionalisation": "professionalization", + "professionalise": "professionalize", + "professionalised": "professionalized", + "professionalises": "professionalizes", + "professionalising": "professionalizing", + "programme": "program", + "programmes": "programs", + "prologue": "prolog", + "prologues": "prologs", + "propagandise": "propagandize", + "propagandised": "propagandized", + "propagandises": "propagandizes", + "propagandising": "propagandizing", + "proselytise": "proselytize", + "proselytised": "proselytized", + "proselytiser": "proselytizer", + "proselytisers": "proselytizers", + "proselytises": "proselytizes", + "proselytising": "proselytizing", + "psychoanalyse": "psychoanalyze", + "psychoanalysed": "psychoanalyzed", + "psychoanalyses": "psychoanalyzes", + "psychoanalysing": "psychoanalyzing", + "publicise": "publicize", + "publicised": "publicized", + "publicises": "publicizes", + "publicising": "publicizing", + "pulverisation": "pulverization", + "pulverise": "pulverize", + "pulverised": "pulverized", + "pulverises": "pulverizes", + "pulverising": "pulverizing", + "pummelled": "pummel", + "pummelling": "pummeled", + "pyjama": "pajama", + "pyjamas": "pajamas", + "pzazz": "pizzazz", + "quarrelled": "quarreled", + "quarrelling": "quarreling", + "radicalise": "radicalize", + "radicalised": "radicalized", + "radicalises": "radicalizes", + "radicalising": "radicalizing", + "rancour": "rancor", + "randomise": "randomize", + "randomised": "randomized", + "randomises": "randomizes", + "randomising": "randomizing", + "rationalisation": "rationalization", + "rationalisations": "rationalizations", + "rationalise": "rationalize", + "rationalised": "rationalized", + "rationalises": "rationalizes", + "rationalising": "rationalizing", + "ravelled": "raveled", + "ravelling": "raveling", + "realisable": "realizable", + "realisation": "realization", + "realisations": "realizations", + "realise": "realize", + "realised": "realized", + "realises": "realizes", + "realising": "realizing", + "recognisable": "recognizable", + "recognisably": "recognizably", + "recognisance": "recognizance", + "recognise": "recognize", + "recognised": "recognized", + "recognises": "recognizes", + "recognising": "recognizing", + "reconnoitre": "reconnoiter", + "reconnoitred": "reconnoitered", + "reconnoitres": "reconnoiters", + "reconnoitring": "reconnoitering", + "refuelled": "refueled", + "refuelling": "refueling", + "regularisation": "regularization", + "regularise": "regularize", + "regularised": "regularized", + "regularises": "regularizes", + "regularising": "regularizing", + "remodelled": "remodeled", + "remodelling": "remodeling", + "remould": "remold", + "remoulded": "remolded", + "remoulding": "remolding", + "remoulds": "remolds", + "reorganisation": "reorganization", + "reorganisations": "reorganizations", + "reorganise": "reorganize", + "reorganised": "reorganized", + "reorganises": "reorganizes", + "reorganising": "reorganizing", + "revelled": "reveled", + "reveller": "reveler", + "revellers": "revelers", + "revelling": "reveling", + "revitalise": "revitalize", + "revitalised": "revitalized", + "revitalises": "revitalizes", + "revitalising": "revitalizing", + "revolutionise": "revolutionize", + "revolutionised": "revolutionized", + "revolutionises": "revolutionizes", + "revolutionising": "revolutionizing", + "rhapsodise": "rhapsodize", + "rhapsodised": "rhapsodized", + "rhapsodises": "rhapsodizes", + "rhapsodising": "rhapsodizing", + "rigour": "rigor", + "rigours": "rigors", + "ritualised": "ritualized", + "rivalled": "rivaled", + "rivalling": "rivaling", + "romanticise": "romanticize", + "romanticised": "romanticized", + "romanticises": "romanticizes", + "romanticising": "romanticizing", + "rumour": "rumor", + "rumoured": "rumored", + "rumours": "rumors", + "sabre": "saber", + "sabres": "sabers", + "saltpetre": "saltpeter", + "sanitise": "sanitize", + "sanitised": "sanitized", + "sanitises": "sanitizes", + "sanitising": "sanitizing", + "satirise": "satirize", + "satirised": "satirized", + "satirises": "satirizes", + "satirising": "satirizing", + "saviour": "savior", + "saviours": "saviors", + "savour": "savor", + "savoured": "savored", + "savouries": "savories", + "savouring": "savoring", + "savours": "savors", + "savoury": "savory", + "scandalise": "scandalize", + "scandalised": "scandalized", + "scandalises": "scandalizes", + "scandalising": "scandalizing", + "sceptic": "skeptic", + "sceptical": "skeptical", + "sceptically": "skeptically", + "scepticism": "skepticism", + "sceptics": "skeptics", + "sceptre": "scepter", + "sceptres": "scepters", + "scrutinise": "scrutinize", + "scrutinised": "scrutinized", + "scrutinises": "scrutinizes", + "scrutinising": "scrutinizing", + "secularisation": "secularization", + "secularise": "secularize", + "secularised": "secularized", + "secularises": "secularizes", + "secularising": "secularizing", + "sensationalise": "sensationalize", + "sensationalised": "sensationalized", + "sensationalises": "sensationalizes", + "sensationalising": "sensationalizing", + "sensitise": "sensitize", + "sensitised": "sensitized", + "sensitises": "sensitizes", + "sensitising": "sensitizing", + "sentimentalise": "sentimentalize", + "sentimentalised": "sentimentalized", + "sentimentalises": "sentimentalizes", + "sentimentalising": "sentimentalizing", + "sepulchre": "sepulcher", + "sepulchres": "sepulchers", + "serialisation": "serialization", + "serialisations": "serializations", + "serialise": "serialize", + "serialised": "serialized", + "serialises": "serializes", + "serialising": "serializing", + "sermonise": "sermonize", + "sermonised": "sermonized", + "sermonises": "sermonizes", + "sermonising": "sermonizing", + "sheikh": "sheik", + "shovelled": "shoveled", + "shovelling": "shoveling", + "shrivelled": "shriveled", + "shrivelling": "shriveling", + "signalise": "signalize", + "signalised": "signalized", + "signalises": "signalizes", + "signalising": "signalizing", + "signalled": "signaled", + "signalling": "signaling", + "smoulder": "smolder", + "smouldered": "smoldered", + "smouldering": "smoldering", + "smoulders": "smolders", + "snivelled": "sniveled", + "snivelling": "sniveling", + "snorkelled": "snorkeled", + "snorkelling": "snorkeling", + "snowplough": "snowplow", + "snowploughs": "snowplow", + "socialisation": "socialization", + "socialise": "socialize", + "socialised": "socialized", + "socialises": "socializes", + "socialising": "socializing", + "sodomise": "sodomize", + "sodomised": "sodomized", + "sodomises": "sodomizes", + "sodomising": "sodomizing", + "solemnise": "solemnize", + "solemnised": "solemnized", + "solemnises": "solemnizes", + "solemnising": "solemnizing", + "sombre": "somber", + "specialisation": "specialization", + "specialisations": "specializations", + "specialise": "specialize", + "specialised": "specialized", + "specialises": "specializes", + "specialising": "specializing", + "spectre": "specter", + "spectres": "specters", + "spiralled": "spiraled", + "spiralling": "spiraling", + "splendour": "splendor", + "splendours": "splendors", + "squirrelled": "squirreled", + "squirrelling": "squirreling", + "stabilisation": "stabilization", + "stabilise": "stabilize", + "stabilised": "stabilized", + "stabiliser": "stabilizer", + "stabilisers": "stabilizers", + "stabilises": "stabilizes", + "stabilising": "stabilizing", + "standardisation": "standardization", + "standardise": "standardize", + "standardised": "standardized", + "standardises": "standardizes", + "standardising": "standardizing", + "stencilled": "stenciled", + "stencilling": "stenciling", + "sterilisation": "sterilization", + "sterilisations": "sterilizations", + "sterilise": "sterilize", + "sterilised": "sterilized", + "steriliser": "sterilizer", + "sterilisers": "sterilizers", + "sterilises": "sterilizes", + "sterilising": "sterilizing", + "stigmatisation": "stigmatization", + "stigmatise": "stigmatize", + "stigmatised": "stigmatized", + "stigmatises": "stigmatizes", + "stigmatising": "stigmatizing", + "storey": "story", + "storeys": "stories", + "subsidisation": "subsidization", + "subsidise": "subsidize", + "subsidised": "subsidized", + "subsidiser": "subsidizer", + "subsidisers": "subsidizers", + "subsidises": "subsidizes", + "subsidising": "subsidizing", + "succour": "succor", + "succoured": "succored", + "succouring": "succoring", + "succours": "succors", + "sulphate": "sulfate", + "sulphates": "sulfates", + "sulphide": "sulfide", + "sulphides": "sulfides", + "sulphur": "sulfur", + "sulphurous": "sulfurous", + "summarise": "summarize", + "summarised": "summarized", + "summarises": "summarizes", + "summarising": "summarizing", + "swivelled": "swiveled", + "swivelling": "swiveling", + "symbolise": "symbolize", + "symbolised": "symbolized", + "symbolises": "symbolizes", + "symbolising": "symbolizing", + "sympathise": "sympathize", + "sympathised": "sympathized", + "sympathiser": "sympathizer", + "sympathisers": "sympathizers", + "sympathises": "sympathizes", + "sympathising": "sympathizing", + "synchronisation": "synchronization", + "synchronise": "synchronize", + "synchronised": "synchronized", + "synchronises": "synchronizes", + "synchronising": "synchronizing", + "synthesise": "synthesize", + "synthesised": "synthesized", + "synthesiser": "synthesizer", + "synthesisers": "synthesizers", + "synthesises": "synthesizes", + "synthesising": "synthesizing", + "syphon": "siphon", + "syphoned": "siphoned", + "syphoning": "siphoning", + "syphons": "siphons", + "systematisation": "systematization", + "systematise": "systematize", + "systematised": "systematized", + "systematises": "systematizes", + "systematising": "systematizing", + "tantalise": "tantalize", + "tantalised": "tantalized", + "tantalises": "tantalizes", + "tantalising": "tantalizing", + "tantalisingly": "tantalizingly", + "tasselled": "tasseled", + "technicolour": "technicolor", + "temporise": "temporize", + "temporised": "temporized", + "temporises": "temporizes", + "temporising": "temporizing", + "tenderise": "tenderize", + "tenderised": "tenderized", + "tenderises": "tenderizes", + "tenderising": "tenderizing", + "terrorise": "terrorize", + "terrorised": "terrorized", + "terrorises": "terrorizes", + "terrorising": "terrorizing", + "theatre": "theater", + "theatregoer": "theatergoer", + "theatregoers": "theatergoers", + "theatres": "theaters", + "theorise": "theorize", + "theorised": "theorized", + "theorises": "theorizes", + "theorising": "theorizing", + "tonne": "ton", + "tonnes": "tons", + "towelled": "toweled", + "towelling": "toweling", + "toxaemia": "toxemia", + "tranquillise": "tranquilize", + "tranquillised": "tranquilized", + "tranquilliser": "tranquilizer", + "tranquillisers": "tranquilizers", + "tranquillises": "tranquilizes", + "tranquillising": "tranquilizing", + "tranquillity": "tranquility", + "tranquillize": "tranquilize", + "tranquillized": "tranquilized", + "tranquillizer": "tranquilizer", + "tranquillizers": "tranquilizers", + "tranquillizes": "tranquilizes", + "tranquillizing": "tranquilizing", + "tranquilly": "tranquility", + "transistorised": "transistorized", + "traumatise": "traumatize", + "traumatised": "traumatized", + "traumatises": "traumatizes", + "traumatising": "traumatizing", + "travelled": "traveled", + "traveller": "traveler", + "travellers": "travelers", + "travelling": "traveling", + "travelog": "travelogue", + "travelogs": "travelogues", + "trialled": "trialed", + "trialling": "trialing", + "tricolour": "tricolor", + "tricolours": "tricolors", + "trivialise": "trivialize", + "trivialised": "trivialized", + "trivialises": "trivializes", + "trivialising": "trivializing", + "tumour": "tumor", + "tumours": "tumors", + "tunnelled": "tunneled", + "tunnelling": "tunneling", + "tyrannise": "tyrannize", + "tyrannised": "tyrannized", + "tyrannises": "tyrannizes", + "tyrannising": "tyrannizing", + "tyre": "tire", + "tyres": "tires", + "unauthorised": "unauthorized", + "uncivilised": "uncivilized", + "underutilised": "underutilized", + "unequalled": "unequaled", + "unfavourable": "unfavorable", + "unfavourably": "unfavorably", + "unionisation": "unionization", + "unionise": "unionize", + "unionised": "unionized", + "unionises": "unionizes", + "unionising": "unionizing", + "unorganised": "unorganized", + "unravelled": "unraveled", + "unravelling": "unraveling", + "unrecognisable": "unrecognizable", + "unrecognised": "unrecognized", + "unrivalled": "unrivaled", + "unsavoury": "unsavory", + "untrammelled": "untrammeled", + "urbanisation": "urbanization", + "urbanise": "urbanize", + "urbanised": "urbanized", + "urbanises": "urbanizes", + "urbanising": "urbanizing", + "utilisable": "utilizable", + "utilisation": "utilization", + "utilise": "utilize", + "utilised": "utilized", + "utilises": "utilizes", + "utilising": "utilizing", + "valour": "valor", + "vandalise": "vandalize", + "vandalised": "vandalized", + "vandalises": "vandalizes", + "vandalising": "vandalizing", + "vaporisation": "vaporization", + "vaporise": "vaporize", + "vaporised": "vaporized", + "vaporises": "vaporizes", + "vaporising": "vaporizing", + "vapour": "vapor", + "vapours": "vapors", + "verbalise": "verbalize", + "verbalised": "verbalized", + "verbalises": "verbalizes", + "verbalising": "verbalizing", + "victimisation": "victimization", + "victimise": "victimize", + "victimised": "victimized", + "victimises": "victimizes", + "victimising": "victimizing", + "videodisc": "videodisk", + "videodiscs": "videodisks", + "vigour": "vigor", + "visualisation": "visualization", + "visualisations": "visualizations", + "visualise": "visualize", + "visualised": "visualized", + "visualises": "visualizes", + "visualising": "visualizing", + "vocalisation": "vocalization", + "vocalisations": "vocalizations", + "vocalise": "vocalize", + "vocalised": "vocalized", + "vocalises": "vocalizes", + "vocalising": "vocalizing", + "vulcanised": "vulcanized", + "vulgarisation": "vulgarization", + "vulgarise": "vulgarize", + "vulgarised": "vulgarized", + "vulgarises": "vulgarizes", + "vulgarising": "vulgarizing", + "waggon": "wagon", + "waggons": "wagons", + "watercolour": "watercolor", + "watercolours": "watercolors", + "weaselled": "weaseled", + "weaselling": "weaseling", + "westernisation": "westernization", + "westernise": "westernize", + "westernised": "westernized", + "westernises": "westernizes", + "westernising": "westernizing", + "womanise": "womanize", + "womanised": "womanized", + "womaniser": "womanizer", + "womanisers": "womanizers", + "womanises": "womanizes", + "womanising": "womanizing", + "woollen": "woolen", + "woollens": "woolens", + "woollies": "woolies", + "woolly": "wooly", + "worshipped": "worshiped", + "worshipper": "worshiper", + "worshipping": "worshiping", + "yodelled": "yodeled", + "yodelling": "yodeling", + "yoghourt": "yogurt", + "yoghourts": "yogurts", + "yoghurt": "yogurt", + "yoghurts": "yogurts" +} diff --git a/faster-whisper/benchmark/requirements.benchmark.txt b/faster-whisper/benchmark/requirements.benchmark.txt new file mode 100644 index 0000000..2645004 --- /dev/null +++ b/faster-whisper/benchmark/requirements.benchmark.txt @@ -0,0 +1,6 @@ +transformers +jiwer +evaluate +datasets +memory_profiler +py3nvml diff --git a/faster-whisper/benchmark/speed_benchmark.py b/faster-whisper/benchmark/speed_benchmark.py new file mode 100644 index 0000000..a3d6ffb --- /dev/null +++ b/faster-whisper/benchmark/speed_benchmark.py @@ -0,0 +1,31 @@ +import argparse +import timeit + +from typing import Callable + +from utils import inference + +parser = argparse.ArgumentParser(description="Speed benchmark") +parser.add_argument( + "--repeat", + type=int, + default=3, + help="Times an experiment will be run.", +) +args = parser.parse_args() + + +def measure_speed(func: Callable[[], None]): + # as written in https://docs.python.org/3/library/timeit.html#timeit.Timer.repeat, + # min should be taken rather than the average + runtimes = timeit.repeat( + func, + repeat=args.repeat, + number=10, + ) + print(runtimes) + print("Min execution time: %.3fs" % (min(runtimes) / 10.0)) + + +if __name__ == "__main__": + measure_speed(inference) diff --git a/faster-whisper/benchmark/utils.py b/faster-whisper/benchmark/utils.py new file mode 100644 index 0000000..8e5ac46 --- /dev/null +++ b/faster-whisper/benchmark/utils.py @@ -0,0 +1,39 @@ +import logging + +from threading import Thread +from typing import Optional + +from faster_whisper import WhisperModel + +model_path = "large-v3" +model = WhisperModel(model_path, device="cuda") + + +def inference(): + segments, info = model.transcribe("benchmark.m4a", language="fr") + for segment in segments: + print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) + + +def get_logger(name: Optional[str] = None) -> logging.Logger: + formatter = logging.Formatter("%(levelname)s: %(message)s") + logger = logging.getLogger(name) + logger.setLevel(logging.DEBUG) + handler = logging.StreamHandler() + handler.setFormatter(formatter) + logger.addHandler(handler) + return logger + + +class MyThread(Thread): + def __init__(self, func, params): + super(MyThread, self).__init__() + self.func = func + self.params = params + self.result = None + + def run(self): + self.result = self.func(*self.params) + + def get_result(self): + return self.result diff --git a/faster-whisper/benchmark/wer_benchmark.py b/faster-whisper/benchmark/wer_benchmark.py new file mode 100644 index 0000000..bf0a1e0 --- /dev/null +++ b/faster-whisper/benchmark/wer_benchmark.py @@ -0,0 +1,61 @@ +import argparse +import json + +from datasets import load_dataset +from evaluate import load +from tqdm import tqdm +from transformers.models.whisper.english_normalizer import EnglishTextNormalizer + +from faster_whisper import WhisperModel + +parser = argparse.ArgumentParser(description="WER benchmark") +parser.add_argument( + "--audio_numb", + type=int, + default=None, + help="Specify the number of validation audio files in the dataset." + " Set to None to retrieve all audio files.", +) +args = parser.parse_args() + +model_path = "large-v3" +model = WhisperModel(model_path, device="cuda") + +# load the dataset with streaming mode +dataset = load_dataset("librispeech_asr", "clean", split="validation", streaming=True) + +# define the evaluation metric +wer_metric = load("wer") +normalizer = EnglishTextNormalizer(json.load(open("normalizer.json"))) + + +def inference(batch): + batch["transcription"] = [] + for sample in batch["audio"]: + segments, info = model.transcribe(sample["array"], language="en") + batch["transcription"].append("".join([segment.text for segment in segments])) + batch["reference"] = batch["text"] + return batch + + +dataset = dataset.map(function=inference, batched=True, batch_size=16) + +all_transcriptions = [] +all_references = [] + +# iterate over the dataset and run inference +for i, result in tqdm(enumerate(dataset), desc="Evaluating..."): + all_transcriptions.append(result["transcription"]) + all_references.append(result["reference"]) + if args.audio_numb and i == (args.audio_numb - 1): + break + +# normalize predictions and references +all_transcriptions = [normalizer(transcription) for transcription in all_transcriptions] +all_references = [normalizer(reference) for reference in all_references] + +# compute the WER metric +wer = 100 * wer_metric.compute( + predictions=all_transcriptions, references=all_references +) +print("WER: %.3f" % wer) diff --git a/faster-whisper/docker/Dockerfile b/faster-whisper/docker/Dockerfile new file mode 100644 index 0000000..604c8e1 --- /dev/null +++ b/faster-whisper/docker/Dockerfile @@ -0,0 +1,6 @@ +FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04 +WORKDIR /root +RUN apt-get update -y && apt-get install -y python3-pip +COPY infer.py jfk.flac ./ +RUN pip3 install faster-whisper +CMD ["python3", "infer.py"] diff --git a/faster-whisper/docker/infer.py b/faster-whisper/docker/infer.py new file mode 100644 index 0000000..5d6b12c --- /dev/null +++ b/faster-whisper/docker/infer.py @@ -0,0 +1,7 @@ +from faster_whisper import WhisperModel + +jfk_path = "jfk.flac" +model = WhisperModel("tiny", device="cuda") +segments, info = model.transcribe(jfk_path, word_timestamps=True) +for segment in segments: + print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) diff --git a/faster-whisper/docker/jfk.flac b/faster-whisper/docker/jfk.flac new file mode 100644 index 0000000..e44b7c1 Binary files /dev/null and b/faster-whisper/docker/jfk.flac differ diff --git a/faster-whisper/faster_whisper.egg-info/PKG-INFO b/faster-whisper/faster_whisper.egg-info/PKG-INFO new file mode 100644 index 0000000..c9d8e53 --- /dev/null +++ b/faster-whisper/faster_whisper.egg-info/PKG-INFO @@ -0,0 +1,345 @@ +Metadata-Version: 2.1 +Name: faster-whisper +Version: 1.0.2 +Summary: Faster Whisper transcription with CTranslate2 +Home-page: https://github.com/SYSTRAN/faster-whisper +Author: Guillaume Klein +License: MIT +Keywords: openai whisper speech ctranslate2 inference quantization transformer +Classifier: Development Status :: 4 - Beta +Classifier: Intended Audience :: Developers +Classifier: Intended Audience :: Science/Research +Classifier: License :: OSI Approved :: MIT License +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3 :: Only +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence +Requires-Python: >=3.8 +Description-Content-Type: text/markdown +Provides-Extra: conversion +Provides-Extra: dev +License-File: LICENSE + +[![CI](https://github.com/SYSTRAN/faster-whisper/workflows/CI/badge.svg)](https://github.com/SYSTRAN/faster-whisper/actions?query=workflow%3ACI) [![PyPI version](https://badge.fury.io/py/faster-whisper.svg)](https://badge.fury.io/py/faster-whisper) + +# Faster Whisper transcription with CTranslate2 + +**faster-whisper** is a reimplementation of OpenAI's Whisper model using [CTranslate2](https://github.com/OpenNMT/CTranslate2/), which is a fast inference engine for Transformer models. + +This implementation is up to 4 times faster than [openai/whisper](https://github.com/openai/whisper) for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU. + +## Benchmark + +### Whisper + +For reference, here's the time and memory usage that are required to transcribe [**13 minutes**](https://www.youtube.com/watch?v=0u7tTptBo9I) of audio using different implementations: + +* [openai/whisper](https://github.com/openai/whisper)@[6dea21fd](https://github.com/openai/whisper/commit/6dea21fd7f7253bfe450f1e2512a0fe47ee2d258) +* [whisper.cpp](https://github.com/ggerganov/whisper.cpp)@[3b010f9](https://github.com/ggerganov/whisper.cpp/commit/3b010f9bed9a6068609e9faf52383aea792b0362) +* [faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[cce6b53e](https://github.com/SYSTRAN/faster-whisper/commit/cce6b53e4554f71172dad188c45f10fb100f6e3e) + +### Large-v2 model on GPU + +| Implementation | Precision | Beam size | Time | Max. GPU memory | Max. CPU memory | +| --- | --- | --- | --- | --- | --- | +| openai/whisper | fp16 | 5 | 4m30s | 11325MB | 9439MB | +| faster-whisper | fp16 | 5 | 54s | 4755MB | 3244MB | +| faster-whisper | int8 | 5 | 59s | 3091MB | 3117MB | + +*Executed with CUDA 11.7.1 on a NVIDIA Tesla V100S.* + +### Small model on CPU + +| Implementation | Precision | Beam size | Time | Max. memory | +| --- | --- | --- | --- | --- | +| openai/whisper | fp32 | 5 | 10m31s | 3101MB | +| whisper.cpp | fp32 | 5 | 17m42s | 1581MB | +| whisper.cpp | fp16 | 5 | 12m39s | 873MB | +| faster-whisper | fp32 | 5 | 2m44s | 1675MB | +| faster-whisper | int8 | 5 | 2m04s | 995MB | + +*Executed with 8 threads on a Intel(R) Xeon(R) Gold 6226R.* + + +### Distil-whisper + +| Implementation | Precision | Beam size | Time | Gigaspeech WER | +| --- | --- | --- | --- | --- | +| distil-whisper/distil-large-v2 | fp16 | 4 |- | 10.36 | +| [faster-distil-large-v2](https://huggingface.co/Systran/faster-distil-whisper-large-v2) | fp16 | 5 | - | 10.28 | +| distil-whisper/distil-medium.en | fp16 | 4 | - | 11.21 | +| [faster-distil-medium.en](https://huggingface.co/Systran/faster-distil-whisper-medium.en) | fp16 | 5 | - | 11.21 | + +*Executed with CUDA 11.4 on a NVIDIA 3090.* + +
+testing details (click to expand) + +For `distil-whisper/distil-large-v2`, the WER is tested with code sample from [link](https://huggingface.co/distil-whisper/distil-large-v2#evaluation). for `faster-distil-whisper`, the WER is tested with setting: +```python +from faster_whisper import WhisperModel + +model_size = "distil-large-v2" +# model_size = "distil-medium.en" +# Run on GPU with FP16 +model = WhisperModel(model_size, device="cuda", compute_type="float16") +segments, info = model.transcribe("audio.mp3", beam_size=5, language="en") +``` +
+ +## Requirements + +* Python 3.8 or greater + +Unlike openai-whisper, FFmpeg does **not** need to be installed on the system. The audio is decoded with the Python library [PyAV](https://github.com/PyAV-Org/PyAV) which bundles the FFmpeg libraries in its package. + +### GPU + +GPU execution requires the following NVIDIA libraries to be installed: + +* [cuBLAS for CUDA 12](https://developer.nvidia.com/cublas) +* [cuDNN 8 for CUDA 12](https://developer.nvidia.com/cudnn) + +**Note**: Latest versions of `ctranslate2` support CUDA 12 only. For CUDA 11, the current workaround is downgrading to the `3.24.0` version of `ctranslate2` (This can be done with `pip install --force-reinstall ctranslate2==3.24.0` or specifying the version in a `requirements.txt`). + +There are multiple ways to install the NVIDIA libraries mentioned above. The recommended way is described in the official NVIDIA documentation, but we also suggest other installation methods below. + +
+Other installation methods (click to expand) + + +**Note:** For all these methods below, keep in mind the above note regarding CUDA versions. Depending on your setup, you may need to install the _CUDA 11_ versions of libraries that correspond to the CUDA 12 libraries listed in the instructions below. + +#### Use Docker + +The libraries (cuBLAS, cuDNN) are installed in these official NVIDIA CUDA Docker images: `nvidia/cuda:12.0.0-runtime-ubuntu20.04` or `nvidia/cuda:12.0.0-runtime-ubuntu22.04`. + +#### Install with `pip` (Linux only) + +On Linux these libraries can be installed with `pip`. Note that `LD_LIBRARY_PATH` must be set before launching Python. + +```bash +pip install nvidia-cublas-cu12 nvidia-cudnn-cu12 + +export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'` +``` + +**Note**: Version 9+ of `nvidia-cudnn-cu12` appears to cause issues due its reliance on cuDNN 9 (Faster-Whisper does not currently support cuDNN 9). Ensure your version of the Python package is for cuDNN 8. + +#### Download the libraries from Purfview's repository (Windows & Linux) + +Purfview's [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) provides the required NVIDIA libraries for Windows & Linux in a [single archive](https://github.com/Purfview/whisper-standalone-win/releases/tag/libs). Decompress the archive and place the libraries in a directory included in the `PATH`. + +
+ +## Installation + +The module can be installed from [PyPI](https://pypi.org/project/faster-whisper/): + +```bash +pip install faster-whisper +``` + +
+Other installation methods (click to expand) + +### Install the master branch + +```bash +pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz" +``` + +### Install a specific commit + +```bash +pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/a4f1cc8f11433e454c3934442b5e1a4ed5e865c3.tar.gz" +``` + +
+ +## Usage + +### Faster-whisper + +```python +from faster_whisper import WhisperModel + +model_size = "large-v3" + +# Run on GPU with FP16 +model = WhisperModel(model_size, device="cuda", compute_type="float16") + +# or run on GPU with INT8 +# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16") +# or run on CPU with INT8 +# model = WhisperModel(model_size, device="cpu", compute_type="int8") + +segments, info = model.transcribe("audio.mp3", beam_size=5) + +print("Detected language '%s' with probability %f" % (info.language, info.language_probability)) + +for segment in segments: + print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) +``` + +**Warning:** `segments` is a *generator* so the transcription only starts when you iterate over it. The transcription can be run to completion by gathering the segments in a list or a `for` loop: + +```python +segments, _ = model.transcribe("audio.mp3") +segments = list(segments) # The transcription will actually run here. +``` + +### multi-segment language detection + +To directly use the model for improved language detection, the following code snippet can be used: + +```python +from faster_whisper import WhisperModel +model = WhisperModel("medium", device="cuda", compute_type="float16") +language_info = model.detect_language_multi_segment("audio.mp3") +``` + +### Batched faster-whisper + + +The batched version of faster-whisper is inspired by [whisper-x](https://github.com/m-bain/whisperX) licensed under the BSD-4 Clause license and integrates its VAD model to this library. This product includes software developed by Max Bain. We modify this implementation and also added kaldi-based feature extraction. It improves the speed upto 10-12x compared to openAI implementation and 3-4x compared to the sequential faster_whisper version. It works by transcribing semantically meaningful audio chunks as batches leading to faster inference. + +The following code snippet illustrates how to run inference with batched version on an example audio file. Please also refer to the test scripts of batched faster whisper. + +```python +from faster_whisper import BatchedInferencePipeline + +model = WhisperModel("medium", device="cuda", compute_type="float16") +batched_model = BatchedInferencePipeline(model=model) +result = batched_model.transcribe("audio.mp3", batch_size=16) + +for segment, info in result: + print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) +``` + +### Faster Distil-Whisper + +The Distil-Whisper checkpoints are compatible with the Faster-Whisper package. In particular, the latest [distil-large-v3](https://huggingface.co/distil-whisper/distil-large-v3) +checkpoint is intrinsically designed to work with the Faster-Whisper transcription algorithm. The following code snippet +demonstrates how to run inference with distil-large-v3 on a specified audio file: + +```python +from faster_whisper import WhisperModel + +model_size = "distil-large-v3" + +model = WhisperModel(model_size, device="cuda", compute_type="float16") +segments, info = model.transcribe("audio.mp3", beam_size=5, language="en", condition_on_previous_text=False) + +for segment in segments: + print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) +``` + +For more information about the distil-large-v3 model, refer to the original [model card](https://huggingface.co/distil-whisper/distil-large-v3). + +### Word-level timestamps + +```python +segments, _ = model.transcribe("audio.mp3", word_timestamps=True) + +for segment in segments: + for word in segment.words: + print("[%.2fs -> %.2fs] %s" % (word.start, word.end, word.word)) +``` + +### VAD filter + +The library integrates the [Silero VAD](https://github.com/snakers4/silero-vad) model to filter out parts of the audio without speech: + +```python +segments, _ = model.transcribe("audio.mp3", vad_filter=True) +``` + +The default behavior is conservative and only removes silence longer than 2 seconds. See the available VAD parameters and default values in the [source code](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py). They can be customized with the dictionary argument `vad_parameters`: + +```python +segments, _ = model.transcribe( + "audio.mp3", + vad_filter=True, + vad_parameters=dict(min_silence_duration_ms=500), +) +``` + +### Logging + +The library logging level can be configured like this: + +```python +import logging + +logging.basicConfig() +logging.getLogger("faster_whisper").setLevel(logging.DEBUG) +``` + +### Going further + +See more model and transcription options in the [`WhisperModel`](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py) class implementation. + +## Community integrations + +Here is a non exhaustive list of open-source projects using faster-whisper. Feel free to add your project to the list! + + +* [faster-whisper-server](https://github.com/fedirz/faster-whisper-server) is an OpenAI compatible server using `faster-whisper`. It's easily deployable with Docker, works with OpenAI SDKs/CLI, supports streaming, and live transcription. +* [WhisperX](https://github.com/m-bain/whisperX) is an award-winning Python library that offers speaker diarization and accurate word-level timestamps using wav2vec2 alignment +* [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on faster-whisper and compatible with the original client from openai/whisper. +* [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on faster-whisper and NVIDIA NeMo. +* [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) Standalone CLI executables of faster-whisper for Windows, Linux & macOS. +* [asr-sd-pipeline](https://github.com/hedrergudene/asr-sd-pipeline) provides a scalable, modular, end to end multi-speaker speech to text solution implemented using AzureML pipelines. +* [Open-Lyrics](https://github.com/zh-plus/Open-Lyrics) is a Python library that transcribes voice files using faster-whisper, and translates/polishes the resulting text into `.lrc` files in the desired language using OpenAI-GPT. +* [wscribe](https://github.com/geekodour/wscribe) is a flexible transcript generation tool supporting faster-whisper, it can export word level transcript and the exported transcript then can be edited with [wscribe-editor](https://github.com/geekodour/wscribe-editor) +* [aTrain](https://github.com/BANDAS-Center/aTrain) is a graphical user interface implementation of faster-whisper developed at the BANDAS-Center at the University of Graz for transcription and diarization in Windows ([Windows Store App](https://apps.microsoft.com/detail/atrain/9N15Q44SZNS2)) and Linux. +* [Whisper-Streaming](https://github.com/ufal/whisper_streaming) implements real-time mode for offline Whisper-like speech-to-text models with faster-whisper as the most recommended back-end. It implements a streaming policy with self-adaptive latency based on the actual source complexity, and demonstrates the state of the art. +* [WhisperLive](https://github.com/collabora/WhisperLive) is a nearly-live implementation of OpenAI's Whisper which uses faster-whisper as the backend to transcribe audio in real-time. +* [Faster-Whisper-Transcriber](https://github.com/BBC-Esq/ctranslate2-faster-whisper-transcriber) is a simple but reliable voice transcriber that provides a user-friendly interface. + +## Model conversion + +When loading a model from its size such as `WhisperModel("large-v3")`, the corresponding CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran). + +We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models. + +For example the command below converts the [original "large-v3" Whisper model](https://huggingface.co/openai/whisper-large-v3) and saves the weights in FP16: + +```bash +pip install transformers[torch]>=4.23 + +ct2-transformers-converter --model openai/whisper-large-v3 --output_dir whisper-large-v3-ct2 +--copy_files tokenizer.json preprocessor_config.json --quantization float16 +``` + +* The option `--model` accepts a model name on the Hub or a path to a model directory. +* If the option `--copy_files tokenizer.json` is not used, the tokenizer configuration is automatically downloaded when the model is loaded later. + +Models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html). + +### Load a converted model + +1. Directly load the model from a local directory: +```python +model = faster_whisper.WhisperModel("whisper-large-v3-ct2") +``` + +2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name: +```python +model = faster_whisper.WhisperModel("username/whisper-large-v3-ct2") +``` + +## Comparing performance against other implementations + +If you are comparing the performance against other Whisper implementations, you should make sure to run the comparison with similar settings. In particular: + +* Verify that the same transcription options are used, especially the same beam size. For example in openai/whisper, `model.transcribe` uses a default beam size of 1 but here we use a default beam size of 5. +* When running on CPU, make sure to set the same number of threads. Many frameworks will read the environment variable `OMP_NUM_THREADS`, which can be set when running your script: + +```bash +OMP_NUM_THREADS=4 python3 my_script.py +``` diff --git a/faster-whisper/faster_whisper.egg-info/SOURCES.txt b/faster-whisper/faster_whisper.egg-info/SOURCES.txt new file mode 100644 index 0000000..99cda55 --- /dev/null +++ b/faster-whisper/faster_whisper.egg-info/SOURCES.txt @@ -0,0 +1,25 @@ +LICENSE +MANIFEST.in +README.md +requirements.conversion.txt +requirements.txt +setup.cfg +setup.py +faster_whisper/__init__.py +faster_whisper/audio.py +faster_whisper/feature_extractor.py +faster_whisper/tokenizer.py +faster_whisper/transcribe.py +faster_whisper/utils.py +faster_whisper/vad.py +faster_whisper/version.py +faster_whisper.egg-info/PKG-INFO +faster_whisper.egg-info/SOURCES.txt +faster_whisper.egg-info/dependency_links.txt +faster_whisper.egg-info/requires.txt +faster_whisper.egg-info/top_level.txt +faster_whisper/assets/__init__.py +faster_whisper/assets/pyannote_vad_model.bin +faster_whisper/assets/silero_vad.onnx +tests/test_transcribe.py +tests/test_utils.py \ No newline at end of file diff --git a/faster-whisper/faster_whisper.egg-info/dependency_links.txt b/faster-whisper/faster_whisper.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/faster-whisper/faster_whisper.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/faster-whisper/faster_whisper.egg-info/requires.txt b/faster-whisper/faster_whisper.egg-info/requires.txt new file mode 100644 index 0000000..7b9b551 --- /dev/null +++ b/faster-whisper/faster_whisper.egg-info/requires.txt @@ -0,0 +1,17 @@ +ctranslate2<5,>=4.0 +huggingface_hub>=0.13 +tokenizers<1,>=0.13 +onnxruntime<2,>=1.14 +transformers +pyannote-audio>=3.1.1 +torch>=2.1.1 +torchaudio>=2.1.2 + +[conversion] +transformers[torch]>=4.23 + +[dev] +black==23.* +flake8==6.* +isort==5.* +pytest==7.* diff --git a/faster-whisper/faster_whisper.egg-info/top_level.txt b/faster-whisper/faster_whisper.egg-info/top_level.txt new file mode 100644 index 0000000..a0e0f02 --- /dev/null +++ b/faster-whisper/faster_whisper.egg-info/top_level.txt @@ -0,0 +1 @@ +faster_whisper diff --git a/faster-whisper/faster_whisper/__init__.py b/faster-whisper/faster_whisper/__init__.py new file mode 100644 index 0000000..ad69277 --- /dev/null +++ b/faster-whisper/faster_whisper/__init__.py @@ -0,0 +1,14 @@ +from faster_whisper.audio import decode_audio +from faster_whisper.transcribe import BatchedInferencePipeline, WhisperModel +from faster_whisper.utils import available_models, download_model, format_timestamp +from faster_whisper.version import __version__ + +__all__ = [ + "available_models", + "decode_audio", + "WhisperModel", + "BatchedInferencePipeline", + "download_model", + "format_timestamp", + "__version__", +] diff --git a/faster-whisper/faster_whisper/assets/__init__.py b/faster-whisper/faster_whisper/assets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/faster-whisper/faster_whisper/assets/pyannote_vad_model.bin b/faster-whisper/faster_whisper/assets/pyannote_vad_model.bin new file mode 100644 index 0000000..75c92f0 Binary files /dev/null and b/faster-whisper/faster_whisper/assets/pyannote_vad_model.bin differ diff --git a/faster-whisper/faster_whisper/assets/silero_vad.onnx b/faster-whisper/faster_whisper/assets/silero_vad.onnx new file mode 100644 index 0000000..d0ccd9d Binary files /dev/null and b/faster-whisper/faster_whisper/assets/silero_vad.onnx differ diff --git a/faster-whisper/faster_whisper/audio.py b/faster-whisper/faster_whisper/audio.py new file mode 100644 index 0000000..83b321e --- /dev/null +++ b/faster-whisper/faster_whisper/audio.py @@ -0,0 +1,52 @@ +from typing import BinaryIO, Union + +import torchaudio +import torch + + +def decode_audio( + input_file: Union[str, BinaryIO], + sampling_rate: int = 16000, + split_stereo: bool = False, +): + """Decodes the audio. + + Args: + input_file: Path to the input file or a file-like object. + sampling_rate: Resample the audio to this sample rate. + split_stereo: Return separate left and right channels. + + Returns: + A float32 Torch Tensor. + + If `split_stereo` is enabled, the function returns a 2-tuple with the + separated left and right channels. + """ + + waveform, audio_sf = torchaudio.load(input_file) # waveform: channels X T + + if audio_sf != sampling_rate: + waveform = torchaudio.functional.resample( + waveform, orig_freq=audio_sf, new_freq=sampling_rate + ) + if split_stereo: + return waveform[0], waveform[1] + + return waveform.mean(0) + + +def pad_or_trim(array, length: int, *, axis: int = -1): + """ + Pad or trim the audio array to N_SAMPLES, as expected by the encoder. + """ + axis = axis % array.ndim + if array.shape[axis] > length: + idx = [Ellipsis] * axis + [slice(length)] + [Ellipsis] * (array.ndim - axis - 1) + return array[idx] + + if array.shape[axis] < length: + pad_widths = [0,] * array.ndim * 2 + pad_widths[2 * axis] = length - array.shape[axis] + array = torch.nn.functional.pad(array, tuple(pad_widths[::-1])) + + return array diff --git a/faster-whisper/faster_whisper/feature_extractor.py b/faster-whisper/faster_whisper/feature_extractor.py new file mode 100644 index 0000000..74e74dd --- /dev/null +++ b/faster-whisper/faster_whisper/feature_extractor.py @@ -0,0 +1,60 @@ +import torch +import torchaudio.compliance.kaldi as ta_kaldi + + +# Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/feature_extraction_whisper.py # noqa: E501 +class FeatureExtractor: + def __init__( + self, + device: str = "auto", + feature_size=80, + sampling_rate=16000, + hop_length=160, + chunk_length=30, + n_fft=400, + ): + if device == "auto": + self.device = "cuda" if torch.cuda.is_available() else "cpu" + else: + self.device = device + self.n_fft = n_fft + self.hop_length = hop_length + self.chunk_length = chunk_length + self.n_samples = chunk_length * sampling_rate + self.nb_max_frames = self.n_samples // hop_length + self.time_per_frame = hop_length / sampling_rate + self.sampling_rate = sampling_rate + self.n_mels = feature_size + + def __call__(self, waveform, padding=True, chunk_length=None, to_cpu=False): + """ + Compute the log-Mel spectrogram of the provided audio, gives similar results + whisper's original torch implementation with 1e-5 tolerance. + """ + if chunk_length is not None: + self.n_samples = chunk_length * self.sampling_rate + self.nb_max_frames = self.n_samples // self.hop_length + + if padding: + waveform = torch.nn.functional.pad(waveform, (0, self.n_samples)) + + waveform = waveform.to(self.device) if self.device == "cuda" else waveform + + fbank = ta_kaldi.fbank( + waveform.unsqueeze(0), + sample_frequency=self.sampling_rate, + window_type="hanning", + num_mel_bins=self.n_mels, + ) + log_spec = fbank.T + + # normalize + + # Audioset values as default mean and std for audio + mean_val = -4.2677393 + std_val = 4.5689974 + scaled_features = (log_spec - mean_val) / (std_val * 2) + + # When the model is running on multiple GPUs, the output should be moved + # to the CPU since we don't know which GPU will handle the next job. + return scaled_features.cpu() if to_cpu else scaled_features diff --git a/faster-whisper/faster_whisper/tokenizer.py b/faster-whisper/faster_whisper/tokenizer.py new file mode 100644 index 0000000..c3b13b4 --- /dev/null +++ b/faster-whisper/faster_whisper/tokenizer.py @@ -0,0 +1,278 @@ +import string + +from functools import cached_property +from typing import List, Optional, Tuple + +import tokenizers + + +class Tokenizer: + """Simple wrapper around a tokenizers.Tokenizer.""" + + def __init__( + self, + tokenizer: tokenizers.Tokenizer, + multilingual: bool, + task: Optional[str] = None, + language: Optional[str] = None, + ): + self.tokenizer = tokenizer + + if multilingual: + if task not in _TASKS: + raise ValueError( + "'%s' is not a valid task (accepted tasks: %s)" + % (task, ", ".join(_TASKS)) + ) + + if language not in _LANGUAGE_CODES: + raise ValueError( + "'%s' is not a valid language code (accepted language codes: %s)" + % (language, ", ".join(_LANGUAGE_CODES)) + ) + + self.task = self.tokenizer.token_to_id("<|%s|>" % task) + self.language = self.tokenizer.token_to_id("<|%s|>" % language) + self.language_code = language + else: + self.task = None + self.language = None + self.language_code = "en" + + @cached_property + def transcribe(self) -> int: + return self.tokenizer.token_to_id("<|transcribe|>") + + @cached_property + def translate(self) -> int: + return self.tokenizer.token_to_id("<|translate|>") + + @cached_property + def sot(self) -> int: + return self.tokenizer.token_to_id("<|startoftranscript|>") + + @cached_property + def sot_lm(self) -> int: + return self.tokenizer.token_to_id("<|startoflm|>") + + @cached_property + def sot_prev(self) -> int: + return self.tokenizer.token_to_id("<|startofprev|>") + + @cached_property + def eot(self) -> int: + return self.tokenizer.token_to_id("<|endoftext|>") + + @cached_property + def no_timestamps(self) -> int: + return self.tokenizer.token_to_id("<|notimestamps|>") + + @property + def timestamp_begin(self) -> int: + return self.no_timestamps + 1 + + @property + def sot_sequence(self) -> List[int]: + sequence = [self.sot] + + if self.language is not None: + sequence.append(self.language) + + if self.task is not None: + sequence.append(self.task) + + return sequence + + def encode(self, text: str) -> List[int]: + return self.tokenizer.encode(text, add_special_tokens=False).ids + + def decode(self, tokens: List[int]) -> str: + text_tokens = [token for token in tokens if token < self.eot] + return self.tokenizer.decode(text_tokens) + + def decode_with_timestamps(self, tokens: List[int]) -> str: + outputs = [[]] + + for token in tokens: + if token >= self.timestamp_begin: + timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>" + outputs.append(timestamp) + outputs.append([]) + else: + outputs[-1].append(token) + + return "".join( + [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs] + ) + + def split_to_word_tokens( + self, tokens: List[int] + ) -> Tuple[List[str], List[List[int]]]: + if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}: + # These languages don't typically use spaces, so it is difficult to split words + # without morpheme analysis. Here, we instead split words at any + # position where the tokens are decoded as valid unicode points + return self.split_tokens_on_unicode(tokens) + + return self.split_tokens_on_spaces(tokens) + + def split_tokens_on_unicode( + self, tokens: List[int] + ) -> Tuple[List[str], List[List[int]]]: + decoded_full = self.decode_with_timestamps(tokens) + replacement_char = "\ufffd" + + words = [] + word_tokens = [] + current_tokens = [] + unicode_offset = 0 + + for token in tokens: + current_tokens.append(token) + decoded = self.decode_with_timestamps(current_tokens) + + try: + replacement_char_index = decoded.index(replacement_char) + replacement_char_index += unicode_offset + except ValueError: + replacement_char_index = None + + if replacement_char_index is None or ( + replacement_char_index < len(decoded_full) + and decoded_full[replacement_char_index] == replacement_char + ): + words.append(decoded) + word_tokens.append(current_tokens) + current_tokens = [] + unicode_offset += len(decoded) + + return words, word_tokens + + def split_tokens_on_spaces( + self, tokens: List[int] + ) -> Tuple[List[str], List[List[int]]]: + subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens) + words = [] + word_tokens = [] + + for subword, subword_tokens in zip(subwords, subword_tokens_list): + special = subword_tokens[0] >= self.eot + with_space = subword.startswith(" ") + punctuation = subword.strip() in string.punctuation + if special or with_space or punctuation or len(words) == 0: + words.append(subword) + word_tokens.append(subword_tokens) + else: + words[-1] = words[-1] + subword + word_tokens[-1].extend(subword_tokens) + + return words, word_tokens + + +_TASKS = ( + "transcribe", + "translate", +) + +_LANGUAGE_CODES = ( + "af", + "am", + "ar", + "as", + "az", + "ba", + "be", + "bg", + "bn", + "bo", + "br", + "bs", + "ca", + "cs", + "cy", + "da", + "de", + "el", + "en", + "es", + "et", + "eu", + "fa", + "fi", + "fo", + "fr", + "gl", + "gu", + "ha", + "haw", + "he", + "hi", + "hr", + "ht", + "hu", + "hy", + "id", + "is", + "it", + "ja", + "jw", + "ka", + "kk", + "km", + "kn", + "ko", + "la", + "lb", + "ln", + "lo", + "lt", + "lv", + "mg", + "mi", + "mk", + "ml", + "mn", + "mr", + "ms", + "mt", + "my", + "ne", + "nl", + "nn", + "no", + "oc", + "pa", + "pl", + "ps", + "pt", + "ro", + "ru", + "sa", + "sd", + "si", + "sk", + "sl", + "sn", + "so", + "sq", + "sr", + "su", + "sv", + "sw", + "ta", + "te", + "tg", + "th", + "tk", + "tl", + "tr", + "tt", + "uk", + "ur", + "uz", + "vi", + "yi", + "yo", + "zh", + "yue", +) diff --git a/faster-whisper/faster_whisper/transcribe.py b/faster-whisper/faster_whisper/transcribe.py new file mode 100644 index 0000000..cdce118 --- /dev/null +++ b/faster-whisper/faster_whisper/transcribe.py @@ -0,0 +1,2343 @@ +import itertools +import json +import logging +import os +import random +import zlib + +from collections import Counter, defaultdict +from inspect import signature +from typing import BinaryIO, Iterable, List, NamedTuple, Optional, Tuple, Union + +import ctranslate2 +import numpy as np +import tokenizers +import torch + +from pyannote.audio import Model +from transformers import Pipeline +from transformers.pipelines.pt_utils import PipelineIterator + +from faster_whisper.audio import decode_audio, pad_or_trim +from faster_whisper.feature_extractor import FeatureExtractor +from faster_whisper.tokenizer import _LANGUAGE_CODES, Tokenizer +from faster_whisper.utils import ( + download_model, + format_timestamp, + get_assets_path, + get_end, + get_logger, +) +from faster_whisper.vad import ( + SpeechTimestampsMap, + VadOptions, + VoiceActivitySegmentation, + collect_chunks, + get_speech_timestamps, + merge_chunks, +) + + +class Word(NamedTuple): + start: float + end: float + word: str + probability: float + + +class Segment(NamedTuple): + id: int + seek: int + start: float + end: float + text: str + tokens: List[int] + avg_logprob: float + compression_ratio: float + no_speech_prob: float + words: Optional[List[Word]] + temperature: Optional[float] = 1.0 + + +# Added additional parameters for multilingual videos and fixes below +class TranscriptionOptions(NamedTuple): + beam_size: int + best_of: int + patience: float + length_penalty: float + repetition_penalty: float + no_repeat_ngram_size: int + log_prob_threshold: Optional[float] + log_prob_low_threshold: Optional[float] + no_speech_threshold: Optional[float] + compression_ratio_threshold: Optional[float] + condition_on_previous_text: bool + prompt_reset_on_temperature: float + temperatures: List[float] + initial_prompt: Optional[Union[str, Iterable[int]]] + prefix: Optional[str] + suppress_blank: bool + suppress_tokens: Optional[List[int]] + without_timestamps: bool + max_initial_timestamp: float + word_timestamps: bool + prepend_punctuations: str + append_punctuations: str + multilingual: bool + output_language: Optional[str] + max_new_tokens: Optional[int] + clip_timestamps: Union[str, List[float]] + hallucination_silence_threshold: Optional[float] + hotwords: Optional[str] + + +class TranscriptionInfo(NamedTuple): + language: str + language_probability: float + duration: float + duration_after_vad: float + all_language_probs: Optional[List[Tuple[str, float]]] + transcription_options: TranscriptionOptions + vad_options: VadOptions + + +# The code below is copied from whisper-x (https://github.com/m-bain/whisperX) +# and adapted for faster_whisper + + +class BatchedInferencePipeline(Pipeline): + + """ + Huggingface Pipeline wrapper for WhisperModel. + Copyright (c) 2022, Max Bain + All rights reserved. + Modified by Mobius Labs GmbH + """ + + def __init__( + self, + model, + use_vad_model: bool = True, + options: Optional[NamedTuple] = None, + tokenizer=None, + device: Union[int, str, "torch.device"] = -1, + chunk_size: int = 30, + vad_device: Union[int, str, "torch.device"] = "auto", + framework="pt", + language: Optional[str] = None, + **kwargs, + ): + self.model: WhisperModel = model + self.tokenizer = tokenizer + self.options = options + self.preset_language = language + self._batch_size = kwargs.pop("batch_size", None) + self._num_workers = 0 + self.use_vad_model = use_vad_model + self.vad_onset = 0.500 + self.vad_offset = 0.363 + self.vad_model_path = os.path.join(get_assets_path(), "pyannote_vad_model.bin") + + ( + self._preprocess_params, + self._forward_params, + self._postprocess_params, + ) = self._sanitize_parameters(**kwargs) + self.call_count = 0 + self.framework = framework + if self.framework == "pt": + self.device = self.get_device(device) + else: + self.device = device + + if self.use_vad_model: + self.vad_device = self.get_device(vad_device) + + # load vad model and perform VAD preprocessing if needed + self.vad_model = self.load_vad_model( + vad_onset=self.vad_onset, vad_offset=self.vad_offset + ) + self.chunk_size = chunk_size # VAD merging size + self.last_speech_timestamp = 0.0 + super(Pipeline, self).__init__() + + def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "tokenizer" in kwargs: + preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] + return preprocess_kwargs, {}, {} + + def get_device(self, device: Union[int, str, "torch.device"]): + """ + Converts the input device into a torch.device object. + + The input can be an integer, a string, or a `torch.device` object. + + The function handles a special case where the input device is "auto". + When "auto" is specified, the device will default to the + device of the model (self.model.device). If the model's device is also "auto", + it selects "cuda" if a CUDA-capable device is available; otherwise, it selects "cpu". + """ + if isinstance(device, torch.device): + return device + elif isinstance(device, str): + if device == "auto" and self.model.device == "auto": + device = "cuda" if torch.cuda.is_available() else "cpu" + elif device == "auto": + device = self.model.device + return torch.device(device) + elif device < 0: + return torch.device("cpu") + else: + return torch.device(f"cuda:{device}") + + def preprocess(self, inputs): + audio = inputs["inputs"] + to_cpu = ( + self.model.model.device == "cuda" and len(self.model.model.device_index) > 1 + ) + features = self.model.feature_extractor(audio, padding=True, to_cpu=to_cpu)[ + :, : self.model.feature_extractor.nb_max_frames + ] + + inputs["features"] = features + return inputs + + def _forward(self, model_inputs, **forward_params): + encoder_output, outputs = self.model.generate_segment_batched( + model_inputs["features"], self.tokenizer, forward_params + ) + + segment_size = encoder_output.shape[1] * 2 + segmented_outputs = [] + for segment_metadata, output in zip(model_inputs["seg_metadata"], outputs): + subsegments, seek, single_timestamp_ending = ( + self.model._split_segments_by_timestamps( + tokenizer=self.tokenizer, + tokens=output["tokens"], + time_offset=segment_metadata["start_time"], + segment_size=segment_size, + segment_duration=segment_metadata["end_time"] + - segment_metadata["start_time"], + seek=0, + ) + ) + segmented_outputs.append( + [ + dict( + text=self.tokenizer.decode(subsegment["tokens"]), + avg_logprob=output["avg_logprob"], + no_speech_prob=output["no_speech_prob"], + tokens=subsegment["tokens"], + start=subsegment["start"], + end=subsegment["end"], + compression_ratio=get_compression_ratio( + self.tokenizer.decode(subsegment["tokens"]) + ), + ) + for subsegment in subsegments + ] + ) + if forward_params["word_timestamps"]: + self.last_speech_timestamp = self.model.add_word_timestamps( + segmented_outputs, + self.tokenizer, + encoder_output, + segment_size, + forward_params["prepend_punctuations"], + forward_params["append_punctuations"], + self.last_speech_timestamp, + ) + + return {"output": segmented_outputs} + + def __call__(self, inputs, options, batch_size=None, **kwargs): + + if batch_size is None: + if self._batch_size is None: + batch_size = 1 + else: + batch_size = self._batch_size + + ( + preprocess_params, + forward_params, + postprocess_params, + ) = self._sanitize_parameters(**kwargs) + + # Fuse __init__ params and __call__ params without modifying the __init__ ones. + preprocess_params = { + **self._preprocess_params, + **preprocess_params, + } + options_dict = options._asdict() + forward_params = {**self._forward_params, **forward_params, **options_dict} + postprocess_params = {**self._postprocess_params, **postprocess_params} + + self.call_count += 1 + if ( + self.call_count > 10 + and self.framework == "pt" + and self.device.type == "cuda" + ): + logging.warning( + "You seem to be using the pipelines sequentially on GPU. Please use a Dataset" + ) + + return self.get_iterator( + inputs, + batch_size, + preprocess_params, + forward_params, + postprocess_params, + ) + + def postprocess(self, model_outputs): + return model_outputs + + def get_iterator( + self, + inputs, + batch_size: int, + preprocess_params=None, + forward_params=None, + postprocess_params=None, + ): + + def stack(items): + return { + "inputs": [x["inputs"] for x in items], + "seg_metadata": [x["seg_metadata"] for x in items], + "features": torch.stack([x["features"] for x in items]), + } + + if "TOKENIZERS_PARALLELISM" not in os.environ: + os.environ["TOKENIZERS_PARALLELISM"] = "false" + # TODO hack by collating feature_extractor and image_processor + dataset = PipelineIterator(inputs, self.preprocess, preprocess_params) + dataloader = torch.utils.data.DataLoader( + dataset, num_workers=self._num_workers, batch_size=batch_size, collate_fn=stack + ) + model_iterator = PipelineIterator( + dataloader, self.forward, forward_params, loader_batch_size=batch_size + ) + final_iterator = PipelineIterator( + model_iterator, self.postprocess, postprocess_params + ) + return final_iterator + + def get_language_and_tokenizer( + self, audio, task: Optional[str] = None, language: Optional[str] = None + ): + all_language_probs = None + language_probability = 1.0 + if self.tokenizer is None: + if not language: + language, language_probability, all_language_probs = self.detect_language(audio) + task = task or "transcribe" + self.tokenizer = Tokenizer( + self.model.hf_tokenizer, + self.model.model.is_multilingual, + task=task, + language=language, + ) + else: + if task is not None: + self.tokenizer.task = self.tokenizer.token_to_id(f"<|{task}|>") + + if language is not None: + self.tokenizer.language = self.tokenizer.token_to_id(f"<|{language}|>") + self.tokenizer.language_code = language + + return language, language_probability, task, all_language_probs + + def audio_split(self, audio, segments, sampling_rate): + "Returns splitted audio chunks as iterator" + + for seg in segments: + f1 = int(seg["start"] * sampling_rate) + f2 = int(seg["end"] * sampling_rate) + seg_metadata = { + "start_time": seg["start"], + "end_time": seg["end"], + "stitched_seg": seg["segments"], + } + yield {"inputs": audio[f1:f2], "seg_metadata": seg_metadata} + + def load_vad_model(self, vad_onset=0.500, vad_offset=0.363): + vad_model = Model.from_pretrained(self.vad_model_path) + hyperparameters = { + "onset": vad_onset, + "offset": vad_offset, + "min_duration_on": 0.1, + "min_duration_off": 0.1, + } + + vad_pipeline = VoiceActivitySegmentation( + segmentation=vad_model, device=torch.device(self.vad_device) + ) + vad_pipeline.instantiate(hyperparameters) + return vad_pipeline + + def transcribe( + self, + audio: Union[str, torch.Tensor, np.ndarray], + vad_segments: Optional[List[dict]] = None, + batch_size: int = 16, + language: Optional[str] = None, + task: str = None, + log_progress: bool = False, + beam_size: int = 5, + best_of: int = 5, + patience: float = 1, + length_penalty: float = 1, + repetition_penalty: float = 1, + no_repeat_ngram_size: int = 0, + temperature: Union[float, List[float], Tuple[float, ...]] = [ + 0.0, + 0.2, + 0.4, + 0.6, + 0.8, + 1.0, + ], + compression_ratio_threshold: Optional[float] = 2.4, + log_prob_threshold: Optional[float] = -1.0, + log_prob_low_threshold: Optional[float] = None, + no_speech_threshold: Optional[float] = 0.6, + initial_prompt: Optional[Union[str, Iterable[int]]] = None, + prefix: Optional[str] = None, + suppress_blank: bool = True, + suppress_tokens: Optional[List[int]] = [-1], + prepend_punctuations: str = "\"'“¿([{-", + append_punctuations: str = "\"'.。,,!!??::”)]}、", + max_new_tokens: Optional[int] = None, + hotwords: Optional[str] = None, + word_timestamps: bool = False, + without_timestamps: bool = True, + ) -> Tuple[Iterable[Segment], TranscriptionInfo]: + """transcribe audio in chunks in batched fashion and return with language info. + + Arguments: + audio: audio file as numpy array/path for batched transcription. + vad_segments: Optionally provide list of dictionaries each containing "start", "end", + and "segments" keys. + "start" and "end" keys specify the start and end of the voiced region within + 30 sec boundary. An additional key "segments" contains all the start + and end of voiced regions within that 30sec boundary as a list of tuples. + If no vad_segments specified, it uses internal vad model automatically segment them. + batch_size: the maximum number of parallel requests to model for decoding. + language: The language spoken in the audio. + task: either "transcribe" or "translate". + log_progress: whether to show progress bar or not. + beam_size: Beam size to use for decoding. + best_of: Number of candidates when sampling with non-zero temperature. + patience: Beam search patience factor. + length_penalty: Exponential length penalty constant. + repetition_penalty: Penalty applied to the score of previously generated tokens + (set > 1 to penalize). + no_repeat_ngram_size: Prevent repetitions of ngrams with this size (set 0 to disable). + temperature: Temperature for sampling. It can be a tuple of temperatures, + which will be successively used upon failures according to either + `compression_ratio_threshold` or `log_prob_threshold`. + compression_ratio_threshold: If the gzip compression ratio is above this value, + treat as failed. + log_prob_threshold: If the average log probability over sampled tokens is + below this value, treat as failed. + log_prob_low_threshold: This parameter alone is sufficient to skip an output text, + whereas log_prob_threshold also looks for appropriate no_speech_threshold value. + This value should be less than log_prob_threshold. + no_speech_threshold: If the no_speech probability is higher than this value AND + the average log probability over sampled tokens is below `log_prob_threshold`, + consider the segment as silent. + initial_prompt: Optional text string or iterable of token ids to provide as a + prompt for the first window. + prefix: Optional text to provide as a prefix for the first window. + suppress_blank: Suppress blank outputs at the beginning of the sampling. + suppress_tokens: List of token IDs to suppress. -1 will suppress a default set + of symbols as defined in the model config.json file. + prepend_punctuations: If word_timestamps is True, merge these punctuation symbols + with the next word + append_punctuations: If word_timestamps is True, merge these punctuation symbols + with the previous word + max_new_tokens: Maximum number of new tokens to generate per-chunk. If not set, + the maximum will be set by the default max_length. + hotwords: + Hotwords/hint phrases to the model. Has no effect if prefix is not None. + word_timestamps: Extract word-level timestamps using the cross-attention pattern + and dynamic time warping, and include the timestamps for each word in each segment. + Set as False. + without_timestamps: Only sample text tokens. + + Static params: (Fixed for batched version) + max_initial_timestamp: The initial timestamp cannot be later than this, set at 0.0. + multilingual: If True, perform transcription on multilingual videos. Set as False. + output_language: Valid only if multilingual is set to True. + Specifies the string representing the output language. One of + 'en' (English) or 'hybrid' (code-switched transcription). set as None. + condition_on_previous_text: If True, the previous output of the model is provided + as a prompt for the next window; disabling may make the text inconsistent across + windows, but the model becomes less prone to getting stuck in a failure loop, + such as repetition looping or timestamps going out of sync. Set as False + prompt_reset_on_temperature: Resets prompt if temperature is above this value. + Arg has effect only if condition_on_previous_text is True. Set at 0.5 + #TODO: support "hallucination_silence_threshold" when "word_timestamps=True" + hallucination_silence_threshold: Optional[float] + When word_timestamps is True, skip silent periods longer than this threshold + (in seconds) when a possible hallucination is detected. set as None. + clip_timestamps: + Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to + process. The last end timestamp defaults to the end of the file. Set as "0". + + unused: + language_detection_threshold: If the maximum probability of the language tokens is + higher than this value, the language is detected. + language_detection_segments: Number of segments to consider for the language detection. + vad_filter: Enable the voice activity detection (VAD) to filter out parts of the audio + without speech. This step is using the Silero VAD model + https://github.com/snakers4/silero-vad. + vad_parameters: Dictionary of Silero VAD parameters or VadOptions class (see available + parameters and default values in the class `VadOptions`). + chunk_length: The length of audio segments. If it is not None, it will overwrite the + default chunk_length of the FeatureExtractor. + + + Returns: + A tuple with: + + - a generator over transcribed batched segments. + - an instance of TranscriptionInfo. + """ + + sampling_rate = self.model.feature_extractor.sampling_rate + + if isinstance(audio, np.ndarray): + audio = torch.from_numpy(audio) + elif not isinstance(audio, torch.Tensor): + audio = decode_audio(audio, sampling_rate=sampling_rate) + duration = audio.shape[0] / sampling_rate + + # if no segment split is provided, use vad_model and generate segments + if not vad_segments: + # run the audio if it is less than 30 sec even without vad_segments + if self.use_vad_model: + vad_segments = self.vad_model( + { + "waveform": audio.unsqueeze(0), + "sample_rate": 16000, + } + ) + vad_segments = merge_chunks( + vad_segments, + self.chunk_size, + onset=self.vad_onset, + offset=self.vad_offset, + ) + elif duration < self.chunk_size: + vad_segments = [ + {"start": 0.0, "end": duration, "segments": [(0.0, duration)]} + ] + else: + raise RuntimeError( + "No vad segments found. Set 'use_vad_model' to True while loading the model" + ) + + language, language_probability, task, all_language_probs = self.get_language_and_tokenizer( + audio, task, language + ) + batch_size = batch_size or self._batch_size + + duration_after_vad = sum(segment['end'] - segment['start'] for segment in vad_segments) + + # batched options: see the difference with default options in WhisperModel + batched_options = TranscriptionOptions( + beam_size=beam_size, + best_of=best_of, + patience=patience, + length_penalty=length_penalty, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + log_prob_threshold=log_prob_threshold, + log_prob_low_threshold=log_prob_low_threshold, + no_speech_threshold=no_speech_threshold, + compression_ratio_threshold=compression_ratio_threshold, + temperatures=( + temperature if isinstance(temperature, (list, tuple)) else [temperature] + ), + initial_prompt=initial_prompt, + prefix=prefix, + suppress_blank=suppress_blank, + suppress_tokens=get_suppressed_tokens(self.tokenizer, suppress_tokens), + prepend_punctuations=prepend_punctuations, + append_punctuations=append_punctuations, + max_new_tokens=max_new_tokens, + hotwords=hotwords, + word_timestamps=word_timestamps, + hallucination_silence_threshold=None, + condition_on_previous_text=False, + clip_timestamps="0", + prompt_reset_on_temperature=0.5, + multilingual=False, + output_language=None, + without_timestamps=without_timestamps, + max_initial_timestamp=0.0, + ) + + info = TranscriptionInfo( + language=language, + language_probability=language_probability, + duration=duration, + duration_after_vad=duration_after_vad, + transcription_options=batched_options, + vad_options=None, + all_language_probs=all_language_probs, + ) + + segments = self._batched_segments_generator( + audio, + vad_segments, + sampling_rate, + batch_size, + batched_options, + log_progress, + ) + + return segments, info + + def _batched_segments_generator( + self, audio, vad_segments, sampling_rate, batch_size, options, log_progress + ): + seg_idx = 0 + total_segments = len(vad_segments) + for idx, out in enumerate( + self.__call__( + self.audio_split(audio, vad_segments, sampling_rate), + batch_size=batch_size, + options=options, + ) + ): + if log_progress: + percent_complete = ((idx + 1) / total_segments) * 100 + self.model.logger.info(f"Progress: {percent_complete:.2f}%...") + + responses = out["output"] + if batch_size == 1: + responses = responses[0] + + for response in responses: + seg_idx += 1 + segments = Segment( + seek=int(responses[-1]["end"] * self.model.frames_per_second), + id=seg_idx, + text=response["text"], + start=round(response["start"], 3), + end=round(response["end"], 3), + words=(None if not options.word_timestamps else response["words"]), + tokens=response["tokens"], + avg_logprob=response["avg_logprob"], + no_speech_prob=response["no_speech_prob"], + compression_ratio=response["compression_ratio"], + ) + yield segments + + # revert the tokenizer if multilingual inference is enabled + if self.preset_language is None: + self.tokenizer = None + self.last_speech_timestamp = 0.0 + + def detect_language(self, audio: torch.Tensor): + to_cpu = ( + self.model.model.device == "cuda" and len(self.model.model.device_index) > 1 + ) + segment = self.model.feature_extractor(audio, padding=True, to_cpu=to_cpu)[ + :, : self.model.feature_extractor.nb_max_frames + ] + encoder_output = self.model.encode(segment) + results = self.model.model.detect_language(encoder_output) + language_token, language_probability = results[0][0] + language = language_token[2:-2] + self.model.logger.info( + f"Detected language: {language} ({language_probability:.2f}) in first 30s of audio..." + ) + all_language_probs = [(token[2:-2], prob) for (token, prob) in results[0]] + return language, language_probability, all_language_probs + + def detect_language_multi_segment( + self, audio: Union[str, BinaryIO, torch.Tensor], params: Optional[dict] = None + ): + return self.model.detect_language_multi_segment(audio, params) + + +class WhisperModel: + def __init__( + self, + model_size_or_path: str, + device: str = "auto", + device_index: Union[int, List[int]] = 0, + compute_type: str = "default", + cpu_threads: int = 16, + num_workers: int = 1, + download_root: Optional[str] = None, + local_files_only: bool = False, + files: dict = None, + **model_kwargs, + ): + """Initializes the Whisper model. + + Args: + model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en, + small, small.en, distil-small.en, medium, medium.en, distil-medium.en, large-v1, + large-v2, large-v3, large, distil-large-v2 or distil-large-v3), a path to a + converted model directory, or a CTranslate2-converted Whisper model ID from the HF Hub. + When a size or a model ID is configured, the converted model is downloaded + from the Hugging Face Hub. + device: Device to use for computation ("cpu", "cuda", "auto"). + device_index: Device ID to use. + The model can also be loaded on multiple GPUs by passing a list of IDs + (e.g. [0, 1, 2, 3]). In that case, multiple transcriptions can run in parallel + when transcribe() is called from multiple Python threads (see also num_workers). + compute_type: Type to use for computation. + See https://opennmt.net/CTranslate2/quantization.html. + cpu_threads: Number of threads to use when running on CPU (4 by default). + A non zero value overrides the OMP_NUM_THREADS environment variable. + num_workers: When transcribe() is called from multiple Python threads, + having multiple workers enables true parallelism when running the model + (concurrent calls to self.model.generate() will run in parallel). + This can improve the global throughput at the cost of increased memory usage. + download_root: Directory where the models should be saved. If not set, the models + are saved in the standard Hugging Face cache directory. + local_files_only: If True, avoid downloading the file and return the path to the + local cached file if it exists. + files: Load model files from the memory. This argument is a dictionary mapping file names + to file contents as file-like or bytes objects. If this is set, model_path acts as an + identifier for this model. + """ + self.logger = get_logger() + + tokenizer_bytes, preprocessor_bytes = None, None + if files: + model_path = model_size_or_path + tokenizer_bytes = files.pop("tokenizer.json", None) + preprocessor_bytes = files.pop("preprocessor_config.json", None) + elif os.path.isdir(model_size_or_path): + model_path = model_size_or_path + else: + model_path = download_model( + model_size_or_path, + local_files_only=local_files_only, + cache_dir=download_root, + ) + self.device = device + # set the random seed to make sure consistency across runs + ctranslate2.set_random_seed(42) + self.model = ctranslate2.models.Whisper( + model_path, + device=self.device, + device_index=device_index, + compute_type=compute_type, + intra_threads=cpu_threads, + inter_threads=num_workers, + files=files, + **model_kwargs, + ) + + tokenizer_file = os.path.join(model_path, "tokenizer.json") + if tokenizer_bytes: + self.hf_tokenizer = tokenizers.Tokenizer.from_buffer(tokenizer_bytes) + elif os.path.isfile(tokenizer_file): + self.hf_tokenizer = tokenizers.Tokenizer.from_file(tokenizer_file) + else: + self.hf_tokenizer = tokenizers.Tokenizer.from_pretrained( + "openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en") + ) + self.feat_kwargs = self._get_feature_kwargs(model_path, preprocessor_bytes) + self.feature_extractor = FeatureExtractor( + **self.feat_kwargs, device=self.device + ) + self.input_stride = 2 + self.num_samples_per_token = ( + self.feature_extractor.hop_length * self.input_stride + ) + self.frames_per_second = ( + self.feature_extractor.sampling_rate // self.feature_extractor.hop_length + ) + self.tokens_per_second = ( + self.feature_extractor.sampling_rate // self.num_samples_per_token + ) + self.time_precision = 0.02 + self.max_length = 448 + + @property + def supported_languages(self) -> List[str]: + """The languages supported by the model.""" + return list(_LANGUAGE_CODES) if self.model.is_multilingual else ["en"] + + def _get_feature_kwargs(self, model_path, preprocessor_bytes=None) -> dict: + config = {} + try: + config_path = os.path.join(model_path, "preprocessor_config.json") + if preprocessor_bytes: + config = json.loads(preprocessor_bytes) + elif os.path.isfile(config_path): + with open(config_path, "r", encoding="utf-8") as file: + config = json.load(file) + else: + return config + valid_keys = signature(FeatureExtractor.__init__).parameters.keys() + return {k: v for k, v in config.items() if k in valid_keys} + except json.JSONDecodeError as e: + self.logger.warning("Could not load preprocessor config: %s", e) + + return config + + def transcribe( + self, + audio: Union[str, BinaryIO, torch.Tensor, np.ndarray], + language: Optional[str] = None, + task: str = "transcribe", + beam_size: int = 5, + best_of: int = 5, + patience: float = 1, + length_penalty: float = 1, + repetition_penalty: float = 1, + no_repeat_ngram_size: int = 0, + temperature: Union[float, List[float], Tuple[float, ...]] = [ + 0.0, + 0.2, + 0.4, + 0.6, + 0.8, + 1.0, + ], + compression_ratio_threshold: Optional[float] = 2.4, + log_prob_threshold: Optional[float] = -1.0, + log_prob_low_threshold: Optional[float] = None, + no_speech_threshold: Optional[float] = 0.6, + condition_on_previous_text: bool = True, + prompt_reset_on_temperature: float = 0.5, + initial_prompt: Optional[Union[str, Iterable[int]]] = None, + prefix: Optional[str] = None, + suppress_blank: bool = True, + suppress_tokens: Optional[List[int]] = [-1], + without_timestamps: bool = False, + max_initial_timestamp: float = 1.0, + word_timestamps: bool = False, + prepend_punctuations: str = "\"'“¿([{-", + append_punctuations: str = "\"'.。,,!!??::”)]}、", + multilingual: bool = False, + output_language: Optional[str] = None, + vad_filter: bool = False, + vad_parameters: Optional[Union[dict, VadOptions]] = None, + max_new_tokens: Optional[int] = None, + chunk_length: Optional[int] = None, + clip_timestamps: Union[str, List[float]] = "0", + hallucination_silence_threshold: Optional[float] = None, + hotwords: Optional[str] = None, + language_detection_threshold: Optional[float] = None, + language_detection_segments: int = 1, + ) -> Tuple[Iterable[Segment], TranscriptionInfo]: + """Transcribes an input file. + + Arguments: + audio: Path to the input file (or a file-like object), or the audio waveform. + language: The language spoken in the audio. It should be a language code such + as "en" or "fr". If not set, the language will be detected in the first 30 seconds + of audio. + task: Task to execute (transcribe or translate). + beam_size: Beam size to use for decoding. + best_of: Number of candidates when sampling with non-zero temperature. + patience: Beam search patience factor. + length_penalty: Exponential length penalty constant. + repetition_penalty: Penalty applied to the score of previously generated tokens + (set > 1 to penalize). + no_repeat_ngram_size: Prevent repetitions of ngrams with this size (set 0 to disable). + temperature: Temperature for sampling. It can be a tuple of temperatures, + which will be successively used upon failures according to either + `compression_ratio_threshold` or `log_prob_threshold`. + compression_ratio_threshold: If the gzip compression ratio is above this value, + treat as failed. + log_prob_threshold: If the average log probability over sampled tokens is + below this value, treat as failed. + log_prob_low_threshold: This parameter alone is sufficient to skip an output text, + wheras log_prob_threshold also looks for appropriate no_speech_threshold value. + This value should be less than log_prob_threshold. + no_speech_threshold: If the no_speech probability is higher than this value AND + the average log probability over sampled tokens is below `log_prob_threshold`, + consider the segment as silent. + condition_on_previous_text: If True, the previous output of the model is provided + as a prompt for the next window; disabling may make the text inconsistent across + windows, but the model becomes less prone to getting stuck in a failure loop, + such as repetition looping or timestamps going out of sync. + prompt_reset_on_temperature: Resets prompt if temperature is above this value. + Arg has effect only if condition_on_previous_text is True. + initial_prompt: Optional text string or iterable of token ids to provide as a + prompt for the first window. + prefix: Optional text to provide as a prefix for the first window. + suppress_blank: Suppress blank outputs at the beginning of the sampling. + suppress_tokens: List of token IDs to suppress. -1 will suppress a default set + of symbols as defined in the model config.json file. + without_timestamps: Only sample text tokens. + max_initial_timestamp: The initial timestamp cannot be later than this. + word_timestamps: Extract word-level timestamps using the cross-attention pattern + and dynamic time warping, and include the timestamps for each word in each segment. + prepend_punctuations: If word_timestamps is True, merge these punctuation symbols + with the next word + append_punctuations: If word_timestamps is True, merge these punctuation symbols + with the previous word + multilingual: If True, perform transcription on multilingual videos + and return the transcript based + on the 'output_language' flag. + output_language: Valid only if multilingual is set to True. + Specifies the string representing the output language. One of + 'en' (English) or 'hybrid' (code-switched transcription). + vad_filter: Enable the voice activity detection (VAD) to filter out parts of the audio + without speech. This step is using the Silero VAD model + https://github.com/snakers4/silero-vad. + vad_parameters: Dictionary of Silero VAD parameters or VadOptions class (see available + parameters and default values in the class `VadOptions`). + max_new_tokens: Maximum number of new tokens to generate per-chunk. If not set, + the maximum will be set by the default max_length. + chunk_length: The length of audio segments. If it is not None, it will overwrite the + default chunk_length of the FeatureExtractor. + clip_timestamps: + Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to + process. The last end timestamp defaults to the end of the file. + vad_filter will be ignored if clip_timestamps is used. + hallucination_silence_threshold: + When word_timestamps is True, skip silent periods longer than this threshold + (in seconds) when a possible hallucination is detected + hotwords: + Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None. + language_detection_threshold: If the maximum probability of the language tokens is higher + than this value, the language is detected. + language_detection_segments: Number of segments to consider for the language detection. + Returns: + A tuple with: + + - a generator over transcribed segments + - an instance of TranscriptionInfo + """ + + sampling_rate = self.feature_extractor.sampling_rate + + if isinstance(audio, np.ndarray): + audio = torch.from_numpy(audio) + elif not isinstance(audio, torch.Tensor): + audio = decode_audio(audio, sampling_rate=sampling_rate) + + duration = audio.shape[0] / sampling_rate + duration_after_vad = duration + + self.logger.info( + "Processing audio with duration %s", format_timestamp(duration) + ) + + if vad_filter and clip_timestamps == "0": + if vad_parameters is None: + vad_parameters = VadOptions() + elif isinstance(vad_parameters, dict): + vad_parameters = VadOptions(**vad_parameters) + speech_chunks = get_speech_timestamps(audio, vad_parameters) + audio = collect_chunks(audio, speech_chunks) + duration_after_vad = audio.shape[0] / sampling_rate + + self.logger.info( + "VAD filter removed %s of audio", + format_timestamp(duration - duration_after_vad), + ) + + if self.logger.isEnabledFor(logging.DEBUG): + self.logger.debug( + "VAD filter kept the following audio segments: %s", + ", ".join( + "[%s -> %s]" + % ( + format_timestamp(chunk["start"] / sampling_rate), + format_timestamp(chunk["end"] / sampling_rate), + ) + for chunk in speech_chunks + ), + ) + + else: + speech_chunks = None + + to_cpu = self.model.device == "cuda" and len(self.model.device_index) > 1 + features = self.feature_extractor( + audio, chunk_length=chunk_length, to_cpu=to_cpu + ) + + encoder_output = None + all_language_probs = None + + # setting output_language for multilingual videos + if multilingual: + if output_language is None: + output_language = "en" + elif output_language not in ["en", "hybrid"]: + raise ValueError("Output language needs to be one of 'en'/'hybrid'.") + + # detecting the language if not provided + if language is None: + if not self.model.is_multilingual: + language = "en" + language_probability = 1 + else: + if ( + language_detection_segments is None + or language_detection_segments < 1 + ): + language_detection_segments = 1 + start_timestamp = ( + float(clip_timestamps.split(",")[0]) + if isinstance(clip_timestamps, str) + else clip_timestamps[0] + ) + content_frames = ( + features.shape[-1] - self.feature_extractor.nb_max_frames + ) + seek = ( + int(start_timestamp * self.frames_per_second) + if start_timestamp * self.frames_per_second < content_frames + else 0 + ) + end_frames = min( + seek + + self.feature_extractor.nb_max_frames + * language_detection_segments, + content_frames, + ) + detected_language_info = {} + while seek < end_frames: + segment = features[ + :, seek : seek + self.feature_extractor.nb_max_frames + ] + encoder_output = self.encode(segment) + # results is a list of tuple[str, float] with language names and + # probabilities. + results = self.model.detect_language(encoder_output)[0] + # Parse language names to strip out markers + all_language_probs = [ + (token[2:-2], prob) for (token, prob) in results + ] + # Get top language token and probability + language, language_probability = all_language_probs[0] + if ( + language_detection_threshold is None + or language_probability > language_detection_threshold + ): + break + detected_language_info.setdefault(language, []).append( + language_probability + ) + seek += segment.shape[-1] + else: + # If no language detected for all segments, the majority vote of the highest + # projected languages for all segments is used to determine the language. + language = max( + detected_language_info, + key=lambda lang: len(detected_language_info[lang]), + ) + language_probability = max(detected_language_info[language]) + + self.logger.info( + "Detected language '%s' with probability %.2f", + language, + language_probability, + ) + else: + if not self.model.is_multilingual and language != "en": + self.logger.warning( + "The current model is English-only but the language parameter is set to '%s'; " + "using 'en' instead." % language + ) + language = "en" + + language_probability = 1 + + tokenizer = Tokenizer( + self.hf_tokenizer, + self.model.is_multilingual, + task=task, + language=language, + ) + + options = TranscriptionOptions( + beam_size=beam_size, + best_of=best_of, + patience=patience, + length_penalty=length_penalty, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + log_prob_threshold=log_prob_threshold, + log_prob_low_threshold=log_prob_low_threshold, + no_speech_threshold=no_speech_threshold, + compression_ratio_threshold=compression_ratio_threshold, + condition_on_previous_text=condition_on_previous_text, + prompt_reset_on_temperature=prompt_reset_on_temperature, + temperatures=( + temperature if isinstance(temperature, (list, tuple)) else [temperature] + ), + initial_prompt=initial_prompt, + prefix=prefix, + suppress_blank=suppress_blank, + suppress_tokens=get_suppressed_tokens(tokenizer, suppress_tokens), + without_timestamps=without_timestamps, + max_initial_timestamp=max_initial_timestamp, + word_timestamps=word_timestamps, + prepend_punctuations=prepend_punctuations, + append_punctuations=append_punctuations, + multilingual=multilingual, + output_language=output_language, + max_new_tokens=max_new_tokens, + clip_timestamps=clip_timestamps, + hallucination_silence_threshold=hallucination_silence_threshold, + hotwords=hotwords, + ) + + segments = self.generate_segments(features, tokenizer, options, encoder_output) + + if speech_chunks: + segments = restore_speech_timestamps(segments, speech_chunks, sampling_rate) + + info = TranscriptionInfo( + language=language, + language_probability=language_probability, + duration=duration, + duration_after_vad=duration_after_vad, + transcription_options=options, + vad_options=vad_parameters, + all_language_probs=all_language_probs, + ) + + return segments, info + + def _split_segments_by_timestamps( + self, + tokenizer: Tokenizer, + tokens: List[int], + time_offset: float, + segment_size: int, + segment_duration: float, + seek: int, + ) -> List[List[int]]: + current_segments = [] + single_timestamp_ending = ( + len(tokens) >= 2 and tokens[-2] < tokenizer.timestamp_begin <= tokens[-1] + ) + + consecutive_timestamps = [ + i + for i in range(len(tokens)) + if i > 0 + and tokens[i] >= tokenizer.timestamp_begin + and tokens[i - 1] >= tokenizer.timestamp_begin + ] + + if len(consecutive_timestamps) > 0: + slices = list(consecutive_timestamps) + if single_timestamp_ending: + slices.append(len(tokens)) + + last_slice = 0 + for current_slice in slices: + sliced_tokens = tokens[last_slice:current_slice] + start_timestamp_position = sliced_tokens[0] - tokenizer.timestamp_begin + end_timestamp_position = sliced_tokens[-1] - tokenizer.timestamp_begin + start_time = ( + time_offset + start_timestamp_position * self.time_precision + ) + end_time = time_offset + end_timestamp_position * self.time_precision + + current_segments.append( + dict( + seek=seek, + start=start_time, + end=end_time, + tokens=sliced_tokens, + ) + ) + last_slice = current_slice + + if single_timestamp_ending: + # single timestamp at the end means no speech after the last timestamp. + seek += segment_size + else: + # otherwise, ignore the unfinished segment and seek to the last timestamp + last_timestamp_position = ( + tokens[last_slice - 1] - tokenizer.timestamp_begin + ) + seek += last_timestamp_position * self.input_stride + + else: + duration = segment_duration + timestamps = [ + token for token in tokens if token >= tokenizer.timestamp_begin + ] + if len(timestamps) > 0 and timestamps[-1] != tokenizer.timestamp_begin: + last_timestamp_position = timestamps[-1] - tokenizer.timestamp_begin + duration = last_timestamp_position * self.time_precision + + current_segments.append( + dict( + seek=seek, + start=time_offset, + end=time_offset + duration, + tokens=tokens, + ) + ) + + seek += segment_size + + return current_segments, seek, single_timestamp_ending + + def generate_segments( + self, + features: torch.Tensor, + tokenizer: Tokenizer, + options: TranscriptionOptions, + encoder_output: Optional[ctranslate2.StorageView] = None, + ) -> Iterable[Segment]: + content_frames = features.shape[-1] - self.feature_extractor.nb_max_frames + content_duration = float(content_frames * self.feature_extractor.time_per_frame) + + if isinstance(options.clip_timestamps, str): + options = options._replace( + clip_timestamps=[ + float(ts) + for ts in ( + options.clip_timestamps.split(",") + if options.clip_timestamps + else [] + ) + ] + ) + seek_points: List[int] = [ + round(ts * self.frames_per_second) for ts in options.clip_timestamps + ] + if len(seek_points) == 0: + seek_points.append(0) + if len(seek_points) % 2 == 1: + seek_points.append(content_frames) + seek_clips: List[Tuple[int, int]] = list( + zip(seek_points[::2], seek_points[1::2]) + ) + + punctuation = "\"'“¿([{-\"'.。,,!!??::”)]}、" + + idx = 0 + clip_idx = 0 + seek = seek_clips[clip_idx][0] + all_tokens = [] + all_prompt_text = [] + prompt_reset_since = 0 + + if options.initial_prompt is not None: + if isinstance(options.initial_prompt, str): + initial_prompt = " " + options.initial_prompt.strip() + initial_prompt_tokens = tokenizer.encode(initial_prompt) + all_tokens.extend(initial_prompt_tokens) + else: + all_tokens.extend(options.initial_prompt) + + last_speech_timestamp = 0.0 + # NOTE: This loop is obscurely flattened to make the diff readable. + # A later commit should turn this into a simpler nested loop. + # for seek_clip_start, seek_clip_end in seek_clips: + # while seek < seek_clip_end + while clip_idx < len(seek_clips): + seek_clip_start, seek_clip_end = seek_clips[clip_idx] + if seek_clip_end > content_frames: + seek_clip_end = content_frames + if seek < seek_clip_start: + seek = seek_clip_start + if seek >= seek_clip_end: + clip_idx += 1 + if clip_idx < len(seek_clips): + seek = seek_clips[clip_idx][0] + continue + time_offset = seek * self.feature_extractor.time_per_frame + window_end_time = float( + (seek + self.feature_extractor.nb_max_frames) + * self.feature_extractor.time_per_frame + ) + segment_size = min( + self.feature_extractor.nb_max_frames, + content_frames - seek, + seek_clip_end - seek, + ) + segment = features[:, seek : seek + segment_size] + segment_duration = segment_size * self.feature_extractor.time_per_frame + segment = pad_or_trim(segment, self.feature_extractor.nb_max_frames) + + if self.logger.isEnabledFor(logging.DEBUG): + self.logger.debug( + "Processing segment at %s", format_timestamp(time_offset) + ) + + previous_tokens = all_tokens[prompt_reset_since:] + + if encoder_output is None: + encoder_output = self.encode(segment) + + # Perform language detection at every segment to update task based on output language, + # if the language is english, task is transcribe, + # else the task is translate to english (default) + # or transcribe if 'output_language' is 'hybrid'. + if options.multilingual: + results = self.model.detect_language(encoder_output) + language_token, language_probability = results[0][0] + language = language_token[2:-2] + if options.output_language == "en" and language != "en": + task = "translate" + else: + task = "transcribe" + + # Update tokenizer based on task and language + tokenizer.task = tokenizer.token_to_id(f"<|{task}|>") + tokenizer.language = tokenizer.token_to_id(language_token) + tokenizer.language_code = language + # Update prompt based on task and language + prompt = self.get_prompt( + tokenizer, + previous_tokens, + without_timestamps=options.without_timestamps, + prefix=options.prefix if seek == 0 else None, + hotwords=options.hotwords, + ) + + if seek > 0 or encoder_output is None: + encoder_output = self.encode(segment) + + ( + result, + avg_logprob, + temperature, + compression_ratio, + ) = self.generate_with_fallback(encoder_output, prompt, tokenizer, options) + + if options.no_speech_threshold is not None: + # no voice activity check + should_skip = result.no_speech_prob > options.no_speech_threshold + + if ( + options.log_prob_threshold is not None + and avg_logprob > options.log_prob_threshold + ): + # don't skip if the logprob is high enough, despite the no_speech_prob + should_skip = False + + if should_skip: + self.logger.debug( + "No speech threshold is met (%f > %f)", + result.no_speech_prob, + options.no_speech_threshold, + ) + + # Skip if the logprob is very low (below the threshold value), + # despite no_speech_prob being low (ex: Too ambiguous outputs) + if options.log_prob_low_threshold: + if avg_logprob < options.log_prob_low_threshold: + should_skip = True + self.logger.debug( + "log prob low threshold is met (%f > %f)", + avg_logprob, + options.log_prob_low_threshold, + ) + + if should_skip: + # fast-forward to the next segment boundary + seek += segment_size + continue + + tokens = result.sequences_ids[0] + + previous_seek = seek + + # anomalous words are very long/short/improbable + def word_anomaly_score(word: dict) -> float: + probability = word.get("probability", 0.0) + duration = word["end"] - word["start"] + score = 0.0 + if probability < 0.15: + score += 1.0 + if duration < 0.133: + score += (0.133 - duration) * 15 + if duration > 2.0: + score += duration - 2.0 + return score + + def is_segment_anomaly(segment: Optional[dict]) -> bool: + if segment is None or not segment["words"]: + return False + words = [w for w in segment["words"] if w["word"] not in punctuation] + words = words[:8] + score = sum(word_anomaly_score(w) for w in words) + return score >= 3 or score + 0.01 >= len(words) + + def next_words_segment(segments: List[dict]) -> Optional[dict]: + return next((s for s in segments if s["words"]), None) + + current_segments, seek, single_timestamp_ending = ( + self._split_segments_by_timestamps( + tokenizer=tokenizer, + tokens=tokens, + time_offset=time_offset, + segment_size=segment_size, + segment_duration=segment_duration, + seek=seek, + ) + ) + + if options.word_timestamps: + self.add_word_timestamps( + [current_segments], + tokenizer, + encoder_output, + segment_size, + options.prepend_punctuations, + options.append_punctuations, + last_speech_timestamp=last_speech_timestamp, + ) + + if not single_timestamp_ending: + last_word_end = get_end(current_segments) + if last_word_end is not None and last_word_end > time_offset: + seek = round(last_word_end * self.frames_per_second) + + # skip silence before possible hallucinations + if options.hallucination_silence_threshold is not None: + threshold = options.hallucination_silence_threshold + + # if first segment might be a hallucination, skip leading silence + first_segment = next_words_segment(current_segments) + if first_segment is not None and is_segment_anomaly(first_segment): + gap = first_segment["start"] - time_offset + if gap > threshold: + seek = previous_seek + round(gap * self.frames_per_second) + continue + + # skip silence before any possible hallucination that is surrounded + # by silence or more hallucinations + hal_last_end = last_speech_timestamp + for si in range(len(current_segments)): + segment = current_segments[si] + if not segment["words"]: + continue + if is_segment_anomaly(segment): + next_segment = next_words_segment( + current_segments[si + 1 :] + ) + if next_segment is not None: + hal_next_start = next_segment["words"][0]["start"] + else: + hal_next_start = time_offset + segment_duration + silence_before = ( + segment["start"] - hal_last_end > threshold + or segment["start"] < threshold + or segment["start"] - time_offset < 2.0 + ) + silence_after = ( + hal_next_start - segment["end"] > threshold + or is_segment_anomaly(next_segment) + or window_end_time - segment["end"] < 2.0 + ) + if silence_before and silence_after: + seek = round( + max(time_offset + 1, segment["start"]) + * self.frames_per_second + ) + if content_duration - segment["end"] < threshold: + seek = content_frames + current_segments[si:] = [] + break + hal_last_end = segment["end"] + + last_word_end = get_end(current_segments) + if last_word_end is not None: + last_speech_timestamp = last_word_end + + for segment in current_segments: + tokens = segment["tokens"] + text = tokenizer.decode(tokens) + + if segment["start"] == segment["end"] or not text.strip(): + continue + + check_prompt_num = 1 + if all( + [ + text.strip() != i.strip() + for i in all_prompt_text[-check_prompt_num:] + ] + ): + all_tokens.extend(tokens) + all_prompt_text.append(text) + idx += 1 + + yield Segment( + id=idx, + seek=seek, + start=segment["start"], + end=segment["end"], + text=text, + tokens=tokens, + temperature=temperature, + avg_logprob=avg_logprob, + compression_ratio=compression_ratio, + no_speech_prob=result.no_speech_prob, + words=( + [Word(**word) for word in segment["words"]] + if options.word_timestamps + else None + ), + ) + + if ( + not options.condition_on_previous_text + or temperature > options.prompt_reset_on_temperature + ): + if options.condition_on_previous_text: + self.logger.debug( + "Reset prompt. prompt_reset_on_temperature threshold is met %f > %f", + temperature, + options.prompt_reset_on_temperature, + ) + + prompt_reset_since = len(all_tokens) + + def encode(self, features: torch.Tensor) -> ctranslate2.StorageView: + # When the model is running on multiple GPUs, the encoder output should be moved + # to the CPU since we don't know which GPU will handle the next job. + to_cpu = self.model.device == "cuda" and len(self.model.device_index) > 1 + + if features.ndim == 2: + features = features.unsqueeze(0) + features = get_ctranslate2_storage(features) + + return self.model.encode(features, to_cpu=to_cpu) + + def generate_with_fallback( + self, + encoder_output: ctranslate2.StorageView, + prompt: List[int], + tokenizer: Tokenizer, + options: TranscriptionOptions, + ) -> Tuple[ctranslate2.models.WhisperGenerationResult, float, float, float]: + decode_result = None + all_results = [] + below_cr_threshold_results = [] + + max_initial_timestamp_index = int( + round(options.max_initial_timestamp / self.time_precision) + ) + if options.max_new_tokens is not None: + max_length = len(prompt) + options.max_new_tokens + else: + max_length = self.max_length + + if max_length > self.max_length: + raise ValueError( + f"The length of the prompt is {len(prompt)}, and the `max_new_tokens` " + f"{max_length - len(prompt)}. Thus, the combined length of the prompt " + f"and `max_new_tokens` is: {max_length}. This exceeds the " + f"`max_length` of the Whisper model: {self.max_length}. " + "You should either reduce the length of your prompt, or " + "reduce the value of `max_new_tokens`, " + f"so that their combined length is less that {self.max_length}." + ) + + for temperature in options.temperatures: + if temperature > 0: + kwargs = { + "beam_size": 1, + "num_hypotheses": options.best_of, + "sampling_topk": 0, + "sampling_temperature": temperature, + } + else: + kwargs = { + "beam_size": options.beam_size, + "patience": options.patience, + } + + result = self.model.generate( + encoder_output, + [prompt], + length_penalty=options.length_penalty, + repetition_penalty=options.repetition_penalty, + no_repeat_ngram_size=options.no_repeat_ngram_size, + max_length=max_length, + return_scores=True, + return_no_speech_prob=True, + suppress_blank=options.suppress_blank, + suppress_tokens=options.suppress_tokens, + max_initial_timestamp_index=max_initial_timestamp_index, + **kwargs, + )[0] + + tokens = result.sequences_ids[0] + + # Recover the average log prob from the returned score. + seq_len = len(tokens) + cum_logprob = result.scores[0] * (seq_len**options.length_penalty) + avg_logprob = cum_logprob / (seq_len + 1) + + text = tokenizer.decode(tokens).strip() + compression_ratio = get_compression_ratio(text) + + decode_result = ( + result, + avg_logprob, + temperature, + compression_ratio, + ) + all_results.append(decode_result) + + needs_fallback = False + + if options.compression_ratio_threshold is not None: + if compression_ratio > options.compression_ratio_threshold: + needs_fallback = True # too repetitive + + self.logger.debug( + "Compression ratio threshold is not met with temperature %.1f (%f > %f)", + temperature, + compression_ratio, + options.compression_ratio_threshold, + ) + else: + below_cr_threshold_results.append(decode_result) + + if ( + options.log_prob_threshold is not None + and avg_logprob < options.log_prob_threshold + ): + needs_fallback = True # average log probability is too low + + self.logger.debug( + "Log probability threshold is not met with temperature %.1f (%f < %f)", + temperature, + avg_logprob, + options.log_prob_threshold, + ) + + if ( + options.no_speech_threshold is not None + and result.no_speech_prob > options.no_speech_threshold + and options.log_prob_threshold is not None + and avg_logprob < options.log_prob_threshold + ): + needs_fallback = False # silence + + if not needs_fallback: + break + else: + # all failed, select the result with the highest average log probability + decode_result = max( + below_cr_threshold_results or all_results, key=lambda x: x[1] + ) + # to pass final temperature for prompt_reset_on_temperature + decode_result = ( + decode_result[0], + decode_result[1], + temperature, + decode_result[3], + ) + + return decode_result + + def get_prompt( + self, + tokenizer: Tokenizer, + previous_tokens: List[int], + without_timestamps: bool = False, + prefix: Optional[str] = None, + hotwords: Optional[str] = None, + ) -> List[int]: + prompt = [] + + if previous_tokens or (hotwords and not prefix): + prompt.append(tokenizer.sot_prev) + if hotwords and not prefix: + hotwords_tokens = tokenizer.encode(" " + hotwords.strip()) + if len(hotwords_tokens) >= self.max_length // 2: + hotwords_tokens = hotwords_tokens[: self.max_length // 2 - 1] + prompt.extend(hotwords_tokens) + if previous_tokens: + prompt.extend(previous_tokens[-(self.max_length // 2 - 1) :]) + + prompt.extend(tokenizer.sot_sequence) + + if without_timestamps: + prompt.append(tokenizer.no_timestamps) + + if prefix: + prefix_tokens = tokenizer.encode(" " + prefix.strip()) + if len(prefix_tokens) >= self.max_length // 2: + prefix_tokens = prefix_tokens[: self.max_length // 2 - 1] + if not without_timestamps: + prompt.append(tokenizer.timestamp_begin) + prompt.extend(prefix_tokens) + + return prompt + + def add_word_timestamps( + self, + segments: List[dict], + tokenizer: Tokenizer, + encoder_output: ctranslate2.StorageView, + num_frames: int, + prepend_punctuations: str, + append_punctuations: str, + last_speech_timestamp: float, + ) -> float: + if len(segments) == 0: + return + + text_tokens = [] + text_tokens_per_segment = [] + for segment in segments: + segment_tokens = [ + [token for token in subsegment["tokens"] if token < tokenizer.eot] + for subsegment in segment + ] + text_tokens.append(list(itertools.chain.from_iterable(segment_tokens))) + text_tokens_per_segment.append(segment_tokens) + + alignments = self.find_alignment( + tokenizer, text_tokens, encoder_output, num_frames + ) + median_max_durations = [] + for alignment in alignments: + word_durations = np.array( + [word["end"] - word["start"] for word in alignment] + ) + word_durations = word_durations[word_durations.nonzero()] + median_duration = ( + np.median(word_durations) if len(word_durations) > 0 else 0.0 + ) + median_duration = min(0.7, float(median_duration)) + max_duration = median_duration * 2 + + # hack: truncate long words at sentence boundaries. + # a better segmentation algorithm based on VAD should be able to replace this. + if len(word_durations) > 0: + sentence_end_marks = ".。!!??" + # ensure words at sentence boundaries + # are not longer than twice the median word duration. + for i in range(1, len(alignment)): + if alignment[i]["end"] - alignment[i]["start"] > max_duration: + if alignment[i]["word"] in sentence_end_marks: + alignment[i]["end"] = alignment[i]["start"] + max_duration + elif alignment[i - 1]["word"] in sentence_end_marks: + alignment[i]["start"] = alignment[i]["end"] - max_duration + + merge_punctuations(alignment, prepend_punctuations, append_punctuations) + median_max_durations.append((median_duration, max_duration)) + + for segment_idx, segment in enumerate(segments): + word_index = 0 + time_offset = segment[0]["start"] + median_duration, max_duration = median_max_durations[segment_idx] + for subsegment_idx, subsegment in enumerate(segment): + saved_tokens = 0 + words = [] + + while word_index < len( + alignments[segment_idx] + ) and saved_tokens < len( + text_tokens_per_segment[segment_idx][subsegment_idx] + ): + timing = alignments[segment_idx][word_index] + + if timing["word"]: + words.append( + dict( + word=timing["word"], + start=round(time_offset + timing["start"], 2), + end=round(time_offset + timing["end"], 2), + probability=timing["probability"], + ) + ) + + saved_tokens += len(timing["tokens"]) + word_index += 1 + + # hack: truncate long words at segment boundaries. + # a better segmentation algorithm based on VAD should be able to replace this. + if len(words) > 0: + # ensure the first and second word after a pause is not longer than + # twice the median word duration. + if words[0][ + "end" + ] - last_speech_timestamp > median_duration * 4 and ( + words[0]["end"] - words[0]["start"] > max_duration + or ( + len(words) > 1 + and words[1]["end"] - words[0]["start"] > max_duration * 2 + ) + ): + if ( + len(words) > 1 + and words[1]["end"] - words[1]["start"] > max_duration + ): + boundary = max( + words[1]["end"] / 2, words[1]["end"] - max_duration + ) + words[0]["end"] = words[1]["start"] = boundary + words[0]["start"] = max(0, words[0]["end"] - max_duration) + + # prefer the segment-level start timestamp if the first word is too long. + if ( + subsegment["start"] < words[0]["end"] + and subsegment["start"] - 0.5 > words[0]["start"] + ): + words[0]["start"] = max( + 0, + min(words[0]["end"] - median_duration, subsegment["start"]), + ) + else: + subsegment["start"] = words[0]["start"] + + # prefer the segment-level end timestamp if the last word is too long. + if ( + subsegment["end"] > words[-1]["start"] + and subsegment["end"] + 0.5 < words[-1]["end"] + ): + words[-1]["end"] = max( + words[-1]["start"] + median_duration, subsegment["end"] + ) + else: + subsegment["end"] = words[-1]["end"] + + last_speech_timestamp = subsegment["end"] + segments[segment_idx][subsegment_idx]["words"] = words + return last_speech_timestamp + + def find_alignment( + self, + tokenizer: Tokenizer, + text_tokens: List[int], + encoder_output: ctranslate2.StorageView, + num_frames: int, + median_filter_width: int = 7, + ) -> List[dict]: + if len(text_tokens) == 0: + return [] + + results = self.model.align( + encoder_output, + tokenizer.sot_sequence, + text_tokens, + num_frames, + median_filter_width=median_filter_width, + ) + return_list = [] + for result, text_token in zip(results, text_tokens): + text_token_probs = result.text_token_probs + alignments = result.alignments + text_indices = np.array([pair[0] for pair in alignments]) + time_indices = np.array([pair[1] for pair in alignments]) + + words, word_tokens = tokenizer.split_to_word_tokens( + text_token + [tokenizer.eot] + ) + if len(word_tokens) <= 1: + # return on eot only + # >>> np.pad([], (1, 0)) + # array([0.]) + # This results in crashes when we lookup jump_times with float, like + # IndexError: arrays used as indices must be of integer (or boolean) type + return [] + word_boundaries = np.pad( + np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0) + ) + if len(word_boundaries) <= 1: + return [] + + jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype( + bool + ) + jump_times = time_indices[jumps] / self.tokens_per_second + start_times = jump_times[word_boundaries[:-1]] + end_times = jump_times[word_boundaries[1:]] + word_probabilities = [ + np.mean(text_token_probs[i:j]) + for i, j in zip(word_boundaries[:-1], word_boundaries[1:]) + ] + + return_list.append( + [ + dict( + word=word, + tokens=tokens, + start=start, + end=end, + probability=probability, + ) + for word, tokens, start, end, probability in zip( + words, word_tokens, start_times, end_times, word_probabilities + ) + ] + ) + return return_list + + def generate_segment_batched( + self, + features: torch.Tensor, + tokenizer: Tokenizer, + options: dict, + ): + batch_size = features.shape[0] + all_tokens = [] + prompt_reset_since = 0 + + if options["initial_prompt"] is not None: + initial_prompt = " " + options["initial_prompt"].strip() + initial_prompt_tokens = tokenizer.encode(initial_prompt) + all_tokens.extend(initial_prompt_tokens) + previous_tokens = all_tokens[prompt_reset_since:] + prompt = self.get_prompt( + tokenizer, + previous_tokens, + without_timestamps=options["without_timestamps"], + prefix=options["prefix"], + ) + + encoder_output = self.encode(features) + + result = self.model.generate( + encoder_output, + [prompt] * batch_size, + beam_size=options["beam_size"], + patience=options["patience"], + length_penalty=options["length_penalty"], + max_length=self.max_length, + suppress_blank=options["suppress_blank"], + suppress_tokens=options["suppress_tokens"], + return_scores=True, + return_no_speech_prob=True, + ) + + output = [] + for res in result: + output.append({}) + # return scores + seq_len = len(res.sequences_ids[0]) + cum_logprob = res.scores[0] * (seq_len ** options["length_penalty"]) + output[-1]["avg_logprob"] = cum_logprob / (seq_len + 1) + + # return no speech prob + output[-1]["no_speech_prob"] = res.no_speech_prob + output[-1]["tokens"] = res.sequences_ids[0] + + return encoder_output, output + + def detect_language_multi_segment( + self, audio: Union[str, BinaryIO, torch.Tensor], params: Optional[dict] = None + ): + """ + Detect language based on N highly-confident segments of a language. + """ + # The threshold is used to decide if the audio is silence or not. + # The default is 0.02 (2.0%) i.e, if more than 2.0% of the audio is silent, + # the audio is considered as silence. + if not params: + params = { + "multilingual": False, + "speech_percentage_threshold": 0.02, + "language_detection_segments": 4, + "vad_filter": True, + "vad_min_silence_duration": 2500, + "language_threshold": 0.7, + } + + if params.get("multilingual", False): + logging.warning( + "lang_id is not supported for multilingual audios, detecting the major language." + ) + + speech_percentage_threshold = params.get("speech_percentage_threshold", 0.02) + language_threshold = params.get("language_threshold", 0.7) + num_detection_segments = params.get("language_detection_segments", 4) + vad_filter_enabled = params.get("vad_filter", True) + vad_params = dict( + min_silence_duration_ms=params.get("vad_min_silence_duration", 2500) + ) + + if vad_filter_enabled: + vad_params = VadOptions(**vad_params) + + # decode audio if it is not decoded already + sampling_rate = self.feature_extractor.sampling_rate + if not isinstance(audio, torch.Tensor): + audio: torch.Tensor = decode_audio(audio, sampling_rate=sampling_rate) + + # calculate duration of audio as number of seconds + # audio.shape[0] is the number of samples in the audio + # sampling_rate is the number of samples per second + # if we divide the number of samples by the number of samples per second, + # we get the duration in seconds + duration = audio.shape[0] / sampling_rate + + # Check if vad is enabled, and collect voiced segments + if vad_filter_enabled: + # get chunks of audio that contain speech + speech_chunks = get_speech_timestamps(audio, vad_params) + # merge chunks of audio that contain speech into a single array + audio = collect_chunks(audio, speech_chunks) + + # calculate new duration of audio without silence + duration_vad = audio.shape[0] / sampling_rate + + logging.debug( + f"Lang ID: VAD filter removed {duration - duration_vad} sec of audio" + ) + + # if the audio after VAD is less than 2% of the original audio, consider it as silence + if duration_vad / duration < speech_percentage_threshold: + return {"language_code": "silence", "language_confidence": 1.0} + + # update duration to be the duration after VAD + duration = duration_vad + + # if the duration of the audio is less than 1 second, consider it as silence + if duration < 1.0: + return {"language_code": "silence", "language_confidence": 1.0} + + # number of feature frames in 30 seconds of audio is 3000 + nb_max_frames = self.feature_extractor.nb_max_frames + + # extract features from audio with padding (default) + to_cpu = self.model.device == "cuda" and len(self.model.device_index) > 1 + features = self.feature_extractor(audio, to_cpu=to_cpu) + + # number of segments in the audio + num_segments = features.shape[-1] // nb_max_frames + # more number of segments than possible with the duration of file + if num_detection_segments > num_segments: + logging.warning( + f"Lang ID: Can not have more segments, setting {num_segments} segments." + ) + num_detection_segments = num_segments + + # create a list of indices to randomly select segments from + indices = list(range(num_detection_segments)) + + # fix seed to get deterministic results + random.seed(0) + random.shuffle(indices) + + detected_languages = [] + all_language_probabilities = defaultdict(list) + confident_language_probabilities = defaultdict(list) + num_confident_segments_per_language = defaultdict(int) + + # Iterate over the randomly selected indices of the segments. + # + # For each segment, extract features and detect language. + # + # If the language is confident, add it to the list of confident segments for that language. + # + # If the number of confident segments for a language + # is greater than or equal to the number of detection segments, + # return the language and the average probability of the language. + # + # If we are unable to get sufficient number of confident predcitions, + # return the most frequently detected language with maximum probability. + # + # We need to get sufficient number of confident predictions per language, not in total. + + for i in indices: + segment_features = features[:, i * nb_max_frames : (i + 1) * nb_max_frames] + try: + encoder_output = self.encode(segment_features) + results = self.model.detect_language(encoder_output)[0] + + except ValueError as e: # or RuntimeError + logging.error(f"Inference error:{e}") + + # results is the list of classes (languages) and their probabilities (descending), + # for eg: [('<|de|>', 0.482177734375),('<|en|>', 0.283447265625),...] + + # take top language token and probability + # and parse language token to strip out markers + # for eg: '<|de|>' -> 'de' + + language_token = results[0][0] + language = language_token[2:-2] + + language_probability = results[0][1] + + detected_languages.append(language) + all_language_probabilities[language].append(language_probability) + + # only consider if the language prediction is confident + if language_probability > language_threshold: + num_confident_segments_per_language[language] += 1 + + # Add language and probability to the list of languages when it is confident + confident_language_probabilities[language].append(language_probability) + + # return the language when sufficient number of confident segments is achieved + if ( + num_confident_segments_per_language[language] + >= num_detection_segments + ): + # Considering the average probability of only confident segments + mean = sum(confident_language_probabilities[language]) / len( + confident_language_probabilities[language] + ) + return { + "language_code": language, + "language_confidence": mean, + } + + # if we are unable to get sufficient number of confident predictions, + # return the most frequently detected language. + # if there is a tie, return the one with maximum average probability. + counter = Counter(detected_languages) + + # Define the key function to select frequent language with attached probabilities + def key_func(language): + # Calculate the frequency of the language + frequency = counter[language] + + # Calculate the average probability of the language + prob_avg = sum(all_language_probabilities[language]) / len( + all_language_probabilities[language] + ) + + return (frequency, prob_avg) + + max_language = None + + if detected_languages: + # Use the key function to find the language with maximum frequency and probability + max_language = max(detected_languages, key=key_func) + max_probability = sum(all_language_probabilities[max_language]) / len( + all_language_probabilities[max_language] + ) + + # Do additional checks for silence for non-confident case + # calculate RMS amplitude and DC offset + dc_offset = audio.mean() + audio_minus_dc_offset = audio - dc_offset + is_silent = ( + torch.all(audio.abs() < 0.01) + or torch.sqrt(torch.mean(audio_minus_dc_offset**2)) < 0.01 + ) + + if is_silent: + return {"language_code": "silence", "language_confidence": 1.0} + + if max_language is not None: + return { + "language_code": max_language, + "language_confidence": max_probability, + } + + # Language is not detected for any segment and none of prev conditions met + return {"language_code": "silence", "language_confidence": 1.0} + + +default_batched_asr_options = { + "beam_size": 5, + "best_of": 5, + "patience": 1, + "length_penalty": 1, + "repetition_penalty": 1, + "no_repeat_ngram_size": 0, + "temperatures": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0], + "compression_ratio_threshold": 2.4, + "log_prob_threshold": -1.0, + "no_speech_threshold": 0.6, + "condition_on_previous_text": False, + "prompt_reset_on_temperature": 0.5, + "initial_prompt": None, + "prefix": None, + "suppress_blank": True, + "suppress_tokens": [-1], + "max_new_tokens": None, + "clip_timestamps": "0", + "hallucination_silence_threshold": None, + "without_timestamps": True, # False for timings + "max_initial_timestamp": 0.0, + "word_timestamps": False, + "prepend_punctuations": "\"'“¿([{-", + "append_punctuations": "\"'.。,,!!??::”)]}、", + "log_prob_low_threshold": None, + "multilingual": False, + "output_language": "en", + "hotwords": None, +} + + +def load_model_batch( + whisper_arch, + device, + device_index=0, + compute_type="float16", + asr_options=None, + language: Optional[str] = None, + model=None, + task="transcribe", + download_root=None, + threads=4, +): + """Load a Whisper model for inference. + Args: + whisper_arch: str - The name of the Whisper model to load. + device: str - The device to load the model on. + compute_type: str - The compute type to use for the model. + options: dict - A dictionary of options to use for the model. + language: str - The language of the model. (use English for now) + download_root: Optional[str] - The root directory to download the model to. + threads: int - The number of cpu threads to use per worker. + Returns: + A Whisper pipeline. + """ + + if whisper_arch.endswith(".en"): + language = "en" + + model = WhisperModel( + whisper_arch, + device=device, + device_index=device_index, + compute_type=compute_type, + download_root=download_root, + cpu_threads=threads, + ) + if language is not None: + tokenizer = Tokenizer( + model.hf_tokenizer, + model.model.is_multilingual, + task=task, + language=language, + ) + else: + model.logger.warning( + "No language specified, it will be detected causing increase in inference time." + ) + tokenizer = None + + if asr_options is not None: + default_batched_asr_options.update(asr_options) + + batched_asr_options = TranscriptionOptions(**default_batched_asr_options) + + return BatchedInferencePipeline( + model=model, + options=batched_asr_options, + tokenizer=tokenizer, + language=language, + ) + + +def restore_speech_timestamps( + segments: Iterable[Segment], + speech_chunks: List[dict], + sampling_rate: int, +) -> Iterable[Segment]: + ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate) + + for segment in segments: + if segment.words: + words = [] + for word in segment.words: + # Ensure the word start and end times are resolved to the same chunk. + middle = (word.start + word.end) / 2 + chunk_index = ts_map.get_chunk_index(middle) + word = word._replace( + start=ts_map.get_original_time(word.start, chunk_index), + end=ts_map.get_original_time(word.end, chunk_index), + ) + words.append(word) + + segment = segment._replace( + start=words[0].start, + end=words[-1].end, + words=words, + ) + + else: + segment = segment._replace( + start=ts_map.get_original_time(segment.start), + end=ts_map.get_original_time(segment.end), + ) + + yield segment + + +def get_ctranslate2_storage(segment: torch.Tensor) -> ctranslate2.StorageView: + segment = segment.contiguous() + segment = ctranslate2.StorageView.from_array( + segment if segment.is_cuda else segment.numpy() + ) # torch cpu tensors don't implement __array_interface__ + # https://github.com/pytorch/pytorch/issues/51156 + return segment + + +def get_compression_ratio(text: str) -> float: + text_bytes = text.encode("utf-8") + return len(text_bytes) / len(zlib.compress(text_bytes)) + + +def get_suppressed_tokens( + tokenizer: Tokenizer, + suppress_tokens: Optional[List[int]], +) -> Optional[List[int]]: + if not suppress_tokens or -1 in suppress_tokens: + return suppress_tokens + + suppress_tokens = list(suppress_tokens) + + # Ensure the following special tokens are suppressed when the user does + # not use the default set (-1). + suppress_tokens.extend( + [ + tokenizer.transcribe, + tokenizer.translate, + tokenizer.sot, + tokenizer.sot_prev, + tokenizer.sot_lm, + ] + ) + + return sorted(set(suppress_tokens)) + + +def merge_punctuations(alignment: List[dict], prepended: str, appended: str) -> None: + # merge prepended punctuations + i = len(alignment) - 2 + j = len(alignment) - 1 + while i >= 0: + previous = alignment[i] + following = alignment[j] + if previous["word"].startswith(" ") and previous["word"].strip() in prepended: + # prepend it to the following word + following["word"] = previous["word"] + following["word"] + following["tokens"] = previous["tokens"] + following["tokens"] + previous["word"] = "" + previous["tokens"] = [] + else: + j = i + i -= 1 + + # merge appended punctuations + i = 0 + j = 1 + while j < len(alignment): + previous = alignment[i] + following = alignment[j] + if not previous["word"].endswith(" ") and following["word"] in appended: + # append it to the previous word + previous["word"] = previous["word"] + following["word"] + previous["tokens"] = previous["tokens"] + following["tokens"] + following["word"] = "" + following["tokens"] = [] + else: + i = j + j += 1 diff --git a/faster-whisper/faster_whisper/utils.py b/faster-whisper/faster_whisper/utils.py new file mode 100644 index 0000000..481bd74 --- /dev/null +++ b/faster-whisper/faster_whisper/utils.py @@ -0,0 +1,157 @@ +import logging +import os +import re + +from typing import List, Optional + +import huggingface_hub +import requests + +from tqdm.auto import tqdm + +_MODELS = { + "tiny.en": "Systran/faster-whisper-tiny.en", + "tiny": "Systran/faster-whisper-tiny", + "base.en": "Systran/faster-whisper-base.en", + "base": "Systran/faster-whisper-base", + "small.en": "Systran/faster-whisper-small.en", + "small": "Systran/faster-whisper-small", + "medium.en": "Systran/faster-whisper-medium.en", + "medium": "Systran/faster-whisper-medium", + "large-v1": "Systran/faster-whisper-large-v1", + "large-v2": "Systran/faster-whisper-large-v2", + "large-v3": "Systran/faster-whisper-large-v3", + "large": "Systran/faster-whisper-large-v3", + "distil-large-v2": "Systran/faster-distil-whisper-large-v2", + "distil-medium.en": "Systran/faster-distil-whisper-medium.en", + "distil-small.en": "Systran/faster-distil-whisper-small.en", + "distil-large-v3": "Systran/faster-distil-whisper-large-v3", +} + + +def available_models() -> List[str]: + """Returns the names of available models.""" + return list(_MODELS.keys()) + + +def get_assets_path(): + """Returns the path to the assets directory.""" + return os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets") + + +def get_logger(): + """Returns the module logger.""" + return logging.getLogger("faster_whisper") + + +def download_model( + size_or_id: str, + output_dir: Optional[str] = None, + local_files_only: bool = False, + cache_dir: Optional[str] = None, +): + """Downloads a CTranslate2 Whisper model from the Hugging Face Hub. + + Args: + size_or_id: Size of the model to download from https://huggingface.co/Systran + (tiny, tiny.en, base, base.en, small, small.en, distil-small.en, medium, medium.en, + distil-medium.en, large-v1, large-v2, large-v3, large, distil-large-v2, + distil-large-v3), or a CTranslate2-converted model ID from the Hugging Face Hub + (e.g. Systran/faster-whisper-large-v3). + output_dir: Directory where the model should be saved. If not set, the model is saved in + the cache directory. + local_files_only: If True, avoid downloading the file and return the path to the local + cached file if it exists. + cache_dir: Path to the folder where cached files are stored. + + Returns: + The path to the downloaded model. + + Raises: + ValueError: if the model size is invalid. + """ + if re.match(r".*/.*", size_or_id): + repo_id = size_or_id + else: + repo_id = _MODELS.get(size_or_id) + if repo_id is None: + raise ValueError( + "Invalid model size '%s', expected one of: %s" + % (size_or_id, ", ".join(_MODELS.keys())) + ) + + allow_patterns = [ + "config.json", + "preprocessor_config.json", + "model.bin", + "tokenizer.json", + "vocabulary.*", + ] + + kwargs = { + "local_files_only": local_files_only, + "allow_patterns": allow_patterns, + "tqdm_class": disabled_tqdm, + } + + if output_dir is not None: + kwargs["local_dir"] = output_dir + kwargs["local_dir_use_symlinks"] = False + + if cache_dir is not None: + kwargs["cache_dir"] = cache_dir + + try: + return huggingface_hub.snapshot_download(repo_id, **kwargs) + except ( + huggingface_hub.utils.HfHubHTTPError, + requests.exceptions.ConnectionError, + ) as exception: + logger = get_logger() + logger.warning( + "An error occured while synchronizing the model %s from the Hugging Face Hub:\n%s", + repo_id, + exception, + ) + logger.warning( + "Trying to load the model directly from the local cache, if it exists." + ) + + kwargs["local_files_only"] = True + return huggingface_hub.snapshot_download(repo_id, **kwargs) + + +def format_timestamp( + seconds: float, + always_include_hours: bool = False, + decimal_marker: str = ".", +) -> str: + assert seconds >= 0, "non-negative timestamp expected" + milliseconds = round(seconds * 1000.0) + + hours = milliseconds // 3_600_000 + milliseconds -= hours * 3_600_000 + + minutes = milliseconds // 60_000 + milliseconds -= minutes * 60_000 + + seconds = milliseconds // 1_000 + milliseconds -= seconds * 1_000 + + hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else "" + return ( + f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}" + ) + + +class disabled_tqdm(tqdm): + def __init__(self, *args, **kwargs): + kwargs["disable"] = True + super().__init__(*args, **kwargs) + + +def get_end(segments: List[dict]) -> Optional[float]: + return next( + (w["end"] for s in reversed(segments) for w in reversed(s["words"])), + segments[-1]["end"] if segments else None, + ) diff --git a/faster-whisper/faster_whisper/vad.py b/faster-whisper/faster_whisper/vad.py new file mode 100644 index 0000000..5d289b9 --- /dev/null +++ b/faster-whisper/faster_whisper/vad.py @@ -0,0 +1,579 @@ +import bisect +import functools +import os +import warnings + +from collections.abc import Callable +from typing import List, NamedTuple, Optional, Union + +import numpy as np +import torch + +from pyannote.audio.core.io import AudioFile +from pyannote.audio.pipelines import VoiceActivityDetection +from pyannote.audio.pipelines.utils import PipelineModel +from pyannote.core import Annotation, Segment, SlidingWindowFeature + +from faster_whisper.utils import get_assets_path + + +# The code below is adapted from https://github.com/snakers4/silero-vad. +class VadOptions(NamedTuple): + """VAD options. + + Attributes: + threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, + probabilities ABOVE this value are considered as SPEECH. It is better to tune this + parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets. + min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. + max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer + than max_speech_duration_s will be split at the timestamp of the last silence that + lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be + split aggressively just before max_speech_duration_s. + min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms + before separating it + window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model. + WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate. + Values other than these may affect model performance!! + speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side + """ + + threshold: float = 0.5 + min_speech_duration_ms: int = 250 + max_speech_duration_s: float = float("inf") + min_silence_duration_ms: int = 2000 + window_size_samples: int = 1024 + speech_pad_ms: int = 400 + + +def get_speech_timestamps( + audio: torch.Tensor, + vad_options: Optional[VadOptions] = None, + **kwargs, +) -> List[dict]: + """This method is used for splitting long audios into speech chunks using silero VAD. + + Args: + audio: One dimensional float array. + vad_options: Options for VAD processing. + kwargs: VAD options passed as keyword arguments for backward compatibility. + + Returns: + List of dicts containing begin and end samples of each speech chunk. + """ + if vad_options is None: + vad_options = VadOptions(**kwargs) + + threshold = vad_options.threshold + min_speech_duration_ms = vad_options.min_speech_duration_ms + max_speech_duration_s = vad_options.max_speech_duration_s + min_silence_duration_ms = vad_options.min_silence_duration_ms + window_size_samples = 512 + speech_pad_ms = vad_options.speech_pad_ms + + if window_size_samples not in [512, 1024, 1536]: + warnings.warn( + "Unusual window_size_samples! Supported window_size_samples:\n" + " - [512, 1024, 1536] for 16000 sampling_rate" + ) + + sampling_rate = 16000 + min_speech_samples = sampling_rate * min_speech_duration_ms / 1000 + speech_pad_samples = sampling_rate * speech_pad_ms / 1000 + max_speech_samples = ( + sampling_rate * max_speech_duration_s + - window_size_samples + - 2 * speech_pad_samples + ) + min_silence_samples = sampling_rate * min_silence_duration_ms / 1000 + min_silence_samples_at_max_speech = sampling_rate * 98 / 1000 + + audio_length_samples = len(audio) + + model = get_vad_model() + state, context = model.get_initial_states(batch_size=1) + + speech_probs = [] + for current_start_sample in range(0, audio_length_samples, window_size_samples): + chunk = audio[current_start_sample : current_start_sample + window_size_samples] + if len(chunk) < window_size_samples: + chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk)))) + speech_prob, state, context = model(chunk, state, context, sampling_rate) + speech_probs.append(speech_prob) + + triggered = False + speeches = [] + current_speech = {} + neg_threshold = threshold - 0.15 + + # to save potential segment end (and tolerate some silence) + temp_end = 0 + # to save potential segment limits in case of maximum segment size reached + prev_end = next_start = 0 + + for i, speech_prob in enumerate(speech_probs): + if (speech_prob >= threshold) and temp_end: + temp_end = 0 + if next_start < prev_end: + next_start = window_size_samples * i + + if (speech_prob >= threshold) and not triggered: + triggered = True + current_speech["start"] = window_size_samples * i + continue + + if ( + triggered + and (window_size_samples * i) - current_speech["start"] > max_speech_samples + ): + if prev_end: + current_speech["end"] = prev_end + speeches.append(current_speech) + current_speech = {} + # previously reached silence (< neg_thres) and is still not speech (< thres) + if next_start < prev_end: + triggered = False + else: + current_speech["start"] = next_start + prev_end = next_start = temp_end = 0 + else: + current_speech["end"] = window_size_samples * i + speeches.append(current_speech) + current_speech = {} + prev_end = next_start = temp_end = 0 + triggered = False + continue + + if (speech_prob < neg_threshold) and triggered: + if not temp_end: + temp_end = window_size_samples * i + # condition to avoid cutting in very short silence + if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech: + prev_end = temp_end + if (window_size_samples * i) - temp_end < min_silence_samples: + continue + else: + current_speech["end"] = temp_end + if ( + current_speech["end"] - current_speech["start"] + ) > min_speech_samples: + speeches.append(current_speech) + current_speech = {} + prev_end = next_start = temp_end = 0 + triggered = False + continue + + if ( + current_speech + and (audio_length_samples - current_speech["start"]) > min_speech_samples + ): + current_speech["end"] = audio_length_samples + speeches.append(current_speech) + + for i, speech in enumerate(speeches): + if i == 0: + speech["start"] = int(max(0, speech["start"] - speech_pad_samples)) + if i != len(speeches) - 1: + silence_duration = speeches[i + 1]["start"] - speech["end"] + if silence_duration < 2 * speech_pad_samples: + speech["end"] += int(silence_duration // 2) + speeches[i + 1]["start"] = int( + max(0, speeches[i + 1]["start"] - silence_duration // 2) + ) + else: + speech["end"] = int( + min(audio_length_samples, speech["end"] + speech_pad_samples) + ) + speeches[i + 1]["start"] = int( + max(0, speeches[i + 1]["start"] - speech_pad_samples) + ) + else: + speech["end"] = int( + min(audio_length_samples, speech["end"] + speech_pad_samples) + ) + + return speeches + + +def collect_chunks(audio: torch.Tensor, chunks: List[dict]) -> torch.Tensor: + """Collects and concatenates audio chunks.""" + if not chunks: + return torch.tensor([], dtype=torch.float32) + + return torch.cat([audio[chunk["start"] : chunk["end"]] for chunk in chunks]) + + +class SpeechTimestampsMap: + """Helper class to restore original speech timestamps.""" + + def __init__(self, chunks: List[dict], sampling_rate: int, time_precision: int = 2): + self.sampling_rate = sampling_rate + self.time_precision = time_precision + self.chunk_end_sample = [] + self.total_silence_before = [] + + previous_end = 0 + silent_samples = 0 + + for chunk in chunks: + silent_samples += chunk["start"] - previous_end + previous_end = chunk["end"] + + self.chunk_end_sample.append(chunk["end"] - silent_samples) + self.total_silence_before.append(silent_samples / sampling_rate) + + def get_original_time( + self, + time: float, + chunk_index: Optional[int] = None, + ) -> float: + if chunk_index is None: + chunk_index = self.get_chunk_index(time) + + total_silence_before = self.total_silence_before[chunk_index] + return round(total_silence_before + time, self.time_precision) + + def get_chunk_index(self, time: float) -> int: + sample = int(time * self.sampling_rate) + return min( + bisect.bisect(self.chunk_end_sample, sample), + len(self.chunk_end_sample) - 1, + ) + + +@functools.lru_cache +def get_vad_model(): + """Returns the VAD model instance.""" + path = os.path.join(get_assets_path(), "silero_vad.onnx") + return SileroVADModel(path) + + +class SileroVADModel: + def __init__(self, path): + try: + import onnxruntime + except ImportError as e: + raise RuntimeError( + "Applying the VAD filter requires the onnxruntime package" + ) from e + + opts = onnxruntime.SessionOptions() + opts.inter_op_num_threads = 1 + opts.intra_op_num_threads = 1 + opts.log_severity_level = 4 + + self.session = onnxruntime.InferenceSession( + path, + providers=["CPUExecutionProvider"], + sess_options=opts, + ) + + def get_initial_states(self, batch_size: int): + state = np.zeros((2, batch_size, 128), dtype=np.float32) + context = np.zeros((batch_size, 64), dtype=np.float32) + return state, context + + def __call__(self, x, state, context, sr: int): + if len(x.shape) == 1: + x = np.expand_dims(x, 0) + if len(x.shape) > 2: + raise ValueError( + f"Too many dimensions for input audio chunk {len(x.shape)}" + ) + if sr / x.shape[1] > 31.25: + raise ValueError("Input audio chunk is too short") + + x = np.concatenate([context, x], axis=1) + + ort_inputs = { + "input": x, + "state": state, + "sr": np.array(sr, dtype="int64"), + } + + out, state = self.session.run(None, ort_inputs) + context = x[..., -64:] + + return out, state, context + + +# The code below is copied from whisper-x (https://github.com/m-bain/whisperX) +# and adapted for faster_whisper. +class SegmentX: + def __init__(self, start, end, speaker=None): + self.start = start + self.end = end + self.speaker = speaker + + +class VoiceActivitySegmentation(VoiceActivityDetection): + """Pipeline wrapper class for Voice Activity Segmentation based on VAD scores.""" + + def __init__( + self, + segmentation: PipelineModel = "pyannote/segmentation", + device: Optional[Union[str, torch.device]] = None, + fscore: bool = False, + use_auth_token: Optional[str] = None, + **inference_kwargs, + ): + """Initialize the pipeline with the model name and the optional device. + + Args: + dict parameters of VoiceActivityDetection class from pyannote: + segmentation (PipelineModel): Loaded model name. + device (torch.device or None): Device to perform the segmentation. + fscore (bool): Flag indicating whether to compute F-score during inference. + use_auth_token (str or None): Optional authentication token for model access. + inference_kwargs (dict): Additional arguments from VoiceActivityDetection pipeline. + """ + super().__init__( + segmentation=segmentation, + device=device, + fscore=fscore, + use_auth_token=use_auth_token, + **inference_kwargs, + ) + + def apply(self, file: AudioFile, hook: Optional[Callable] = None) -> Annotation: + """Apply voice activity detection on the audio file. + + Args: + file (AudioFile): Processed file. + hook (callable): Hook called with signature: hook("step_name", step_artefact, file=file) + + Returns: + segmentations (Annotation): Voice activity segmentation. + """ + # setup hook (e.g. for debugging purposes) + hook = self.setup_hook(file, hook=hook) + + # apply segmentation model if needed + # output shape is (num_chunks, num_frames, 1) + if self.training: + if self.CACHED_SEGMENTATION in file: + segmentations = file[self.CACHED_SEGMENTATION] + else: + segmentations = self._segmentation(file) + file[self.CACHED_SEGMENTATION] = segmentations + else: + segmentations: SlidingWindowFeature = self._segmentation(file) + + return segmentations + + +class BinarizeVadScores: + """Binarize detection scores using hysteresis thresholding. + + Reference: + Gregory Gelly and Jean-Luc Gauvain. "Minimum Word Error Training of + RNN-based Voice Activity Detection", InterSpeech 2015. + + Modified by Max Bain to include WhisperX's min-cut operation + https://arxiv.org/abs/2303.00747 + + """ + + def __init__( + self, + onset: float = 0.5, + offset: Optional[float] = None, + min_duration_on: float = 0.0, + min_duration_off: float = 0.0, + pad_onset: float = 0.0, + pad_offset: float = 0.0, + max_duration: float = float("inf"), + ): + """Initializes the parameters for Binarizing the VAD scores. + + Args: + onset (float, optional): + Onset threshold. Defaults to 0.5. + offset (float, optional): + Offset threshold. Defaults to `onset`. + min_duration_on (float, optional): + Remove active regions shorter than that many seconds. Defaults to 0s. + min_duration_off (float, optional): + Fill inactive regions shorter than that many seconds. Defaults to 0s. + pad_onset (float, optional): + Extend active regions by moving their start time by that many seconds. + Defaults to 0s. + pad_offset (float, optional): + Extend active regions by moving their end time by that many seconds. + Defaults to 0s. + max_duration (float): + The maximum length of an active segment. + """ + super().__init__() + + self.onset = onset + self.offset = offset or onset + + self.pad_onset = pad_onset + self.pad_offset = pad_offset + + self.min_duration_on = min_duration_on + self.min_duration_off = min_duration_off + + self.max_duration = max_duration + + def __get_active_regions(self, scores: SlidingWindowFeature) -> Annotation: + """Extract active regions from VAD scores. + + Args: + scores (SlidingWindowFeature): Detection scores. + + Returns: + active (Annotation): Active regions. + """ + num_frames, num_classes = scores.data.shape + frames = scores.sliding_window + timestamps = [frames[i].middle for i in range(num_frames)] + # annotation meant to store 'active' regions + active = Annotation() + for k, k_scores in enumerate(scores.data.T): + label = k if scores.labels is None else scores.labels[k] + + # initial state + start = timestamps[0] + is_active = k_scores[0] > self.onset + curr_scores = [k_scores[0]] + curr_timestamps = [start] + t = start + # optionally add `strict=False` for python 3.10 or later + for t, y in zip(timestamps[1:], k_scores[1:]): + # currently active + if is_active: + curr_duration = t - start + if curr_duration > self.max_duration: + search_after = len(curr_scores) // 2 + # divide segment + min_score_div_idx = search_after + np.argmin( + curr_scores[search_after:] + ) + min_score_t = curr_timestamps[min_score_div_idx] + region = Segment( + start - self.pad_onset, min_score_t + self.pad_offset + ) + active[region, k] = label + start = curr_timestamps[min_score_div_idx] + curr_scores = curr_scores[min_score_div_idx + 1 :] + curr_timestamps = curr_timestamps[min_score_div_idx + 1 :] + # switching from active to inactive + elif y < self.offset: + region = Segment(start - self.pad_onset, t + self.pad_offset) + active[region, k] = label + start = t + is_active = False + curr_scores = [] + curr_timestamps = [] + curr_scores.append(y) + curr_timestamps.append(t) + # currently inactive + else: + # switching from inactive to active + if y > self.onset: + start = t + is_active = True + + # if active at the end, add final region + if is_active: + region = Segment(start - self.pad_onset, t + self.pad_offset) + active[region, k] = label + + return active + + def __call__(self, scores: SlidingWindowFeature) -> Annotation: + """Binarize detection scores. + + Args: + scores (SlidingWindowFeature): Detection scores. + + Returns: + active (Annotation): Binarized scores. + """ + active = self.__get_active_regions(scores) + # because of padding, some active regions might be overlapping: merge them. + # also: fill same speaker gaps shorter than min_duration_off + if self.pad_offset > 0.0 or self.pad_onset > 0.0 or self.min_duration_off > 0.0: + if self.max_duration < float("inf"): + raise NotImplementedError("This would break current max_duration param") + active = active.support(collar=self.min_duration_off) + + # remove tracks shorter than min_duration_on + if self.min_duration_on > 0: + for segment, track in list(active.itertracks()): + if segment.duration < self.min_duration_on: + del active[segment, track] + + return active + + +def merge_chunks( + segments, + chunk_size, + onset: float = 0.5, + offset: Optional[float] = None, + edge_padding: float = 0.1, +): + """ + Merge operation described in whisper-x paper + """ + curr_end = 0 + merged_segments = [] + seg_idxs = [] + speaker_idxs = [] + + assert chunk_size > 0 + binarize = BinarizeVadScores(max_duration=chunk_size, onset=onset, offset=offset) + segments = binarize(segments) + segments_list = [] + for speech_turn in segments.get_timeline(): + segments_list.append( + SegmentX( + max(0.0, speech_turn.start - edge_padding), + speech_turn.end + edge_padding, + "UNKNOWN", + ) + ) # 100ms edge padding to account for edge errors + + if len(segments_list) == 0: + print("No active speech found in audio") + return [] + + # Make sur the starting point is the start of the segment. + curr_start = segments_list[0].start + + for idx, seg in enumerate(segments_list): + # if any segment start timing is less than previous segment end timing, + # reset the edge padding. Similarly for end timing. + if idx > 0: + if seg.start < segments_list[idx - 1].end: + seg.start = seg.start + edge_padding + if idx < len(segments_list) - 1: + if seg.end > segments_list[idx + 1].start: + seg.end = seg.end - edge_padding + + if seg.end - curr_start > chunk_size and curr_end - curr_start > 0: + merged_segments.append( + { + "start": curr_start, + "end": curr_end, + "segments": seg_idxs, + } + ) + curr_start = seg.start + seg_idxs = [] + speaker_idxs = [] + curr_end = seg.end + seg_idxs.append((seg.start, seg.end)) + speaker_idxs.append(seg.speaker) + # add final + merged_segments.append( + { + "start": curr_start, + "end": curr_end, + "segments": seg_idxs, + } + ) + return merged_segments diff --git a/faster-whisper/faster_whisper/version.py b/faster-whisper/faster_whisper/version.py new file mode 100644 index 0000000..94155e5 --- /dev/null +++ b/faster-whisper/faster_whisper/version.py @@ -0,0 +1,3 @@ +"""Version information.""" + +__version__ = "1.0.2" diff --git a/faster-whisper/requirements.conversion.txt b/faster-whisper/requirements.conversion.txt new file mode 100644 index 0000000..56fdf5f --- /dev/null +++ b/faster-whisper/requirements.conversion.txt @@ -0,0 +1 @@ +transformers[torch]>=4.23 diff --git a/faster-whisper/requirements.txt b/faster-whisper/requirements.txt new file mode 100644 index 0000000..6516f96 --- /dev/null +++ b/faster-whisper/requirements.txt @@ -0,0 +1,8 @@ +ctranslate2>=4.0,<5 +huggingface_hub>=0.13 +tokenizers>=0.13,<1 +onnxruntime>=1.14,<2 +transformers +pyannote-audio>=3.1.1 +torch>=2.1.1 +torchaudio>=2.1.2 \ No newline at end of file diff --git a/faster-whisper/setup.cfg b/faster-whisper/setup.cfg new file mode 100644 index 0000000..bf2da86 --- /dev/null +++ b/faster-whisper/setup.cfg @@ -0,0 +1,9 @@ +[flake8] +max-line-length = 100 +ignore = + E203, + W503, + +[isort] +profile=black +lines_between_types=1 diff --git a/faster-whisper/setup.py b/faster-whisper/setup.py new file mode 100644 index 0000000..782f1b2 --- /dev/null +++ b/faster-whisper/setup.py @@ -0,0 +1,68 @@ +import os + +from setuptools import find_packages, setup + +base_dir = os.path.dirname(os.path.abspath(__file__)) + + +def get_long_description(): + readme_path = os.path.join(base_dir, "README.md") + with open(readme_path, encoding="utf-8") as readme_file: + return readme_file.read() + + +def get_project_version(): + version_path = os.path.join(base_dir, "faster_whisper", "version.py") + version = {} + with open(version_path, encoding="utf-8") as fp: + exec(fp.read(), version) + return version["__version__"] + + +def get_requirements(path): + with open(path, encoding="utf-8") as requirements: + return [requirement.strip() for requirement in requirements] + + +install_requires = get_requirements(os.path.join(base_dir, "requirements.txt")) +conversion_requires = get_requirements( + os.path.join(base_dir, "requirements.conversion.txt") +) + +setup( + name="faster-whisper", + version=get_project_version(), + license="MIT", + description="Faster Whisper transcription with CTranslate2", + long_description=get_long_description(), + long_description_content_type="text/markdown", + author="Guillaume Klein", + url="https://github.com/SYSTRAN/faster-whisper", + classifiers=[ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + ], + keywords="openai whisper speech ctranslate2 inference quantization transformer", + python_requires=">=3.8", + install_requires=install_requires, + extras_require={ + "conversion": conversion_requires, + "dev": [ + "black==23.*", + "flake8==6.*", + "isort==5.*", + "pytest==7.*", + ], + }, + packages=find_packages(), + include_package_data=True, +) diff --git a/faster-whisper/tests/conftest.py b/faster-whisper/tests/conftest.py new file mode 100644 index 0000000..0c0f424 --- /dev/null +++ b/faster-whisper/tests/conftest.py @@ -0,0 +1,18 @@ +import os + +import pytest + + +@pytest.fixture +def data_dir(): + return os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") + + +@pytest.fixture +def jfk_path(data_dir): + return os.path.join(data_dir, "jfk.flac") + + +@pytest.fixture +def physcisworks_path(data_dir): + return os.path.join(data_dir, "physicsworks.wav") diff --git a/faster-whisper/tests/data/jfk.flac b/faster-whisper/tests/data/jfk.flac new file mode 100644 index 0000000..e44b7c1 Binary files /dev/null and b/faster-whisper/tests/data/jfk.flac differ diff --git a/faster-whisper/tests/data/physicsworks.wav b/faster-whisper/tests/data/physicsworks.wav new file mode 100644 index 0000000..885b6c1 Binary files /dev/null and b/faster-whisper/tests/data/physicsworks.wav differ diff --git a/faster-whisper/tests/data/stereo_diarization.wav b/faster-whisper/tests/data/stereo_diarization.wav new file mode 100644 index 0000000..3f5ae75 Binary files /dev/null and b/faster-whisper/tests/data/stereo_diarization.wav differ diff --git a/faster-whisper/tests/test_transcribe.py b/faster-whisper/tests/test_transcribe.py new file mode 100644 index 0000000..5403622 --- /dev/null +++ b/faster-whisper/tests/test_transcribe.py @@ -0,0 +1,150 @@ +import os + +from faster_whisper import BatchedInferencePipeline, WhisperModel, decode_audio + + +def test_supported_languages(): + model = WhisperModel("tiny.en") + assert model.supported_languages == ["en"] + + +def test_transcribe(jfk_path): + model = WhisperModel("tiny") + segments, info = model.transcribe(jfk_path, word_timestamps=True) + assert info.all_language_probs is not None + + assert info.language == "en" + assert info.language_probability > 0.9 + assert info.duration == 11 + + # Get top language info from all results, which should match the + # already existing metadata + top_lang, top_lang_score = info.all_language_probs[0] + assert info.language == top_lang + assert abs(info.language_probability - top_lang_score) < 1e-16 + + segments = list(segments) + + assert len(segments) == 1 + + segment = segments[0] + + assert segment.text == ( + " And so, my fellow Americans, ask not what your country can do for you, " + "ask what you can do for your country." + ) + + assert segment.text == "".join(word.word for word in segment.words) + assert segment.start == segment.words[0].start + assert segment.end == segment.words[-1].end + batched_model = BatchedInferencePipeline(model=model, use_vad_model=False) + result, info = batched_model.transcribe(jfk_path, word_timestamps=True) + assert info.language == "en" + assert info.language_probability > 0.7 + segments = [] + for segment in result: + segments.append( + {"start": segment.start, "end": segment.end, "text": segment.text} + ) + + assert len(segments) == 1 + assert segment.text == ( + " And so my fellow Americans ask not what your country can do for you, " + "ask what you can do for your country." + ) + + +def test_batched_transcribe(physcisworks_path): + model = WhisperModel("tiny") + batched_model = BatchedInferencePipeline(model=model) + result, info = batched_model.transcribe(physcisworks_path, batch_size=16) + assert info.language == "en" + assert info.language_probability > 0.7 + segments = [] + for segment in result: + segments.append( + {"start": segment.start, "end": segment.end, "text": segment.text} + ) + # number of near 30 sec segments + assert len(segments) == 8 + + result, info = batched_model.transcribe( + physcisworks_path, + batch_size=16, + without_timestamps=False, + word_timestamps=True, + ) + segments = [] + for segment in result: + assert segment.words is not None + segments.append( + {"start": segment.start, "end": segment.end, "text": segment.text} + ) + assert len(segments) > 8 + + +def test_prefix_with_timestamps(jfk_path): + model = WhisperModel("tiny") + segments, _ = model.transcribe(jfk_path, prefix="And so my fellow Americans") + segments = list(segments) + + assert len(segments) == 1 + + segment = segments[0] + + assert segment.text == ( + " And so my fellow Americans ask not what your country can do for you, " + "ask what you can do for your country." + ) + + assert segment.start == 0 + assert 10 < segment.end < 11 + + +def test_vad(jfk_path): + model = WhisperModel("tiny") + segments, info = model.transcribe( + jfk_path, + vad_filter=True, + vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200), + ) + segments = list(segments) + + assert len(segments) == 1 + segment = segments[0] + + assert segment.text == ( + " And so my fellow Americans ask not what your country can do for you, " + "ask what you can do for your country." + ) + + assert 0 < segment.start < 1 + assert 10 < segment.end < 11 + + assert info.vad_options.min_silence_duration_ms == 500 + assert info.vad_options.speech_pad_ms == 200 + + +def test_stereo_diarization(data_dir): + model = WhisperModel("tiny") + + audio_path = os.path.join(data_dir, "stereo_diarization.wav") + left, right = decode_audio(audio_path, split_stereo=True) + + segments, _ = model.transcribe(left) + transcription = "".join(segment.text for segment in segments).strip() + assert transcription == ( + "He began a confused complaint against the wizard " + "who had vanished behind the curtain on the left." + ) + + segments, _ = model.transcribe(right) + transcription = "".join(segment.text for segment in segments).strip() + assert transcription == "The horizon seems extremely distant." + + +def test_multisegment_lang_id(physcisworks_path): + model = WhisperModel("tiny") + language_info = model.detect_language_multi_segment(physcisworks_path) + assert language_info["language_code"] == "en" + assert language_info["language_confidence"] > 0.8 diff --git a/faster-whisper/tests/test_utils.py b/faster-whisper/tests/test_utils.py new file mode 100644 index 0000000..bb488fe --- /dev/null +++ b/faster-whisper/tests/test_utils.py @@ -0,0 +1,29 @@ +import os + +from faster_whisper import available_models, download_model + + +def test_available_models(): + models = available_models() + assert isinstance(models, list) + assert "tiny" in models + + +def test_download_model(tmpdir): + output_dir = str(tmpdir.join("model")) + + model_dir = download_model("tiny", output_dir=output_dir) + + assert model_dir == output_dir + assert os.path.isdir(model_dir) + assert not os.path.islink(model_dir) + + for filename in os.listdir(model_dir): + path = os.path.join(model_dir, filename) + assert not os.path.islink(path) + + +def test_download_model_in_cache(tmpdir): + cache_dir = str(tmpdir.join("model")) + download_model("tiny", cache_dir=cache_dir) + assert os.path.isdir(cache_dir) diff --git a/pre_requirements.txt b/pre_requirements.txt index 341faa8..faf003e 100644 --- a/pre_requirements.txt +++ b/pre_requirements.txt @@ -11,4 +11,4 @@ Cython==3.0.10 youtokentome @ git+https://github.com/gburlet/YouTokenToMe.git@dependencies deepmultilingualpunctuation==1.0.1 pyannote.audio==3.2.0 -ipython==8.24.0 \ No newline at end of file +ipython==8.24.0