From 8035e9ef4890a06824a9a56a54d0feea9588f23b Mon Sep 17 00:00:00 2001 From: petterreinholdtsen Date: Thu, 4 May 2023 19:53:59 +0200 Subject: [PATCH] Drop ffmpeg-python dependency and call ffmpeg directly. (#1242) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Drop ffmpeg-python dependency and call ffmpeg directly. The last ffmpeg-python module release was in 2019[1], upstream seem to be unavailable[2] and the project development seem to have stagnated[3]. As the features it provide is trivial to replace using the Python native subprocess module, drop the dependency. [1] [2] [3] * Rewrote to use subprocess.run() instead of subprocess.Popen(). * formatting changes * formatting update * isort fix * Error checking * isort 🤦🏻 * flake8 fix * minor spelling changes --------- Co-authored-by: Jong Wook Kim --- README.md | 4 +--- requirements.txt | 1 - whisper/audio.py | 28 +++++++++++++++++++--------- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 648d0c18..b4d3998c 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,7 @@ A Transformer sequence-to-sequence model is trained on various speech processing ## Setup - -We used Python 3.9.9 and [PyTorch](https://pytorch.org/) 1.10.1 to train and test our models, but the codebase is expected to be compatible with Python 3.8-3.11 and recent PyTorch versions. The codebase also depends on a few Python packages, most notably [OpenAI's tiktoken](https://github.com/openai/tiktoken) for their fast tokenizer implementation and [ffmpeg-python](https://github.com/kkroening/ffmpeg-python) for reading audio files. You can download and install (or update to) the latest release of Whisper with the following command: - +We used Python 3.9.9 and [PyTorch](https://pytorch.org/) 1.10.1 to train and test our models, but the codebase is expected to be compatible with Python 3.8-3.11 and recent PyTorch versions. The codebase also depends on a few Python packages, most notably [OpenAI's tiktoken](https://github.com/openai/tiktoken) for their fast tokenizer implementation. You can download and install (or update to) the latest release of Whisper with the following command: pip install -U openai-whisper diff --git a/requirements.txt b/requirements.txt index 995977a5..3c11ac32 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,3 @@ torch tqdm more-itertools tiktoken==0.3.3 -ffmpeg-python==0.2.0 diff --git a/whisper/audio.py b/whisper/audio.py index 513ab7c9..4f5b6e07 100644 --- a/whisper/audio.py +++ b/whisper/audio.py @@ -1,8 +1,8 @@ import os from functools import lru_cache +from subprocess import CalledProcessError, run from typing import Optional, Union -import ffmpeg import numpy as np import torch import torch.nn.functional as F @@ -39,15 +39,25 @@ def load_audio(file: str, sr: int = SAMPLE_RATE): ------- A NumPy array containing the audio waveform, in float32 dtype. """ + + # This launches a subprocess to decode audio while down-mixing + # and resampling as necessary. Requires the ffmpeg CLI in PATH. + # fmt: off + cmd = [ + "ffmpeg", + "-nostdin", + "-threads", "0", + "-i", file, + "-f", "s16le", + "-ac", "1", + "-acodec", "pcm_s16le", + "-ar", str(sr), + "-" + ] + # fmt: on try: - # This launches a subprocess to decode audio while down-mixing and resampling as necessary. - # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. - out, _ = ( - ffmpeg.input(file, threads=0) - .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) - .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) - ) - except ffmpeg.Error as e: + out = run(cmd, capture_output=True, check=True).stdout + except CalledProcessError as e: raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0