openai · jongwook · May 4, 2023 · Apr 16, 2023 · Apr 18, 2023 · May 4, 2023
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ A Transformer sequence-to-sequence model is trained on various speech processing
 
 ## Setup
 
-We used Python 3.9.9 and [PyTorch](https://pytorch.org/) 1.10.1 to train and test our models, but the codebase is expected to be compatible with Python 3.8-3.10 and recent PyTorch versions. The codebase also depends on a few Python packages, most notably [OpenAI's tiktoken](https://github.com/openai/tiktoken) for their fast tokenizer implementation and [ffmpeg-python](https://github.com/kkroening/ffmpeg-python) for reading audio files. You can download and install (or update to) the latest release of Whisper with the following command:
+We used Python 3.9.9 and [PyTorch](https://pytorch.org/) 1.10.1 to train and test our models, but the codebase is expected to be compatible with Python 3.8-3.10 and recent PyTorch versions. The codebase also depends on a few Python packages, most notably [OpenAI's tiktoken](https://github.com/openai/tiktoken) for their fast tokenizer implementation. You can download and install (or update to) the latest release of Whisper with the following command:
 
     pip install -U openai-whisper
 

diff --git a/requirements.txt b/requirements.txt
@@ -4,4 +4,3 @@ torch
 tqdm
 more-itertools
 tiktoken==0.3.1
-ffmpeg-python==0.2.0
diff --git a/whisper/audio.py b/whisper/audio.py
@@ -1,8 +1,8 @@
 import os
 from functools import lru_cache
+from subprocess import Popen, PIPE, DEVNULL
 from typing import Optional, Union
 
-import ffmpeg
 import numpy as np
 import torch
 import torch.nn.functional as F
@@ -39,19 +39,26 @@ def load_audio(file: str, sr: int = SAMPLE_RATE):
     -------
     A NumPy array containing the audio waveform, in float32 dtype.
     """
-    try:
-        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
-        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
-        out, _ = (
-            ffmpeg.input(file, threads=0)
-            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
-            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
-        )
-    except ffmpeg.Error as e:
-        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
-
-    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
 
+    p = None
+    try:
+        # This launches a subprocess to decode audio while down-mixing
+        # and resampling as necessary.  Requires the ffmpeg CLI in PATH.
+        cmd = ['ffmpeg',
+               '-nostdin',
+               '-threads', '0',
+               '-i', file,
+               '-f', 's16le',
+               '-ac', '1',
+               '-acodec', 'pcm_s16le',
+               '-ar', str(sr),
+               '-']
+        p = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
+        return np.frombuffer(p.stdout.read(),
+                             np.int16).flatten().astype(np.float32) / 32768.0
+    finally:
+        if p:
+            p.terminate()
 
 def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
     """