feat: add vad filter flag

See #3
geekodour · Nov 9, 2023 · 205bb19 · 205bb19
1 parent b028255
commit 205bb19
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 3 deletions.
diff --git a/src/wscribe/backends/fasterwhisper.py b/src/wscribe/backends/fasterwhisper.py
@@ -37,7 +37,11 @@ def load(self) -> None:
         )
 
     def transcribe(
-        self, input: np.ndarray, language: Optional[str] = None, silent: bool = False
+        self,
+        input: np.ndarray,
+        language: Optional[str] = None,
+        silent: bool = False,
+        vad: bool = False,
     ) -> list[TranscribedData]:
         """
         Return word level transcription data.
@@ -50,6 +54,7 @@ def transcribe(
             beam_size=DEFAULT_BEAM,
             word_timestamps=True,
             language=language,
+            vad_filter=vad,
         )
         # ps = playback seconds
         with tqdm(

diff --git a/src/wscribe/cli/main.py b/src/wscribe/cli/main.py
@@ -55,7 +55,16 @@ def cli():
 @click.option("-d", "--debug", help="show debug logs", default=False, is_flag=True)
 @click.option("-s", "--stats", help="print stats", default=False, is_flag=True)
 @click.option("-q", "--quiet", help="no progress bar", default=False, is_flag=True)
-def transcribe(source, destination, format, model, gpu, language, debug, stats, quiet):
+@click.option(
+    "-v",
+    "--vad",
+    help="use vad filter(better results, slower)",
+    default=False,
+    is_flag=True,
+)
+def transcribe(
+    source, destination, format, model, gpu, language, debug, stats, quiet, vad
+):
     """
     Transcribes SOURCE to DESTINATION. Where SOURCE can be local path to an audio/video file and
     DESTINATION needs to be a local path to a non-existing file.
@@ -76,7 +85,7 @@ def transcribe(source, destination, format, model, gpu, language, debug, stats,
     audio_end_time = time.perf_counter()
 
     ts_start_time = time.perf_counter()
-    result = m.transcribe(input=audio, language=language, silent=quiet)
+    result = m.transcribe(input=audio, language=language, silent=quiet, vad=vad)
     ts_end_time = time.perf_counter()
 
     writer = WRITERS[format](result=result, destination=destination)