-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtranscribe_hotkey.py
133 lines (109 loc) · 3.91 KB
/
transcribe_hotkey.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import speech_recognition as sr
import asyncio
import torch
import numpy as np
import whisperx
from typing import *
async def record_audio(
audio_queue: asyncio.Queue[torch.Tensor],
is_listening: asyncio.Event,
energy: int,
pause: float,
dynamic_energy: bool,
stop_future: asyncio.Future
):
r = sr.Recognizer()
r.energy_threshold = energy
r.pause_threshold = pause
r.dynamic_energy_threshold = dynamic_energy
loop = asyncio.get_running_loop()
print("[LISTEN] Starting listener")
with sr.Microphone(sample_rate=16000) as source:
print("[LISTEN] found microphone")
while not stop_future.done():
audio = await loop.run_in_executor(None, r.listen, source)
if is_listening.is_set():
np_audio = np.frombuffer(audio.get_raw_data(), np.int16).flatten().astype(np.float32) / 32768.0
await audio_queue.put(np_audio)
print("[LISTEN] Listener finished")
async def transcribe_audio(
audio_queue: asyncio.Queue[torch.Tensor],
result_queue: asyncio.Queue[str],
audio_model: Any,
stop_future: asyncio.Future
):
print("[TRANS] Starting transcriber")
while not stop_future.done():
audio_data = await audio_queue.get()
result = audio_model.transcribe(audio_data, batch_size=16)
# predicted_text = str(result["text"]).strip()
print(f"[TRANS] Got result {result}")
await result_queue.put(result)
print("[TRANS] Transcriber finished")
async def start_audio_transcription_backend(is_listening: asyncio.Event, stop_future: asyncio.Future):
model = "large-v2"
audio_model = whisperx.load_model(model, device="cuda", language="en")
energy = 100
pause = 0.8
dynamic_energy = False
audio_queue: asyncio.Queue[torch.Tensor] = asyncio.Queue()
result_queue: asyncio.Queue[str] = asyncio.Queue()
asyncio.create_task(
record_audio(
audio_queue,
is_listening,
energy,
pause,
dynamic_energy,
stop_future,
)
)
asyncio.create_task(
transcribe_audio(
audio_queue,
result_queue,
audio_model,
stop_future,
)
)
return result_queue
async def start_keyboard_backend(is_listening: asyncio.Event, stop_future: asyncio.Future):
from pynput import keyboard
def on_press(key):
if key == keyboard.Key.f23 and not is_listening.is_set():
print("[HOTKEY] F23 pressed, starting transcription.")
is_listening.set()
def on_release(key):
if key == keyboard.Key.f23:
print("[HOTKEY] F23 released, stopping transcription.")
is_listening.clear()
def listen_for_keys():
with keyboard.Listener(on_press=on_press, on_release=on_release) as listener:
listener.join()
import threading
listener_thread = threading.Thread(target=listen_for_keys)
listener_thread.start()
def stop_check():
while not stop_future.done():
threading.Event().wait(0.5)
listener_thread.join()
stop_check_thread = threading.Thread(target=stop_check)
stop_check_thread.start()
async def main():
stop_future = asyncio.Future()
is_listening = asyncio.Event()
print("[MAIN] starting audio backend")
result_queue = await start_audio_transcription_backend(is_listening, stop_future)
print("[MAIN] starting keyboard backend")
asyncio.create_task(start_keyboard_backend(is_listening, stop_future))
print("[MAIN] Beginning main loop - hold F23 to perform transcription")
try:
while True:
result = await result_queue.get()
segments = result["segments"]
print("[MAIN] Transcribing...", segments)
except KeyboardInterrupt:
print("[MAIN] Stopping...")
stop_future.set_result(True)
if __name__ == "__main__":
asyncio.run(main())