Skip to content

Commit

Permalink
Feat: Demo feature for remote streaming speech to text (#111)
Browse files Browse the repository at this point in the history
* add argument parser to allow --demo that triggers pyaudio's audio recording

* rework class hierachy and file structure

* recording works, although facing output & missing audio issues

* test on predetermined audio first, although problem with cutting off end

* fix hallucination using threading and queue

* add VAD to process silences

* fix audio getting stuck mid-sentence by feeding in dummy audio sample. fix silence count logic.

* add pyaudio to github pip install

* install pyaudio dependencies

* git install silero-vad

* add documentation and remove unneccesary comments

* remove import pyaudio and dependencies in the beginning to keep simuleval lightweight

* fix stuck after silence by system reset
  • Loading branch information
Epic-Eric authored Sep 13, 2024
1 parent 6101ba1 commit 536de82
Show file tree
Hide file tree
Showing 7 changed files with 171 additions and 3 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,14 @@ jobs:
run: |
sudo apt-get update
sudo apt-get install libsndfile1
sudo apt-get install portaudio19-dev
python -m pip install --upgrade pip==24.0
pip install flake8 pytest black
pip install g2p-en
pip install huggingface-hub
pip install fairseq
pip install sentencepiece
pip install openai-whisper editdistance
pip install openai-whisper editdistance pyaudio silero-vad
pip install -e .
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
python -c "import nltk; nltk.download('averaged_perceptron_tagger_eng')"
Expand Down
14 changes: 14 additions & 0 deletions examples/speech_to_text/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,17 @@ WER LAAL AL AP DAL ATD
```

This agent can also perform S2T task, by adding `--task translate`.

### Streaming Speech-to-Text Demo

A streaming speech to text demo feature, taking input from user's microphone, sending it to Whisper's wait-k model, and displaying the prediction texts in the terminal.

1. Kick off a remote agent. More information [Remote_agent](../../docs/tutorials/remote_evaluation.rst)
2. Enter demo mode by providing a desired segment size (usually 500ms):

```bash
simuleval --remote-eval --demo --source-segment-size 500 --remote-port 8888
```

3. Speak into the microphone and watch the live transcription!
4. Press ^c (Control C) to exit the program in terminal
4 changes: 3 additions & 1 deletion simuleval/data/dataloader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ def build_dataloader(args: Namespace) -> GenericDataloader:
assert dataloader_key in DATALOADER_DICT, f"{dataloader_key} is not defined"
logger.info(f"Evaluating from dataloader {dataloader_key}.")
return DATALOADER_DICT[dataloader_key].from_args(args)

if args.demo:
args.source_type = "speech"
args.target_type = "text"
assert args.source_type in SUPPORTED_SOURCE_MEDIUM
assert args.target_type in SUPPORTED_TARGET_MEDIUM

Expand Down
2 changes: 2 additions & 0 deletions simuleval/data/dataloader/s2t_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ def from_files(
def from_args(cls, args: Namespace):
args.source_type = "speech"
args.target_type = "text"
if args.demo:
return cls([], [], [])
return cls.from_files(args.source, args.target, args.tgt_lang)


Expand Down
3 changes: 3 additions & 0 deletions simuleval/evaluator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@

from .evaluator import SentenceLevelEvaluator
from .remote import RemoteEvaluator
from .remote import DemoRemote


def build_evaluator(args):
return SentenceLevelEvaluator.from_args(args)


def build_remote_evaluator(args):
if args.demo:
return DemoRemote(build_evaluator(args))
return RemoteEvaluator(build_evaluator(args))
142 changes: 141 additions & 1 deletion simuleval/evaluator/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,28 @@
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

import sys
import logging
from simuleval.data.segments import Segment, segment_from_json_string
import threading
import time
from queue import Queue
import numpy as np

try:
import wave
import pyaudio
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
except:
wave, pyaudio, load_silero_vad, read_audio, get_speech_timestamps = [
None for _ in range(5)
]

from simuleval.data.segments import (
Segment,
segment_from_json_string,
SpeechSegment,
EmptySegment,
)
from simuleval.evaluator import SentenceLevelEvaluator
import requests

Expand Down Expand Up @@ -40,8 +60,128 @@ def remote_eval(self):
self.system_reset()
while not instance.finish_prediction:
self.send_source(instance.send_source(self.source_segment_size))
# instance.py line 275, returns a segment object with all the floats in the 500 ms range

output_segment = self.receive_prediction()
# gets the prediction in text! like "This"...
# refreshes each time. "This" for the 1st, "is" for the second

instance.receive_prediction(output_segment)
# instance.py line 190
# processes data, gets in a prediction list with ["This", "is"] on 2nd iteration
self.evaluator.write_log(instance)

self.evaluator.dump_results()


class DemoRemote(RemoteEvaluator):
def __init__(self, evaluator: SentenceLevelEvaluator) -> None:
if None in [wave, pyaudio, load_silero_vad, read_audio, get_speech_timestamps]:
raise Exception(
"Please install wave, pyaudio, and silero_vad to run the demo"
)
super().__init__(evaluator)
self.float_array = np.asarray([])
self.sample_rate = 16000
self.finished = False
self.queue = Queue(maxsize=0)
self.VADmodel = load_silero_vad()
self.silence_count = 0

def record_audio(self):
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1 if sys.platform == "darwin" else 2
RATE = self.sample_rate
RECORD_SECONDS = 10000 # Indefinite time

with wave.open(f"output.wav", "wb") as wf:
p = pyaudio.PyAudio()
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)

stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True)

all_data = bytearray()
start = time.time()
for _ in range(0, round(RATE // CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
wf.writeframes(data)
all_data += data
if time.time() - start > (self.source_segment_size / 1000.0):
self.queue.put(all_data)
all_data = bytearray()
start = time.time()

self.queue.put(all_data)
stream.close()
p.terminate()
self.finished = True

def remote_eval(self):
# Initialization
self.system_reset()
recording = threading.Thread(target=self.record_audio)
recording.start()

# Start recording
print("Recording...")
while not self.finished or not self.queue.empty():
data = byte_to_float(self.queue.get()).tolist()
# VAD
speech_timestamps = get_speech_timestamps(
audio=data, model=self.VADmodel, sampling_rate=self.sample_rate
)

if len(speech_timestamps) != 0: # has audio
self.silence_count = 0
else:
self.silence_count += 1

if self.silence_count <= 4:
segment = SpeechSegment(
index=self.source_segment_size,
content=data,
sample_rate=self.sample_rate,
finished=False,
)
self.send_source(segment)
output_segment = self.receive_prediction()
if len(output_segment.content) == 0:
continue
prediction_list = str(output_segment.content.replace(" ", ""))
print(prediction_list, end=" ")
sys.stdout.flush()

else:
segment = SpeechSegment(
index=self.source_segment_size,
content=[0.0 for _ in range(8192)],
sample_rate=self.sample_rate,
finished=True,
)
self.send_source(segment)
output_segment = self.receive_prediction()
self.silence_count = 0
self.system_reset()


def pcm2float(sig, dtype="float32"):
sig = np.asarray(sig)
if sig.dtype.kind not in "iu":
raise TypeError("'sig' must be an array of integers")
dtype = np.dtype(dtype)
if dtype.kind != "f":
raise TypeError("'dtype' must be a floating point type")

# pcm (16 bit) min = -32768, max = 32767, map it to -1 to 1 by dividing by max (32767)
i = np.iinfo(sig.dtype)
abs_max = 2 ** (i.bits - 1)
offset = i.min + abs_max
return (sig.astype(dtype) - offset) / abs_max


def byte_to_float(byte):
# byte -> int16(PCM_16) -> float32
return pcm2float(np.frombuffer(byte, dtype=np.int16), dtype="float32")
6 changes: 6 additions & 0 deletions simuleval/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,12 @@ def general_parser(
default=False,
help="Create visualization graphs",
)
parser.add_argument(
"--demo",
action="store_true",
default=False,
help="Live remote speech to text demonstration in terminal",
)

return parser

Expand Down

0 comments on commit 536de82

Please sign in to comment.