From eb1ee46c26ee6a9f53ddcbe470aa3134c438e097 Mon Sep 17 00:00:00 2001 From: m-barker Date: Mon, 4 Dec 2023 17:34:41 +0000 Subject: [PATCH 01/13] feat: actionlib server for speech recognition --- .../CMakeLists.txt | 11 +- .../action/TranscribeSpeech.action | 6 + .../lasr_speech_recognition_msgs/package.xml | 2 + .../CMakeLists.txt | 10 +- .../nodes/transcribe_microphone_server | 236 ++++++++++++++++++ .../package.xml | 4 + 6 files changed, 264 insertions(+), 5 deletions(-) create mode 100644 common/speech/lasr_speech_recognition_msgs/action/TranscribeSpeech.action create mode 100644 common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server diff --git a/common/speech/lasr_speech_recognition_msgs/CMakeLists.txt b/common/speech/lasr_speech_recognition_msgs/CMakeLists.txt index ae6bac4d8..10e0472f1 100644 --- a/common/speech/lasr_speech_recognition_msgs/CMakeLists.txt +++ b/common/speech/lasr_speech_recognition_msgs/CMakeLists.txt @@ -7,7 +7,11 @@ project(lasr_speech_recognition_msgs) ## Find catkin macros and libraries ## if COMPONENTS list like find_package(catkin REQUIRED COMPONENTS xyz) ## is used, also find other catkin packages -find_package(catkin REQUIRED COMPONENTS message_generation) +find_package(catkin REQUIRED COMPONENTS message_generation genmsg actionlib_msgs actionlib std_msgs) +add_action_files( + DIRECTORY action + FILES TranscribeSpeech.action +) ## System dependencies are found with CMake's conventions # find_package(Boost REQUIRED COMPONENTS system) @@ -63,8 +67,9 @@ add_service_files( ## Generate added messages and services with any dependencies listed here generate_messages( -# DEPENDENCIES -# std_msgs # Or other packages containing msgs + DEPENDENCIES + std_msgs # Or other packages containing msgs + actionlib_msgs ) ################################################ diff --git a/common/speech/lasr_speech_recognition_msgs/action/TranscribeSpeech.action b/common/speech/lasr_speech_recognition_msgs/action/TranscribeSpeech.action new file mode 100644 index 000000000..486b0cb19 --- /dev/null +++ b/common/speech/lasr_speech_recognition_msgs/action/TranscribeSpeech.action @@ -0,0 +1,6 @@ +--- +#result definition +string sequence +--- +#feedback +string sequence \ No newline at end of file diff --git a/common/speech/lasr_speech_recognition_msgs/package.xml b/common/speech/lasr_speech_recognition_msgs/package.xml index e319ec5db..6f00b03f4 100644 --- a/common/speech/lasr_speech_recognition_msgs/package.xml +++ b/common/speech/lasr_speech_recognition_msgs/package.xml @@ -51,6 +51,8 @@ catkin message_generation message_runtime + actionlib_msgs + actionlib_msgs diff --git a/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt b/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt index d2cce16f7..75240eddb 100644 --- a/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt +++ b/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt @@ -7,7 +7,11 @@ project(lasr_speech_recognition_whisper) ## Find catkin macros and libraries ## if COMPONENTS list like find_package(catkin REQUIRED COMPONENTS xyz) ## is used, also find other catkin packages -find_package(catkin REQUIRED catkin_virtualenv) +find_package(catkin REQUIRED catkin_virtualenv genmsg actionlib_msgs actionlib std_msgs) +# add_action_files( +# DIRECTORY action +# FILES TranscribeSpeech.action +# ) ## System dependencies are found with CMake's conventions # find_package(Boost REQUIRED COMPONENTS system) @@ -70,7 +74,8 @@ catkin_generate_virtualenv( ## Generate added messages and services with any dependencies listed here # generate_messages( # DEPENDENCIES -# std_msgs # Or other packages containing msgs +# std_msgs +# actionlib_msgs # Or other packages containing msgs # ) ################################################ @@ -162,6 +167,7 @@ include_directories( catkin_install_python(PROGRAMS nodes/simple_transcribe_microphone nodes/transcribe_microphone + nodes/transcribe_microphone_server scripts/list_microphones.py scripts/test_microphones.py DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server new file mode 100644 index 000000000..ee0df9578 --- /dev/null +++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 + +import os +import argparse +from typing import Optional, Union +from dataclasses import dataclass +from pathlib import Path + +import rospy +import numpy as np +import torch +import actionlib +import speech_recognition as sr # type: ignore +import lasr_speech_recognition_msgs.msg # type: ignore +from lasr_speech_recognition_whisper import load_model # type: ignore + + +@dataclass +class speech_model_params: + """Class for storing speech recognition model parameters.""" + + model_name: str = "medium.en" + device: str = "cuda" if torch.cuda.is_available() else "cpu" + start_timeout: float = 5.0 + end_timeout: Optional[float] = None + sample_rate: int = 16000 + mic_device: Optional[Union[int, str]] = None + + +class TranscribeSpeechAction(object): + # create messages that are used to publish feedback/result + _feedback = lasr_speech_recognition_msgs.msg.TranscribeSpeechFeedback() + _result = lasr_speech_recognition_msgs.msg.TranscribeSpeechResult() + + def __init__( + self, + action_name: str, + model_params: speech_model_params, + ) -> None: + """Starts an action server for transcribing speech. + + Args: + action_name (str): Name of the action server. + """ + + self._action_name = action_name + self._model_params = model_params + self._model = load_model( + self._model_params.model_name, device=self._model_params.device + ) + self.recogniser = self._configure_recogniser() + self._feedback_sequence: Optional[list[str]] = None + self._action_server = actionlib.SimpleActionServer( + self._action_name, + lasr_speech_recognition_msgs.msg.TranscribeSpeechAction, + execute_cb=self.execute_cb, + auto_start=False, + ) + self._action_server.register_preempt_callback(self.prempt_cb) + self._action_server.start() + + def _configure_microphone(self) -> sr.Microphone: + """Configures the microphone for listening to speech based on the + microphone device index or name. + + Returns: microphone object + """ + + if ( + isinstance(self._model_params.mic_device, int) + or self._model_params.mic_device.isdigit() + ): + return sr.Microphone( + device_index=int(self._model_params.mic_device), + sample_rate=self._model_params.sample_rate, + ) + elif isinstance(self._model_params.mic_device, str): + microphones = enumerate(sr.Microphone.list_microphone_names()) + for index, name in microphones: + if self._model_params.mic_device in name: + return sr.Microphone( + device_index=index, + sample_rate=self._model_params.sample_rate, + ) + raise ValueError( + f"Could not find microphone with name: {self._model_params.mic_device}" + ) + # If no microphone device is specified, use the system default microphone + return sr.Microphone(sample_rate=self._model_params.sample_rate) + + def _configure_recogniser(self, ambient_adj: bool = True) -> sr.Recognizer: + """Configures the speech recogniser object. + + Args: + ambient_adj (bool, optional): Whether to adjust for ambient noise. Defaults to True. + + Returns: + sr.Recognizer: speech recogniser object. + """ + + recogniser = sr.Recognizer() + if ambient_adj: + with self._configure_microphone() as source: + recogniser.adjust_for_ambient_noise(source) + return recogniser + + def prempt_cb(self) -> None: + """Callback for preempting the action server. + + Resets the feedback sequence and sets server to prempted. + """ + self._action_server.set_preempted() + + def execute_cb(self, goal) -> None: + self._feedback_sequence = [] + with self._configure_microphone() as src: + wav_data = self.recogniser.listen( + src, + timeout=self._model_params.start_timeout, + phrase_time_limit=self._model_params.end_timeout, + ).get_wav_data() + # Magic number 32768.0 is the maximum value of a 16-bit signed integer + float_data = ( + np.frombuffer(wav_data, dtype=np.int16).astype(np.float32, order="C") + / 32768.0 + ) + + # Cast to fp16 if using GPU + phrase = self._model.transcribe( + float_data, + fp16=self._model_params.device == "cuda", + )["text"] + + self._feedback_sequence.append(phrase) + self._result.sequence = phrase + rospy.loginfo(f"Transcribed phrase: {phrase}") + rospy.loginfo(f"{self._action_name} has succeeded") + self._action_server.set_succeeded(self._result) + + +def parse_args() -> dict: + """Parses the command line arguments into a name: value dictinoary. + + Returns: + dict: Dictionary of name: value pairs of command line arguments. + """ + parser = argparse.ArgumentParser( + description="Starts an action server for transcribing speech." + ) + + parser.add_argument( + "--action-name", + type=str, + default="transcribe_speech", + help="Name of the action server.", + ) + parser.add_argument( + "--model-name", + type=str, + default="medium.en", + help="Name of the speech recognition model.", + ) + parser.add_argument( + "--device", + type=str, + default="cuda" if torch.cuda.is_available() else "cpu", + help="Device to run the model on.", + ) + parser.add_argument( + "--start_timeout", + type=float, + default=5.0, + help="Timeout for listening for the start of a phrase.", + ) + parser.add_argument( + "--end_timeout", + type=float, + default=None, + help="Timeout for listening for the end of a phrase.", + ) + parser.add_argument( + "--sample_rate", + type=int, + default=16000, + help="Sample rate of the microphone.", + ) + parser.add_argument( + "--mic_device", + default=None, + help="Microphone device index or name. Can be a string or an integer.", + ) + + return vars(parser.parse_args()) + + +def configure_model_params(config: dict) -> speech_model_params: + """Configures the speech model parameters based on the provided + command line parameters. + + Args: + config (dict): Command line parameters parsed in dictionary form. + + Returns: + speech_model_params: dataclass containing the speech model parameters + """ + model_params = speech_model_params() + if config["model_name"]: + model_params.model_name = config["model_name"] + if config["device"]: + model_params.device = config["device"] + if config["start_timeout"]: + model_params.start_timeout = config["start_timeout"] + if config["end_timeout"]: + model_params.end_timeout = config["end_timeout"] + if config["sample_rate"]: + model_params.sample_rate = config["sample_rate"] + if config["mic_device"]: + model_params.mic_device = config["mic_device"] + + return model_params + + +def configure_whisper_cache() -> None: + """Configures the whisper cache directory.""" + whisper_cache = os.path.join(str(Path.home()), ".cache", "whisper") + os.makedirs(whisper_cache, exist_ok=True) + # Environemntal variable required to run whisper locally + os.environ["TIKTOKEN_CACHE_DIR"] = whisper_cache + + +if __name__ == "__main__": + configure_whisper_cache() + config = parse_args() + rospy.init_node(config["action_name"]) + server = TranscribeSpeechAction(rospy.get_name(), configure_model_params(config)) + rospy.spin() diff --git a/common/speech/lasr_speech_recognition_whisper/package.xml b/common/speech/lasr_speech_recognition_whisper/package.xml index 39935c089..4c6f49965 100644 --- a/common/speech/lasr_speech_recognition_whisper/package.xml +++ b/common/speech/lasr_speech_recognition_whisper/package.xml @@ -51,6 +51,10 @@ catkin catkin_virtualenv lasr_speech_recognition_msgs + actionlib + actionlib_msgs + actionlib + actionlib_msgs From e931db00707f8f8d08da94ecb817c2b8122593ad Mon Sep 17 00:00:00 2001 From: m-barker Date: Mon, 4 Dec 2023 17:49:34 +0000 Subject: [PATCH 02/13] docs: add documentation for speech action server --- .../lasr_speech_recognition_whisper/doc/USAGE.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/common/speech/lasr_speech_recognition_whisper/doc/USAGE.md b/common/speech/lasr_speech_recognition_whisper/doc/USAGE.md index 6c0649478..8730a6645 100644 --- a/common/speech/lasr_speech_recognition_whisper/doc/USAGE.md +++ b/common/speech/lasr_speech_recognition_whisper/doc/USAGE.md @@ -26,3 +26,19 @@ Stop listening whenever: ```bash rosservice call /whisper/stop_listening "{}" ``` + +Run an actionlib server to transcribe the microphone: + +```bash +rosrun lasr_speech_recognition_whisper transcribe_microphone_server +``` + +The response from the request is a `string` containing the transcribed text. + +Several command line configuration options exist, which can be viewed with: + +```bash +rosrun lasr_speech_recognition_whisper transcribe_microphone_server --help +``` + + From e1d7069ce7b692f2ba344906452741bd8b41210a Mon Sep 17 00:00:00 2001 From: m-barker Date: Tue, 5 Dec 2023 21:22:04 +0000 Subject: [PATCH 03/13] fix: incorrect typing of mic name arg --- .../nodes/transcribe_microphone_server | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server index ee0df9578..aab5ded63 100644 --- a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server +++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server @@ -17,14 +17,24 @@ from lasr_speech_recognition_whisper import load_model # type: ignore @dataclass class speech_model_params: - """Class for storing speech recognition model parameters.""" + """Class for storing speech recognition model parameters. + + Args: + model_name (str, optional): Name of the speech recognition model. Defaults to "medium.en". + Must be a valid Whisper model name. + device (str, optional): Device to run the model on. Defaults to "cuda" if available, otherwise "cpu". + start_timeout (float): Max number of seconds of silence when starting listening before stopping. Defaults to 5.0. + end_timeout (Optional[float]): Max number of seconds of silence after starting listening before stopping. Defaults to None. + sample_rate (int): Sample rate of the microphone. Defaults to 16000Hz. + mic_device (Optional[str]): Microphone device index or name. Defaults to None. + """ model_name: str = "medium.en" device: str = "cuda" if torch.cuda.is_available() else "cpu" start_timeout: float = 5.0 end_timeout: Optional[float] = None sample_rate: int = 16000 - mic_device: Optional[Union[int, str]] = None + mic_device: Optional[str] = None class TranscribeSpeechAction(object): @@ -66,15 +76,15 @@ class TranscribeSpeechAction(object): Returns: microphone object """ - if ( - isinstance(self._model_params.mic_device, int) - or self._model_params.mic_device.isdigit() - ): + if self._model_params.mic_device is None: + # If no microphone device is specified, use the system default microphone + return sr.Microphone(sample_rate=self._model_params.sample_rate) + elif self._model_params.mic_device.isdigit(): return sr.Microphone( device_index=int(self._model_params.mic_device), sample_rate=self._model_params.sample_rate, ) - elif isinstance(self._model_params.mic_device, str): + else: microphones = enumerate(sr.Microphone.list_microphone_names()) for index, name in microphones: if self._model_params.mic_device in name: @@ -85,8 +95,6 @@ class TranscribeSpeechAction(object): raise ValueError( f"Could not find microphone with name: {self._model_params.mic_device}" ) - # If no microphone device is specified, use the system default microphone - return sr.Microphone(sample_rate=self._model_params.sample_rate) def _configure_recogniser(self, ambient_adj: bool = True) -> sr.Recognizer: """Configures the speech recogniser object. @@ -186,8 +194,9 @@ def parse_args() -> dict: ) parser.add_argument( "--mic_device", + type=str, default=None, - help="Microphone device index or name. Can be a string or an integer.", + help="Microphone device index or name", ) return vars(parser.parse_args()) From 199eae4cdaae4262142f8e45f5af3f52951aca54 Mon Sep 17 00:00:00 2001 From: m-barker Date: Tue, 5 Dec 2023 21:24:33 +0000 Subject: [PATCH 04/13] fix: remove unsused feedback list --- .../nodes/transcribe_microphone_server | 3 --- 1 file changed, 3 deletions(-) diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server index aab5ded63..7b4e68159 100644 --- a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server +++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server @@ -59,7 +59,6 @@ class TranscribeSpeechAction(object): self._model_params.model_name, device=self._model_params.device ) self.recogniser = self._configure_recogniser() - self._feedback_sequence: Optional[list[str]] = None self._action_server = actionlib.SimpleActionServer( self._action_name, lasr_speech_recognition_msgs.msg.TranscribeSpeechAction, @@ -120,7 +119,6 @@ class TranscribeSpeechAction(object): self._action_server.set_preempted() def execute_cb(self, goal) -> None: - self._feedback_sequence = [] with self._configure_microphone() as src: wav_data = self.recogniser.listen( src, @@ -139,7 +137,6 @@ class TranscribeSpeechAction(object): fp16=self._model_params.device == "cuda", )["text"] - self._feedback_sequence.append(phrase) self._result.sequence = phrase rospy.loginfo(f"Transcribed phrase: {phrase}") rospy.loginfo(f"{self._action_name} has succeeded") From faf94f7815ada8f312b442c1d6b2be3751a337d2 Mon Sep 17 00:00:00 2001 From: m-barker Date: Tue, 5 Dec 2023 22:09:27 +0000 Subject: [PATCH 05/13] fix: correctly check and handle preemption --- .../nodes/transcribe_microphone_server | 36 +++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server index 7b4e68159..70f8ee712 100644 --- a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server +++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server @@ -5,6 +5,7 @@ import argparse from typing import Optional, Union from dataclasses import dataclass from pathlib import Path +from timeit import default_timer as timer import rospy import numpy as np @@ -58,6 +59,9 @@ class TranscribeSpeechAction(object): self._model = load_model( self._model_params.model_name, device=self._model_params.device ) + rospy.loginfo( + f"Loaded model: {self._model_params.model_name} on {self._model_params.device}" + ) self.recogniser = self._configure_recogniser() self._action_server = actionlib.SimpleActionServer( self._action_name, @@ -114,11 +118,26 @@ class TranscribeSpeechAction(object): def prempt_cb(self) -> None: """Callback for preempting the action server. - Resets the feedback sequence and sets server to prempted. + Sets server to preempted state. """ - self._action_server.set_preempted() + preempted_str = f"{self._action_name} has been preempted" + rospy.loginfo(preempted_str) + self._result.sequence = preempted_str + self._action_server.set_preempted(result=self._result, text=preempted_str) def execute_cb(self, goal) -> None: + """Callback for executing the action server. + + Checks for preemption before listening and before and after transcribing, returning + if preemption is requested. + + Args: + goal: UNUSED - actionlib requires a goal argument in the execute callback, but + this action server does not use a goal. + """ + + if self._action_server.is_preempt_requested(): + return with self._configure_microphone() as src: wav_data = self.recogniser.listen( src, @@ -131,11 +150,24 @@ class TranscribeSpeechAction(object): / 32768.0 ) + if self._action_server.is_preempt_requested(): + return + + rospy.loginfo(f"Transcribing phrase with Whisper...") + transcription_start_time = timer() # Cast to fp16 if using GPU phrase = self._model.transcribe( float_data, fp16=self._model_params.device == "cuda", )["text"] + transcription_end_time = timer() + rospy.loginfo(f"Transcription finished!") + rospy.loginfo( + f"Time taken: {transcription_end_time - transcription_start_time:.2f}s" + ) + + if self._action_server.is_preempt_requested(): + return self._result.sequence = phrase rospy.loginfo(f"Transcribed phrase: {phrase}") From c3fb186069c3d1d98d3ab026c6020a12d638209e Mon Sep 17 00:00:00 2001 From: m-barker Date: Wed, 13 Dec 2023 19:05:46 +0000 Subject: [PATCH 06/13] feat: timer for adjusting mic for ambient noise --- .../nodes/transcribe_microphone_server | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server index 70f8ee712..1f15aa9d6 100644 --- a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server +++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server @@ -28,6 +28,7 @@ class speech_model_params: end_timeout (Optional[float]): Max number of seconds of silence after starting listening before stopping. Defaults to None. sample_rate (int): Sample rate of the microphone. Defaults to 16000Hz. mic_device (Optional[str]): Microphone device index or name. Defaults to None. + timer_duration (Optional[int]): Duration of the timer for adjusting the microphone for ambient noise. Defaults to 20 seconds. """ model_name: str = "medium.en" @@ -36,6 +37,7 @@ class speech_model_params: end_timeout: Optional[float] = None sample_rate: int = 16000 mic_device: Optional[str] = None + timer_duration: Optional[int] = 20 class TranscribeSpeechAction(object): @@ -62,7 +64,9 @@ class TranscribeSpeechAction(object): rospy.loginfo( f"Loaded model: {self._model_params.model_name} on {self._model_params.device}" ) + # Configure the speech recogniser object and adjust for ambient noise self.recogniser = self._configure_recogniser() + # Setup the action server and register execution callback self._action_server = actionlib.SimpleActionServer( self._action_name, lasr_speech_recognition_msgs.msg.TranscribeSpeechAction, @@ -70,8 +74,25 @@ class TranscribeSpeechAction(object): auto_start=False, ) self._action_server.register_preempt_callback(self.prempt_cb) + # Setup the timer for adjusting the microphone for ambient noise every x seconds + self._timer_duration = self._model_params.timer_duration + self._timer = rospy.Timer(rospy.Duration(self._timer_duration), self._timer_cb) + self._listening = False + self._action_server.start() + def _timer_cb(self) -> None: + """Adjusts the microphone for ambient noise, unless the action server is listening.""" + if self._listening: + return + with self._configure_microphone() as source: + self.recogniser.adjust_for_ambient_noise(source) + + def _reset_timer(self) -> None: + """Resets the timer for adjusting the microphone for ambient noise.""" + self._timer.shutdown() + self._timer = rospy.Timer(rospy.Duration(self._timer_duration), self._timer_cb) + def _configure_microphone(self) -> sr.Microphone: """Configures the microphone for listening to speech based on the microphone device index or name. @@ -108,11 +129,12 @@ class TranscribeSpeechAction(object): Returns: sr.Recognizer: speech recogniser object. """ - + self._listening = True recogniser = sr.Recognizer() if ambient_adj: with self._configure_microphone() as source: recogniser.adjust_for_ambient_noise(source) + self._listening = False return recogniser def prempt_cb(self) -> None: @@ -135,10 +157,10 @@ class TranscribeSpeechAction(object): goal: UNUSED - actionlib requires a goal argument in the execute callback, but this action server does not use a goal. """ - if self._action_server.is_preempt_requested(): return with self._configure_microphone() as src: + self._listening = True wav_data = self.recogniser.listen( src, timeout=self._model_params.start_timeout, @@ -151,6 +173,7 @@ class TranscribeSpeechAction(object): ) if self._action_server.is_preempt_requested(): + self._listening = False return rospy.loginfo(f"Transcribing phrase with Whisper...") @@ -167,6 +190,7 @@ class TranscribeSpeechAction(object): ) if self._action_server.is_preempt_requested(): + self._listening = False return self._result.sequence = phrase @@ -174,6 +198,9 @@ class TranscribeSpeechAction(object): rospy.loginfo(f"{self._action_name} has succeeded") self._action_server.set_succeeded(self._result) + # Have this at the very end to not disrupt the action server + self._listening = False + def parse_args() -> dict: """Parses the command line arguments into a name: value dictinoary. From 271b5f3ac6ffdf2430e47eef4cd363f88af76bb6 Mon Sep 17 00:00:00 2001 From: m-barker Date: Thu, 14 Dec 2023 20:54:35 +0000 Subject: [PATCH 07/13] feat: ambient timer duration cli --- .../nodes/transcribe_microphone_server | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server index 1f15aa9d6..6bf797da0 100644 --- a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server +++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server @@ -85,6 +85,7 @@ class TranscribeSpeechAction(object): """Adjusts the microphone for ambient noise, unless the action server is listening.""" if self._listening: return + rospy.loginfo("Adjusting microphone for ambient noise...") with self._configure_microphone() as source: self.recogniser.adjust_for_ambient_noise(source) @@ -254,6 +255,12 @@ def parse_args() -> dict: default=None, help="Microphone device index or name", ) + parser.add_argument( + "--timer_duration", + type=int, + default=20, + help="Number of seconds of silence before the ambient noise adjustment is called.", + ) return vars(parser.parse_args()) @@ -281,6 +288,8 @@ def configure_model_params(config: dict) -> speech_model_params: model_params.sample_rate = config["sample_rate"] if config["mic_device"]: model_params.mic_device = config["mic_device"] + if config["timer_duration"]: + model_params.timer_duration = config["timer_duration"] return model_params From 443538e105993cb5370a43409e63a4b714d6dae5 Mon Sep 17 00:00:00 2001 From: m-barker Date: Thu, 14 Dec 2023 21:13:10 +0000 Subject: [PATCH 08/13] feat: reset the timer upon entering exc callback --- .../nodes/transcribe_microphone_server | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server index 6bf797da0..63d4559af 100644 --- a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server +++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server @@ -2,7 +2,7 @@ import os import argparse -from typing import Optional, Union +from typing import Optional from dataclasses import dataclass from pathlib import Path from timeit import default_timer as timer @@ -160,6 +160,9 @@ class TranscribeSpeechAction(object): """ if self._action_server.is_preempt_requested(): return + # Since we are about to listen, reset the timer for adjusting the microphone for ambient noise + # as this assumes self_timer_duration seconds of silence before adjusting + self._reset_timer() with self._configure_microphone() as src: self._listening = True wav_data = self.recogniser.listen( From dd6cb56fc6995194ef921c8b8d6f8e3634fddc19 Mon Sep 17 00:00:00 2001 From: m-barker Date: Thu, 14 Dec 2023 21:38:27 +0000 Subject: [PATCH 09/13] feat: warmup model by transcribing test file --- .../nodes/transcribe_microphone_server | 16 ++++++-- .../lasr_speech_recognition_whisper/cache.py | 40 +++++++++++++++---- 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server index 63d4559af..2c538e535 100644 --- a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server +++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server @@ -29,6 +29,7 @@ class speech_model_params: sample_rate (int): Sample rate of the microphone. Defaults to 16000Hz. mic_device (Optional[str]): Microphone device index or name. Defaults to None. timer_duration (Optional[int]): Duration of the timer for adjusting the microphone for ambient noise. Defaults to 20 seconds. + warmup (bool): Whether to warmup the model by running inference on a test file. Defaults to True. """ model_name: str = "medium.en" @@ -38,6 +39,7 @@ class speech_model_params: sample_rate: int = 16000 mic_device: Optional[str] = None timer_duration: Optional[int] = 20 + warmup: bool = True class TranscribeSpeechAction(object): @@ -59,10 +61,9 @@ class TranscribeSpeechAction(object): self._action_name = action_name self._model_params = model_params self._model = load_model( - self._model_params.model_name, device=self._model_params.device - ) - rospy.loginfo( - f"Loaded model: {self._model_params.model_name} on {self._model_params.device}" + self._model_params.model_name, + self._model_params.device, + self._model_params.warmup, ) # Configure the speech recogniser object and adjust for ambient noise self.recogniser = self._configure_recogniser() @@ -264,6 +265,11 @@ def parse_args() -> dict: default=20, help="Number of seconds of silence before the ambient noise adjustment is called.", ) + parser.add_argument( + "--no_warmup", + action="store_true", + help="Disable warming up the model by running inference on a test file.", + ) return vars(parser.parse_args()) @@ -293,6 +299,8 @@ def configure_model_params(config: dict) -> speech_model_params: model_params.mic_device = config["mic_device"] if config["timer_duration"]: model_params.timer_duration = config["timer_duration"] + if config["no_warmup"]: + model_params.warmup = False return model_params diff --git a/common/speech/lasr_speech_recognition_whisper/src/lasr_speech_recognition_whisper/cache.py b/common/speech/lasr_speech_recognition_whisper/src/lasr_speech_recognition_whisper/cache.py index d0ec731fc..42ec44785 100644 --- a/common/speech/lasr_speech_recognition_whisper/src/lasr_speech_recognition_whisper/cache.py +++ b/common/speech/lasr_speech_recognition_whisper/src/lasr_speech_recognition_whisper/cache.py @@ -1,17 +1,43 @@ -import whisper +import os +import whisper # type: ignore +import rospkg # type: ignore import rospy # Keep all loaded models in memory MODEL_CACHE = {} -def load_model(name: str, device: str = 'cpu'): - ''' - Load a given Whisper model - ''' + +def load_model( + name: str, device: str = "cpu", load_test_file: bool = False +) -> whisper.Whisper: + """Loads a whisper model from disk, or from cache if it has already been loaded. + + Args: + name (str): Name of the whisper model. Must be the name of an official whisper + model, or the path to a model checkpoint. + device (str, optional): Pytorch device to put the model on. Defaults to 'cpu'. + load_test_file (bool, optional): Whether to run inference on a test audio file + after loading the model (if model is not in cache). Defaults to False. Test file + is assumed to be called "test.m4a" and be in the root of the package directory. + + Returns: + whisper.Whisper: Whisper model instance + """ global MODEL_CACHE if name not in MODEL_CACHE: - rospy.loginfo(f'Load model {name}') + rospy.loginfo(f"Loading model {name}") MODEL_CACHE[name] = whisper.load_model(name, device=device) - + rospy.loginfo(f"Sucessfully loaded model {name} on {device}") + if load_test_file: + package_root = rospkg.RosPack().get_path("lasr_speech_recognition_whisper") + example_fp = os.path.join(package_root, "test.m4a") + rospy.loginfo( + "Running transcription on example file to ensure model is loaded..." + ) + test_result: str = MODEL_CACHE[name].transcribe( + example_fp, fp16=device == "cuda" + ) + rospy.loginfo(f"Transcription test result: {test_result}") + return MODEL_CACHE[name] From 669506bc86b4d3c7eb51065c20461484fe05cf17 Mon Sep 17 00:00:00 2001 From: m-barker Date: Mon, 29 Jan 2024 10:46:57 +0000 Subject: [PATCH 10/13] feat: supress ALSA warning msgs --- .../nodes/transcribe_microphone_server | 90 ++++++++++++------- 1 file changed, 60 insertions(+), 30 deletions(-) diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server index 2c538e535..95f3b1e36 100644 --- a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server +++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server @@ -15,6 +15,29 @@ import speech_recognition as sr # type: ignore import lasr_speech_recognition_msgs.msg # type: ignore from lasr_speech_recognition_whisper import load_model # type: ignore +# Error handler to remove ALSA error messages taken from: +# https://stackoverflow.com/questions/7088672/pyaudio-working-but-spits-out-error-messages-each-time/17673011#17673011 + +from ctypes import * +from contextlib import contextmanager + +ERROR_HANDLER_FUNC = CFUNCTYPE(None, c_char_p, c_int, c_char_p, c_int, c_char_p) + + +def py_error_handler(filename, line, function, err, fmt): + pass + + +c_error_handler = ERROR_HANDLER_FUNC(py_error_handler) + + +@contextmanager +def noalsaerr(): + asound = cdll.LoadLibrary("libasound.so") + asound.snd_lib_error_set_handler(c_error_handler) + yield + asound.snd_lib_error_set_handler(None) + @dataclass class speech_model_params: @@ -60,35 +83,40 @@ class TranscribeSpeechAction(object): self._action_name = action_name self._model_params = model_params - self._model = load_model( - self._model_params.model_name, - self._model_params.device, - self._model_params.warmup, - ) - # Configure the speech recogniser object and adjust for ambient noise - self.recogniser = self._configure_recogniser() - # Setup the action server and register execution callback - self._action_server = actionlib.SimpleActionServer( - self._action_name, - lasr_speech_recognition_msgs.msg.TranscribeSpeechAction, - execute_cb=self.execute_cb, - auto_start=False, - ) - self._action_server.register_preempt_callback(self.prempt_cb) - # Setup the timer for adjusting the microphone for ambient noise every x seconds - self._timer_duration = self._model_params.timer_duration - self._timer = rospy.Timer(rospy.Duration(self._timer_duration), self._timer_cb) - self._listening = False - self._action_server.start() + with noalsaerr(): + self._model = load_model( + self._model_params.model_name, + self._model_params.device, + self._model_params.warmup, + ) + # Configure the speech recogniser object and adjust for ambient noise + self.recogniser = self._configure_recogniser() + # Setup the action server and register execution callback + self._action_server = actionlib.SimpleActionServer( + self._action_name, + lasr_speech_recognition_msgs.msg.TranscribeSpeechAction, + execute_cb=self.execute_cb, + auto_start=False, + ) + self._action_server.register_preempt_callback(self.prempt_cb) + # Setup the timer for adjusting the microphone for ambient noise every x seconds + self._timer_duration = self._model_params.timer_duration + self._timer = rospy.Timer( + rospy.Duration(self._timer_duration), self._timer_cb + ) + self._listening = False - def _timer_cb(self) -> None: + self._action_server.start() + + def _timer_cb(self, _) -> None: """Adjusts the microphone for ambient noise, unless the action server is listening.""" if self._listening: return rospy.loginfo("Adjusting microphone for ambient noise...") - with self._configure_microphone() as source: - self.recogniser.adjust_for_ambient_noise(source) + with noalsaerr(): + with self._configure_microphone() as source: + self.recogniser.adjust_for_ambient_noise(source) def _reset_timer(self) -> None: """Resets the timer for adjusting the microphone for ambient noise.""" @@ -159,18 +187,20 @@ class TranscribeSpeechAction(object): goal: UNUSED - actionlib requires a goal argument in the execute callback, but this action server does not use a goal. """ + rospy.loginfo("Request Received") if self._action_server.is_preempt_requested(): return # Since we are about to listen, reset the timer for adjusting the microphone for ambient noise # as this assumes self_timer_duration seconds of silence before adjusting self._reset_timer() - with self._configure_microphone() as src: - self._listening = True - wav_data = self.recogniser.listen( - src, - timeout=self._model_params.start_timeout, - phrase_time_limit=self._model_params.end_timeout, - ).get_wav_data() + with noalsaerr(): + with self._configure_microphone() as src: + self._listening = True + wav_data = self.recogniser.listen( + src, + timeout=self._model_params.start_timeout, + phrase_time_limit=self._model_params.end_timeout, + ).get_wav_data() # Magic number 32768.0 is the maximum value of a 16-bit signed integer float_data = ( np.frombuffer(wav_data, dtype=np.int16).astype(np.float32, order="C") From ae8e9e189f5440ce6dfeb4700eb4234f499400e6 Mon Sep 17 00:00:00 2001 From: m-barker Date: Mon, 29 Jan 2024 10:47:26 +0000 Subject: [PATCH 11/13] feat: tiago repeat after me demo script --- .../CMakeLists.txt | 1 + .../scripts/repeat_after_me.py | 58 +++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 common/speech/lasr_speech_recognition_whisper/scripts/repeat_after_me.py diff --git a/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt b/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt index 75240eddb..9c2fe1f57 100644 --- a/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt +++ b/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt @@ -170,6 +170,7 @@ catkin_install_python(PROGRAMS nodes/transcribe_microphone_server scripts/list_microphones.py scripts/test_microphones.py + scripts/repeat_after_me.py DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} ) diff --git a/common/speech/lasr_speech_recognition_whisper/scripts/repeat_after_me.py b/common/speech/lasr_speech_recognition_whisper/scripts/repeat_after_me.py new file mode 100644 index 000000000..2e6b20622 --- /dev/null +++ b/common/speech/lasr_speech_recognition_whisper/scripts/repeat_after_me.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +import rospy +import actionlib +from lasr_voice import Voice # type: ignore +from lasr_speech_recognition_msgs.srv import TranscribeAudio, TranscribeAudioResponse # type: ignore +from lasr_speech_recognition_msgs.msg import ( # type: ignore + TranscribeSpeechAction, + TranscribeSpeechGoal, +) + +# import actionlib +rospy.init_node("repeat") + +USE_ACTIONLIB = True + +voice = Voice() + + +if USE_ACTIONLIB: + client = actionlib.SimpleActionClient("transcribe_speech", TranscribeSpeechAction) + client.wait_for_server() + repeating = False + rospy.loginfo("Done waiting") + while not rospy.is_shutdown(): + goal = TranscribeSpeechGoal() + client.send_goal(goal) + client.wait_for_result() + result = client.get_result() + text = result.sequence + print(text) + if "tiago" in text.lower().strip(): + if "repeat" in text.lower().strip(): + repeating = True + voice.sync_tts("Okay, I'll start repeating now.") + continue + elif "stop" in text.lower().strip(): + repeating = False + voice.sync_tts("Okay, I'll stop repeating now.") + break + if repeating: + voice.sync_tts(f"I heard {text}") +else: + transcribe = rospy.ServiceProxy("/whisper/transcribe_audio", TranscribeAudio) + repeating = False + while not rospy.is_shutdown(): + text = transcribe().phrase + print(text) + if "tiago" in text.lower().strip(): + if "repeat" in text.lower().strip(): + repeating = True + voice.sync_tts("Okay, I'll start repeating now.") + continue + elif "stop" in text.lower().strip(): + repeating = False + voice.sync_tts("Okay, I'll stop repeating now.") + break + if repeating: + voice.sync_tts(f"I heard {text}") From 8745bf41d79341b0c5300fc3b73461074f9cd761 Mon Sep 17 00:00:00 2001 From: m-barker Date: Mon, 29 Jan 2024 11:16:01 +0000 Subject: [PATCH 12/13] feat: test speech server locally with microphones --- .../CMakeLists.txt | 1 + .../scripts/test_speech_server.py | 21 +++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 common/speech/lasr_speech_recognition_whisper/scripts/test_speech_server.py diff --git a/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt b/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt index 9c2fe1f57..a11465954 100644 --- a/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt +++ b/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt @@ -171,6 +171,7 @@ catkin_install_python(PROGRAMS scripts/list_microphones.py scripts/test_microphones.py scripts/repeat_after_me.py + scripts/test_speech_server.py DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} ) diff --git a/common/speech/lasr_speech_recognition_whisper/scripts/test_speech_server.py b/common/speech/lasr_speech_recognition_whisper/scripts/test_speech_server.py new file mode 100644 index 000000000..fef16eb0c --- /dev/null +++ b/common/speech/lasr_speech_recognition_whisper/scripts/test_speech_server.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +import rospy +import actionlib +from lasr_speech_recognition_msgs.srv import TranscribeAudio, TranscribeAudioResponse # type: ignore +from lasr_speech_recognition_msgs.msg import ( # type: ignore + TranscribeSpeechAction, + TranscribeSpeechGoal, +) + + +rospy.init_node("test_speech_server") +client = actionlib.SimpleActionClient("transcribe_speech", TranscribeSpeechAction) +client.wait_for_server() +rospy.loginfo("Done waiting") +while not rospy.is_shutdown(): + goal = TranscribeSpeechGoal() + client.send_goal(goal) + client.wait_for_result() + result = client.get_result() + text = result.sequence + print(f"Transcribed Speech: {text}") From 1b4136314995d8c551a7bdb915ad58f625ec5c28 Mon Sep 17 00:00:00 2001 From: m-barker Date: Mon, 29 Jan 2024 11:18:19 +0000 Subject: [PATCH 13/13] docs: update usage with two new scripts --- .../lasr_speech_recognition_whisper/doc/USAGE.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/common/speech/lasr_speech_recognition_whisper/doc/USAGE.md b/common/speech/lasr_speech_recognition_whisper/doc/USAGE.md index 8730a6645..2bb966c13 100644 --- a/common/speech/lasr_speech_recognition_whisper/doc/USAGE.md +++ b/common/speech/lasr_speech_recognition_whisper/doc/USAGE.md @@ -41,4 +41,16 @@ Several command line configuration options exist, which can be viewed with: rosrun lasr_speech_recognition_whisper transcribe_microphone_server --help ``` +Get tiago to repeat, with TTS the transcribed speech output; he will begin repeating after hearing "tiago, repeat ...." and stop once hearing "tiago, stop..." + +```bash +rosrun lasr_speech_recognition_whisper repeat_after_me.py +``` + +To constantly listen and view transcribed speech output in the command line (by constantly sending requests to the actionlib server), run the following script: + +```bash +rosrun lasr_speech_recongition_whisper test_speech_server.py +``` +