From eb1ee46c26ee6a9f53ddcbe470aa3134c438e097 Mon Sep 17 00:00:00 2001
From: m-barker <mattbarker322@gmail.com>
Date: Mon, 4 Dec 2023 17:34:41 +0000
Subject: [PATCH 01/13] feat: actionlib server for speech recognition

---
 .../CMakeLists.txt                            |  11 +-
 .../action/TranscribeSpeech.action            |   6 +
 .../lasr_speech_recognition_msgs/package.xml  |   2 +
 .../CMakeLists.txt                            |  10 +-
 .../nodes/transcribe_microphone_server        | 236 ++++++++++++++++++
 .../package.xml                               |   4 +
 6 files changed, 264 insertions(+), 5 deletions(-)
 create mode 100644 common/speech/lasr_speech_recognition_msgs/action/TranscribeSpeech.action
 create mode 100644 common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server

diff --git a/common/speech/lasr_speech_recognition_msgs/CMakeLists.txt b/common/speech/lasr_speech_recognition_msgs/CMakeLists.txt
index ae6bac4d8..10e0472f1 100644
--- a/common/speech/lasr_speech_recognition_msgs/CMakeLists.txt
+++ b/common/speech/lasr_speech_recognition_msgs/CMakeLists.txt
@@ -7,7 +7,11 @@ project(lasr_speech_recognition_msgs)
 ## Find catkin macros and libraries
 ## if COMPONENTS list like find_package(catkin REQUIRED COMPONENTS xyz)
 ## is used, also find other catkin packages
-find_package(catkin REQUIRED COMPONENTS message_generation)
+find_package(catkin REQUIRED COMPONENTS message_generation genmsg actionlib_msgs actionlib std_msgs)
+add_action_files(
+  DIRECTORY action
+  FILES TranscribeSpeech.action
+)
 
 ## System dependencies are found with CMake's conventions
 # find_package(Boost REQUIRED COMPONENTS system)
@@ -63,8 +67,9 @@ add_service_files(
 
 ## Generate added messages and services with any dependencies listed here
 generate_messages(
-#   DEPENDENCIES
-#   std_msgs  # Or other packages containing msgs
+  DEPENDENCIES
+  std_msgs  # Or other packages containing msgs
+  actionlib_msgs
 )
 
 ################################################
diff --git a/common/speech/lasr_speech_recognition_msgs/action/TranscribeSpeech.action b/common/speech/lasr_speech_recognition_msgs/action/TranscribeSpeech.action
new file mode 100644
index 000000000..486b0cb19
--- /dev/null
+++ b/common/speech/lasr_speech_recognition_msgs/action/TranscribeSpeech.action
@@ -0,0 +1,6 @@
+---
+#result definition
+string sequence
+---
+#feedback
+string sequence
\ No newline at end of file
diff --git a/common/speech/lasr_speech_recognition_msgs/package.xml b/common/speech/lasr_speech_recognition_msgs/package.xml
index e319ec5db..6f00b03f4 100644
--- a/common/speech/lasr_speech_recognition_msgs/package.xml
+++ b/common/speech/lasr_speech_recognition_msgs/package.xml
@@ -51,6 +51,8 @@
   <buildtool_depend>catkin</buildtool_depend>
   <build_depend>message_generation</build_depend>
   <exec_depend>message_runtime</exec_depend>
+  <build_depend>actionlib_msgs</build_depend>
+  <exec_depend>actionlib_msgs</exec_depend>
 
 
   <!-- The export tag contains other, unspecified, tags -->
diff --git a/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt b/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt
index d2cce16f7..75240eddb 100644
--- a/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt
+++ b/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt
@@ -7,7 +7,11 @@ project(lasr_speech_recognition_whisper)
 ## Find catkin macros and libraries
 ## if COMPONENTS list like find_package(catkin REQUIRED COMPONENTS xyz)
 ## is used, also find other catkin packages
-find_package(catkin REQUIRED catkin_virtualenv)
+find_package(catkin REQUIRED catkin_virtualenv genmsg actionlib_msgs actionlib std_msgs)
+# add_action_files(
+#   DIRECTORY action
+#   FILES TranscribeSpeech.action
+# )
 
 ## System dependencies are found with CMake's conventions
 # find_package(Boost REQUIRED COMPONENTS system)
@@ -70,7 +74,8 @@ catkin_generate_virtualenv(
 ## Generate added messages and services with any dependencies listed here
 # generate_messages(
 #   DEPENDENCIES
-#   std_msgs  # Or other packages containing msgs
+#   std_msgs
+#   actionlib_msgs  # Or other packages containing msgs
 # )
 
 ################################################
@@ -162,6 +167,7 @@ include_directories(
 catkin_install_python(PROGRAMS
   nodes/simple_transcribe_microphone
   nodes/transcribe_microphone
+  nodes/transcribe_microphone_server
   scripts/list_microphones.py
   scripts/test_microphones.py
   DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
new file mode 100644
index 000000000..ee0df9578
--- /dev/null
+++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
@@ -0,0 +1,236 @@
+#!/usr/bin/env python3
+
+import os
+import argparse
+from typing import Optional, Union
+from dataclasses import dataclass
+from pathlib import Path
+
+import rospy
+import numpy as np
+import torch
+import actionlib
+import speech_recognition as sr  # type: ignore
+import lasr_speech_recognition_msgs.msg  # type: ignore
+from lasr_speech_recognition_whisper import load_model  # type: ignore
+
+
+@dataclass
+class speech_model_params:
+    """Class for storing speech recognition model parameters."""
+
+    model_name: str = "medium.en"
+    device: str = "cuda" if torch.cuda.is_available() else "cpu"
+    start_timeout: float = 5.0
+    end_timeout: Optional[float] = None
+    sample_rate: int = 16000
+    mic_device: Optional[Union[int, str]] = None
+
+
+class TranscribeSpeechAction(object):
+    # create messages that are used to publish feedback/result
+    _feedback = lasr_speech_recognition_msgs.msg.TranscribeSpeechFeedback()
+    _result = lasr_speech_recognition_msgs.msg.TranscribeSpeechResult()
+
+    def __init__(
+        self,
+        action_name: str,
+        model_params: speech_model_params,
+    ) -> None:
+        """Starts an action server for transcribing speech.
+
+        Args:
+            action_name (str): Name of the action server.
+        """
+
+        self._action_name = action_name
+        self._model_params = model_params
+        self._model = load_model(
+            self._model_params.model_name, device=self._model_params.device
+        )
+        self.recogniser = self._configure_recogniser()
+        self._feedback_sequence: Optional[list[str]] = None
+        self._action_server = actionlib.SimpleActionServer(
+            self._action_name,
+            lasr_speech_recognition_msgs.msg.TranscribeSpeechAction,
+            execute_cb=self.execute_cb,
+            auto_start=False,
+        )
+        self._action_server.register_preempt_callback(self.prempt_cb)
+        self._action_server.start()
+
+    def _configure_microphone(self) -> sr.Microphone:
+        """Configures the microphone for listening to speech based on the
+        microphone device index or name.
+
+        Returns: microphone object
+        """
+
+        if (
+            isinstance(self._model_params.mic_device, int)
+            or self._model_params.mic_device.isdigit()
+        ):
+            return sr.Microphone(
+                device_index=int(self._model_params.mic_device),
+                sample_rate=self._model_params.sample_rate,
+            )
+        elif isinstance(self._model_params.mic_device, str):
+            microphones = enumerate(sr.Microphone.list_microphone_names())
+            for index, name in microphones:
+                if self._model_params.mic_device in name:
+                    return sr.Microphone(
+                        device_index=index,
+                        sample_rate=self._model_params.sample_rate,
+                    )
+            raise ValueError(
+                f"Could not find microphone with name: {self._model_params.mic_device}"
+            )
+        # If no microphone device is specified, use the system default microphone
+        return sr.Microphone(sample_rate=self._model_params.sample_rate)
+
+    def _configure_recogniser(self, ambient_adj: bool = True) -> sr.Recognizer:
+        """Configures the speech recogniser object.
+
+        Args:
+            ambient_adj (bool, optional): Whether to adjust for ambient noise. Defaults to True.
+
+        Returns:
+            sr.Recognizer: speech recogniser object.
+        """
+
+        recogniser = sr.Recognizer()
+        if ambient_adj:
+            with self._configure_microphone() as source:
+                recogniser.adjust_for_ambient_noise(source)
+        return recogniser
+
+    def prempt_cb(self) -> None:
+        """Callback for preempting the action server.
+
+        Resets the feedback sequence and sets server to prempted.
+        """
+        self._action_server.set_preempted()
+
+    def execute_cb(self, goal) -> None:
+        self._feedback_sequence = []
+        with self._configure_microphone() as src:
+            wav_data = self.recogniser.listen(
+                src,
+                timeout=self._model_params.start_timeout,
+                phrase_time_limit=self._model_params.end_timeout,
+            ).get_wav_data()
+        # Magic number 32768.0 is the maximum value of a 16-bit signed integer
+        float_data = (
+            np.frombuffer(wav_data, dtype=np.int16).astype(np.float32, order="C")
+            / 32768.0
+        )
+
+        # Cast to fp16 if using GPU
+        phrase = self._model.transcribe(
+            float_data,
+            fp16=self._model_params.device == "cuda",
+        )["text"]
+
+        self._feedback_sequence.append(phrase)
+        self._result.sequence = phrase
+        rospy.loginfo(f"Transcribed phrase: {phrase}")
+        rospy.loginfo(f"{self._action_name} has succeeded")
+        self._action_server.set_succeeded(self._result)
+
+
+def parse_args() -> dict:
+    """Parses the command line arguments into a name: value dictinoary.
+
+    Returns:
+        dict: Dictionary of name: value pairs of command line arguments.
+    """
+    parser = argparse.ArgumentParser(
+        description="Starts an action server for transcribing speech."
+    )
+
+    parser.add_argument(
+        "--action-name",
+        type=str,
+        default="transcribe_speech",
+        help="Name of the action server.",
+    )
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="medium.en",
+        help="Name of the speech recognition model.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda" if torch.cuda.is_available() else "cpu",
+        help="Device to run the model on.",
+    )
+    parser.add_argument(
+        "--start_timeout",
+        type=float,
+        default=5.0,
+        help="Timeout for listening for the start of a phrase.",
+    )
+    parser.add_argument(
+        "--end_timeout",
+        type=float,
+        default=None,
+        help="Timeout for listening for the end of a phrase.",
+    )
+    parser.add_argument(
+        "--sample_rate",
+        type=int,
+        default=16000,
+        help="Sample rate of the microphone.",
+    )
+    parser.add_argument(
+        "--mic_device",
+        default=None,
+        help="Microphone device index or name. Can be a string or an integer.",
+    )
+
+    return vars(parser.parse_args())
+
+
+def configure_model_params(config: dict) -> speech_model_params:
+    """Configures the speech model parameters based on the provided
+    command line parameters.
+
+    Args:
+        config (dict): Command line parameters parsed in dictionary form.
+
+    Returns:
+        speech_model_params: dataclass containing the speech model parameters
+    """
+    model_params = speech_model_params()
+    if config["model_name"]:
+        model_params.model_name = config["model_name"]
+    if config["device"]:
+        model_params.device = config["device"]
+    if config["start_timeout"]:
+        model_params.start_timeout = config["start_timeout"]
+    if config["end_timeout"]:
+        model_params.end_timeout = config["end_timeout"]
+    if config["sample_rate"]:
+        model_params.sample_rate = config["sample_rate"]
+    if config["mic_device"]:
+        model_params.mic_device = config["mic_device"]
+
+    return model_params
+
+
+def configure_whisper_cache() -> None:
+    """Configures the whisper cache directory."""
+    whisper_cache = os.path.join(str(Path.home()), ".cache", "whisper")
+    os.makedirs(whisper_cache, exist_ok=True)
+    # Environemntal variable required to run whisper locally
+    os.environ["TIKTOKEN_CACHE_DIR"] = whisper_cache
+
+
+if __name__ == "__main__":
+    configure_whisper_cache()
+    config = parse_args()
+    rospy.init_node(config["action_name"])
+    server = TranscribeSpeechAction(rospy.get_name(), configure_model_params(config))
+    rospy.spin()
diff --git a/common/speech/lasr_speech_recognition_whisper/package.xml b/common/speech/lasr_speech_recognition_whisper/package.xml
index 39935c089..4c6f49965 100644
--- a/common/speech/lasr_speech_recognition_whisper/package.xml
+++ b/common/speech/lasr_speech_recognition_whisper/package.xml
@@ -51,6 +51,10 @@
   <buildtool_depend>catkin</buildtool_depend>
   <build_depend>catkin_virtualenv</build_depend>
   <depend>lasr_speech_recognition_msgs</depend>
+  <build_depend>actionlib</build_depend>
+  <build_depend>actionlib_msgs</build_depend>
+  <exec_depend>actionlib</exec_depend>
+  <exec_depend>actionlib_msgs</exec_depend>
 
 
   <!-- The export tag contains other, unspecified, tags -->

From e931db00707f8f8d08da94ecb817c2b8122593ad Mon Sep 17 00:00:00 2001
From: m-barker <mattbarker322@gmail.com>
Date: Mon, 4 Dec 2023 17:49:34 +0000
Subject: [PATCH 02/13] docs: add documentation for speech action server

---
 .../lasr_speech_recognition_whisper/doc/USAGE.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/common/speech/lasr_speech_recognition_whisper/doc/USAGE.md b/common/speech/lasr_speech_recognition_whisper/doc/USAGE.md
index 6c0649478..8730a6645 100644
--- a/common/speech/lasr_speech_recognition_whisper/doc/USAGE.md
+++ b/common/speech/lasr_speech_recognition_whisper/doc/USAGE.md
@@ -26,3 +26,19 @@ Stop listening whenever:
 ```bash
 rosservice call /whisper/stop_listening "{}"
 ```
+
+Run an actionlib server to transcribe the microphone:
+
+```bash
+rosrun lasr_speech_recognition_whisper transcribe_microphone_server
+```
+
+The response from the request is a `string` containing the transcribed text.
+
+Several command line configuration options exist, which can be viewed with:
+
+```bash
+rosrun lasr_speech_recognition_whisper transcribe_microphone_server --help
+```
+
+

From e1d7069ce7b692f2ba344906452741bd8b41210a Mon Sep 17 00:00:00 2001
From: m-barker <mattbarker322@gmail.com>
Date: Tue, 5 Dec 2023 21:22:04 +0000
Subject: [PATCH 03/13] fix: incorrect typing of mic name arg

---
 .../nodes/transcribe_microphone_server        | 29 ++++++++++++-------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
index ee0df9578..aab5ded63 100644
--- a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
+++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
@@ -17,14 +17,24 @@ from lasr_speech_recognition_whisper import load_model  # type: ignore
 
 @dataclass
 class speech_model_params:
-    """Class for storing speech recognition model parameters."""
+    """Class for storing speech recognition model parameters.
+
+    Args:
+        model_name (str, optional): Name of the speech recognition model. Defaults to "medium.en".
+        Must be a valid Whisper model name.
+        device (str, optional): Device to run the model on. Defaults to "cuda" if available, otherwise "cpu".
+        start_timeout (float): Max number of seconds of silence when starting listening before stopping. Defaults to 5.0.
+        end_timeout (Optional[float]): Max number of seconds of silence after starting listening before stopping. Defaults to None.
+        sample_rate (int): Sample rate of the microphone. Defaults to 16000Hz.
+        mic_device (Optional[str]): Microphone device index or name. Defaults to None.
+    """
 
     model_name: str = "medium.en"
     device: str = "cuda" if torch.cuda.is_available() else "cpu"
     start_timeout: float = 5.0
     end_timeout: Optional[float] = None
     sample_rate: int = 16000
-    mic_device: Optional[Union[int, str]] = None
+    mic_device: Optional[str] = None
 
 
 class TranscribeSpeechAction(object):
@@ -66,15 +76,15 @@ class TranscribeSpeechAction(object):
         Returns: microphone object
         """
 
-        if (
-            isinstance(self._model_params.mic_device, int)
-            or self._model_params.mic_device.isdigit()
-        ):
+        if self._model_params.mic_device is None:
+            # If no microphone device is specified, use the system default microphone
+            return sr.Microphone(sample_rate=self._model_params.sample_rate)
+        elif self._model_params.mic_device.isdigit():
             return sr.Microphone(
                 device_index=int(self._model_params.mic_device),
                 sample_rate=self._model_params.sample_rate,
             )
-        elif isinstance(self._model_params.mic_device, str):
+        else:
             microphones = enumerate(sr.Microphone.list_microphone_names())
             for index, name in microphones:
                 if self._model_params.mic_device in name:
@@ -85,8 +95,6 @@ class TranscribeSpeechAction(object):
             raise ValueError(
                 f"Could not find microphone with name: {self._model_params.mic_device}"
             )
-        # If no microphone device is specified, use the system default microphone
-        return sr.Microphone(sample_rate=self._model_params.sample_rate)
 
     def _configure_recogniser(self, ambient_adj: bool = True) -> sr.Recognizer:
         """Configures the speech recogniser object.
@@ -186,8 +194,9 @@ def parse_args() -> dict:
     )
     parser.add_argument(
         "--mic_device",
+        type=str,
         default=None,
-        help="Microphone device index or name. Can be a string or an integer.",
+        help="Microphone device index or name",
     )
 
     return vars(parser.parse_args())

From 199eae4cdaae4262142f8e45f5af3f52951aca54 Mon Sep 17 00:00:00 2001
From: m-barker <mattbarker322@gmail.com>
Date: Tue, 5 Dec 2023 21:24:33 +0000
Subject: [PATCH 04/13] fix: remove unsused feedback list

---
 .../nodes/transcribe_microphone_server                         | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
index aab5ded63..7b4e68159 100644
--- a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
+++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
@@ -59,7 +59,6 @@ class TranscribeSpeechAction(object):
             self._model_params.model_name, device=self._model_params.device
         )
         self.recogniser = self._configure_recogniser()
-        self._feedback_sequence: Optional[list[str]] = None
         self._action_server = actionlib.SimpleActionServer(
             self._action_name,
             lasr_speech_recognition_msgs.msg.TranscribeSpeechAction,
@@ -120,7 +119,6 @@ class TranscribeSpeechAction(object):
         self._action_server.set_preempted()
 
     def execute_cb(self, goal) -> None:
-        self._feedback_sequence = []
         with self._configure_microphone() as src:
             wav_data = self.recogniser.listen(
                 src,
@@ -139,7 +137,6 @@ class TranscribeSpeechAction(object):
             fp16=self._model_params.device == "cuda",
         )["text"]
 
-        self._feedback_sequence.append(phrase)
         self._result.sequence = phrase
         rospy.loginfo(f"Transcribed phrase: {phrase}")
         rospy.loginfo(f"{self._action_name} has succeeded")

From faf94f7815ada8f312b442c1d6b2be3751a337d2 Mon Sep 17 00:00:00 2001
From: m-barker <mattbarker322@gmail.com>
Date: Tue, 5 Dec 2023 22:09:27 +0000
Subject: [PATCH 05/13] fix: correctly check and handle preemption

---
 .../nodes/transcribe_microphone_server        | 36 +++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
index 7b4e68159..70f8ee712 100644
--- a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
+++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
@@ -5,6 +5,7 @@ import argparse
 from typing import Optional, Union
 from dataclasses import dataclass
 from pathlib import Path
+from timeit import default_timer as timer
 
 import rospy
 import numpy as np
@@ -58,6 +59,9 @@ class TranscribeSpeechAction(object):
         self._model = load_model(
             self._model_params.model_name, device=self._model_params.device
         )
+        rospy.loginfo(
+            f"Loaded model: {self._model_params.model_name} on {self._model_params.device}"
+        )
         self.recogniser = self._configure_recogniser()
         self._action_server = actionlib.SimpleActionServer(
             self._action_name,
@@ -114,11 +118,26 @@ class TranscribeSpeechAction(object):
     def prempt_cb(self) -> None:
         """Callback for preempting the action server.
 
-        Resets the feedback sequence and sets server to prempted.
+        Sets server to preempted state.
         """
-        self._action_server.set_preempted()
+        preempted_str = f"{self._action_name} has been preempted"
+        rospy.loginfo(preempted_str)
+        self._result.sequence = preempted_str
+        self._action_server.set_preempted(result=self._result, text=preempted_str)
 
     def execute_cb(self, goal) -> None:
+        """Callback for executing the action server.
+
+        Checks for preemption before listening and before and after transcribing, returning
+        if preemption is requested.
+
+        Args:
+            goal: UNUSED - actionlib requires a goal argument in the execute callback, but
+            this action server does not use a goal.
+        """
+
+        if self._action_server.is_preempt_requested():
+            return
         with self._configure_microphone() as src:
             wav_data = self.recogniser.listen(
                 src,
@@ -131,11 +150,24 @@ class TranscribeSpeechAction(object):
             / 32768.0
         )
 
+        if self._action_server.is_preempt_requested():
+            return
+
+        rospy.loginfo(f"Transcribing phrase with Whisper...")
+        transcription_start_time = timer()
         # Cast to fp16 if using GPU
         phrase = self._model.transcribe(
             float_data,
             fp16=self._model_params.device == "cuda",
         )["text"]
+        transcription_end_time = timer()
+        rospy.loginfo(f"Transcription finished!")
+        rospy.loginfo(
+            f"Time taken: {transcription_end_time - transcription_start_time:.2f}s"
+        )
+
+        if self._action_server.is_preempt_requested():
+            return
 
         self._result.sequence = phrase
         rospy.loginfo(f"Transcribed phrase: {phrase}")

From c3fb186069c3d1d98d3ab026c6020a12d638209e Mon Sep 17 00:00:00 2001
From: m-barker <mattbarker322@gmail.com>
Date: Wed, 13 Dec 2023 19:05:46 +0000
Subject: [PATCH 06/13] feat: timer for adjusting mic for ambient noise

---
 .../nodes/transcribe_microphone_server        | 31 +++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
index 70f8ee712..1f15aa9d6 100644
--- a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
+++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
@@ -28,6 +28,7 @@ class speech_model_params:
         end_timeout (Optional[float]): Max number of seconds of silence after starting listening before stopping. Defaults to None.
         sample_rate (int): Sample rate of the microphone. Defaults to 16000Hz.
         mic_device (Optional[str]): Microphone device index or name. Defaults to None.
+        timer_duration (Optional[int]): Duration of the timer for adjusting the microphone for ambient noise. Defaults to 20 seconds.
     """
 
     model_name: str = "medium.en"
@@ -36,6 +37,7 @@ class speech_model_params:
     end_timeout: Optional[float] = None
     sample_rate: int = 16000
     mic_device: Optional[str] = None
+    timer_duration: Optional[int] = 20
 
 
 class TranscribeSpeechAction(object):
@@ -62,7 +64,9 @@ class TranscribeSpeechAction(object):
         rospy.loginfo(
             f"Loaded model: {self._model_params.model_name} on {self._model_params.device}"
         )
+        # Configure the speech recogniser object and adjust for ambient noise
         self.recogniser = self._configure_recogniser()
+        # Setup the action server and register execution callback
         self._action_server = actionlib.SimpleActionServer(
             self._action_name,
             lasr_speech_recognition_msgs.msg.TranscribeSpeechAction,
@@ -70,8 +74,25 @@ class TranscribeSpeechAction(object):
             auto_start=False,
         )
         self._action_server.register_preempt_callback(self.prempt_cb)
+        # Setup the timer for adjusting the microphone for ambient noise every x seconds
+        self._timer_duration = self._model_params.timer_duration
+        self._timer = rospy.Timer(rospy.Duration(self._timer_duration), self._timer_cb)
+        self._listening = False
+
         self._action_server.start()
 
+    def _timer_cb(self) -> None:
+        """Adjusts the microphone for ambient noise, unless the action server is listening."""
+        if self._listening:
+            return
+        with self._configure_microphone() as source:
+            self.recogniser.adjust_for_ambient_noise(source)
+
+    def _reset_timer(self) -> None:
+        """Resets the timer for adjusting the microphone for ambient noise."""
+        self._timer.shutdown()
+        self._timer = rospy.Timer(rospy.Duration(self._timer_duration), self._timer_cb)
+
     def _configure_microphone(self) -> sr.Microphone:
         """Configures the microphone for listening to speech based on the
         microphone device index or name.
@@ -108,11 +129,12 @@ class TranscribeSpeechAction(object):
         Returns:
             sr.Recognizer: speech recogniser object.
         """
-
+        self._listening = True
         recogniser = sr.Recognizer()
         if ambient_adj:
             with self._configure_microphone() as source:
                 recogniser.adjust_for_ambient_noise(source)
+        self._listening = False
         return recogniser
 
     def prempt_cb(self) -> None:
@@ -135,10 +157,10 @@ class TranscribeSpeechAction(object):
             goal: UNUSED - actionlib requires a goal argument in the execute callback, but
             this action server does not use a goal.
         """
-
         if self._action_server.is_preempt_requested():
             return
         with self._configure_microphone() as src:
+            self._listening = True
             wav_data = self.recogniser.listen(
                 src,
                 timeout=self._model_params.start_timeout,
@@ -151,6 +173,7 @@ class TranscribeSpeechAction(object):
         )
 
         if self._action_server.is_preempt_requested():
+            self._listening = False
             return
 
         rospy.loginfo(f"Transcribing phrase with Whisper...")
@@ -167,6 +190,7 @@ class TranscribeSpeechAction(object):
         )
 
         if self._action_server.is_preempt_requested():
+            self._listening = False
             return
 
         self._result.sequence = phrase
@@ -174,6 +198,9 @@ class TranscribeSpeechAction(object):
         rospy.loginfo(f"{self._action_name} has succeeded")
         self._action_server.set_succeeded(self._result)
 
+        # Have this at the very end to not disrupt the action server
+        self._listening = False
+
 
 def parse_args() -> dict:
     """Parses the command line arguments into a name: value dictinoary.

From 271b5f3ac6ffdf2430e47eef4cd363f88af76bb6 Mon Sep 17 00:00:00 2001
From: m-barker <mattbarker322@gmail.com>
Date: Thu, 14 Dec 2023 20:54:35 +0000
Subject: [PATCH 07/13] feat: ambient timer duration cli

---
 .../nodes/transcribe_microphone_server                   | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
index 1f15aa9d6..6bf797da0 100644
--- a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
+++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
@@ -85,6 +85,7 @@ class TranscribeSpeechAction(object):
         """Adjusts the microphone for ambient noise, unless the action server is listening."""
         if self._listening:
             return
+        rospy.loginfo("Adjusting microphone for ambient noise...")
         with self._configure_microphone() as source:
             self.recogniser.adjust_for_ambient_noise(source)
 
@@ -254,6 +255,12 @@ def parse_args() -> dict:
         default=None,
         help="Microphone device index or name",
     )
+    parser.add_argument(
+        "--timer_duration",
+        type=int,
+        default=20,
+        help="Number of seconds of silence before the ambient noise adjustment is called.",
+    )
 
     return vars(parser.parse_args())
 
@@ -281,6 +288,8 @@ def configure_model_params(config: dict) -> speech_model_params:
         model_params.sample_rate = config["sample_rate"]
     if config["mic_device"]:
         model_params.mic_device = config["mic_device"]
+    if config["timer_duration"]:
+        model_params.timer_duration = config["timer_duration"]
 
     return model_params
 

From 443538e105993cb5370a43409e63a4b714d6dae5 Mon Sep 17 00:00:00 2001
From: m-barker <mattbarker322@gmail.com>
Date: Thu, 14 Dec 2023 21:13:10 +0000
Subject: [PATCH 08/13] feat: reset the timer upon entering exc callback

---
 .../nodes/transcribe_microphone_server                       | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
index 6bf797da0..63d4559af 100644
--- a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
+++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
@@ -2,7 +2,7 @@
 
 import os
 import argparse
-from typing import Optional, Union
+from typing import Optional
 from dataclasses import dataclass
 from pathlib import Path
 from timeit import default_timer as timer
@@ -160,6 +160,9 @@ class TranscribeSpeechAction(object):
         """
         if self._action_server.is_preempt_requested():
             return
+        # Since we are about to listen, reset the timer for adjusting the microphone for ambient noise
+        # as this assumes self_timer_duration seconds of silence before adjusting
+        self._reset_timer()
         with self._configure_microphone() as src:
             self._listening = True
             wav_data = self.recogniser.listen(

From dd6cb56fc6995194ef921c8b8d6f8e3634fddc19 Mon Sep 17 00:00:00 2001
From: m-barker <mattbarker322@gmail.com>
Date: Thu, 14 Dec 2023 21:38:27 +0000
Subject: [PATCH 09/13] feat: warmup model by transcribing test file

---
 .../nodes/transcribe_microphone_server        | 16 ++++++--
 .../lasr_speech_recognition_whisper/cache.py  | 40 +++++++++++++++----
 2 files changed, 45 insertions(+), 11 deletions(-)

diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
index 63d4559af..2c538e535 100644
--- a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
+++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
@@ -29,6 +29,7 @@ class speech_model_params:
         sample_rate (int): Sample rate of the microphone. Defaults to 16000Hz.
         mic_device (Optional[str]): Microphone device index or name. Defaults to None.
         timer_duration (Optional[int]): Duration of the timer for adjusting the microphone for ambient noise. Defaults to 20 seconds.
+        warmup (bool): Whether to warmup the model by running inference on a test file. Defaults to True.
     """
 
     model_name: str = "medium.en"
@@ -38,6 +39,7 @@ class speech_model_params:
     sample_rate: int = 16000
     mic_device: Optional[str] = None
     timer_duration: Optional[int] = 20
+    warmup: bool = True
 
 
 class TranscribeSpeechAction(object):
@@ -59,10 +61,9 @@ class TranscribeSpeechAction(object):
         self._action_name = action_name
         self._model_params = model_params
         self._model = load_model(
-            self._model_params.model_name, device=self._model_params.device
-        )
-        rospy.loginfo(
-            f"Loaded model: {self._model_params.model_name} on {self._model_params.device}"
+            self._model_params.model_name,
+            self._model_params.device,
+            self._model_params.warmup,
         )
         # Configure the speech recogniser object and adjust for ambient noise
         self.recogniser = self._configure_recogniser()
@@ -264,6 +265,11 @@ def parse_args() -> dict:
         default=20,
         help="Number of seconds of silence before the ambient noise adjustment is called.",
     )
+    parser.add_argument(
+        "--no_warmup",
+        action="store_true",
+        help="Disable warming up the model by running inference on a test file.",
+    )
 
     return vars(parser.parse_args())
 
@@ -293,6 +299,8 @@ def configure_model_params(config: dict) -> speech_model_params:
         model_params.mic_device = config["mic_device"]
     if config["timer_duration"]:
         model_params.timer_duration = config["timer_duration"]
+    if config["no_warmup"]:
+        model_params.warmup = False
 
     return model_params
 
diff --git a/common/speech/lasr_speech_recognition_whisper/src/lasr_speech_recognition_whisper/cache.py b/common/speech/lasr_speech_recognition_whisper/src/lasr_speech_recognition_whisper/cache.py
index d0ec731fc..42ec44785 100644
--- a/common/speech/lasr_speech_recognition_whisper/src/lasr_speech_recognition_whisper/cache.py
+++ b/common/speech/lasr_speech_recognition_whisper/src/lasr_speech_recognition_whisper/cache.py
@@ -1,17 +1,43 @@
-import whisper
+import os
+import whisper  # type: ignore
+import rospkg  # type: ignore
 import rospy
 
 # Keep all loaded models in memory
 MODEL_CACHE = {}
 
-def load_model(name: str, device: str = 'cpu'):
-    '''
-    Load a given Whisper model
-    '''
+
+def load_model(
+    name: str, device: str = "cpu", load_test_file: bool = False
+) -> whisper.Whisper:
+    """Loads a whisper model from disk, or from cache if it has already been loaded.
+
+    Args:
+        name (str): Name of the whisper model. Must be the name of an official whisper
+        model, or the path to a model checkpoint.
+        device (str, optional): Pytorch device to put the model on. Defaults to 'cpu'.
+        load_test_file (bool, optional): Whether to run inference on a test audio file
+        after loading the model (if model is not in cache). Defaults to False. Test file
+        is assumed to be called "test.m4a" and be in the root of the package directory.
+
+    Returns:
+        whisper.Whisper: Whisper model instance
+    """
     global MODEL_CACHE
 
     if name not in MODEL_CACHE:
-        rospy.loginfo(f'Load model {name}')
+        rospy.loginfo(f"Loading model {name}")
         MODEL_CACHE[name] = whisper.load_model(name, device=device)
-    
+        rospy.loginfo(f"Sucessfully loaded model {name} on {device}")
+        if load_test_file:
+            package_root = rospkg.RosPack().get_path("lasr_speech_recognition_whisper")
+            example_fp = os.path.join(package_root, "test.m4a")
+            rospy.loginfo(
+                "Running transcription on example file to ensure model is loaded..."
+            )
+            test_result: str = MODEL_CACHE[name].transcribe(
+                example_fp, fp16=device == "cuda"
+            )
+            rospy.loginfo(f"Transcription test result: {test_result}")
+
     return MODEL_CACHE[name]

From 669506bc86b4d3c7eb51065c20461484fe05cf17 Mon Sep 17 00:00:00 2001
From: m-barker <mattbarker322@gmail.com>
Date: Mon, 29 Jan 2024 10:46:57 +0000
Subject: [PATCH 10/13] feat: supress ALSA warning msgs

---
 .../nodes/transcribe_microphone_server        | 90 ++++++++++++-------
 1 file changed, 60 insertions(+), 30 deletions(-)

diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
index 2c538e535..95f3b1e36 100644
--- a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
+++ b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
@@ -15,6 +15,29 @@ import speech_recognition as sr  # type: ignore
 import lasr_speech_recognition_msgs.msg  # type: ignore
 from lasr_speech_recognition_whisper import load_model  # type: ignore
 
+# Error handler to remove ALSA error messages taken from:
+# https://stackoverflow.com/questions/7088672/pyaudio-working-but-spits-out-error-messages-each-time/17673011#17673011
+
+from ctypes import *
+from contextlib import contextmanager
+
+ERROR_HANDLER_FUNC = CFUNCTYPE(None, c_char_p, c_int, c_char_p, c_int, c_char_p)
+
+
+def py_error_handler(filename, line, function, err, fmt):
+    pass
+
+
+c_error_handler = ERROR_HANDLER_FUNC(py_error_handler)
+
+
+@contextmanager
+def noalsaerr():
+    asound = cdll.LoadLibrary("libasound.so")
+    asound.snd_lib_error_set_handler(c_error_handler)
+    yield
+    asound.snd_lib_error_set_handler(None)
+
 
 @dataclass
 class speech_model_params:
@@ -60,35 +83,40 @@ class TranscribeSpeechAction(object):
 
         self._action_name = action_name
         self._model_params = model_params
-        self._model = load_model(
-            self._model_params.model_name,
-            self._model_params.device,
-            self._model_params.warmup,
-        )
-        # Configure the speech recogniser object and adjust for ambient noise
-        self.recogniser = self._configure_recogniser()
-        # Setup the action server and register execution callback
-        self._action_server = actionlib.SimpleActionServer(
-            self._action_name,
-            lasr_speech_recognition_msgs.msg.TranscribeSpeechAction,
-            execute_cb=self.execute_cb,
-            auto_start=False,
-        )
-        self._action_server.register_preempt_callback(self.prempt_cb)
-        # Setup the timer for adjusting the microphone for ambient noise every x seconds
-        self._timer_duration = self._model_params.timer_duration
-        self._timer = rospy.Timer(rospy.Duration(self._timer_duration), self._timer_cb)
-        self._listening = False
 
-        self._action_server.start()
+        with noalsaerr():
+            self._model = load_model(
+                self._model_params.model_name,
+                self._model_params.device,
+                self._model_params.warmup,
+            )
+            # Configure the speech recogniser object and adjust for ambient noise
+            self.recogniser = self._configure_recogniser()
+            # Setup the action server and register execution callback
+            self._action_server = actionlib.SimpleActionServer(
+                self._action_name,
+                lasr_speech_recognition_msgs.msg.TranscribeSpeechAction,
+                execute_cb=self.execute_cb,
+                auto_start=False,
+            )
+            self._action_server.register_preempt_callback(self.prempt_cb)
+            # Setup the timer for adjusting the microphone for ambient noise every x seconds
+            self._timer_duration = self._model_params.timer_duration
+            self._timer = rospy.Timer(
+                rospy.Duration(self._timer_duration), self._timer_cb
+            )
+            self._listening = False
 
-    def _timer_cb(self) -> None:
+            self._action_server.start()
+
+    def _timer_cb(self, _) -> None:
         """Adjusts the microphone for ambient noise, unless the action server is listening."""
         if self._listening:
             return
         rospy.loginfo("Adjusting microphone for ambient noise...")
-        with self._configure_microphone() as source:
-            self.recogniser.adjust_for_ambient_noise(source)
+        with noalsaerr():
+            with self._configure_microphone() as source:
+                self.recogniser.adjust_for_ambient_noise(source)
 
     def _reset_timer(self) -> None:
         """Resets the timer for adjusting the microphone for ambient noise."""
@@ -159,18 +187,20 @@ class TranscribeSpeechAction(object):
             goal: UNUSED - actionlib requires a goal argument in the execute callback, but
             this action server does not use a goal.
         """
+        rospy.loginfo("Request Received")
         if self._action_server.is_preempt_requested():
             return
         # Since we are about to listen, reset the timer for adjusting the microphone for ambient noise
         # as this assumes self_timer_duration seconds of silence before adjusting
         self._reset_timer()
-        with self._configure_microphone() as src:
-            self._listening = True
-            wav_data = self.recogniser.listen(
-                src,
-                timeout=self._model_params.start_timeout,
-                phrase_time_limit=self._model_params.end_timeout,
-            ).get_wav_data()
+        with noalsaerr():
+            with self._configure_microphone() as src:
+                self._listening = True
+                wav_data = self.recogniser.listen(
+                    src,
+                    timeout=self._model_params.start_timeout,
+                    phrase_time_limit=self._model_params.end_timeout,
+                ).get_wav_data()
         # Magic number 32768.0 is the maximum value of a 16-bit signed integer
         float_data = (
             np.frombuffer(wav_data, dtype=np.int16).astype(np.float32, order="C")

From ae8e9e189f5440ce6dfeb4700eb4234f499400e6 Mon Sep 17 00:00:00 2001
From: m-barker <mattbarker322@gmail.com>
Date: Mon, 29 Jan 2024 10:47:26 +0000
Subject: [PATCH 11/13] feat: tiago repeat after me demo script

---
 .../CMakeLists.txt                            |  1 +
 .../scripts/repeat_after_me.py                | 58 +++++++++++++++++++
 2 files changed, 59 insertions(+)
 create mode 100644 common/speech/lasr_speech_recognition_whisper/scripts/repeat_after_me.py

diff --git a/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt b/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt
index 75240eddb..9c2fe1f57 100644
--- a/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt
+++ b/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt
@@ -170,6 +170,7 @@ catkin_install_python(PROGRAMS
   nodes/transcribe_microphone_server
   scripts/list_microphones.py
   scripts/test_microphones.py
+  scripts/repeat_after_me.py
   DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
 )
 
diff --git a/common/speech/lasr_speech_recognition_whisper/scripts/repeat_after_me.py b/common/speech/lasr_speech_recognition_whisper/scripts/repeat_after_me.py
new file mode 100644
index 000000000..2e6b20622
--- /dev/null
+++ b/common/speech/lasr_speech_recognition_whisper/scripts/repeat_after_me.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+import rospy
+import actionlib
+from lasr_voice import Voice  # type: ignore
+from lasr_speech_recognition_msgs.srv import TranscribeAudio, TranscribeAudioResponse  # type: ignore
+from lasr_speech_recognition_msgs.msg import (  # type: ignore
+    TranscribeSpeechAction,
+    TranscribeSpeechGoal,
+)
+
+# import actionlib
+rospy.init_node("repeat")
+
+USE_ACTIONLIB = True
+
+voice = Voice()
+
+
+if USE_ACTIONLIB:
+    client = actionlib.SimpleActionClient("transcribe_speech", TranscribeSpeechAction)
+    client.wait_for_server()
+    repeating = False
+    rospy.loginfo("Done waiting")
+    while not rospy.is_shutdown():
+        goal = TranscribeSpeechGoal()
+        client.send_goal(goal)
+        client.wait_for_result()
+        result = client.get_result()
+        text = result.sequence
+        print(text)
+        if "tiago" in text.lower().strip():
+            if "repeat" in text.lower().strip():
+                repeating = True
+                voice.sync_tts("Okay, I'll start repeating now.")
+                continue
+            elif "stop" in text.lower().strip():
+                repeating = False
+                voice.sync_tts("Okay, I'll stop repeating now.")
+                break
+        if repeating:
+            voice.sync_tts(f"I heard {text}")
+else:
+    transcribe = rospy.ServiceProxy("/whisper/transcribe_audio", TranscribeAudio)
+    repeating = False
+    while not rospy.is_shutdown():
+        text = transcribe().phrase
+        print(text)
+        if "tiago" in text.lower().strip():
+            if "repeat" in text.lower().strip():
+                repeating = True
+                voice.sync_tts("Okay, I'll start repeating now.")
+                continue
+            elif "stop" in text.lower().strip():
+                repeating = False
+                voice.sync_tts("Okay, I'll stop repeating now.")
+                break
+        if repeating:
+            voice.sync_tts(f"I heard {text}")

From 8745bf41d79341b0c5300fc3b73461074f9cd761 Mon Sep 17 00:00:00 2001
From: m-barker <mattbarker322@gmail.com>
Date: Mon, 29 Jan 2024 11:16:01 +0000
Subject: [PATCH 12/13] feat: test speech server locally with microphones

---
 .../CMakeLists.txt                            |  1 +
 .../scripts/test_speech_server.py             | 21 +++++++++++++++++++
 2 files changed, 22 insertions(+)
 create mode 100644 common/speech/lasr_speech_recognition_whisper/scripts/test_speech_server.py

diff --git a/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt b/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt
index 9c2fe1f57..a11465954 100644
--- a/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt
+++ b/common/speech/lasr_speech_recognition_whisper/CMakeLists.txt
@@ -171,6 +171,7 @@ catkin_install_python(PROGRAMS
   scripts/list_microphones.py
   scripts/test_microphones.py
   scripts/repeat_after_me.py
+  scripts/test_speech_server.py
   DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
 )
 
diff --git a/common/speech/lasr_speech_recognition_whisper/scripts/test_speech_server.py b/common/speech/lasr_speech_recognition_whisper/scripts/test_speech_server.py
new file mode 100644
index 000000000..fef16eb0c
--- /dev/null
+++ b/common/speech/lasr_speech_recognition_whisper/scripts/test_speech_server.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+import rospy
+import actionlib
+from lasr_speech_recognition_msgs.srv import TranscribeAudio, TranscribeAudioResponse  # type: ignore
+from lasr_speech_recognition_msgs.msg import (  # type: ignore
+    TranscribeSpeechAction,
+    TranscribeSpeechGoal,
+)
+
+
+rospy.init_node("test_speech_server")
+client = actionlib.SimpleActionClient("transcribe_speech", TranscribeSpeechAction)
+client.wait_for_server()
+rospy.loginfo("Done waiting")
+while not rospy.is_shutdown():
+    goal = TranscribeSpeechGoal()
+    client.send_goal(goal)
+    client.wait_for_result()
+    result = client.get_result()
+    text = result.sequence
+    print(f"Transcribed Speech: {text}")

From 1b4136314995d8c551a7bdb915ad58f625ec5c28 Mon Sep 17 00:00:00 2001
From: m-barker <mattbarker322@gmail.com>
Date: Mon, 29 Jan 2024 11:18:19 +0000
Subject: [PATCH 13/13] docs: update usage with two new scripts

---
 .../lasr_speech_recognition_whisper/doc/USAGE.md     | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/common/speech/lasr_speech_recognition_whisper/doc/USAGE.md b/common/speech/lasr_speech_recognition_whisper/doc/USAGE.md
index 8730a6645..2bb966c13 100644
--- a/common/speech/lasr_speech_recognition_whisper/doc/USAGE.md
+++ b/common/speech/lasr_speech_recognition_whisper/doc/USAGE.md
@@ -41,4 +41,16 @@ Several command line configuration options exist, which can be viewed with:
 rosrun lasr_speech_recognition_whisper transcribe_microphone_server --help
 ```
 
+Get tiago to repeat, with TTS the transcribed speech output; he will begin repeating after hearing "tiago, repeat ...." and stop once hearing "tiago, stop..."
+
+```bash
+rosrun lasr_speech_recognition_whisper repeat_after_me.py
+```
+
+To constantly listen and view transcribed speech output in the command line (by constantly sending requests to the actionlib server), run the following script:
+
+```bash
+rosrun lasr_speech_recongition_whisper test_speech_server.py
+```
+