Merge remote-tracking branch 'upstream/main'

MBTMBTMBT · Jun 24, 2024 · 90d8c3c · 90d8c3c
2 parents c90c6bb + 1eee0bb
commit 90d8c3c
Show file tree

Hide file tree

Showing 7 changed files with 98 additions and 62 deletions.
diff --git a/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server b/common/speech/lasr_speech_recognition_whisper/nodes/transcribe_microphone_server
@@ -26,23 +26,25 @@ class speech_model_params:
         Must be a valid Whisper model name.
         device (str, optional): Device to run the model on. Defaults to "cuda" if available, otherwise "cpu".
         start_timeout (float): Max number of seconds of silence when starting listening before stopping. Defaults to 5.0.
-        end_timeout (Optional[float]): Max number of seconds of silence after starting listening before stopping. Defaults to None.
+        phrase_duration (Optional[float]): Max number of seconds of the phrase. Defaults to 10 seconds.
         sample_rate (int): Sample rate of the microphone. Defaults to 16000Hz.
         mic_device (Optional[str]): Microphone device index or name. Defaults to None.
         timer_duration (Optional[int]): Duration of the timer for adjusting the microphone for ambient noise. Defaults to 20 seconds.
         warmup (bool): Whether to warmup the model by running inference on a test file. Defaults to True.
         energy_threshold (Optional[int]): Energy threshold for silence detection. Using this disables automatic adjustment. Defaults to None.
+        pause_threshold (Optional[float]): Seconds of non-speaking audio before a phrase is considered complete. Defaults to 0.8 seconds.
     """
 
     model_name: str = "medium.en"
     device: str = "cuda" if torch.cuda.is_available() else "cpu"
     start_timeout: float = 5.0
-    end_timeout: Optional[float] = None
+    phrase_duration: Optional[float] = 10
     sample_rate: int = 16000
     mic_device: Optional[str] = None
     timer_duration: Optional[int] = 20
     warmup: bool = True
-    energy_threshold: Optional[int] = None
+    energy_threshold: Optional[int] = 600
+    pause_threshold: Optional[float] = 2.0
 
 
 class TranscribeSpeechAction(object):
@@ -122,6 +124,9 @@ class TranscribeSpeechAction(object):
         self._listening = True
         recogniser = sr.Recognizer()
 
+        if self._model_params.pause_threshold:
+            recogniser.pause_threshold = self._model_params.pause_threshold
+
         if self._model_params.energy_threshold:
             recogniser.dynamic_energy_threshold = False
             recogniser.energy_threshold = self._model_params.energy_threshold
@@ -160,7 +165,7 @@ class TranscribeSpeechAction(object):
             wav_data = self.recogniser.listen(
                 src,
                 timeout=self._model_params.start_timeout,
-                phrase_time_limit=self._model_params.end_timeout,
+                phrase_time_limit=self._model_params.phrase_duration,
             ).get_wav_data()
         # Magic number 32768.0 is the maximum value of a 16-bit signed integer
         float_data = (
@@ -233,10 +238,10 @@ def parse_args() -> dict:
         help="Timeout for listening for the start of a phrase.",
     )
     parser.add_argument(
-        "--end_timeout",
+        "--phrase_duration",
         type=float,
-        default=None,
-        help="Timeout for listening for the end of a phrase.",
+        default=10,
+        help="Maximum phrase duration after starting listening in seconds.",
     )
     parser.add_argument(
         "--sample_rate",
@@ -259,10 +264,17 @@ def parse_args() -> dict:
     parser.add_argument(
         "--energy_threshold",
         type=int,
-        default=None,
+        default=600,
         help="Energy threshold for silence detection. Using this disables automatic adjustment",
     )
 
+    parser.add_argument(
+        "--pause_threshold",
+        type=float,
+        default=2.0,
+        help="Seconds of non-speaking audio before a phrase is considered complete.",
+    )
+
     args, unknown = parser.parse_known_args()
     return vars(args)
 
@@ -284,14 +296,18 @@ def configure_model_params(config: dict) -> speech_model_params:
         model_params.device = config["device"]
     if config["start_timeout"]:
         model_params.start_timeout = config["start_timeout"]
-    if config["end_timeout"]:
-        model_params.end_timeout = config["end_timeout"]
+    if config["phrase_duration"]:
+        model_params.phrase_duration = config["phrase_duration"]
     if config["sample_rate"]:
         model_params.sample_rate = config["sample_rate"]
     if config["mic_device"]:
         model_params.mic_device = config["mic_device"]
     if config["no_warmup"]:
         model_params.warmup = False
+    if config["energy_threshold"]:
+        model_params.energy_threshold = config["energy_threshold"]
+    if config["pause_threshold"]:
+        model_params.pause_threshold = config["pause_threshold"]
 
     return model_params
 

diff --git a/skills/src/lasr_skills/describe_people.py b/skills/src/lasr_skills/describe_people.py
@@ -40,7 +40,12 @@ def __init__(self):
             )
             smach.StateMachine.add(
                 "GET_IMAGE",
-                GetCroppedImage(object_name="person", crop_method=crop_method, rgb_topic=rgb_topic, use_mask=True),
+                GetCroppedImage(
+                    object_name="person",
+                    crop_method=crop_method,
+                    rgb_topic=rgb_topic,
+                    use_mask=True,
+                ),
                 transitions={
                     "succeeded": "CONVERT_IMAGE",
                     "failed": "SAY_GET_IMAGE_AGAIN",
@@ -234,9 +239,14 @@ def execute(self, userdata):
                         rospy.logdebug(f"|> Person does not have {part} visible")
                         continue
 
-                    if part.name == "torso_front" or part.name == "torso_back":
+                    if (
+                        part.part_name == "torso_front"
+                        or part.part_name == "torso_back"
+                    ):
                         torso_mask = np.logical_or(torso_mask, part_mask)
-                    elif part.name == "left_face" or part.name == "right_face":
+                    elif (
+                        part.part_name == "left_face" or part.part_name == "right_face"
+                    ):
                         head_mask = np.logical_or(head_mask, part_mask)
 
                 torso_mask_data, torso_mask_shape, torso_mask_dtype = numpy2message(

diff --git a/skills/src/lasr_skills/look_at_person.py b/skills/src/lasr_skills/look_at_person.py
@@ -2,8 +2,10 @@
 from geometry_msgs.msg import PointStamped
 import smach
 from .vision import GetPointCloud
-from lasr_vision_msgs.srv import BodyPixDetection, BodyPixDetectionRequest
-from lasr_vision_msgs.msg import BodyPixMaskRequest
+from lasr_vision_msgs.srv import (
+    BodyPixKeypointDetection,
+    BodyPixKeypointDetectionRequest,
+)
 from lasr_skills import LookToPoint, DetectFaces
 import cv2
 import rospy
@@ -155,38 +157,39 @@ def execute(self, userdata):
 
             return "failed"
 
-    @smach.cb_interface(input_keys=["poses", "detections"], output_keys=["bbox_eyes"])
+    @smach.cb_interface(
+        input_keys=["keypoints", "detections"], output_keys=["bbox_eyes"]
+    )
     def match_poses_and_detections(ud):
         bbox_eyes = []
-        for pose in ud.poses:
+        for keypoint in ud.keypoints:
             for detection in ud.detections.detections:
                 temp = {
                     "bbox": detection.xywh,
                 }
-                for keypoint in pose.keypoints:
-                    if (
-                        keypoint.part == "leftEye"
-                        and detection.xywh[0]
-                        < keypoint.xy[0]
-                        < detection.xywh[0] + detection.xywh[2]
-                        and detection.xywh[1]
-                        < keypoint.xy[1]
-                        < detection.xywh[1] + detection.xywh[3]
-                    ):
-                        temp["left_eye"] = keypoint.xy
-                    if (
-                        keypoint.part == "rightEye"
-                        and detection.xywh[0]
-                        < keypoint.xy[0]
-                        < detection.xywh[0] + detection.xywh[2]
-                        and detection.xywh[1]
-                        < keypoint.xy[1]
-                        < detection.xywh[1] + detection.xywh[3]
-                    ):
-                        temp["right_eye"] = keypoint.xy
+                if (
+                    keypoint.keypoint_name == "leftEye"
+                    and detection.xywh[0]
+                    < keypoint.x
+                    < detection.xywh[0] + detection.xywh[2]
+                    and detection.xywh[1]
+                    < keypoint.y
+                    < detection.xywh[1] + detection.xywh[3]
+                ):
+                    temp["left_eye"] = keypoint.x, keypoint.y
+                if (
+                    keypoint.keypoint_name == "rightEye"
+                    and detection.xywh[0]
+                    < keypoint.x
+                    < detection.xywh[0] + detection.xywh[2]
+                    and detection.xywh[1]
+                    < keypoint.y
+                    < detection.xywh[1] + detection.xywh[3]
+                ):
+                    temp["right_eye"] = keypoint.x, keypoint.y
 
-                    if "left_eye" in temp and "right_eye" in temp:
-                        bbox_eyes.append(temp)
+                if "left_eye" in temp and "right_eye" in temp:
+                    bbox_eyes.append(temp)
 
         ud.bbox_eyes = bbox_eyes
 
@@ -226,21 +229,17 @@ def __init__(self, filter=False):
                         },
                     )
 
-            eyes = BodyPixMaskRequest()
-            eyes.parts = ["left_eye", "right_eye"]
-            masks = [eyes]
-
             smach.StateMachine.add(
                 "SEGMENT_PERSON",
                 smach_ros.ServiceState(
-                    "/bodypix/detect",
-                    BodyPixDetection,
-                    request_cb=lambda ud, _: BodyPixDetectionRequest(
-                        pcl_to_img_msg(ud.pcl_msg), "resnet50", 0.7, masks
+                    "/bodypix/keypoint_detection",
+                    BodyPixKeypointDetection,
+                    request_cb=lambda ud, _: BodyPixKeypointDetectionRequest(
+                        pcl_to_img_msg(ud.pcl_msg), "resnet50", 0.2
                     ),
-                    response_slots=["masks", "poses"],
+                    response_slots=["keypoints"],
                     input_keys=["pcl_msg"],
-                    output_keys=["masks", "poses"],
+                    output_keys=["keypoints"],
                 ),
                 transitions={
                     "succeeded": "DETECT_FACES",
@@ -262,8 +261,8 @@ def __init__(self, filter=False):
                 "MATCH_POSES_AND_DETECTIONS",
                 CBState(
                     self.match_poses_and_detections,
-                    input_keys=["poses", "detections"],
-                    output_keys=["poses"],
+                    input_keys=["keypoints", "detections"],
+                    output_keys=["keypoints"],
                     outcomes=["succeeded", "failed"],
                 ),
                 transitions={"succeeded": "CHECK_EYES", "failed": "failed"},

diff --git a/tasks/carry_my_luggage/launch/setup.launch b/tasks/carry_my_luggage/launch/setup.launch
@@ -15,5 +15,5 @@
 
     <!--      PERCEPTION     -->
     <include file="$(find lasr_vision_yolov8)/launch/service.launch" />
-    <include file="$(find lasr_vision_bodypix)/launch/service.launch" />
+    <include file="$(find lasr_vision_bodypix)/launch/mask_service.launch" />
 </launch>
diff --git a/tasks/receptionist/launch/setup.launch b/tasks/receptionist/launch/setup.launch
@@ -4,7 +4,7 @@
     <arg name="config" default="lab"/>
 
     <!--     INTERACTION     -->
-    <arg name="whisper_device_param" default="13" />
+    <arg name="whisper_device_param" default="9" />
     <node pkg="lasr_speech_recognition_whisper" type="transcribe_microphone_server" name="transcribe_speech" output="screen" args="--mic_device $(arg whisper_device_param)"/>
 
     <!--     STATIC POINTS     -->
@@ -21,7 +21,7 @@
 
     <node pkg="lasr_vision_feature_extraction" type="service" name="torch_service" output="screen"/>
     <arg name="debug" default="true" />
-    <node pkg="lasr_vision_bodypix" type="service" name="bodypix_service" output="screen" args="--debug $(arg debug)"/>
+    <node pkg="lasr_vision_bodypix" type="mask_service.py" name="bodypix_service" output="screen" args="--debug $(arg debug)"/>
     <node pkg="lasr_vision_deepface" type="service" name="deepface_service" output="screen"/>
 
 </launch>
diff --git a/tasks/receptionist/scripts/main.py b/tasks/receptionist/scripts/main.py
@@ -10,25 +10,21 @@
 if __name__ == "__main__":
     rospy.init_node("receptionist_robocup")
     wait_pose_param = rospy.get_param("/receptionist/wait_pose")
-    # wait_pose_param = rospy.get_param("/wait_pose/")
 
     wait_pose = Pose(
         position=Point(**wait_pose_param["position"]),
         orientation=Quaternion(**wait_pose_param["orientation"]),
     )
 
-    # wait_area_param = rospy.get_param("/wait_area")
     wait_area_param = rospy.get_param("/receptionist/wait_area")
     wait_area = Polygon(wait_area_param)
 
-    # seat_pose_param = rospy.get_param("/seat_pose")
     seat_pose_param = rospy.get_param("/receptionist/seat_pose")
     seat_pose = Pose(
         position=Point(**seat_pose_param["position"]),
         orientation=Quaternion(**seat_pose_param["orientation"]),
     )
 
-    # seat_area_param = rospy.get_param("/seat_area")
     seat_area_param = rospy.get_param("/receptionist/seat_area")
 
     seat_area = Polygon(seat_area_param)

diff --git a/tasks/receptionist/src/receptionist/states/introduce.py b/tasks/receptionist/src/receptionist/states/introduce.py
@@ -46,7 +46,7 @@ def stringify_guest_data(guest_data: Dict[str, Any], guest_id: str) -> str:
     known_attributes = {}
 
     for attribute, value in relevant_guest_data["attributes"].items():
-        if value != "unknown" and value != False and value != "No_Beard":
+        if value != "unknown":
             known_attributes[attribute] = value
     print("These are the known attributes")
     print(known_attributes)
@@ -98,11 +98,26 @@ def stringify_guest_data(guest_data: Dict[str, Any], guest_id: str) -> str:
                 )
                 detection = True
         if attribute == "facial_hair":
-            guest_str += f"They have facial hair."
             detection = True
+            if relevant_guest_data["attributes"][attribute] != "No_Beard":
+                guest_str += f"They have facial hair."
+            else:
+                guest_str += f"They do not have facial hair."
+        if attribute == "glasses":
+            detection = True
+            if relevant_guest_data["attributes"][attribute]:
+                guest_str += f"They are wearing glasses."
+            else:
+                guest_str += f"They are not wearing glasses."
+        if attribute == "hat":
+            detection = True
+            if relevant_guest_data["attributes"][attribute]:
+                guest_str += f"They are wearing a hat."
+            else:
+                guest_str += f"They are not wearing a hat."
         if not detection:
             if isSingular(attribute):
-                guest_str += f"They are a wearing {attribute}."
+                guest_str += f"They are wearing a {attribute}."
             else:
                 guest_str += f"They are wearing {attribute}."