Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
Benteng Ma committed Jun 24, 2024
2 parents c90c6bb + 1eee0bb commit 90d8c3c
Show file tree
Hide file tree
Showing 7 changed files with 98 additions and 62 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,25 @@ class speech_model_params:
Must be a valid Whisper model name.
device (str, optional): Device to run the model on. Defaults to "cuda" if available, otherwise "cpu".
start_timeout (float): Max number of seconds of silence when starting listening before stopping. Defaults to 5.0.
end_timeout (Optional[float]): Max number of seconds of silence after starting listening before stopping. Defaults to None.
phrase_duration (Optional[float]): Max number of seconds of the phrase. Defaults to 10 seconds.
sample_rate (int): Sample rate of the microphone. Defaults to 16000Hz.
mic_device (Optional[str]): Microphone device index or name. Defaults to None.
timer_duration (Optional[int]): Duration of the timer for adjusting the microphone for ambient noise. Defaults to 20 seconds.
warmup (bool): Whether to warmup the model by running inference on a test file. Defaults to True.
energy_threshold (Optional[int]): Energy threshold for silence detection. Using this disables automatic adjustment. Defaults to None.
pause_threshold (Optional[float]): Seconds of non-speaking audio before a phrase is considered complete. Defaults to 0.8 seconds.
"""

model_name: str = "medium.en"
device: str = "cuda" if torch.cuda.is_available() else "cpu"
start_timeout: float = 5.0
end_timeout: Optional[float] = None
phrase_duration: Optional[float] = 10
sample_rate: int = 16000
mic_device: Optional[str] = None
timer_duration: Optional[int] = 20
warmup: bool = True
energy_threshold: Optional[int] = None
energy_threshold: Optional[int] = 600
pause_threshold: Optional[float] = 2.0


class TranscribeSpeechAction(object):
Expand Down Expand Up @@ -122,6 +124,9 @@ class TranscribeSpeechAction(object):
self._listening = True
recogniser = sr.Recognizer()

if self._model_params.pause_threshold:
recogniser.pause_threshold = self._model_params.pause_threshold

if self._model_params.energy_threshold:
recogniser.dynamic_energy_threshold = False
recogniser.energy_threshold = self._model_params.energy_threshold
Expand Down Expand Up @@ -160,7 +165,7 @@ class TranscribeSpeechAction(object):
wav_data = self.recogniser.listen(
src,
timeout=self._model_params.start_timeout,
phrase_time_limit=self._model_params.end_timeout,
phrase_time_limit=self._model_params.phrase_duration,
).get_wav_data()
# Magic number 32768.0 is the maximum value of a 16-bit signed integer
float_data = (
Expand Down Expand Up @@ -233,10 +238,10 @@ def parse_args() -> dict:
help="Timeout for listening for the start of a phrase.",
)
parser.add_argument(
"--end_timeout",
"--phrase_duration",
type=float,
default=None,
help="Timeout for listening for the end of a phrase.",
default=10,
help="Maximum phrase duration after starting listening in seconds.",
)
parser.add_argument(
"--sample_rate",
Expand All @@ -259,10 +264,17 @@ def parse_args() -> dict:
parser.add_argument(
"--energy_threshold",
type=int,
default=None,
default=600,
help="Energy threshold for silence detection. Using this disables automatic adjustment",
)

parser.add_argument(
"--pause_threshold",
type=float,
default=2.0,
help="Seconds of non-speaking audio before a phrase is considered complete.",
)

args, unknown = parser.parse_known_args()
return vars(args)

Expand All @@ -284,14 +296,18 @@ def configure_model_params(config: dict) -> speech_model_params:
model_params.device = config["device"]
if config["start_timeout"]:
model_params.start_timeout = config["start_timeout"]
if config["end_timeout"]:
model_params.end_timeout = config["end_timeout"]
if config["phrase_duration"]:
model_params.phrase_duration = config["phrase_duration"]
if config["sample_rate"]:
model_params.sample_rate = config["sample_rate"]
if config["mic_device"]:
model_params.mic_device = config["mic_device"]
if config["no_warmup"]:
model_params.warmup = False
if config["energy_threshold"]:
model_params.energy_threshold = config["energy_threshold"]
if config["pause_threshold"]:
model_params.pause_threshold = config["pause_threshold"]

return model_params

Expand Down
16 changes: 13 additions & 3 deletions skills/src/lasr_skills/describe_people.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,12 @@ def __init__(self):
)
smach.StateMachine.add(
"GET_IMAGE",
GetCroppedImage(object_name="person", crop_method=crop_method, rgb_topic=rgb_topic, use_mask=True),
GetCroppedImage(
object_name="person",
crop_method=crop_method,
rgb_topic=rgb_topic,
use_mask=True,
),
transitions={
"succeeded": "CONVERT_IMAGE",
"failed": "SAY_GET_IMAGE_AGAIN",
Expand Down Expand Up @@ -234,9 +239,14 @@ def execute(self, userdata):
rospy.logdebug(f"|> Person does not have {part} visible")
continue

if part.name == "torso_front" or part.name == "torso_back":
if (
part.part_name == "torso_front"
or part.part_name == "torso_back"
):
torso_mask = np.logical_or(torso_mask, part_mask)
elif part.name == "left_face" or part.name == "right_face":
elif (
part.part_name == "left_face" or part.part_name == "right_face"
):
head_mask = np.logical_or(head_mask, part_mask)

torso_mask_data, torso_mask_shape, torso_mask_dtype = numpy2message(
Expand Down
77 changes: 38 additions & 39 deletions skills/src/lasr_skills/look_at_person.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
from geometry_msgs.msg import PointStamped
import smach
from .vision import GetPointCloud
from lasr_vision_msgs.srv import BodyPixDetection, BodyPixDetectionRequest
from lasr_vision_msgs.msg import BodyPixMaskRequest
from lasr_vision_msgs.srv import (
BodyPixKeypointDetection,
BodyPixKeypointDetectionRequest,
)
from lasr_skills import LookToPoint, DetectFaces
import cv2
import rospy
Expand Down Expand Up @@ -155,38 +157,39 @@ def execute(self, userdata):

return "failed"

@smach.cb_interface(input_keys=["poses", "detections"], output_keys=["bbox_eyes"])
@smach.cb_interface(
input_keys=["keypoints", "detections"], output_keys=["bbox_eyes"]
)
def match_poses_and_detections(ud):
bbox_eyes = []
for pose in ud.poses:
for keypoint in ud.keypoints:
for detection in ud.detections.detections:
temp = {
"bbox": detection.xywh,
}
for keypoint in pose.keypoints:
if (
keypoint.part == "leftEye"
and detection.xywh[0]
< keypoint.xy[0]
< detection.xywh[0] + detection.xywh[2]
and detection.xywh[1]
< keypoint.xy[1]
< detection.xywh[1] + detection.xywh[3]
):
temp["left_eye"] = keypoint.xy
if (
keypoint.part == "rightEye"
and detection.xywh[0]
< keypoint.xy[0]
< detection.xywh[0] + detection.xywh[2]
and detection.xywh[1]
< keypoint.xy[1]
< detection.xywh[1] + detection.xywh[3]
):
temp["right_eye"] = keypoint.xy
if (
keypoint.keypoint_name == "leftEye"
and detection.xywh[0]
< keypoint.x
< detection.xywh[0] + detection.xywh[2]
and detection.xywh[1]
< keypoint.y
< detection.xywh[1] + detection.xywh[3]
):
temp["left_eye"] = keypoint.x, keypoint.y
if (
keypoint.keypoint_name == "rightEye"
and detection.xywh[0]
< keypoint.x
< detection.xywh[0] + detection.xywh[2]
and detection.xywh[1]
< keypoint.y
< detection.xywh[1] + detection.xywh[3]
):
temp["right_eye"] = keypoint.x, keypoint.y

if "left_eye" in temp and "right_eye" in temp:
bbox_eyes.append(temp)
if "left_eye" in temp and "right_eye" in temp:
bbox_eyes.append(temp)

ud.bbox_eyes = bbox_eyes

Expand Down Expand Up @@ -226,21 +229,17 @@ def __init__(self, filter=False):
},
)

eyes = BodyPixMaskRequest()
eyes.parts = ["left_eye", "right_eye"]
masks = [eyes]

smach.StateMachine.add(
"SEGMENT_PERSON",
smach_ros.ServiceState(
"/bodypix/detect",
BodyPixDetection,
request_cb=lambda ud, _: BodyPixDetectionRequest(
pcl_to_img_msg(ud.pcl_msg), "resnet50", 0.7, masks
"/bodypix/keypoint_detection",
BodyPixKeypointDetection,
request_cb=lambda ud, _: BodyPixKeypointDetectionRequest(
pcl_to_img_msg(ud.pcl_msg), "resnet50", 0.2
),
response_slots=["masks", "poses"],
response_slots=["keypoints"],
input_keys=["pcl_msg"],
output_keys=["masks", "poses"],
output_keys=["keypoints"],
),
transitions={
"succeeded": "DETECT_FACES",
Expand All @@ -262,8 +261,8 @@ def __init__(self, filter=False):
"MATCH_POSES_AND_DETECTIONS",
CBState(
self.match_poses_and_detections,
input_keys=["poses", "detections"],
output_keys=["poses"],
input_keys=["keypoints", "detections"],
output_keys=["keypoints"],
outcomes=["succeeded", "failed"],
),
transitions={"succeeded": "CHECK_EYES", "failed": "failed"},
Expand Down
2 changes: 1 addition & 1 deletion tasks/carry_my_luggage/launch/setup.launch
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@

<!-- PERCEPTION -->
<include file="$(find lasr_vision_yolov8)/launch/service.launch" />
<include file="$(find lasr_vision_bodypix)/launch/service.launch" />
<include file="$(find lasr_vision_bodypix)/launch/mask_service.launch" />
</launch>
4 changes: 2 additions & 2 deletions tasks/receptionist/launch/setup.launch
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<arg name="config" default="lab"/>

<!-- INTERACTION -->
<arg name="whisper_device_param" default="13" />
<arg name="whisper_device_param" default="9" />
<node pkg="lasr_speech_recognition_whisper" type="transcribe_microphone_server" name="transcribe_speech" output="screen" args="--mic_device $(arg whisper_device_param)"/>

<!-- STATIC POINTS -->
Expand All @@ -21,7 +21,7 @@

<node pkg="lasr_vision_feature_extraction" type="service" name="torch_service" output="screen"/>
<arg name="debug" default="true" />
<node pkg="lasr_vision_bodypix" type="service" name="bodypix_service" output="screen" args="--debug $(arg debug)"/>
<node pkg="lasr_vision_bodypix" type="mask_service.py" name="bodypix_service" output="screen" args="--debug $(arg debug)"/>
<node pkg="lasr_vision_deepface" type="service" name="deepface_service" output="screen"/>

</launch>
4 changes: 0 additions & 4 deletions tasks/receptionist/scripts/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,21 @@
if __name__ == "__main__":
rospy.init_node("receptionist_robocup")
wait_pose_param = rospy.get_param("/receptionist/wait_pose")
# wait_pose_param = rospy.get_param("/wait_pose/")

wait_pose = Pose(
position=Point(**wait_pose_param["position"]),
orientation=Quaternion(**wait_pose_param["orientation"]),
)

# wait_area_param = rospy.get_param("/wait_area")
wait_area_param = rospy.get_param("/receptionist/wait_area")
wait_area = Polygon(wait_area_param)

# seat_pose_param = rospy.get_param("/seat_pose")
seat_pose_param = rospy.get_param("/receptionist/seat_pose")
seat_pose = Pose(
position=Point(**seat_pose_param["position"]),
orientation=Quaternion(**seat_pose_param["orientation"]),
)

# seat_area_param = rospy.get_param("/seat_area")
seat_area_param = rospy.get_param("/receptionist/seat_area")

seat_area = Polygon(seat_area_param)
Expand Down
21 changes: 18 additions & 3 deletions tasks/receptionist/src/receptionist/states/introduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def stringify_guest_data(guest_data: Dict[str, Any], guest_id: str) -> str:
known_attributes = {}

for attribute, value in relevant_guest_data["attributes"].items():
if value != "unknown" and value != False and value != "No_Beard":
if value != "unknown":
known_attributes[attribute] = value
print("These are the known attributes")
print(known_attributes)
Expand Down Expand Up @@ -98,11 +98,26 @@ def stringify_guest_data(guest_data: Dict[str, Any], guest_id: str) -> str:
)
detection = True
if attribute == "facial_hair":
guest_str += f"They have facial hair."
detection = True
if relevant_guest_data["attributes"][attribute] != "No_Beard":
guest_str += f"They have facial hair."
else:
guest_str += f"They do not have facial hair."
if attribute == "glasses":
detection = True
if relevant_guest_data["attributes"][attribute]:
guest_str += f"They are wearing glasses."
else:
guest_str += f"They are not wearing glasses."
if attribute == "hat":
detection = True
if relevant_guest_data["attributes"][attribute]:
guest_str += f"They are wearing a hat."
else:
guest_str += f"They are not wearing a hat."
if not detection:
if isSingular(attribute):
guest_str += f"They are a wearing {attribute}."
guest_str += f"They are wearing a {attribute}."
else:
guest_str += f"They are wearing {attribute}."

Expand Down

0 comments on commit 90d8c3c

Please sign in to comment.