src/waymo_open_dataset/label.proto

/* Copyright 2019 The Waymo Open Dataset Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

syntax = "proto2";

package waymo.open_dataset;

import "waymo_open_dataset/protos/keypoint.proto";

message Label {
  // Upright box, zero pitch and roll.
  message Box {
    // Box coordinates in vehicle frame.
    optional double center_x = 1;
    optional double center_y = 2;
    optional double center_z = 3;

    // Dimensions of the box. length: dim x. width: dim y. height: dim z.
    optional double length = 5;
    optional double width = 4;
    optional double height = 6;

    // The heading of the bounding box (in radians).  The heading is the angle
    // required to rotate +x to the surface normal of the box front face. It is
    // normalized to [-pi, pi).
    optional double heading = 7;

    enum Type {
      TYPE_UNKNOWN = 0;
      // 7-DOF 3D (a.k.a upright 3D box).
      TYPE_3D = 1;
      // 5-DOF 2D. Mostly used for laser top down representation.
      TYPE_2D = 2;
      // Axis aligned 2D. Mostly used for image.
      TYPE_AA_2D = 3;
    }
  }

  optional Box box = 1;

  message Metadata {
    optional double speed_x = 1;
    optional double speed_y = 2;
    optional double speed_z = 5;
    optional double accel_x = 3;
    optional double accel_y = 4;
    optional double accel_z = 6;
  }
  optional Metadata metadata = 2;

  enum Type {
    TYPE_UNKNOWN = 0;
    TYPE_VEHICLE = 1;
    TYPE_PEDESTRIAN = 2;
    TYPE_SIGN = 3;
    TYPE_CYCLIST = 4;
  }
  optional Type type = 3;
  // Object ID.
  optional string id = 4;

  // The difficulty level of this label. The higher the level, the harder it is.
  enum DifficultyLevel {
    UNKNOWN = 0;
    LEVEL_1 = 1;
    LEVEL_2 = 2;
  }

  // Difficulty level for detection problem.
  optional DifficultyLevel detection_difficulty_level = 5;
  // Difficulty level for tracking problem.
  optional DifficultyLevel tracking_difficulty_level = 6;

  // The total number of lidar points in this box.
  optional int32 num_lidar_points_in_box = 7;
  // The total number of top lidar points in this box.
  optional int32 num_top_lidar_points_in_box = 13;

  oneof keypoints_oneof {
    // Used if the Label is a part of `Frame.laser_labels`.
    keypoints.LaserKeypoints laser_keypoints = 8;
    // Used if the Label is a part of `Frame.camera_labels`.
    keypoints.CameraKeypoints camera_keypoints = 9;
  }

  // Information to cross reference between labels for different modalities.
  message Association {
    // Currently only CameraLabels with class `TYPE_PEDESTRIAN` store
    // information about associated lidar objects.
    optional string laser_object_id = 1;
  }
  optional Association association = 10;

  // Used by Lidar labels to store in which camera it is mostly visible.
  optional string most_visible_camera_name = 11;
  // Used by Lidar labels to store a camera-synchronized box corresponding to
  // the camera indicated by `most_visible_camera_name`. Currently, the boxes
  // are shifted to the time when the most visible camera captures the center of
  // the box, taking into account the rolling shutter of that camera.
  // Specifically, given the object box living at the start of the Open Dataset
  // frame (t_frame) with center position (c) and velocity (v), we aim to find
  // the camera capture time (t_capture), when the camera indicated by
  // `most_visible_camera_name` captures the center of the object. To this end,
  //  we solve the rolling shutter optimization considering both ego and object
  //  motion:
  //   t_capture = image_column_to_time(
  //       camera_projection(c + v * (t_capture - t_frame),
  //                         transform_vehicle(t_capture - t_ref),
  //                         cam_params)),
  // where transform_vehicle(t_capture - t_frame) is the vehicle transform from
  // a pose reference time t_ref to t_capture considering the ego motion, and
  // cam_params is the camera extrinsic and intrinsic parameters.
  // We then move the label box to t_capture by updating the center of the box
  // as follows:
  //   c_camra_synced = c + v * (t_capture - t_frame),
  // while keeping the box dimensions and heading direction.
  // We use the camera_synced_box as the ground truth box for the 3D Camera-Only
  // Detection Challenge. This makes the assumption that the users provide the
  // detection at the same time as the most visible camera captures the object
  // center.
  optional Box camera_synced_box = 12;


}

// Non-self-intersecting 2d polygons. This polygon is not necessarily convex.
message Polygon2dProto {
  repeated double x = 1;
  repeated double y = 2;

  // A globally unique ID.
  optional string id = 3;
}