feat: add text detection and object mapping support (#197)

googleapis · Feb 21, 2019 · b4d993c · b4d993c
1 parent e33937e
commit b4d993c
Show file tree

Hide file tree

Showing 3 changed files with 372 additions and 7 deletions.
diff --git a/...cloud-videointelligence/protos/google/cloud/videointelligence/v1/video_intelligence.proto b/...cloud-videointelligence/protos/google/cloud/videointelligence/v1/video_intelligence.proto
@@ -1,4 +1,4 @@
-// Copyright 2017 Google Inc.
+// Copyright 2018 Google LLC.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+//
 
 syntax = "proto3";
 
@@ -101,6 +102,9 @@ message VideoContext {
 
   // Config for SPEECH_TRANSCRIPTION.
   SpeechTranscriptionConfig speech_transcription_config = 6;
+
+  // Config for TEXT_DETECTION.
+  TextDetectionConfig text_detection_config = 8;
 }
 
 // Config for LABEL_DETECTION.
@@ -148,6 +152,16 @@ message FaceDetectionConfig {
   bool include_bounding_boxes = 2;
 }
 
+// Config for TEXT_DETECTION.
+message TextDetectionConfig {
+  // Language hint can be specified if the language to be detected is known a
+  // priori. It can increase the accuracy of the detection. Language hint must
+  // be language code in BCP-47 format.
+  //
+  // Automatic language detection is performed if no hint is provided.
+  repeated string language_hints = 1;
+}
+
 // Video segment.
 message VideoSegment {
   // Time-offset, relative to the beginning of the video,
@@ -305,6 +319,14 @@ message VideoAnnotationResults {
   // Speech transcription.
   repeated SpeechTranscription speech_transcriptions = 11;
 
+  // OCR text detection and tracking.
+  // Annotations for list of detected text snippets. Each will have list of
+  // frame information associated with it.
+  repeated TextAnnotation text_annotations = 12;
+
+  // Annotations for list of objects detected and tracked in video.
+  repeated ObjectTrackingAnnotation object_annotations = 14;
+
   // If set, indicates an error. Note that for a single `AnnotateVideoRequest`
   // some videos may succeed and some may fail.
   google.rpc.Status error = 9;
@@ -479,6 +501,115 @@ message WordInfo {
   int32 speaker_tag = 5;
 }
 
+// A vertex represents a 2D point in the image.
+// NOTE: the normalized vertex coordinates are relative to the original image
+// and range from 0 to 1.
+message NormalizedVertex {
+  // X coordinate.
+  float x = 1;
+
+  // Y coordinate.
+  float y = 2;
+}
+
+// Normalized bounding polygon for text (that might not be aligned with axis).
+// Contains list of the corner points in clockwise order starting from
+// top-left corner. For example, for a rectangular bounding box:
+// When the text is horizontal it might look like:
+//         0----1
+//         |    |
+//         3----2
+//
+// When it's clockwise rotated 180 degrees around the top-left corner it
+// becomes:
+//         2----3
+//         |    |
+//         1----0
+//
+// and the vertex order will still be (0, 1, 2, 3). Note that values can be less
+// than 0, or greater than 1 due to trignometric calculations for location of
+// the box.
+message NormalizedBoundingPoly {
+  // Normalized vertices of the bounding polygon.
+  repeated NormalizedVertex vertices = 1;
+}
+
+// Video segment level annotation results for text detection.
+message TextSegment {
+  // Video segment where a text snippet was detected.
+  VideoSegment segment = 1;
+
+  // Confidence for the track of detected text. It is calculated as the highest
+  // over all frames where OCR detected text appears.
+  float confidence = 2;
+
+  // Information related to the frames where OCR detected text appears.
+  repeated TextFrame frames = 3;
+}
+
+// Video frame level annotation results for text annotation (OCR).
+// Contains information regarding timestamp and bounding box locations for the
+// frames containing detected OCR text snippets.
+message TextFrame {
+  // Bounding polygon of the detected text for this frame.
+  NormalizedBoundingPoly rotated_bounding_box = 1;
+
+  // Timestamp of this frame.
+  google.protobuf.Duration time_offset = 2;
+}
+
+// Annotations related to one detected OCR text snippet. This will contain the
+// corresponding text, confidence value, and frame level information for each
+// detection.
+message TextAnnotation {
+  // The detected text.
+  string text = 1;
+
+  // All video segments where OCR detected text appears.
+  repeated TextSegment segments = 2;
+}
+
+// Video frame level annotations for object detection and tracking. This field
+// stores per frame location, time offset, and confidence.
+message ObjectTrackingFrame {
+  // The normalized bounding box location of this object track for the frame.
+  NormalizedBoundingBox normalized_bounding_box = 1;
+
+  // The timestamp of the frame in microseconds.
+  google.protobuf.Duration time_offset = 2;
+}
+
+// Annotations corresponding to one tracked object.
+message ObjectTrackingAnnotation {
+  // Different representation of tracking info in non-streaming batch
+  // and streaming modes.
+  oneof track_info {
+    // Non-streaming batch mode ONLY.
+    // Each object track corresponds to one video segment where it appears.
+    VideoSegment segment = 3;
+
+    // Streaming mode ONLY.
+    // In streaming mode, we do not know the end time of a tracked object
+    // before it is completed. Hence, there is no VideoSegment info returned.
+    // Instead, we provide a unique identifiable integer track_id so that
+    // the customers can correlate the results of the ongoing
+    // ObjectTrackAnnotation of the same track_id over time.
+    int64 track_id = 5;
+  }
+
+  // Entity to specify the object category that this track is labeled as.
+  Entity entity = 1;
+
+  // Object category's labeling confidence of this track.
+  float confidence = 4;
+
+  // Information corresponding to all frames where this object track appears.
+  // Non-streaming batch mode: it may be one or multiple ObjectTrackingFrame
+  // messages in frames.
+  // Streaming mode: it can only be one ObjectTrackingFrame message in frames.
+  repeated ObjectTrackingFrame frames = 2;
+}
+
 // Video annotation feature.
 enum Feature {
   // Unspecified.
@@ -498,6 +629,12 @@ enum Feature {
 
   // Speech transcription.
   SPEECH_TRANSCRIPTION = 6;
+
+  // OCR text detection and tracking.
+  TEXT_DETECTION = 7;
+
+  // Object detection and tracking.
+  OBJECT_TRACKING = 9;
 }
 
 // Label detection mode.