Skip to content

Commit

Permalink
feat: add text detection and object mapping support (#197)
Browse files Browse the repository at this point in the history
  • Loading branch information
yoshi-automation authored and JustinBeckwith committed Feb 21, 2019
1 parent e33937e commit b4d993c
Show file tree
Hide file tree
Showing 3 changed files with 372 additions and 7 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2017 Google Inc.
// Copyright 2018 Google LLC.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -11,6 +11,7 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

syntax = "proto3";

Expand Down Expand Up @@ -101,6 +102,9 @@ message VideoContext {

// Config for SPEECH_TRANSCRIPTION.
SpeechTranscriptionConfig speech_transcription_config = 6;

// Config for TEXT_DETECTION.
TextDetectionConfig text_detection_config = 8;
}

// Config for LABEL_DETECTION.
Expand Down Expand Up @@ -148,6 +152,16 @@ message FaceDetectionConfig {
bool include_bounding_boxes = 2;
}

// Config for TEXT_DETECTION.
message TextDetectionConfig {
// Language hint can be specified if the language to be detected is known a
// priori. It can increase the accuracy of the detection. Language hint must
// be language code in BCP-47 format.
//
// Automatic language detection is performed if no hint is provided.
repeated string language_hints = 1;
}

// Video segment.
message VideoSegment {
// Time-offset, relative to the beginning of the video,
Expand Down Expand Up @@ -305,6 +319,14 @@ message VideoAnnotationResults {
// Speech transcription.
repeated SpeechTranscription speech_transcriptions = 11;

// OCR text detection and tracking.
// Annotations for list of detected text snippets. Each will have list of
// frame information associated with it.
repeated TextAnnotation text_annotations = 12;

// Annotations for list of objects detected and tracked in video.
repeated ObjectTrackingAnnotation object_annotations = 14;

// If set, indicates an error. Note that for a single `AnnotateVideoRequest`
// some videos may succeed and some may fail.
google.rpc.Status error = 9;
Expand Down Expand Up @@ -479,6 +501,115 @@ message WordInfo {
int32 speaker_tag = 5;
}

// A vertex represents a 2D point in the image.
// NOTE: the normalized vertex coordinates are relative to the original image
// and range from 0 to 1.
message NormalizedVertex {
// X coordinate.
float x = 1;

// Y coordinate.
float y = 2;
}

// Normalized bounding polygon for text (that might not be aligned with axis).
// Contains list of the corner points in clockwise order starting from
// top-left corner. For example, for a rectangular bounding box:
// When the text is horizontal it might look like:
// 0----1
// | |
// 3----2
//
// When it's clockwise rotated 180 degrees around the top-left corner it
// becomes:
// 2----3
// | |
// 1----0
//
// and the vertex order will still be (0, 1, 2, 3). Note that values can be less
// than 0, or greater than 1 due to trignometric calculations for location of
// the box.
message NormalizedBoundingPoly {
// Normalized vertices of the bounding polygon.
repeated NormalizedVertex vertices = 1;
}

// Video segment level annotation results for text detection.
message TextSegment {
// Video segment where a text snippet was detected.
VideoSegment segment = 1;

// Confidence for the track of detected text. It is calculated as the highest
// over all frames where OCR detected text appears.
float confidence = 2;

// Information related to the frames where OCR detected text appears.
repeated TextFrame frames = 3;
}

// Video frame level annotation results for text annotation (OCR).
// Contains information regarding timestamp and bounding box locations for the
// frames containing detected OCR text snippets.
message TextFrame {
// Bounding polygon of the detected text for this frame.
NormalizedBoundingPoly rotated_bounding_box = 1;

// Timestamp of this frame.
google.protobuf.Duration time_offset = 2;
}

// Annotations related to one detected OCR text snippet. This will contain the
// corresponding text, confidence value, and frame level information for each
// detection.
message TextAnnotation {
// The detected text.
string text = 1;

// All video segments where OCR detected text appears.
repeated TextSegment segments = 2;
}

// Video frame level annotations for object detection and tracking. This field
// stores per frame location, time offset, and confidence.
message ObjectTrackingFrame {
// The normalized bounding box location of this object track for the frame.
NormalizedBoundingBox normalized_bounding_box = 1;

// The timestamp of the frame in microseconds.
google.protobuf.Duration time_offset = 2;
}

// Annotations corresponding to one tracked object.
message ObjectTrackingAnnotation {
// Different representation of tracking info in non-streaming batch
// and streaming modes.
oneof track_info {
// Non-streaming batch mode ONLY.
// Each object track corresponds to one video segment where it appears.
VideoSegment segment = 3;

// Streaming mode ONLY.
// In streaming mode, we do not know the end time of a tracked object
// before it is completed. Hence, there is no VideoSegment info returned.
// Instead, we provide a unique identifiable integer track_id so that
// the customers can correlate the results of the ongoing
// ObjectTrackAnnotation of the same track_id over time.
int64 track_id = 5;
}

// Entity to specify the object category that this track is labeled as.
Entity entity = 1;

// Object category's labeling confidence of this track.
float confidence = 4;

// Information corresponding to all frames where this object track appears.
// Non-streaming batch mode: it may be one or multiple ObjectTrackingFrame
// messages in frames.
// Streaming mode: it can only be one ObjectTrackingFrame message in frames.
repeated ObjectTrackingFrame frames = 2;
}

// Video annotation feature.
enum Feature {
// Unspecified.
Expand All @@ -498,6 +629,12 @@ enum Feature {

// Speech transcription.
SPEECH_TRANSCRIPTION = 6;

// OCR text detection and tracking.
TEXT_DETECTION = 7;

// Object detection and tracking.
OBJECT_TRACKING = 9;
}

// Label detection mode.
Expand Down
Loading

0 comments on commit b4d993c

Please sign in to comment.