From b4d993cfdbff67ae9f4f7a8fd10fd89f89ffe401 Mon Sep 17 00:00:00 2001 From: Yoshi Automation Bot <44816363+yoshi-automation@users.noreply.github.com> Date: Thu, 21 Feb 2019 07:39:44 -0800 Subject: [PATCH] feat: add text detection and object mapping support (#197) --- .../v1/video_intelligence.proto | 139 ++++++++++- .../v1/doc_video_intelligence.js | 230 +++++++++++++++++- .../synth.metadata | 10 +- 3 files changed, 372 insertions(+), 7 deletions(-) diff --git a/packages/google-cloud-videointelligence/protos/google/cloud/videointelligence/v1/video_intelligence.proto b/packages/google-cloud-videointelligence/protos/google/cloud/videointelligence/v1/video_intelligence.proto index 223e866e53a..04875086510 100644 --- a/packages/google-cloud-videointelligence/protos/google/cloud/videointelligence/v1/video_intelligence.proto +++ b/packages/google-cloud-videointelligence/protos/google/cloud/videointelligence/v1/video_intelligence.proto @@ -1,4 +1,4 @@ -// Copyright 2017 Google Inc. +// Copyright 2018 Google LLC. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +// syntax = "proto3"; @@ -101,6 +102,9 @@ message VideoContext { // Config for SPEECH_TRANSCRIPTION. SpeechTranscriptionConfig speech_transcription_config = 6; + + // Config for TEXT_DETECTION. + TextDetectionConfig text_detection_config = 8; } // Config for LABEL_DETECTION. @@ -148,6 +152,16 @@ message FaceDetectionConfig { bool include_bounding_boxes = 2; } +// Config for TEXT_DETECTION. +message TextDetectionConfig { + // Language hint can be specified if the language to be detected is known a + // priori. It can increase the accuracy of the detection. Language hint must + // be language code in BCP-47 format. + // + // Automatic language detection is performed if no hint is provided. + repeated string language_hints = 1; +} + // Video segment. message VideoSegment { // Time-offset, relative to the beginning of the video, @@ -305,6 +319,14 @@ message VideoAnnotationResults { // Speech transcription. repeated SpeechTranscription speech_transcriptions = 11; + // OCR text detection and tracking. + // Annotations for list of detected text snippets. Each will have list of + // frame information associated with it. + repeated TextAnnotation text_annotations = 12; + + // Annotations for list of objects detected and tracked in video. + repeated ObjectTrackingAnnotation object_annotations = 14; + // If set, indicates an error. Note that for a single `AnnotateVideoRequest` // some videos may succeed and some may fail. google.rpc.Status error = 9; @@ -479,6 +501,115 @@ message WordInfo { int32 speaker_tag = 5; } +// A vertex represents a 2D point in the image. +// NOTE: the normalized vertex coordinates are relative to the original image +// and range from 0 to 1. +message NormalizedVertex { + // X coordinate. + float x = 1; + + // Y coordinate. + float y = 2; +} + +// Normalized bounding polygon for text (that might not be aligned with axis). +// Contains list of the corner points in clockwise order starting from +// top-left corner. For example, for a rectangular bounding box: +// When the text is horizontal it might look like: +// 0----1 +// | | +// 3----2 +// +// When it's clockwise rotated 180 degrees around the top-left corner it +// becomes: +// 2----3 +// | | +// 1----0 +// +// and the vertex order will still be (0, 1, 2, 3). Note that values can be less +// than 0, or greater than 1 due to trignometric calculations for location of +// the box. +message NormalizedBoundingPoly { + // Normalized vertices of the bounding polygon. + repeated NormalizedVertex vertices = 1; +} + +// Video segment level annotation results for text detection. +message TextSegment { + // Video segment where a text snippet was detected. + VideoSegment segment = 1; + + // Confidence for the track of detected text. It is calculated as the highest + // over all frames where OCR detected text appears. + float confidence = 2; + + // Information related to the frames where OCR detected text appears. + repeated TextFrame frames = 3; +} + +// Video frame level annotation results for text annotation (OCR). +// Contains information regarding timestamp and bounding box locations for the +// frames containing detected OCR text snippets. +message TextFrame { + // Bounding polygon of the detected text for this frame. + NormalizedBoundingPoly rotated_bounding_box = 1; + + // Timestamp of this frame. + google.protobuf.Duration time_offset = 2; +} + +// Annotations related to one detected OCR text snippet. This will contain the +// corresponding text, confidence value, and frame level information for each +// detection. +message TextAnnotation { + // The detected text. + string text = 1; + + // All video segments where OCR detected text appears. + repeated TextSegment segments = 2; +} + +// Video frame level annotations for object detection and tracking. This field +// stores per frame location, time offset, and confidence. +message ObjectTrackingFrame { + // The normalized bounding box location of this object track for the frame. + NormalizedBoundingBox normalized_bounding_box = 1; + + // The timestamp of the frame in microseconds. + google.protobuf.Duration time_offset = 2; +} + +// Annotations corresponding to one tracked object. +message ObjectTrackingAnnotation { + // Different representation of tracking info in non-streaming batch + // and streaming modes. + oneof track_info { + // Non-streaming batch mode ONLY. + // Each object track corresponds to one video segment where it appears. + VideoSegment segment = 3; + + // Streaming mode ONLY. + // In streaming mode, we do not know the end time of a tracked object + // before it is completed. Hence, there is no VideoSegment info returned. + // Instead, we provide a unique identifiable integer track_id so that + // the customers can correlate the results of the ongoing + // ObjectTrackAnnotation of the same track_id over time. + int64 track_id = 5; + } + + // Entity to specify the object category that this track is labeled as. + Entity entity = 1; + + // Object category's labeling confidence of this track. + float confidence = 4; + + // Information corresponding to all frames where this object track appears. + // Non-streaming batch mode: it may be one or multiple ObjectTrackingFrame + // messages in frames. + // Streaming mode: it can only be one ObjectTrackingFrame message in frames. + repeated ObjectTrackingFrame frames = 2; +} + // Video annotation feature. enum Feature { // Unspecified. @@ -498,6 +629,12 @@ enum Feature { // Speech transcription. SPEECH_TRANSCRIPTION = 6; + + // OCR text detection and tracking. + TEXT_DETECTION = 7; + + // Object detection and tracking. + OBJECT_TRACKING = 9; } // Label detection mode. diff --git a/packages/google-cloud-videointelligence/src/v1/doc/google/cloud/videointelligence/v1/doc_video_intelligence.js b/packages/google-cloud-videointelligence/src/v1/doc/google/cloud/videointelligence/v1/doc_video_intelligence.js index 2d4edfbcded..128738b27ac 100644 --- a/packages/google-cloud-videointelligence/src/v1/doc/google/cloud/videointelligence/v1/doc_video_intelligence.js +++ b/packages/google-cloud-videointelligence/src/v1/doc/google/cloud/videointelligence/v1/doc_video_intelligence.js @@ -101,6 +101,11 @@ const AnnotateVideoRequest = { * * This object should have the same structure as [SpeechTranscriptionConfig]{@link google.cloud.videointelligence.v1.SpeechTranscriptionConfig} * + * @property {Object} textDetectionConfig + * Config for TEXT_DETECTION. + * + * This object should have the same structure as [TextDetectionConfig]{@link google.cloud.videointelligence.v1.TextDetectionConfig} + * * @typedef VideoContext * @memberof google.cloud.videointelligence.v1 * @see [google.cloud.videointelligence.v1.VideoContext definition in proto format]{@link https://github.com/googleapis/googleapis/blob/master/google/cloud/videointelligence/v1/video_intelligence.proto} @@ -188,6 +193,24 @@ const FaceDetectionConfig = { // This is for documentation. Actual contents will be loaded by gRPC. }; +/** + * Config for TEXT_DETECTION. + * + * @property {string[]} languageHints + * Language hint can be specified if the language to be detected is known a + * priori. It can increase the accuracy of the detection. Language hint must + * be language code in BCP-47 format. + * + * Automatic language detection is performed if no hint is provided. + * + * @typedef TextDetectionConfig + * @memberof google.cloud.videointelligence.v1 + * @see [google.cloud.videointelligence.v1.TextDetectionConfig definition in proto format]{@link https://github.com/googleapis/googleapis/blob/master/google/cloud/videointelligence/v1/video_intelligence.proto} + */ +const TextDetectionConfig = { + // This is for documentation. Actual contents will be loaded by gRPC. +}; + /** * Video segment. * @@ -480,6 +503,18 @@ const FaceAnnotation = { * * This object should have the same structure as [SpeechTranscription]{@link google.cloud.videointelligence.v1.SpeechTranscription} * + * @property {Object[]} textAnnotations + * OCR text detection and tracking. + * Annotations for list of detected text snippets. Each will have list of + * frame information associated with it. + * + * This object should have the same structure as [TextAnnotation]{@link google.cloud.videointelligence.v1.TextAnnotation} + * + * @property {Object[]} objectAnnotations + * Annotations for list of objects detected and tracked in video. + * + * This object should have the same structure as [ObjectTrackingAnnotation]{@link google.cloud.videointelligence.v1.ObjectTrackingAnnotation} + * * @property {Object} error * If set, indicates an error. Note that for a single `AnnotateVideoRequest` * some videos may succeed and some may fail. @@ -745,6 +780,189 @@ const WordInfo = { // This is for documentation. Actual contents will be loaded by gRPC. }; +/** + * A vertex represents a 2D point in the image. + * NOTE: the normalized vertex coordinates are relative to the original image + * and range from 0 to 1. + * + * @property {number} x + * X coordinate. + * + * @property {number} y + * Y coordinate. + * + * @typedef NormalizedVertex + * @memberof google.cloud.videointelligence.v1 + * @see [google.cloud.videointelligence.v1.NormalizedVertex definition in proto format]{@link https://github.com/googleapis/googleapis/blob/master/google/cloud/videointelligence/v1/video_intelligence.proto} + */ +const NormalizedVertex = { + // This is for documentation. Actual contents will be loaded by gRPC. +}; + +/** + * Normalized bounding polygon for text (that might not be aligned with axis). + * Contains list of the corner points in clockwise order starting from + * top-left corner. For example, for a rectangular bounding box: + * When the text is horizontal it might look like: + * 0----1 + * | | + * 3----2 + * + * When it's clockwise rotated 180 degrees around the top-left corner it + * becomes: + * 2----3 + * | | + * 1----0 + * + * and the vertex order will still be (0, 1, 2, 3). Note that values can be less + * than 0, or greater than 1 due to trignometric calculations for location of + * the box. + * + * @property {Object[]} vertices + * Normalized vertices of the bounding polygon. + * + * This object should have the same structure as [NormalizedVertex]{@link google.cloud.videointelligence.v1.NormalizedVertex} + * + * @typedef NormalizedBoundingPoly + * @memberof google.cloud.videointelligence.v1 + * @see [google.cloud.videointelligence.v1.NormalizedBoundingPoly definition in proto format]{@link https://github.com/googleapis/googleapis/blob/master/google/cloud/videointelligence/v1/video_intelligence.proto} + */ +const NormalizedBoundingPoly = { + // This is for documentation. Actual contents will be loaded by gRPC. +}; + +/** + * Video segment level annotation results for text detection. + * + * @property {Object} segment + * Video segment where a text snippet was detected. + * + * This object should have the same structure as [VideoSegment]{@link google.cloud.videointelligence.v1.VideoSegment} + * + * @property {number} confidence + * Confidence for the track of detected text. It is calculated as the highest + * over all frames where OCR detected text appears. + * + * @property {Object[]} frames + * Information related to the frames where OCR detected text appears. + * + * This object should have the same structure as [TextFrame]{@link google.cloud.videointelligence.v1.TextFrame} + * + * @typedef TextSegment + * @memberof google.cloud.videointelligence.v1 + * @see [google.cloud.videointelligence.v1.TextSegment definition in proto format]{@link https://github.com/googleapis/googleapis/blob/master/google/cloud/videointelligence/v1/video_intelligence.proto} + */ +const TextSegment = { + // This is for documentation. Actual contents will be loaded by gRPC. +}; + +/** + * Video frame level annotation results for text annotation (OCR). + * Contains information regarding timestamp and bounding box locations for the + * frames containing detected OCR text snippets. + * + * @property {Object} rotatedBoundingBox + * Bounding polygon of the detected text for this frame. + * + * This object should have the same structure as [NormalizedBoundingPoly]{@link google.cloud.videointelligence.v1.NormalizedBoundingPoly} + * + * @property {Object} timeOffset + * Timestamp of this frame. + * + * This object should have the same structure as [Duration]{@link google.protobuf.Duration} + * + * @typedef TextFrame + * @memberof google.cloud.videointelligence.v1 + * @see [google.cloud.videointelligence.v1.TextFrame definition in proto format]{@link https://github.com/googleapis/googleapis/blob/master/google/cloud/videointelligence/v1/video_intelligence.proto} + */ +const TextFrame = { + // This is for documentation. Actual contents will be loaded by gRPC. +}; + +/** + * Annotations related to one detected OCR text snippet. This will contain the + * corresponding text, confidence value, and frame level information for each + * detection. + * + * @property {string} text + * The detected text. + * + * @property {Object[]} segments + * All video segments where OCR detected text appears. + * + * This object should have the same structure as [TextSegment]{@link google.cloud.videointelligence.v1.TextSegment} + * + * @typedef TextAnnotation + * @memberof google.cloud.videointelligence.v1 + * @see [google.cloud.videointelligence.v1.TextAnnotation definition in proto format]{@link https://github.com/googleapis/googleapis/blob/master/google/cloud/videointelligence/v1/video_intelligence.proto} + */ +const TextAnnotation = { + // This is for documentation. Actual contents will be loaded by gRPC. +}; + +/** + * Video frame level annotations for object detection and tracking. This field + * stores per frame location, time offset, and confidence. + * + * @property {Object} normalizedBoundingBox + * The normalized bounding box location of this object track for the frame. + * + * This object should have the same structure as [NormalizedBoundingBox]{@link google.cloud.videointelligence.v1.NormalizedBoundingBox} + * + * @property {Object} timeOffset + * The timestamp of the frame in microseconds. + * + * This object should have the same structure as [Duration]{@link google.protobuf.Duration} + * + * @typedef ObjectTrackingFrame + * @memberof google.cloud.videointelligence.v1 + * @see [google.cloud.videointelligence.v1.ObjectTrackingFrame definition in proto format]{@link https://github.com/googleapis/googleapis/blob/master/google/cloud/videointelligence/v1/video_intelligence.proto} + */ +const ObjectTrackingFrame = { + // This is for documentation. Actual contents will be loaded by gRPC. +}; + +/** + * Annotations corresponding to one tracked object. + * + * @property {Object} segment + * Non-streaming batch mode ONLY. + * Each object track corresponds to one video segment where it appears. + * + * This object should have the same structure as [VideoSegment]{@link google.cloud.videointelligence.v1.VideoSegment} + * + * @property {number} trackId + * Streaming mode ONLY. + * In streaming mode, we do not know the end time of a tracked object + * before it is completed. Hence, there is no VideoSegment info returned. + * Instead, we provide a unique identifiable integer track_id so that + * the customers can correlate the results of the ongoing + * ObjectTrackAnnotation of the same track_id over time. + * + * @property {Object} entity + * Entity to specify the object category that this track is labeled as. + * + * This object should have the same structure as [Entity]{@link google.cloud.videointelligence.v1.Entity} + * + * @property {number} confidence + * Object category's labeling confidence of this track. + * + * @property {Object[]} frames + * Information corresponding to all frames where this object track appears. + * Non-streaming batch mode: it may be one or multiple ObjectTrackingFrame + * messages in frames. + * Streaming mode: it can only be one ObjectTrackingFrame message in frames. + * + * This object should have the same structure as [ObjectTrackingFrame]{@link google.cloud.videointelligence.v1.ObjectTrackingFrame} + * + * @typedef ObjectTrackingAnnotation + * @memberof google.cloud.videointelligence.v1 + * @see [google.cloud.videointelligence.v1.ObjectTrackingAnnotation definition in proto format]{@link https://github.com/googleapis/googleapis/blob/master/google/cloud/videointelligence/v1/video_intelligence.proto} + */ +const ObjectTrackingAnnotation = { + // This is for documentation. Actual contents will be loaded by gRPC. +}; + /** * Video annotation feature. * @@ -781,7 +999,17 @@ const Feature = { /** * Speech transcription. */ - SPEECH_TRANSCRIPTION: 6 + SPEECH_TRANSCRIPTION: 6, + + /** + * OCR text detection and tracking. + */ + TEXT_DETECTION: 7, + + /** + * Object detection and tracking. + */ + OBJECT_TRACKING: 9 }; /** diff --git a/packages/google-cloud-videointelligence/synth.metadata b/packages/google-cloud-videointelligence/synth.metadata index c2c8bd9e255..3962c6f4efd 100644 --- a/packages/google-cloud-videointelligence/synth.metadata +++ b/packages/google-cloud-videointelligence/synth.metadata @@ -1,19 +1,19 @@ { - "updateTime": "2019-02-13T12:29:47.057383Z", + "updateTime": "2019-02-21T12:20:49.083829Z", "sources": [ { "generator": { "name": "artman", - "version": "0.16.13", - "dockerImage": "googleapis/artman@sha256:5fd9aee1d82a00cebf425c8fa431f5457539562f5867ad9c54370f0ec9a7ccaa" + "version": "0.16.14", + "dockerImage": "googleapis/artman@sha256:f3d61ae45abaeefb6be5f228cda22732c2f1b00fb687c79c4bd4f2c42bb1e1a7" } }, { "git": { "name": "googleapis", "remote": "https://github.com/googleapis/googleapis.git", - "sha": "ca61898878f0926dd9dcc68ba90764f17133efe4", - "internalRef": "233680013" + "sha": "9cf63704bd272a40b79dde5a2b33f61104ee4f7f", + "internalRef": "234935970" } }, {