[DO NOT MERGE] Vision API OCR PDF/TIFF sample [(#1420)](GoogleCloudPl…

…atform/python-docs-samples#1420) * add docpdf sample * import order * list blobs * filename change * add the renamed files * parse json string to AnnotateFileResponse message * show more of the response * simplify response processing to better focus on how to make the request * fix typo * linter * linter * linter
busunkim96 · Apr 4, 2018 · 5040962 · 5040962
1 parent 1eeeb8b
commit 5040962
Show file tree

Hide file tree

Showing 3 changed files with 149 additions and 0 deletions.
diff --git a/samples/snippets/detect/detect_pdf.py b/samples/snippets/detect/detect_pdf.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""OCR with PDF/TIFF as source files on GCS
+
+Example:
+    python detect_pdf.py \
+    --gcs-source-uri gs://python-docs-samples-tests/HodgeConj.pdf \
+    --gcs-destination-uri gs://BUCKET_NAME/PREFIX/
+"""
+
+import argparse
+import re
+
+from google.cloud import storage
+from google.cloud import vision_v1p2beta1 as vision
+from google.protobuf import json_format
+
+
+# [START vision_async_detect_document_ocr]
+def async_detect_document(gcs_source_uri, gcs_destination_uri):
+    # Supported mime_types are: 'application/pdf' and 'image/tiff'
+    mime_type = 'application/pdf'
+
+    # How many pages should be grouped into each json output file.
+    # With a file of 5 pages
+    batch_size = 2
+
+    client = vision.ImageAnnotatorClient()
+
+    feature = vision.types.Feature(
+        type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)
+
+    gcs_source = vision.types.GcsSource(uri=gcs_source_uri)
+    input_config = vision.types.InputConfig(
+        gcs_source=gcs_source, mime_type=mime_type)
+
+    gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri)
+    output_config = vision.types.OutputConfig(
+        gcs_destination=gcs_destination, batch_size=batch_size)
+
+    async_request = vision.types.AsyncAnnotateFileRequest(
+        features=[feature], input_config=input_config,
+        output_config=output_config)
+
+    operation = client.async_batch_annotate_files(
+        requests=[async_request])
+
+    print('Waiting for the operation to finish.')
+    operation.result(timeout=90)
+
+    # Once the request has completed and the output has been
+    # written to GCS, we can list all the output files.
+    storage_client = storage.Client()
+
+    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
+    bucket_name = match.group(1)
+    prefix = match.group(2)
+
+    bucket = storage_client.get_bucket(bucket_name=bucket_name)
+
+    # List objects with the given prefix.
+    blob_list = list(bucket.list_blobs(prefix=prefix))
+    print('Output files:')
+    for blob in blob_list:
+        print(blob.name)
+
+    # Process the first output file from GCS.
+    # Since we specified batch_size=2, the first response contains
+    # the first two pages of the input file.
+    output = blob_list[0]
+
+    json_string = output.download_as_string()
+    response = json_format.Parse(
+        json_string, vision.types.AnnotateFileResponse())
+
+    # The actual response for the first page of the input file.
+    first_page_response = response.responses[0]
+    annotation = first_page_response.full_text_annotation
+
+    # Here we print the full text from the first page.
+    # The response contains more information:
+    # annotation/pages/blocks/paragraphs/words/symbols
+    # including confidence scores and bounding boxes
+    print(u'Full text:\n{}'.format(
+        annotation.text))
+# [END vision_async_detect_document_ocr]
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--gcs-source-uri', required=True)
+    parser.add_argument('--gcs-destination-uri', required=True)
+
+    args = parser.parse_args()
+    async_detect_document(args.gcs_source_uri, args.gcs_destination_uri)
diff --git a/samples/snippets/detect/detect_pdf_test.py b/samples/snippets/detect/detect_pdf_test.py
@@ -0,0 +1,38 @@
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from google.cloud import storage
+
+from detect_pdf import async_detect_document
+
+BUCKET = os.environ['CLOUD_STORAGE_BUCKET']
+OUTPUT_PREFIX = 'OCR_PDF_TEST_OUTPUT'
+GCS_SOURCE_URI = 'gs://{}/HodgeConj.pdf'.format(BUCKET)
+GCS_DESTINATION_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX)
+
+
+def test_async_detect_document(capsys):
+    async_detect_document(
+        gcs_source_uri=GCS_SOURCE_URI,
+        gcs_destination_uri=GCS_DESTINATION_URI)
+    out, _ = capsys.readouterr()
+
+    assert 'Hodge conjecture' in out
+
+    storage_client = storage.Client()
+    bucket = storage_client.get_bucket(BUCKET)
+    for blob in bucket.list_blobs(prefix=OUTPUT_PREFIX):
+        blob.delete()
diff --git a/samples/snippets/detect/requirements.txt b/samples/snippets/detect/requirements.txt
@@ -1 +1,2 @@
 google-cloud-vision==0.30.1
+google-cloud-storage==1.6.0