-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgoogle_pdf_to_txt.py
124 lines (98 loc) · 4.68 KB
/
google_pdf_to_txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import json
import os
import re
from dotenv import load_dotenv
from google.cloud import storage, vision
load_dotenv()
credential_path = os.environ["credential_path"]
project_id = os.environ["project_id"]
location = os.environ["location"]
bktnm = os.environ["bktnm"]
book = os.environ["book"]
langhint = os.environ["langhint"]
pagenumbers = os.environ["pagenumbers"] or "YES"
batch_size = 100
def write_to_text(gcs_destination_uri, output_file, output_file_in_cloud):
storage_client = storage.Client()
match = re.match(r"gs://([^/]+)/(.+)", gcs_destination_uri)
bucket_name = match.group(1)
prefix = match.group(2)
bucket = storage_client.get_bucket(bucket_name)
blob_list = list(bucket.list_blobs(prefix=prefix))
print("Output files saving locally..")
## remove the output file if exists to open later as single file from one+ pages of one+ json files.
if os.path.exists(output_file):
os.remove(output_file)
gen = (n for n in range(len(blob_list)) if output_file in blob_list[n].name)
for n in gen:
blob = blob_list[n]
json_file = re.sub(prefix, "", blob.name) ## remove txt/
print(f"{blob.name} as {json_file}, it will be overwritten if exists")
if os.path.exists(json_file):
os.remove(json_file)
blob.download_to_filename(json_file)
json_string = blob.download_as_bytes().decode("utf-8")
response = json.loads(json_string)
for m in range(len(response["responses"])):
first_page_response = response["responses"][m]
try:
annotation = first_page_response["fullTextAnnotation"]
context = first_page_response["context"]
pg = annotation["pages"]
confidence = pg[0]["confidence"]
except KeyError:
print("No annotation for this page.")
with open(output_file, "a+", encoding="utf-8") as f:
if pagenumbers == "YES":
f.write(f"\n=!pgB!=Page: {str(context['pageNumber']).zfill(3).ljust(13, '=')} Confidence: {str(confidence).ljust(20, '=')} Page: {str(context['pageNumber']).zfill(3)}=!Epg!=\n")
f.write(annotation["text"])
print(f"{output_file} saved.")
def pdf_to_text(input_file):
output_file = input_file.split(".")[0] + ".txt"
client = vision.ImageAnnotatorClient()
mime_type = "application/pdf" # Supported mime_types are: 'application/pdf' and 'image/tiff'
feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION, model="builtin/latest")
image_context = {"language_hints": [langhint]}
storage_client = storage.Client()
bucket = storage_client.bucket(bktnm)
input_file_in_cloud = "pdf/" + str(input_file)
blob = bucket.blob(input_file_in_cloud) ## destination file name on cloud
generation_match_precondition = 0
## delete if pre-exists
if storage.Blob(bucket=bucket, name=input_file_in_cloud).exists(storage_client):
generation_match_precondition = blob.generation
blob.delete(if_generation_match=generation_match_precondition)
output_file_in_cloud = "txt/" + str(output_file)
## delete if pre-exists
if storage.Blob(bucket=bucket, name=output_file_in_cloud).exists(storage_client):
generation_match_precondition = blob.generation
blob.delete(if_generation_match=generation_match_precondition)
generation_match_precondition = 0
## source file to read
blob.upload_from_filename(input_file, if_generation_match=generation_match_precondition)
print(f"File {input_file} uploaded as {input_file_in_cloud}.")
## upload is over. Prepare source and destination for pdf to text
gcs_source_uri = "gs://" + bktnm + "/" + input_file_in_cloud
gcs_source = vision.GcsSource(uri=gcs_source_uri)
input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)
gcs_destination_uri = "gs://" + bktnm + "/" + "txt/" + output_file
gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
output_config = vision.OutputConfig(gcs_destination=gcs_destination, batch_size=batch_size)
async_request = vision.AsyncAnnotateFileRequest(
features=[feature],
input_config=input_config,
output_config=output_config,
image_context=image_context,
)
operation = client.async_batch_annotate_files(requests=[async_request])
print("Waiting for the pdf to txt to finish..")
filesResponse = operation.result(timeout=420)
print(f"pdf to txt finished: Saved as {output_file_in_cloud}.")
write_to_text(
f"gs://{bktnm}/txt/",
output_file,
output_file_in_cloud,
)
if __name__ == "__main__":
pdf_to_text(book)
print(f"done.")