-
Notifications
You must be signed in to change notification settings - Fork 1
/
pdfparser.py
112 lines (75 loc) · 3.93 KB
/
pdfparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import json
import shutil
"""
Sample project for OCRWebService.com (REST API).
Extract text from scanned images and PDF documents and convert into editable formats.
Please create new account with ocrwebservice.com via http://www.ocrwebservice.com/account/signup and get license code
"""
# Provide your username and license code
LicenseCode = '2D7C49E9-9910-4618-BDC4-3DF71CCA28F0';
UserName = 'FIRSTTLASTT';
try:
import requests
except ImportError:
print("You need the requests library to be installed in order to use this sample.")
print("Run 'pip install requests' to fix it.")
exit()
"""
You should specify OCR settings. See full description http://www.ocrwebservice.com/service/restguide
Input parameters:
[language] - Specifies the recognition language.
This parameter can contain several language names separated with commas.
For example "language=english,german,spanish".
Optional parameter. By default:english
[pagerange] - Enter page numbers and/or page ranges separated by commas.
For example "pagerange=1,3,5-12" or "pagerange=allpages".
Optional parameter. By default:allpages
[tobw] - Convert image to black and white (recommend for color image and photo).
For example "tobw=false"
Optional parameter. By default:false
[zone] - Specifies the region on the image for zonal OCR.
The coordinates in pixels relative to the left top corner in the following format: top:left:height:width.
This parameter can contain several zones separated with commas.
For example "zone=0:0:100:100,50:50:50:50"
Optional parameter.
[outputformat] - Specifies the output file format.
Can be specified up to two output formats, separated with commas.
For example "outputformat=pdf,txt"
Optional parameter. By default:doc
[gettext] - Specifies that extracted text will be returned.
For example "tobw=true"
Optional parameter. By default:false
[description] - Specifies your task description. Will be returned in response.
Optional parameter.
!!!! For getting result you must specify "gettext" or "outputformat" !!!!
"""
# Build your OCR:
# Extract text with English language by default
#RequestUrl = "http://www.ocrwebservice.com/restservices/processDocument?gettext=true";
# Extract text with English and german language using zonal OCR
#RequestUrl = 'http://www.ocrwebservice.com/restservices/processDocument?language=english,german&zone=0:0:600:400,500:1000:150:400';
# Convert first 5 pages of multipage document into doc and txt
RequestUrl = 'http://www.ocrwebservice.com/restservices/processDocument?language=english&outputformat=xlsx';
#Full path to uploaded document
FilePath = "ICICI_Credit.pdf"
with open(FilePath, 'rb') as image_file:
image_data = image_file.read()
r = requests.post(RequestUrl, data=image_data, auth=(UserName, LicenseCode))
if r.status_code == 401:
#Please provide valid username and license code
print("Unauthorized request")
exit()
# Decode Output response
jobj = json.loads(r.content)
ocrError = str(jobj["ErrorMessage"])
if ocrError != '':
#Error occurs during recognition
print ("Recognition Error: " + ocrError)
exit()
# Get extracted text from Second zone for each page
#print("Zone 2 Page 1 Text:" + str(jobj["OCRText"][1][0]))
#print("Zone 2 Page 2 Text:" + str(jobj["OCRText"][1][1]))
#Download output file (if outputformat was specified)
file_response = requests.get(jobj["OutputFileUrl"], stream=True)
with open("outputDoc.xlsx", 'wb') as output_file:
shutil.copyfileobj(file_response.raw, output_file)