forked from raphaeltony/CertHQ
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch_text.py
63 lines (52 loc) · 1.72 KB
/
fetch_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import openai
import json
from PIL import Image
from pytesseract import pytesseract
from dateutil import parser
# tesseract : set path
TESSERACTPATH = "D:/tesseract/tesseract.exe"
def get_text(FILEPATH):
img = Image.open(FILEPATH)
pytesseract.tesseract_cmd = TESSERACTPATH
text = pytesseract.image_to_string(img)
return text
# set your openapi key here
openai.api_key = "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"
prompt = '''You are supposed to identify name without honorifics, Event,
Institution Name,Start Date,
End Date, Prize, Level, Cash Prize from a text which is extracted from a certificate.
Give the response in a JSON format like
{
"name": "RAPHAEL TONY",
"event": "Webspace",
"instname": "Rajagiri School of Engineering",
"start_date": "15th February 2022",
"end_date": "17th February 2022",
"prize": null,
"level":null,
"cash_prize": null
}
The prize values can only be First, Second, Third or Particpant.
If a valid enddate is not found use same value as startdate.
The levels can be State, National, International or Collegiate
'''
def get_response(FILEPATH):
certificate = get_text(FILEPATH)
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": prompt },
{"role": "user", "content": certificate},
]
)
result = ''
for choice in response.choices:
result += choice.message.content
d = json.loads(result) #converting json string to python dictionary
try:
d["start_date"] = parser.parse(d["start_date"]).date()
d["end_date"] = parser.parse(d["end_date"]).date()
except(Exception):
pass
print(d)
return d