-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
75 lines (64 loc) · 2.58 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os
import tempfile
from flask import Flask, request, jsonify, send_file
from google.cloud import vision
from google.cloud import texttospeech
from google.cloud import translate_v2 as translate
# Set up Google Cloud credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./truvision-416017-6ac5fc6efb38.json"
app = Flask(__name__)
@app.route('/detect_text', methods=['POST'])
def detect_text():
if 'file' not in request.files:
return jsonify({'error': 'No file part'}), 400
file = request.files['file']
language = request.args.get('language', 'en-US') # Default to 'en-US' if language param is not provided
if file.filename == '':
return jsonify({'error': 'No selected file'}), 400
if file:
client = vision.ImageAnnotatorClient()
content = file.read()
image = vision.Image(content=content)
response = client.text_detection(image=image)
texts = response.text_annotations
if texts:
detected_text = texts[0].description
audio_content = generate_audio(detected_text, language)
with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as temp_file:
temp_file.write(audio_content)
temp_file.flush()
return send_file(temp_file.name, mimetype='audio/mpeg', as_attachment=True)
else:
return jsonify({'error': 'No text detected'}), 400
else:
return jsonify({'error': 'File upload failed'}), 400
def generate_audio(text, language):
translate_client = translate.Client()
if (language == "en-US"):
language_code = "en"
elif (language == 'es-ES'):
language_code = "es"
elif (language == 'fr-FR'):
language_code = 'fr'
elif (language == 'it-IT'):
language_code = 'it'
elif (language == 'cmn-CN'):
language_code = 'zh'
else:
language_code = 'en'
translated_text = translate_client.translate(text, target_language=language_code)
text = translated_text['translatedText']
client = texttospeech.TextToSpeechClient()
synthesis_input = texttospeech.SynthesisInput(text=text)
voice = texttospeech.VoiceSelectionParams(
language_code=language, ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
)
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
response = client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
)
return response.audio_content
if __name__ == '__main__':
app.run(debug=True)