Merge branch 'voice-mode'

xinydev · Aug 19, 2023 · 22ff73d · 22ff73d
2 parents 150b25d + 60c8144
commit 22ff73d
Show file tree

Hide file tree

Showing 3 changed files with 122 additions and 2 deletions.
diff --git a/gptw/gptw.py b/gptw/gptw.py
@@ -8,6 +8,7 @@
 import openai
 
 import gptw
+from gptw.voice import chat_in_audio
 
 
 def args_init():
@@ -47,6 +48,14 @@ def args_init():
         help="list all available sub cmds",
     )
 
+    parser.add_argument(
+        "--voice",
+        dest="voice",
+        action="store_true",
+        default=False,
+        help="chat with gpt in voice mode",
+    )
+
     parser.add_argument(
         "-d",
         "--debug",
@@ -112,7 +121,7 @@ def get_config(key, default_value=None):
             return default_value
         return CFG[key]
     except Exception:
-        print("config not found, run `ww --config` to set it")
+        print(f"config {key} not found, run `ww --config` to set it")
         sys.exit(1)
 
 
@@ -198,6 +207,15 @@ def main():
         set_config(k, v)
         exit(0)
 
+    if args.voice:
+        token = get_config("azure-token")
+        endpoint = get_config("azure-endpoint")
+        depname = get_config("azure-depname")
+        tts_key = get_config("tts-key")
+        tts_region = get_config("tts-region")
+        chat_in_audio(token, endpoint, depname, tts_key, tts_region)
+        exit(0)
+
     prompts = get_prompts()
     logging.debug(f"configs:{prompts}")
 

diff --git a/gptw/voice.py b/gptw/voice.py
@@ -0,0 +1,102 @@
+import time
+
+import azure.cognitiveservices.speech as speechsdk
+import openai
+
+ssml = """<speak xmlns="http://www.w3.org/2001/10/synthesis"
+xmlns:mstts="http://www.w3.org/2001/mstts"
+xmlns:emo="http://www.w3.org/2009/10/emotionml"
+version="1.0" xml:lang="en-US">
+<voice name="en-US-JennyNeural"><s/>
+    <mstts:express-as style="friendly">
+        <prosody rate="+8.00%" pitch="+3.00%">
+            {}
+        </prosody>
+    </mstts:express-as><s/>
+</voice></speak>"""
+
+
+prompts = """
+As an English language learning assistant,
+your first task is to offer users a selection of engaging topics to choose from.
+Next, you engage in conversation with user.
+For each user input, your primary task is to assess whether their grammar is correct and
+whether it aligns with the conventions used by native speakers. and provide feedback to the user.
+make sure keep  every response short and concise.
+After provide the feedback, pose the next question related to the user's input.
+"""
+
+recognizer_results = []
+
+
+def chat_in_audio(token, endpoint, depname, tts_key, tts_region):
+
+    global recognizer_results
+
+    # Set up OpenAI API credentials
+    openai.api_key = token
+    openai.api_base = endpoint
+    openai.api_type = "azure"
+    openai.api_version = "2023-05-15"
+
+    # Set up Azure TTS API credentials
+    speech_config = speechsdk.SpeechConfig(subscription=tts_key, region=tts_region)
+    speech_config.output_format = speechsdk.OutputFormat.Detailed
+
+    # Set up speech recognizer
+    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config)
+
+    # https://speech.microsoft.com/portal/voicegallery
+    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
+
+    # Set up conversation loop
+    messages = [
+        {
+            "role": "system",
+            "content": prompts,
+        },
+    ]
+
+    speech_recognizer.recognized.connect(recognized_callback)
+    while True:
+        # Get user input from microphone
+        print("Speak then press Enter to continue, or type 'exit' to exit")
+
+        speech_recognizer.start_continuous_recognition()
+
+        x = input("...: ")
+        if x == "exit":
+            break
+
+        speech_recognizer.stop_continuous_recognition_async().get()
+        time.sleep(1)
+        # Use speech recognizer to convert speech to text
+        user_input = " ".join(recognizer_results)
+        recognizer_results = []
+        print("You said: " + user_input)
+        messages.append({"role": "user", "content": user_input})
+        completion = openai.ChatCompletion.create(
+            engine=depname,
+            messages=messages,
+            temperature=0.5,
+        )
+
+        # Get response text
+        response_text = str(completion.choices[0].message.content).strip()
+        print("AI said: " + response_text)
+        messages.append({"role": "assistant", "content": response_text})
+
+        play_result = speech_synthesizer.speak_ssml_async(
+            ssml.format(response_text)
+        ).get()
+
+        if play_result.reason != speechsdk.ResultReason.SynthesizingAudioCompleted:
+            print(f"Error synthesizing audio: {play_result.error_details}")
+
+        time.sleep(0.5)
+
+
+def recognized_callback(evt):
+    global recognizer_results
+    if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
+        recognizer_results.append(evt.result.text)
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,7 @@ keywords = [
 ]
 license = "Apache-2.0"
 requires-python = ">=3.7.1"
-dependencies = ["openai>=0.27.0"]
+dependencies = ["openai>=0.27.0", "azure-cognitiveservices-speech"]
 optional-dependencies.testing = ["coverage==5.5", "pytest==7.1.3"]
 classifiers = [
   "Development Status :: 3 - Alpha",