h2oai · pseudotensor · May 18, 2024 · Apr 10, 2024 · Apr 10, 2024 · Apr 10, 2024
diff --git a/docs/README_InferenceServers.md b/docs/README_InferenceServers.md
@@ -276,22 +276,128 @@ where `<key>` should be replaced by your OpenAI key that probably starts with `s
 
 ### Text to Speech
 
-h2oGPT can do text-to-speech and speech-to-text if `--enable_tts=True` and `--enable_stt=True` as well as `--pre_load_image_audio_models=True`, respectively. h2oGPT's OpenAI Proxy server follows OpenAI API for [Text to Speech](https://platform.openai.com/docs/guides/text-to-speech), e.g.:
+h2oGPT can do text-to-speech and speech-to-text if `--enable_tts=True` and `--enable_stt=True` as well
+as `--pre_load_image_audio_models=True`, respectively. h2oGPT's OpenAI Proxy server follows OpenAI API
+for [Text to Speech](https://platform.openai.com/docs/guides/text-to-speech), e.g.:
+
 ```python
 from openai import OpenAI
-from pathlib import Path
 client = OpenAI(base_url='http://0.0.0.0:5000/v1')
 
-speech_file_path = Path(__file__).parent / "speech.mp3"
-response = client.audio.speech.create(
-model="tts-1",
-voice="SLT (female)", # if server has XTT with Microsoft package
-input="Today is a wonderful day to build something people love!"
-)
+with client.audio.speech.with_streaming_response.create(
+        model="tts-1",
+        voice="",
+        extra_body=dict(stream=True,
+                        chatbot_role="Female AI Assistant",
+                        speaker="SLT (female)",
+                        stream_strip=True,
+                        ),
+        response_format='wav',
+        input="Good morning! The sun is shining brilliantly today, casting a warm, golden glow that promises a day full of possibility and joy. It’s the perfect moment to embrace new opportunities and make the most of every cheerful, sunlit hour. What can I do to help you make today absolutely wonderful?",
+) as response:
+    response.stream_to_file("speech_local.wav")
+```
+
+Set `stream=False` to avoid streaming, e.g.:
+```python
+    from openai import OpenAI
+
+    client = OpenAI(base_url='http://0.0.0.0:5000/v1')
+
+    response = client.audio.speech.create(
+            model="tts-1",
+            voice="",
+            extra_body=dict(stream=False,
+                            chatbot_role="Female AI Assistant",
+                            speaker="SLT (female)",
+                            format='wav',
+                            ),
+            input="Today is a wonderful day to build something people love! Today is a wonderful day to build something people love! Today is a wonderful day to build something people love! Today is a wonderful day to build something people love! Today is a wonderful day to build something people love! Today is a wonderful day to build something people love! Today is a wonderful day to build something people love! Today is a wonderful day to build something people love! Today is a wonderful day to build something people love! Today is a wonderful day to build something people love! Today is a wonderful day to build something people love! Today is a wonderful day to build something people love! Today is a wonderful day to build something people love! Today is a wonderful day to build something people love! ",
+    )
+    response.stream_to_file("speech_local2.wav")
+```
+
+To stream the audio and play during streaming, one can use httpx and pygame:
+```python
+import openai
+import httpx
+import pygame
+
+import pygame.mixer
+
+pygame.mixer.init(frequency=16000, size=-16, channels=1)
+
+sound_queue = []
+
+
+def play_audio(audio):
+    import io
+    from pydub import AudioSegment
+
+    sr = 16000
+    s = io.BytesIO(audio)
+    channels = 1
+    sample_width = 2
 
-response.stream_to_file(speech_file_path)
+    audio = AudioSegment.from_raw(s, sample_width=sample_width, frame_rate=sr, channels=channels)
+    sound = pygame.mixer.Sound(io.BytesIO(audio.raw_data))
+    sound_queue.append(sound)
+    sound.play()
+
+    # Wait for the audio to finish playing
+    duration_ms = sound.get_length() * 1000  # Convert seconds to milliseconds
+    pygame.time.wait(int(duration_ms))
+
+
+# Ensure to clear the queue when done to free memory and resources
+def clear_queue(sound_queue):
+    for sound in sound_queue:
+        sound.stop()
+
+
+api_key = 'EMPTY'
+
+# Initialize OpenAI and Pygame
+client = openai.OpenAI(api_key=api_key)
+
+# Set up the request headers and parameters
+headers = {
+    "Authorization": f"Bearer {client.api_key}",
+    "Content-Type": "application/json",
+}
+data = {
+    "model": "tts-1",
+    "voice": "SLT (female)",
+    "input": "Good morning! The sun is shining brilliantly today, casting a warm, golden glow that promises a day full of possibility and joy. It’s the perfect moment to embrace new opportunities and make the most of every cheerful, sunlit hour. What can I do to help you make today absolutely wonderful?",
+    "stream": "true",
+    "stream_strip": "false",
+}
+
+# base_url = "https://api.openai.com/v1"
+base_url = "http://localhost:5000/v1/audio/speech"
+
+# Start the HTTP session and stream the audio
+with httpx.Client(timeout=None) as http_client:
+    # Initiate a POST request and stream the response
+    with http_client.stream("POST", base_url, headers=headers, json=data) as response:
+        chunk_riff = b''
+        for chunk in response.iter_bytes():
+            if chunk.startswith(b'RIFF'):
+                if chunk_riff:
+                    play_audio(chunk_riff)
+                chunk_riff = chunk
+            else:
+                chunk_riff += chunk
+        # Play the last accumulated chunk
+        if chunk_riff:
+            play_audio(chunk_riff)
+# done
+clear_queue(sound_queue)
+pygame.quit()
 ```
 
+The streaming case writes the file (which could be to some buffer) each chunk (sentence) at a time, while non-streaming case does entire file at once and client waits till end to write the file.  For the streaming case, if it is a wave file, like OpenAI, the server artificially inflates the estimated duration of the audio so player will play through end of the audio.
+
 ### Speech to Text
 
 Requires h2oGPT loaded with `--enable_stt=True --pre_load_image_audio_models=True`.
@@ -369,11 +475,25 @@ conda create -n vllm -y
 conda activate vllm
 conda install python=3.10 -y
 ```
+Install required NCCL:
+```bash
+sudo apt update
+sudo apt install libnccl2 libnccl-dev
+```
+Ensure cuda 12.1 installed, and can choose to avoid overwriting original link if want.  E.g. for Ubuntu:
+```bash
+# https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=20.04&target_type=runfile_local
+ wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
+sudo sh cuda_12.1.0_530.30.02_linux.run
+sudo chmod -R a+rwx /usr/local/
+```
 Assuming torch was installed with CUDA 12.1, and you have installed cuda locally in `/usr/local/cuda-12.1`:
 ```bash
 export CUDA_HOME=/usr/local/cuda-12.1
 export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu121"
-pip install flash-attn==2.5.4
+export HF_HUB_ENABLE_HF_TRANSFER=1
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/lib64:$HOME/extras/CUPTI/lib64
+export PATH=$PATH:$CUDA_HOME/bin
 pip install vllm
 ```
 Then can start in OpenAI compliant mode, e.g. for LLaMa 65B on 2*A100 GPUs:

diff --git a/gradio_utils/grclient.py b/gradio_utils/grclient.py
@@ -727,7 +727,7 @@ def query_or_summarize_or_extract(
         guided_regex: str = "",
         guided_choice: str = "",
         guided_grammar: str = "",
-        guided_whitespace_pattern: str = ' ',
+        guided_whitespace_pattern: str = None,
         prompt_type: Union[int, str] = None,
         prompt_dict: Dict = None,
         jq_schema=".[]",

diff --git a/openai_server/backend.py b/openai_server/backend.py
@@ -199,6 +199,9 @@ def get_response(instruction, gen_kwargs, verbose=False, chunk_response=True, st
 
     kwargs.update(**gen_kwargs)
 
+    if gen_kwargs.get('skip_gradio'):
+        fun_with_dict_str_plain
+
     # concurrent gradio client
     client = get_client(user=gen_kwargs.get('user'))
 
@@ -552,7 +555,7 @@ def _audio_to_text(model, audio_file, stream, response_format, chunk, **kwargs):
         yield dict(text=text.strip())
 
 
-def text_to_audio(model, voice, input, stream, format, **kwargs):
+def text_to_audio(model, voice, input, stream, response_format, **kwargs):
     # tts_model = 'microsoft/speecht5_tts'
     # tts_model = 'tts_models/multilingual/multi-dataset/xtts_v2'
     # assumes enable_tts=True set for h2oGPT
@@ -588,20 +591,20 @@ def text_to_audio(model, voice, input, stream, format, **kwargs):
 
         n = 0
         for audio_str in job:
-            yield audio_str_to_bytes(audio_str, format=format)
+            yield audio_str_to_bytes(audio_str, response_format=response_format)
             n += 1
 
         # get rest after job done
         outputs = job.outputs().copy()
         for audio_str in outputs[n:]:
-            yield audio_str_to_bytes(audio_str, format=format)
+            yield audio_str_to_bytes(audio_str, response_format=response_format)
             n += 1
     else:
         audio_str = client.predict(*tuple(list(inputs.values())), api_name='/speak_text_api')
-        yield audio_str_to_bytes(audio_str, format=format)
+        yield audio_str_to_bytes(audio_str, response_format=response_format)
 
 
-def audio_str_to_bytes(audio_str1, format='wav'):
+def audio_str_to_bytes(audio_str1, response_format='wav'):
     # Parse the input string to a dictionary
     audio_dict = ast.literal_eval(audio_str1)
 
@@ -627,7 +630,7 @@ def audio_str_to_bytes(audio_str1, format='wav'):
 
     # Export the AudioSegment to a BytesIO object as WAV
     output_stream = io.BytesIO()
-    audio_segment.export(output_stream, format=format)
+    audio_segment.export(output_stream, format=response_format)
     output_bytes = output_stream.getvalue()
 
     return output_bytes

diff --git a/openai_server/server.py b/openai_server/server.py
@@ -1,25 +1,20 @@
-import contextlib
-import logging
+import io
 import os
 import sys
 import ast
 import json
-from threading import Thread
 import time
 from traceback import print_exception
 from typing import List, Dict, Optional, Literal, Union
 from pydantic import BaseModel, Field
 
-import uvicorn
-from fastapi import Depends, FastAPI, Header, HTTPException
+from fastapi import  FastAPI, Header, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi import Request, Form, UploadFile, File, Depends
+from fastapi import Request, Depends
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 from sse_starlette import EventSourceResponse
 from starlette.responses import PlainTextResponse
 
-from openai_server.log import logger
-
 sys.path.append('openai_server')
 
 
@@ -389,10 +384,42 @@ class AudioTextRequest(BaseModel):
     model: str = ''
     voice: str = ''  # overrides both chatbot_role and speaker if set
     input: str
+    response_format: str = 'wav'  # "mp3", "opus", "aac", "flac", "wav", "pcm"
     stream: bool = True
+    stream_strip: bool = True
     chatbot_role: str = "Female AI Assistant"  # Coqui TTS
     speaker: str = "SLT (female)"  # Microsoft TTS
-    format: str = 'wav'
+
+
+def modify_wav_header(wav_bytes):
+    # Ensure the bytes start with the 'RIFF' identifier
+    if wav_bytes[:4] != b'RIFF':
+        raise ValueError("This is not a valid WAV file.")
+
+    # Get current size (which we will fake)
+    original_size = int.from_bytes(wav_bytes[4:8], byteorder='little')
+    # print("Original size:", original_size)
+
+    # Calculate fake size (Maximum value for 32-bit unsigned int minus 8)
+    fake_size = (2**30 - 1) - 8
+    modified_size_bytes = fake_size.to_bytes(4, byteorder='little')
+
+    # Replace the original size with the fake size in the RIFF header
+    modified_wav_bytes = wav_bytes[:4] + modified_size_bytes + wav_bytes[8:]
+
+    # Find the 'data' chunk and modify its size too
+    data_chunk_pos = modified_wav_bytes.find(b'data')
+    if data_chunk_pos == -1:
+        raise ValueError("Data chunk not found in WAV file.")
+
+    # Set a large fake size for the data chunk as well
+    modified_wav_bytes = (
+        modified_wav_bytes[:data_chunk_pos + 4] +  # 'data' text
+        modified_size_bytes +  # fake size for data chunk
+        modified_wav_bytes[data_chunk_pos + 8:]  # rest of data
+    )
+
+    return modified_wav_bytes
 
 
 @app.post('/v1/audio/speech', dependencies=check_key)
@@ -406,26 +433,29 @@ async def handle_audio_to_speech(
         from openai_server.backend import text_to_audio
 
         async def generator():
-            response = text_to_audio(**dict(audio_request))
-            for chunk in response:
+            chunki = 0
+            for chunk in text_to_audio(**dict(audio_request)):
                 disconnected = await request.is_disconnected()
                 if disconnected:
                     break
-                yield chunk
 
-        if audio_request.format == 'wav':
-            return StreamingResponse(generator(), media_type="audio/wav")
-        else:
-            return StreamingResponse(generator(), media_type="audio/%s" % audio_request.format)
+                if chunki == 0 and audio_request.response_format == 'wav':
+                    # pretend longer than is, like OpenAI does
+                    chunk = modify_wav_header(chunk)
+                # h2oGPT sends each chunk as full object, we need rest to be raw data without header for real streaming
+                if chunki > 0 and audio_request.stream_strip:
+                    from pydub import AudioSegment
+                    chunk = AudioSegment.from_file(io.BytesIO(chunk), format=audio_request.response_format).raw_data
+
+                yield chunk
+                chunki += 1
+        return StreamingResponse(generator(), media_type="audio/%s" % audio_request.response_format)
     else:
         from openai_server.backend import text_to_audio
         response = ''
         for response1 in text_to_audio(**dict(audio_request)):
             response = response1
-        if audio_request.format == 'wav':
-            return Response(content=response, media_type="audio/wav")
-        else:
-            return Response(content=response, media_type="audio/%s" % audio_request.format)
+        return Response(content=response, media_type="audio/%s" % audio_request.response_format)
 
 
 class ImageGenerationRequest(BaseModel):
@@ -501,47 +531,3 @@ async def handle_embeddings(request: Request, request_data: EmbeddingsRequest):
     from openai_server.backend import text_to_embedding
     response = text_to_embedding(model, text, encoding_format)
     return JSONResponse(response)
-
-
-def run_server(host='0.0.0.0',
-               port=5000,
-               ssl_certfile=None,
-               ssl_keyfile=None,
-               gradio_prefix=None,
-               gradio_host=None,
-               gradio_port=None,
-               h2ogpt_key=None,
-               auth=None,
-               auth_access='open',
-               guest_name='',
-               ):
-    os.environ['GRADIO_PREFIX'] = gradio_prefix or 'http'
-    os.environ['GRADIO_SERVER_HOST'] = gradio_host or '127.0.0.1'
-    os.environ['GRADIO_SERVER_PORT'] = gradio_port or '7860'
-    os.environ['GRADIO_H2OGPT_H2OGPT_KEY'] = h2ogpt_key or ''  # don't use H2OGPT_H2OGPT_KEY, mixes things up
-    # use h2ogpt_key if no server api key, so OpenAI inherits key by default if any keys set and enforced via API for h2oGPT
-    # but OpenAI key cannot be '', so dummy value is EMPTY and if EMPTY we ignore the key in authorization
-    server_api_key = os.getenv('H2OGPT_OPENAI_API_KEY', os.environ['GRADIO_H2OGPT_H2OGPT_KEY']) or 'EMPTY'
-    os.environ['H2OGPT_OPENAI_API_KEY'] = server_api_key
-
-    os.environ['GRADIO_AUTH'] = str(auth)
-    os.environ['GRADIO_AUTH_ACCESS'] = auth_access
-    os.environ['GRADIO_GUEST_NAME'] = guest_name
-
-    port = int(os.getenv('H2OGPT_OPENAI_PORT', port))
-    ssl_certfile = os.getenv('H2OGPT_OPENAI_CERT_PATH', ssl_certfile)
-    ssl_keyfile = os.getenv('H2OGPT_OPENAI_KEY_PATH', ssl_keyfile)
-
-    prefix = 'https' if ssl_keyfile and ssl_certfile else 'http'
-    logger.info(f'OpenAI API URL: {prefix}://{host}:{port}')
-    logger.info(f'OpenAI API key: {server_api_key}')
-
-    logging.getLogger("uvicorn.error").propagate = False
-    uvicorn.run(app, host=host, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)
-
-
-def run(wait=True, **kwargs):
-    if wait:
-        run_server(**kwargs)
-    else:
-        Thread(target=run_server, kwargs=kwargs, daemon=True).start()