fixie-ai · zqhuang211 · Aug 16, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024
diff --git a/.gitignore b/.gitignore
@@ -125,6 +125,7 @@ ipython_config.py
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
+poetry.toml
 
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,13 +9,13 @@ readme = "README.md"
 
 [tool.poetry.dependencies]
 python = "^3.11"
-torch = "2.2.2"
+torch = "2.4"
 transformers = {version = ">=4.43.1", extras = ["torch"]}
 bitsandbytes = "~0.42.0"
 peft = "~0.11.1"
 simple-parsing = "~0.1.5"
 librosa = "~0.10.2.post1"
-requests = "~2.26.0"
+requests = "~2.31.0"
 datasets = "~2.19.1"
 mosaicml-streaming = "~0.7.6"
 nltk = "~3.8.1"
@@ -39,8 +39,8 @@ fsspec = "~2024.3.1"
 gcsfs = "~2024.3.1"
 sounddevice = "~0.4.7"
 mosaicml-cli = "~0.6.31"
-gradio-client = "~1.0.1"
-gradio = "~3.40.1"
+gradio-client = "~0.16.1"
+gradio = "~4.29.0"
 gpustat = "~1.1.1"
 types-requests = "~2.26.0"
 types-pyyaml = "^6.0.12.20240724"

diff --git a/ultravox/data/datasets.py b/ultravox/data/datasets.py
@@ -191,6 +191,9 @@ def __post_init__(self):
             ), f"Unexpected audio dtype: {self.audio.dtype}"
             assert self.audio.ndim == 1, f"Unexpected audio shape: {self.audio.shape}"
 
+    def add_past_messages(self, past_messages: List[Dict[str, str]]):
+        self.messages = past_messages + self.messages
+
     messages: List[Dict[str, str]]
     """List of messages, each with a "role" and "content" field."""
     audio: Optional[np.typing.NDArray[np.float32]] = None

diff --git a/ultravox/inference/base.py b/ultravox/inference/base.py
@@ -10,6 +10,7 @@ class VoiceOutput:
     text: str
     input_tokens: int
     output_tokens: int
+    audio_token_len: int = 0
 
 
 class InferenceMessage:

diff --git a/ultravox/inference/infer.py b/ultravox/inference/infer.py
@@ -1,5 +1,6 @@
+import copy
 import threading
-from typing import Optional
+from typing import Dict, List, Optional, Tuple, Union
 
 import librosa
 import numpy as np
@@ -11,7 +12,7 @@
 from ultravox.model import ultravox_processing
 
 SAMPLE_RATE = 16000
-MAX_TOKENS = 1024
+MAX_NEW_TOKENS = 1024
 # Without this penalty, the model tends to repeat itself.
 REPETITION_PENALTY = 1.1
 
@@ -29,20 +30,61 @@ def __init__(
         self.tokenizer = tokenizer
         self.processor = processor
         self.dtype = dtype
+        self.past_messages: List[Dict[str, str]] = []
+        self.past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = (
+            None
+        )
+
+    def reset_history(self):
+        self.past_messages = []
+        self.past_key_values = None
+
+    def _add_past_message(self, message: Dict[str, str], audio_token_len: int):
+        message = copy.copy(message)
+        content = message["content"]
+        if audio_token_len > 0:
+            if content.count("<|audio|>") != 1:
+                raise ValueError(
+                    f"Expected 1 audio placeholder, found {content.count('<|audio|>')}"
+                )
+            message["content"] = content.replace(
+                "<|audio|>", self.tokenizer.eos_token * audio_token_len
+            )
+
+        self.past_messages.append(message)
+
+    def _get_sample_with_past(
+        self, sample: datasets.VoiceSample
+    ) -> datasets.VoiceSample:
+        sample = copy.copy(sample)
+        sample.add_past_messages(self.past_messages)
+        return sample
 
     def infer(
         self,
         sample: datasets.VoiceSample,
         max_tokens: Optional[int] = None,
         temperature: Optional[float] = None,
     ) -> base.VoiceOutput:
-        inputs = self._dataproc(sample)
+        extended_sample = self._get_sample_with_past(sample)
+        inputs = self._dataproc(extended_sample)
         input_len = inputs["input_ids"].shape[1]
-        output = self._generate(inputs, max_tokens, temperature)
-        output_tokens = output[0][input_len:]
+        output = self._generate(
+            inputs, max_tokens, temperature, self.past_key_values
+        )
+        output_tokens = output.sequences[0][input_len:]
         output_text = self.tokenizer.decode(output_tokens, skip_special_tokens=True)
         output_len = len(output_tokens)
-        return base.VoiceOutput(output_text, input_len, output_len)
+
+        # update history
+        audio_token_len = (
+            0 if "audio_token_len" not in inputs else inputs["audio_token_len"][0]
+        )
+        self._add_past_message(extended_sample.messages[-1], audio_token_len)
+        self._add_past_message({"role": "assistant", "content": output_text}, 0)
+        self.past_key_values = output.past_key_values
+
+        return base.VoiceOutput(output_text, input_len, output_len, audio_token_len)
 
     def infer_stream(
         self,
@@ -57,7 +99,12 @@ def infer_stream(
             self.tokenizer, skip_prompt=True, decode_kwargs=decode_kwargs
         )
 
-        thread_args = (inputs, max_tokens, temperature, streamer)
+        thread_args = (
+            inputs,
+            max_tokens,
+            temperature,
+            streamer,
+        )
         thread = threading.Thread(target=self._generate, args=thread_args)
         thread.start()
         output_tokens = 0
@@ -108,8 +155,9 @@ def _dataproc(self, sample: datasets.VoiceSample):
     def _generate(
         self,
         inputs: torch.Tensor,
-        max_tokens: Optional[int] = None,
+        max_new_tokens: Optional[int] = None,
         temperature: Optional[float] = None,
+        past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         streamer: Optional[transformers.TextStreamer] = None,
     ):
         temperature = temperature or None
@@ -122,10 +170,12 @@ def _generate(
         return self.model.generate(
             **inputs,
             do_sample=do_sample,
-            max_new_tokens=max_tokens or MAX_TOKENS,
+            max_new_tokens=max_new_tokens or MAX_NEW_TOKENS,
             temperature=temperature,
             repetition_penalty=REPETITION_PENALTY,
             pad_token_id=self.tokenizer.eos_token_id,
             eos_token_id=terminators,
             streamer=streamer,
+            past_key_values=past_key_values,
+            return_dict_in_generate=True,
         )
diff --git a/ultravox/model/ultravox_model.py b/ultravox/model/ultravox_model.py
@@ -247,10 +247,16 @@ def prepare_inputs_for_generation(
             **kwargs,
         )
 
-        if is_cache_empty(past_key_values) and audio_values is not None:
-            # We only want to use audio features in the 1st generation step
+        prefill_start_idx = kwargs["cache_position"][0]
+        if (
+            audio_values is not None
+            and audio_token_start_idx is not None
+            and prefill_start_idx <= torch.max(audio_token_start_idx)
+        ):
             model_input["audio_values"] = audio_values
-            model_input["audio_token_start_idx"] = audio_token_start_idx
+            model_input["audio_token_start_idx"] = (
+                audio_token_start_idx - prefill_start_idx
+            )
             model_input["audio_token_len"] = audio_token_len
 
         return model_input

diff --git a/ultravox/tools/gradio_demo.py b/ultravox/tools/gradio_demo.py
@@ -1,13 +1,18 @@
 from dataclasses import dataclass
-from typing import Tuple
+from typing import Optional
 
 import gradio as gr
-import numpy as np
 import simple_parsing
 
 from ultravox.data import datasets
 from ultravox.inference import ultravox_infer
 
+demo_instruction: str = """Enter your prompt here (audio will be inserted at the end or at <|audio|>).
+
+Text mode: Shift+Enter to submit.
+Voice mode: Click the recording button to start, then click again to stop and submit.
+"""
+
 
 @dataclass
 class DemoConfig:
@@ -16,27 +21,110 @@ class DemoConfig:
     #    runs/llama2_asr_gigaspeech/checkpoint-1000/
     #    wandb://fixie/ultravox/model-llama2_asr_gigaspeech:v0
     model_path: str = "fixie-ai/ultravox"
-    default_prompt: str = "Transcribe\n<|audio|>"
+    # Use <|audio|> to specify where to insert audio, otherwise, audio is inserted at the end in voice mode.
+    default_prompt: str = ""
+    max_new_tokens: int = 256
+    device: str = "mps"
+    data_type: str = "float16"
 
 
 def main():
     args = simple_parsing.parse(config_class=DemoConfig)
-    inference = ultravox_infer.UltravoxInference(args.model_path)
+    inference = ultravox_infer.UltravoxInference(
+        args.model_path, device=args.device, data_type=args.data_type
+    )
 
-    def wrapper(text: str, audio: Tuple[int, np.ndarray]) -> str:
-        sample = datasets.VoiceSample.from_prompt_and_raw(text, audio[1], audio[0])
-        return inference.infer(sample, max_tokens=64).text
+    def add_text(chatbot: gr.Chatbot, text: str) -> gr.Chatbot:
+        return chatbot + [(text, None)]
 
-    inputs = [
-        gr.Textbox(label="Prompt", value=args.default_prompt),
-        gr.Audio(label="Audio", show_download_button=True),
-    ]
-    outputs = [gr.Textbox(label="Output")]
-    examples = [["Transcribe\n<|audio|>", "examples/test16.wav"]]
+    def add_audio(chatbot: gr.Chatbot, audio: str) -> gr.Chatbot:
+        return chatbot + [((audio,), None)]
 
-    gr.Interface(fn=wrapper, inputs=inputs, outputs=outputs, examples=examples).launch(
-        share=True
-    )
+    def process_turn(
+        chatbot: gr.Chatbot,
+        prompt: str,
+        audio: Optional[str] = None,
+        temperature: float = 0,
+    ):
+        # We want to keep the prompt (mixed audio/text instruction) as is in voice mode, but set it to "" in anticipation of new prompt in text mode.
+        prompt_to_return = prompt
+        if audio:
+            if "<|audio|>" not in prompt:
+                prompt += "<|audio|>"
+            sample = datasets.VoiceSample.from_prompt_and_file(prompt, audio)
+        else:
+            sample = datasets.VoiceSample.from_prompt(prompt)
+            prompt_to_return = ""
+
+        if len(sample.messages) != 1:
+            raise ValueError(
+                f"Expected exactly 1 message in sample but got {len(sample.messages)}"
+            )
+
+        output = inference.infer(
+            sample,
+            max_tokens=args.max_new_tokens,
+            temperature=temperature,
+        )
+
+        chatbot = chatbot + [(None, output.text)]
+        return chatbot, gr.update(value=prompt_to_return)
+
+    def process_text(chatbot, prompt, temperature):
+        return process_turn(chatbot, prompt, None, temperature)
+
+    def process_audio(chatbot, prompt, audio, temperature):
+        return process_turn(chatbot, prompt, audio, temperature)
+
+    def gradio_reset():
+        inference.reset_history()
+        return [], "", None
+
+    with gr.Blocks() as demo:
+        chatbot = gr.Chatbot(scale=10, height=1000)
+
+        with gr.Row():
+            with gr.Column(scale=1):
+                reset = gr.Button("Reset")
+                audio = gr.Audio(
+                    label="🎤",
+                    sources=["microphone"],
+                    type="filepath",
+                    visible=True,
+                )
+            with gr.Column(scale=8):
+                prompt = gr.Textbox(
+                    show_label=False,
+                    lines=5,
+                    placeholder=demo_instruction,
+                    value=args.default_prompt,
+                    container=True,
+                )
+            with gr.Column(scale=1):
+                temperature = gr.Slider(
+                    minimum=0,
+                    maximum=5.0,
+                    value=0,
+                    step=0.1,
+                    interactive=True,
+                    label="temperature",
+                )
+
+        prompt.submit(add_text, [chatbot, prompt], [chatbot], queue=False).then(
+            process_text,
+            [chatbot, prompt, temperature],
+            [chatbot, prompt],
+            queue=False,
+        )
+        audio.stop_recording(add_audio, [chatbot, audio], [chatbot], queue=False).then(
+            process_audio,
+            [chatbot, prompt, audio, temperature],
+            [chatbot, prompt],
+            queue=False,
+        )
+        reset.click(gradio_reset, [], [chatbot, prompt, audio], queue=False)
+
+    demo.launch(share=True)
 
 
 if __name__ == "__main__":

diff --git a/ultravox/tools/infer_api.py b/ultravox/tools/infer_api.py
@@ -2,11 +2,12 @@
 import json
 import os
 import tempfile
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Tuple, Union
 
 import gradio_client
 import numpy as np
 import requests
+import transformers
 
 from ultravox.data import datasets
 from ultravox.inference import base
@@ -23,6 +24,7 @@ def infer(
         sample: datasets.VoiceSample,
         max_tokens: Optional[int] = None,
         temperature: Optional[float] = None,
+        past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
     ) -> base.VoiceOutput:
         text = ""
         stats = None
@@ -41,6 +43,7 @@ def infer_stream(
         sample: datasets.VoiceSample,
         max_tokens: Optional[int] = None,
         temperature: Optional[float] = None,
+        past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
     ) -> base.InferenceGenerator:
         url = f"{self._base_url}/chat/completions"
         headers = {"Content-Type": "application/json"}
@@ -104,6 +107,7 @@ def infer(
         sample: datasets.VoiceSample,
         max_tokens: Optional[int] = None,
         temperature: Optional[float] = None,
+        past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
     ) -> base.VoiceOutput:
         headers = {"Content-Type": "application/json"}
         response = requests.post(
@@ -127,6 +131,7 @@ def infer(
         sample: datasets.VoiceSample,
         max_tokens: Optional[int] = None,
         temperature: Optional[float] = None,
+        past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
     ) -> base.VoiceOutput:
         # For some reason the most recent Gradio endpoint only accepts
         # audio as a file, not as a base64-encoded string. There's probably