Skip to content

Commit

Permalink
Make v0.4 release (#99)
Browse files Browse the repository at this point in the history
Two changes for training the ultravox v0.4 model:
- Update the speech transcription instructions
- Update the training config file to include more training data

Co-authored-by: Zhongqiang Huang <zhuang@fixie.ai>
  • Loading branch information
zqhuang211 and Zhongqiang Huang authored Aug 27, 2024
1 parent 052bfef commit b649b9f
Show file tree
Hide file tree
Showing 2 changed files with 180 additions and 9 deletions.
10 changes: 5 additions & 5 deletions ultravox/data/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@
# from https://arxiv.org/pdf/2402.08846
"Transcribe speech to text: <|audio|>",
# from GPT-4
"Capture every word from <|audio|> verbatim",
"Convert speech to text from <|audio|>",
"Listen and transcribe the complete text from <|audio|>",
"Record in writing what is spoken in <|audio|>",
"Transcribe the spoken words from <|audio|> with exact wording and punctuation",
"Capture every word from the audio verbatim\n<|audio|>",
"Convert speech to text from audio\n<|audio|>",
"Listen and transcribe the complete text from audio\n<|audio|>",
"Record in writing what is spoken in audio\n<|audio|>",
"Transcribe the spoken words from audio with exact wording and punctuation\n<|audio|>",
]
ANSWER_PROMPTS = [
# from Gazelle
Expand Down
179 changes: 175 additions & 4 deletions ultravox/training/configs/release_config.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# SLM with ultravox & llama3.1, trained wtih knowledge distillation.
exp_name: "ultravox-v0_3"
exp_name: "ultravox-v0_4"

# Make sure to accept the license agreement on huggingface hub
text_model: "meta-llama/Meta-Llama-3.1-8B-Instruct"
audio_model: "openai/whisper-small"
audio_model: "openai/whisper-medium"


loss_config:
Expand All @@ -14,10 +14,11 @@ loss_config:
val_sets: ["anyinstruct", "soda", "peoplespeech"]

batch_size: 24
max_steps: 7200 # x8x24 = 1,382,400 samples
max_steps: 14400 # x8x24 = 2,764,800

data_sets: []
data_sets: ["anyinstruct"]
data_dicts:
# continuation
- path: "fixie-ai/librispeech_asr"
name: "clean"
splits:
Expand All @@ -35,6 +36,14 @@ data_dicts:
assistant_template: "{{ continuation }}"
transcript_template: "{{ text }}"
weight: 1
- path: "fixie-ai/peoples_speech"
name: "clean"
splits:
- "train" # 1_501_271 samples
user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
assistant_template: "{{ continuation }}"
transcript_template: "{{ text_proc.format_asr_text(text) }}"
weight: 8
- path: "fixie-ai/common_voice_17_0"
name: "en"
splits:
Expand All @@ -43,3 +52,165 @@ data_dicts:
assistant_template: "{{ continuation }}"
transcript_template: "{{ text_proc.format_asr_text(sentence) }}"
weight: 8
- path: "fixie-ai/common_voice_17_0"
name: "ar"
splits:
- "train" # 28_369 samples
user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
assistant_template: "{{ continuation }}"
transcript_template: "{{ sentence }}"
weight: 0.2
- path: "fixie-ai/common_voice_17_0"
name: "de"
splits:
- "train" # 589_100 samples
user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
assistant_template: "{{ continuation }}"
transcript_template: "{{ sentence }}"
weight: 4
- path: "fixie-ai/common_voice_17_0"
name: "es"
splits:
- "train" # 336_846 samples
user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
assistant_template: "{{ continuation }}"
transcript_template: "{{ sentence }}"
weight: 3
- path: "fixie-ai/common_voice_17_0"
name: "fr"
splits:
- "train" # 558_054 samples
user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
assistant_template: "{{ continuation }}"
transcript_template: "{{ sentence }}"
weight: 4
- path: "fixie-ai/common_voice_17_0"
name: "it"
splits:
- "train" # 169_771 samples
user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
assistant_template: "{{ continuation }}"
transcript_template: "{{ sentence }}"
weight: 1.2
- path: "fixie-ai/common_voice_17_0"
name: "ja"
splits:
- "train" # 10_039 samples
user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
assistant_template: "{{ continuation }}"
transcript_template: "{{ sentence }}"
weight: 0.1
- path: "fixie-ai/common_voice_17_0"
name: "pt"
splits:
- "train" # 21_968 samples
user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
assistant_template: "{{ continuation }}"
transcript_template: "{{ sentence }}"
weight: 0.2
- path: "fixie-ai/common_voice_17_0"
name: "ru"
splits:
- "train" # 26_377 samples
user_template: "Continue the following text using less than 50 words:\n\n<|audio|>"
assistant_template: "{{ continuation }}"
transcript_template: "{{ sentence }}"
weight: 0.2
# ASR task
- path: "fixie-ai/librispeech_asr"
name: "clean"
splits:
- "train.100" # 28_539 samples
- "train.360" # 104_014 samples
user_template: "{{ dataset._get_transcribe_prompt() }}"
assistant_template: "{{ text }}"
transcript_template: "{{ text }}"
weight: 0.1
- path: "fixie-ai/librispeech_asr"
name: "other"
splits:
- "train.500" # 148_688 samples
user_template: "{{ dataset._get_transcribe_prompt() }}"
assistant_template: "{{ text }}"
transcript_template: "{{ text }}"
weight: 0.1
- path: "fixie-ai/peoples_speech"
name: "clean"
splits:
- "train" # 1_501_271 samples
user_template: "{{ dataset._get_transcribe_prompt() }}"
assistant_template: "{{ text_proc.format_asr_text(text) }}"
transcript_template: "{{ text_proc.format_asr_text(text) }}"
weight: 0.8
- path: "fixie-ai/common_voice_17_0"
name: "en"
splits:
- "train" # 1_101_170 samples
user_template: "{{ dataset._get_transcribe_prompt() }}"
assistant_template: "{{ text_proc.format_asr_text(sentence) }}"
transcript_template: "{{ text_proc.format_asr_text(sentence) }}"
weight: 0.8
- path: "fixie-ai/common_voice_17_0"
name: "ar"
splits:
- "train" # 28_369 samples
user_template: "{{ dataset._get_transcribe_prompt() }}"
assistant_template: "{{ text_proc.format_asr_text(sentence) }}"
transcript_template: "{{ sentence }}"
weight: 0.02
- path: "fixie-ai/common_voice_17_0"
name: "de"
splits:
- "train" # 589_100 samples
user_template: "{{ dataset._get_transcribe_prompt() }}"
assistant_template: "{{ text_proc.format_asr_text(sentence) }}"
transcript_template: "{{ sentence }}"
weight: 0.4
- path: "fixie-ai/common_voice_17_0"
name: "es"
splits:
- "train" # 336_846 samples
user_template: "{{ dataset._get_transcribe_prompt() }}"
assistant_template: "{{ text_proc.format_asr_text(sentence) }}"
transcript_template: "{{ sentence }}"
weight: 0.3
- path: "fixie-ai/common_voice_17_0"
name: "fr"
splits:
- "train" # 558_054 samples
user_template: "{{ dataset._get_transcribe_prompt() }}"
assistant_template: "{{ text_proc.format_asr_text(sentence) }}"
transcript_template: "{{ sentence }}"
weight: 0.4
- path: "fixie-ai/common_voice_17_0"
name: "it"
splits:
- "train" # 169_771 samples
user_template: "{{ dataset._get_transcribe_prompt() }}"
assistant_template: "{{ text_proc.format_asr_text(sentence) }}"
transcript_template: "{{ sentence }}"
weight: 0.12
- path: "fixie-ai/common_voice_17_0"
name: "ja"
splits:
- "train" # 10_039 samples
user_template: "{{ dataset._get_transcribe_prompt() }}"
assistant_template: "{{ text_proc.format_asr_text(sentence) }}"
transcript_template: "{{ sentence }}"
weight: 0.01
- path: "fixie-ai/common_voice_17_0"
name: "pt"
splits:
- "train" # 21_968 samples
user_template: "{{ dataset._get_transcribe_prompt() }}"
assistant_template: "{{ text_proc.format_asr_text(sentence) }}"
transcript_template: "{{ sentence }}"
weight: 0.02
- path: "fixie-ai/common_voice_17_0"
name: "ru"
splits:
- "train" # 26_377 samples
user_template: "{{ dataset._get_transcribe_prompt() }}"
assistant_template: "{{ text_proc.format_asr_text(sentence) }}"
transcript_template: "{{ sentence }}"
weight: 0.02

0 comments on commit b649b9f

Please sign in to comment.