Skip to content

Commit

Permalink
[azopenai] Fixing issue where you can't use whisper with m4a files. (#…
Browse files Browse the repository at this point in the history
…22210)

Fixing issue where you can't use whisper with m4a files.

* It's one of the formats that doesn't seem to be recognized without an explicit file extension, which you can pass via Filename
* My tests were too heavily dependent on implementation details of the models. Changing this out to check that things are working correctly without checking the exact contents of the response.
* Also, rerecorded tests since we're doing multiple audio tests as well.

Fixes #22195
  • Loading branch information
richardpark-msft authored Jan 9, 2024
1 parent 9f9219c commit ecdb3a6
Show file tree
Hide file tree
Showing 8 changed files with 227 additions and 173 deletions.
3 changes: 3 additions & 0 deletions sdk/ai/azopenai/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@

### Bugs Fixed

- `AudioTranscriptionOptions.Filename` and `AudioTranslationOptions.Filename` fields are now properly propagated, allowing
for disambiguating the format of an audio file when OpenAI can't detect it. (PR#22210)

### Other Changes

## 0.4.0 (2023-12-11)
Expand Down
2 changes: 1 addition & 1 deletion sdk/ai/azopenai/assets.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
"AssetsRepo": "Azure/azure-sdk-assets",
"AssetsRepoPrefixPath": "go",
"TagPrefix": "go/ai/azopenai",
"Tag": "go/ai/azopenai_b42da78821"
"Tag": "go/ai/azopenai_85a01b7ac6"
}
331 changes: 189 additions & 142 deletions sdk/ai/azopenai/client_audio_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ package azopenai_test

import (
"context"
"fmt"
"os"
"path/filepath"
"testing"

"github.com/Azure/azure-sdk-for-go/sdk/ai/azopenai"
Expand All @@ -24,20 +26,27 @@ func TestClient_GetAudioTranscription_AzureOpenAI(t *testing.T) {
func TestClient_GetAudioTranscription_OpenAI(t *testing.T) {
client := newOpenAIClientForTest(t)

mp3Bytes, err := os.ReadFile(`testdata/sampledata_audiofiles_myVoiceIsMyPassportVerifyMe01.mp3`)
require.NoError(t, err)

args := newTranscriptionOptions(azopenai.AudioTranscriptionFormatVerboseJSON, openAI.Whisper.Model, mp3Bytes)
transcriptResp, err := client.GetAudioTranscription(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)
testFiles := []string{
`testdata/sampledata_audiofiles_myVoiceIsMyPassportVerifyMe01.m4a`,
`testdata/sampledata_audiofiles_myVoiceIsMyPassportVerifyMe01.mp3`,
}

require.NotEmpty(t, *transcriptResp.Text)
require.Greater(t, *transcriptResp.Duration, float32(0.0))
require.NotEmpty(t, *transcriptResp.Language)
require.NotEmpty(t, transcriptResp.Segments)
require.NotEmpty(t, transcriptResp.Segments[0])
require.NotEmpty(t, transcriptResp.Task)
for _, audioFile := range testFiles {
t.Run(fmt.Sprintf("verbose (%s)", filepath.Ext(audioFile)), func(t *testing.T) {
args := newTranscriptionOptions(azopenai.AudioTranscriptionFormatVerboseJSON, openAI.Whisper.Model, audioFile)

transcriptResp, err := client.GetAudioTranscription(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
require.Greater(t, *transcriptResp.Duration, float32(0.0))
require.NotEmpty(t, *transcriptResp.Language)
require.NotEmpty(t, transcriptResp.Segments)
require.NotEmpty(t, transcriptResp.Segments[0])
require.NotEmpty(t, transcriptResp.Task)
})
}
}

func TestClient_GetAudioTranslation_AzureOpenAI(t *testing.T) {
Expand All @@ -48,154 +57,192 @@ func TestClient_GetAudioTranslation_AzureOpenAI(t *testing.T) {
func TestClient_GetAudioTranslation_OpenAI(t *testing.T) {
client := newOpenAIClientForTest(t)

mp3Bytes, err := os.ReadFile(`testdata/sampledata_audiofiles_myVoiceIsMyPassportVerifyMe01.mp3`)
require.NoError(t, err)

args := newTranslationOptions(azopenai.AudioTranslationFormatVerboseJSON, openAI.Whisper.Model, mp3Bytes)
transcriptResp, err := client.GetAudioTranslation(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)
testFiles := []string{
`testdata/sampledata_audiofiles_myVoiceIsMyPassportVerifyMe01.m4a`,
`testdata/sampledata_audiofiles_myVoiceIsMyPassportVerifyMe01.mp3`,
}

require.NotEmpty(t, *transcriptResp.Text)
require.Greater(t, *transcriptResp.Duration, float32(0.0))
require.NotEmpty(t, *transcriptResp.Language)
require.NotEmpty(t, transcriptResp.Segments)
require.NotEmpty(t, transcriptResp.Segments[0])
require.NotEmpty(t, transcriptResp.Task)
for _, audioFile := range testFiles {
t.Run(fmt.Sprintf("verbose (%s)", filepath.Ext(audioFile)), func(t *testing.T) {
args := newTranslationOptions(azopenai.AudioTranslationFormatVerboseJSON, openAI.Whisper.Model, audioFile)
transcriptResp, err := client.GetAudioTranslation(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
require.Greater(t, *transcriptResp.Duration, float32(0.0))
require.NotEmpty(t, *transcriptResp.Language)
require.NotEmpty(t, transcriptResp.Segments)
require.NotEmpty(t, transcriptResp.Segments[0])
require.NotEmpty(t, transcriptResp.Task)
})
}
}

func runTranscriptionTests(t *testing.T, client *azopenai.Client, model string) {
mp3Bytes, err := os.ReadFile(`testdata/sampledata_audiofiles_myVoiceIsMyPassportVerifyMe01.mp3`)
require.NoError(t, err)

t.Run(string(azopenai.AudioTranscriptionFormatText), func(t *testing.T) {
args := newTranscriptionOptions(azopenai.AudioTranscriptionFormatText, model, mp3Bytes)
transcriptResp, err := client.GetAudioTranscription(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
requireEmptyAudioTranscription(t, transcriptResp.AudioTranscription)
})

t.Run(string(azopenai.AudioTranscriptionFormatSrt), func(t *testing.T) {
args := newTranscriptionOptions(azopenai.AudioTranscriptionFormatSrt, model, mp3Bytes)
transcriptResp, err := client.GetAudioTranscription(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
requireEmptyAudioTranscription(t, transcriptResp.AudioTranscription)
})

t.Run(string(azopenai.AudioTranscriptionFormatVtt), func(t *testing.T) {
args := newTranscriptionOptions(azopenai.AudioTranscriptionFormatVtt, model, mp3Bytes)
transcriptResp, err := client.GetAudioTranscription(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
requireEmptyAudioTranscription(t, transcriptResp.AudioTranscription)
})

t.Run(string(azopenai.AudioTranscriptionFormatVerboseJSON), func(t *testing.T) {
args := newTranscriptionOptions(azopenai.AudioTranscriptionFormatVerboseJSON, model, mp3Bytes)
transcriptResp, err := client.GetAudioTranscription(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
require.Greater(t, *transcriptResp.Duration, float32(0.0))
require.NotEmpty(t, *transcriptResp.Language)
require.NotEmpty(t, transcriptResp.Segments)
require.NotEmpty(t, transcriptResp.Segments[0])
require.NotEmpty(t, transcriptResp.Task)
})

t.Run(string(azopenai.AudioTranscriptionFormatJSON), func(t *testing.T) {
args := newTranscriptionOptions(azopenai.AudioTranscriptionFormatJSON, model, mp3Bytes)
transcriptResp, err := client.GetAudioTranscription(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
requireEmptyAudioTranscription(t, transcriptResp.AudioTranscription)
})
testFiles := []string{
`testdata/sampledata_audiofiles_myVoiceIsMyPassportVerifyMe01.m4a`,
`testdata/sampledata_audiofiles_myVoiceIsMyPassportVerifyMe01.mp3`,
}

for _, audioFile := range testFiles {
ext := filepath.Ext(audioFile)

t.Run(fmt.Sprintf("%s (%s)", azopenai.AudioTranscriptionFormatText, ext), func(t *testing.T) {
args := newTranscriptionOptions(azopenai.AudioTranscriptionFormatText, model, audioFile)
transcriptResp, err := client.GetAudioTranscription(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
requireEmptyAudioTranscription(t, transcriptResp.AudioTranscription)
})

t.Run(fmt.Sprintf("%s (%s)", azopenai.AudioTranscriptionFormatSrt, ext), func(t *testing.T) {
args := newTranscriptionOptions(azopenai.AudioTranscriptionFormatSrt, model, audioFile)
transcriptResp, err := client.GetAudioTranscription(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
requireEmptyAudioTranscription(t, transcriptResp.AudioTranscription)
})

t.Run(fmt.Sprintf("%s (%s)", azopenai.AudioTranscriptionFormatVtt, ext), func(t *testing.T) {
args := newTranscriptionOptions(azopenai.AudioTranscriptionFormatVtt, model, audioFile)
transcriptResp, err := client.GetAudioTranscription(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
requireEmptyAudioTranscription(t, transcriptResp.AudioTranscription)
})

t.Run(fmt.Sprintf("%s (%s)", azopenai.AudioTranscriptionFormatVerboseJSON, ext), func(t *testing.T) {
args := newTranscriptionOptions(azopenai.AudioTranscriptionFormatVerboseJSON, model, audioFile)
transcriptResp, err := client.GetAudioTranscription(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
require.Greater(t, *transcriptResp.Duration, float32(0.0))
require.NotEmpty(t, *transcriptResp.Language)
require.NotEmpty(t, transcriptResp.Segments)
require.NotEmpty(t, transcriptResp.Segments[0])
require.NotEmpty(t, transcriptResp.Task)
})

t.Run(fmt.Sprintf("%s (%s)", azopenai.AudioTranscriptionFormatJSON, ext), func(t *testing.T) {
args := newTranscriptionOptions(azopenai.AudioTranscriptionFormatJSON, model, audioFile)
transcriptResp, err := client.GetAudioTranscription(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
requireEmptyAudioTranscription(t, transcriptResp.AudioTranscription)
})
}
}

func runTranslationTests(t *testing.T, client *azopenai.Client, model string) {
mp3Bytes, err := os.ReadFile(`testdata/sampledata_audiofiles_myVoiceIsMyPassportVerifyMe01.mp3`)
require.NoError(t, err)

t.Run(string(azopenai.AudioTranscriptionFormatText), func(t *testing.T) {
args := newTranslationOptions(azopenai.AudioTranslationFormatText, model, mp3Bytes)
transcriptResp, err := client.GetAudioTranslation(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
requireEmptyAudioTranslation(t, transcriptResp.AudioTranslation)
})

t.Run(string(azopenai.AudioTranscriptionFormatSrt), func(t *testing.T) {
args := newTranslationOptions(azopenai.AudioTranslationFormatSrt, model, mp3Bytes)
transcriptResp, err := client.GetAudioTranslation(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
requireEmptyAudioTranslation(t, transcriptResp.AudioTranslation)
})

t.Run(string(azopenai.AudioTranscriptionFormatVtt), func(t *testing.T) {
args := newTranslationOptions(azopenai.AudioTranslationFormatVtt, model, mp3Bytes)
transcriptResp, err := client.GetAudioTranslation(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
requireEmptyAudioTranslation(t, transcriptResp.AudioTranslation)
})

t.Run(string(azopenai.AudioTranscriptionFormatVerboseJSON), func(t *testing.T) {
args := newTranslationOptions(azopenai.AudioTranslationFormatVerboseJSON, model, mp3Bytes)
transcriptResp, err := client.GetAudioTranslation(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
require.Greater(t, *transcriptResp.Duration, float32(0.0))
require.NotEmpty(t, *transcriptResp.Language)
require.NotEmpty(t, transcriptResp.Segments)
require.NotEmpty(t, transcriptResp.Segments[0])
require.NotEmpty(t, transcriptResp.Task)
})

t.Run(string(azopenai.AudioTranscriptionFormatJSON), func(t *testing.T) {
args := newTranslationOptions(azopenai.AudioTranslationFormatJSON, model, mp3Bytes)
transcriptResp, err := client.GetAudioTranslation(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
requireEmptyAudioTranslation(t, transcriptResp.AudioTranslation)
})
testFiles := []string{
`testdata/sampledata_audiofiles_myVoiceIsMyPassportVerifyMe01.m4a`,
`testdata/sampledata_audiofiles_myVoiceIsMyPassportVerifyMe01.mp3`,
}

for _, audioFile := range testFiles {
ext := filepath.Ext(audioFile)

t.Run(fmt.Sprintf("%s (%s)", azopenai.AudioTranscriptionFormatText, ext), func(t *testing.T) {
args := newTranslationOptions(azopenai.AudioTranslationFormatText, model, audioFile)
transcriptResp, err := client.GetAudioTranslation(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
requireEmptyAudioTranslation(t, transcriptResp.AudioTranslation)
})

t.Run(fmt.Sprintf("%s (%s)", azopenai.AudioTranscriptionFormatSrt, ext), func(t *testing.T) {
args := newTranslationOptions(azopenai.AudioTranslationFormatSrt, model, audioFile)
transcriptResp, err := client.GetAudioTranslation(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
requireEmptyAudioTranslation(t, transcriptResp.AudioTranslation)
})

t.Run(fmt.Sprintf("%s (%s)", azopenai.AudioTranscriptionFormatVtt, ext), func(t *testing.T) {
args := newTranslationOptions(azopenai.AudioTranslationFormatVtt, model, audioFile)
transcriptResp, err := client.GetAudioTranslation(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
requireEmptyAudioTranslation(t, transcriptResp.AudioTranslation)
})

t.Run(fmt.Sprintf("%s (%s)", azopenai.AudioTranscriptionFormatVerboseJSON, ext), func(t *testing.T) {
args := newTranslationOptions(azopenai.AudioTranslationFormatVerboseJSON, model, audioFile)
transcriptResp, err := client.GetAudioTranslation(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
require.Greater(t, *transcriptResp.Duration, float32(0.0))
require.NotEmpty(t, *transcriptResp.Language)
require.NotEmpty(t, transcriptResp.Segments)
require.NotEmpty(t, transcriptResp.Segments[0])
require.NotEmpty(t, transcriptResp.Task)
})

t.Run(fmt.Sprintf("%s (%s)", azopenai.AudioTranscriptionFormatJSON, ext), func(t *testing.T) {
args := newTranslationOptions(azopenai.AudioTranslationFormatJSON, model, audioFile)
transcriptResp, err := client.GetAudioTranslation(context.Background(), args, nil)
require.NoError(t, err)
require.NotEmpty(t, transcriptResp)

require.NotEmpty(t, *transcriptResp.Text)
requireEmptyAudioTranslation(t, transcriptResp.AudioTranslation)
})
}
}

func newTranscriptionOptions(format azopenai.AudioTranscriptionFormat, model string, mp3Bytes []byte) azopenai.AudioTranscriptionOptions {
func newTranscriptionOptions(format azopenai.AudioTranscriptionFormat, model string, path string) azopenai.AudioTranscriptionOptions {
audioBytes, err := os.ReadFile(path)

if err != nil {
panic(err)
}

return azopenai.AudioTranscriptionOptions{
DeploymentName: to.Ptr(model),
File: mp3Bytes,
File: audioBytes,
Filename: &path,
ResponseFormat: &format,
Language: to.Ptr("en"),
Temperature: to.Ptr[float32](0.0),
}
}

func newTranslationOptions(format azopenai.AudioTranslationFormat, model string, mp3Bytes []byte) azopenai.AudioTranslationOptions {
func newTranslationOptions(format azopenai.AudioTranslationFormat, model string, path string) azopenai.AudioTranslationOptions {
audioBytes, err := os.ReadFile(path)

if err != nil {
panic(err)
}

var filename *string

if filepath.Ext(path) != ".mp3" {
filename = &path
}

return azopenai.AudioTranslationOptions{
DeploymentName: to.Ptr(model),
File: mp3Bytes,
File: audioBytes,
Filename: filename,
ResponseFormat: &format,
Temperature: to.Ptr[float32](0.0),
}
Expand Down
Loading

0 comments on commit ecdb3a6

Please sign in to comment.