Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(audio): add speech #263

Merged
merged 2 commits into from
Nov 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
- **Chat**: add tool calls (#256)
- **Chat**: add vision feature (#258)
- **Config**: adding ktor engine config to support Kotlin Scripting (#261)
- ***Audio**: add speech-to-text (#263)

#### Beta
- **Assistants**: api implementation (#259)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
package com.aallam.openai.client

import com.aallam.openai.api.audio.Transcription
import com.aallam.openai.api.audio.TranscriptionRequest
import com.aallam.openai.api.audio.Translation
import com.aallam.openai.api.audio.TranslationRequest
import com.aallam.openai.api.audio.*

/**
* Learn how to turn audio into text.
Expand All @@ -19,4 +16,9 @@ public interface Audio {
* Translates audio into English.
*/
public suspend fun translation(request: TranslationRequest): Translation

/**
* Generates audio from the input text.
*/
public suspend fun speech(request: SpeechRequest): ByteArray
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package com.aallam.openai.client.internal.api
internal object ApiPath {
const val Translation = "audio/translations"
const val Transcription = "audio/transcriptions"
const val Speech = "audio/speech"
const val ChatCompletions = "chat/completions"
const val Completions = "completions"
const val Edits = "edits"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ import com.aallam.openai.client.Audio
import com.aallam.openai.client.internal.extension.appendFileSource
import com.aallam.openai.client.internal.http.HttpRequester
import com.aallam.openai.client.internal.http.perform
import io.ktor.client.request.*
import io.ktor.client.request.forms.*
import io.ktor.http.*

/**
* Implementation of [Audio].
Expand Down Expand Up @@ -76,4 +78,14 @@ internal class AudioApi(val requester: HttpRequester) : Audio {
request.responseFormat?.let { append(key = "response_format", value = it) }
request.temperature?.let { append(key = "temperature", value = it) }
}

override suspend fun speech(request: SpeechRequest): ByteArray {
return requester.perform {
it.post{
url(ApiPath.Speech)
setBody(request)
contentType(ContentType.Application.Json)
}
}
}
}
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
package com.aallam.openai.client

import com.aallam.openai.api.audio.AudioResponseFormat
import com.aallam.openai.api.audio.transcriptionRequest
import com.aallam.openai.api.audio.translationRequest
import com.aallam.openai.api.audio.*
import com.aallam.openai.api.file.FileSource
import com.aallam.openai.api.model.ModelId
import com.aallam.openai.client.internal.TestFileSystem
import com.aallam.openai.client.internal.testFilePath
import okio.FileSystem
import okio.Path.Companion.toPath
import kotlin.test.Test
import kotlin.test.assertEquals
import kotlin.test.assertTrue
Expand Down Expand Up @@ -95,4 +95,15 @@ class TestAudio : TestOpenAI() {
assertEquals(translation.duration!!, 42.06, absoluteTolerance = 0.1)
assertTrue { translation.segments?.isNotEmpty() ?: false }
}

@Test
fun speech() = test {
val request = speechRequest {
model = ModelId("tts-1")
input = "The quick brown fox jumped over the lazy dog."
voice = Voice.Alloy
}
val audio = openAI.speech(request)
assertTrue { audio.isNotEmpty() }
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,5 @@ class TestChatVisionJVM : TestOpenAI() {
val content = response.choices.first().message.content.orEmpty()
assertNotNull(content)
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package com.aallam.openai.api.audio

import com.aallam.openai.api.OpenAIDsl
import com.aallam.openai.api.model.ModelId
import kotlinx.serialization.SerialName
import kotlinx.serialization.Serializable

/**
* Generates audio from the input text.
*/
@Serializable
public data class SpeechRequest(

/**
* One of the available TTS models: tts-1 or tts-1-hd
*/
@SerialName("model") public val model: ModelId,

/**
* The text to generate audio for. The maximum length is 4096 characters.
*/
@SerialName("input") public val input: String,

/**
* The voice to use when generating the audio
*/
@SerialName("voice") public val voice: Voice? = null,

/**
* The format to audio in.
*/
@SerialName("response_format") public val responseFormat: SpeechResponseFormat? = null,

/**
* The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
*/
@SerialName("speed") public val speed: Double? = null,
)

/**
* Creates a new [SpeechRequest] instance.
*/
public fun speechRequest(block: SpeechRequestBuilder.() -> Unit): SpeechRequest =
SpeechRequestBuilder().apply(block).build()

/**
* A speech request builder.
*/
@OpenAIDsl
public class SpeechRequestBuilder {

/**
* One of the available TTS models: tts-1 or tts-1-hd
*/
public var model: ModelId? = null

/**
* The text to generate audio for. The maximum length is 4096 characters.
*/
public var input: String? = null

/**
* The voice to use when generating the audio
*/
public var voice: Voice? = null

/**
* The format to audio in.
*/
public var responseFormat: SpeechResponseFormat? = null

/**
* The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
*/
public var speed: Double? = null

/**
* Builds and returns a [SpeechRequest] instance.
*/
public fun build(): SpeechRequest = SpeechRequest(
model = requireNotNull(model) { "model is required" },
input = requireNotNull(input) { "input is required" },
voice = voice,
responseFormat = responseFormat,
speed = speed
)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package com.aallam.openai.api.audio

import kotlinx.serialization.Serializable
import kotlin.jvm.JvmInline

@Serializable
@JvmInline
public value class SpeechResponseFormat(public val value: String) {
public companion object {
public val Mp3: SpeechResponseFormat = SpeechResponseFormat("mp3")
public val Opus: SpeechResponseFormat = SpeechResponseFormat("opus")
public val Aac: SpeechResponseFormat = SpeechResponseFormat("aac")
public val Flac: SpeechResponseFormat = SpeechResponseFormat("flac")
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package com.aallam.openai.api.audio

import kotlinx.serialization.Serializable
import kotlin.jvm.JvmInline

/**
* The voice to use when generating the audio
*/
@Serializable
@JvmInline
public value class Voice(public val value: String) {
public companion object {
public val Alloy: Voice = Voice("alloy")
public val Echo: Voice = Voice("echo")
public val Fable: Voice = Voice("fable")
public val Onyx: Voice = Voice("onyx")
public val Nova: Voice = Voice("nova")
public val Shimmer: Voice = Voice("shimmer")
}
}