Skip to content

Commit

Permalink
feat(audio): add speech (#263)
Browse files Browse the repository at this point in the history
  • Loading branch information
aallam authored Nov 23, 2023
1 parent 2196f92 commit 7dbcffd
Show file tree
Hide file tree
Showing 9 changed files with 157 additions and 7 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
- **Chat**: add tool calls (#256)
- **Chat**: add vision feature (#258)
- **Config**: adding ktor engine config to support Kotlin Scripting (#261)
- ***Audio**: add speech-to-text (#263)

#### Beta
- **Assistants**: api implementation (#259)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
package com.aallam.openai.client

import com.aallam.openai.api.audio.Transcription
import com.aallam.openai.api.audio.TranscriptionRequest
import com.aallam.openai.api.audio.Translation
import com.aallam.openai.api.audio.TranslationRequest
import com.aallam.openai.api.audio.*

/**
* Learn how to turn audio into text.
Expand All @@ -19,4 +16,9 @@ public interface Audio {
* Translates audio into English.
*/
public suspend fun translation(request: TranslationRequest): Translation

/**
* Generates audio from the input text.
*/
public suspend fun speech(request: SpeechRequest): ByteArray
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package com.aallam.openai.client.internal.api
internal object ApiPath {
const val Translation = "audio/translations"
const val Transcription = "audio/transcriptions"
const val Speech = "audio/speech"
const val ChatCompletions = "chat/completions"
const val Completions = "completions"
const val Edits = "edits"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ import com.aallam.openai.client.Audio
import com.aallam.openai.client.internal.extension.appendFileSource
import com.aallam.openai.client.internal.http.HttpRequester
import com.aallam.openai.client.internal.http.perform
import io.ktor.client.request.*
import io.ktor.client.request.forms.*
import io.ktor.http.*

/**
* Implementation of [Audio].
Expand Down Expand Up @@ -76,4 +78,14 @@ internal class AudioApi(val requester: HttpRequester) : Audio {
request.responseFormat?.let { append(key = "response_format", value = it) }
request.temperature?.let { append(key = "temperature", value = it) }
}

override suspend fun speech(request: SpeechRequest): ByteArray {
return requester.perform {
it.post{
url(ApiPath.Speech)
setBody(request)
contentType(ContentType.Application.Json)
}
}
}
}
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
package com.aallam.openai.client

import com.aallam.openai.api.audio.AudioResponseFormat
import com.aallam.openai.api.audio.transcriptionRequest
import com.aallam.openai.api.audio.translationRequest
import com.aallam.openai.api.audio.*
import com.aallam.openai.api.file.FileSource
import com.aallam.openai.api.model.ModelId
import com.aallam.openai.client.internal.TestFileSystem
import com.aallam.openai.client.internal.testFilePath
import okio.FileSystem
import okio.Path.Companion.toPath
import kotlin.test.Test
import kotlin.test.assertEquals
import kotlin.test.assertTrue
Expand Down Expand Up @@ -95,4 +95,15 @@ class TestAudio : TestOpenAI() {
assertEquals(translation.duration!!, 42.06, absoluteTolerance = 0.1)
assertTrue { translation.segments?.isNotEmpty() ?: false }
}

@Test
fun speech() = test {
val request = speechRequest {
model = ModelId("tts-1")
input = "The quick brown fox jumped over the lazy dog."
voice = Voice.Alloy
}
val audio = openAI.speech(request)
assertTrue { audio.isNotEmpty() }
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,5 @@ class TestChatVisionJVM : TestOpenAI() {
val content = response.choices.first().message.content.orEmpty()
assertNotNull(content)
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package com.aallam.openai.api.audio

import com.aallam.openai.api.OpenAIDsl
import com.aallam.openai.api.model.ModelId
import kotlinx.serialization.SerialName
import kotlinx.serialization.Serializable

/**
* Generates audio from the input text.
*/
@Serializable
public data class SpeechRequest(

/**
* One of the available TTS models: tts-1 or tts-1-hd
*/
@SerialName("model") public val model: ModelId,

/**
* The text to generate audio for. The maximum length is 4096 characters.
*/
@SerialName("input") public val input: String,

/**
* The voice to use when generating the audio
*/
@SerialName("voice") public val voice: Voice? = null,

/**
* The format to audio in.
*/
@SerialName("response_format") public val responseFormat: SpeechResponseFormat? = null,

/**
* The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
*/
@SerialName("speed") public val speed: Double? = null,
)

/**
* Creates a new [SpeechRequest] instance.
*/
public fun speechRequest(block: SpeechRequestBuilder.() -> Unit): SpeechRequest =
SpeechRequestBuilder().apply(block).build()

/**
* A speech request builder.
*/
@OpenAIDsl
public class SpeechRequestBuilder {

/**
* One of the available TTS models: tts-1 or tts-1-hd
*/
public var model: ModelId? = null

/**
* The text to generate audio for. The maximum length is 4096 characters.
*/
public var input: String? = null

/**
* The voice to use when generating the audio
*/
public var voice: Voice? = null

/**
* The format to audio in.
*/
public var responseFormat: SpeechResponseFormat? = null

/**
* The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default.
*/
public var speed: Double? = null

/**
* Builds and returns a [SpeechRequest] instance.
*/
public fun build(): SpeechRequest = SpeechRequest(
model = requireNotNull(model) { "model is required" },
input = requireNotNull(input) { "input is required" },
voice = voice,
responseFormat = responseFormat,
speed = speed
)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package com.aallam.openai.api.audio

import kotlinx.serialization.Serializable
import kotlin.jvm.JvmInline

@Serializable
@JvmInline
public value class SpeechResponseFormat(public val value: String) {
public companion object {
public val Mp3: SpeechResponseFormat = SpeechResponseFormat("mp3")
public val Opus: SpeechResponseFormat = SpeechResponseFormat("opus")
public val Aac: SpeechResponseFormat = SpeechResponseFormat("aac")
public val Flac: SpeechResponseFormat = SpeechResponseFormat("flac")
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package com.aallam.openai.api.audio

import kotlinx.serialization.Serializable
import kotlin.jvm.JvmInline

/**
* The voice to use when generating the audio
*/
@Serializable
@JvmInline
public value class Voice(public val value: String) {
public companion object {
public val Alloy: Voice = Voice("alloy")
public val Echo: Voice = Voice("echo")
public val Fable: Voice = Voice("fable")
public val Onyx: Voice = Voice("onyx")
public val Nova: Voice = Voice("nova")
public val Shimmer: Voice = Voice("shimmer")
}
}

0 comments on commit 7dbcffd

Please sign in to comment.