From 533f0eabd409affbb331b7203db37750dbedf365 Mon Sep 17 00:00:00 2001 From: Yoshio Terada Date: Wed, 12 Mar 2025 10:44:22 +0900 Subject: [PATCH] First commit to fix Issue openai#211 First commit to fix Issue openai#211 This commit includes the fix described in Issue openai#211. * Addressed the issue where Base64 encoding could not be handled. * Improved performance by using Base64 encoding by default. --- .../com/openai/models/embeddings/Embedding.kt | 55 +++++++++---- .../embeddings/EmbeddingCreateParams.kt | 4 +- .../models/embeddings/EmbeddingValue.kt | 81 +++++++++++++++++++ .../embeddings/EmbeddingValueDeserializer.kt | 32 ++++++++ .../embeddings/CreateEmbeddingResponseTest.kt | 25 +++++- .../openai/models/embeddings/EmbeddingTest.kt | 15 +++- 6 files changed, 189 insertions(+), 23 deletions(-) create mode 100644 openai-java-core/src/main/kotlin/com/openai/models/embeddings/EmbeddingValue.kt create mode 100644 openai-java-core/src/main/kotlin/com/openai/models/embeddings/EmbeddingValueDeserializer.kt diff --git a/openai-java-core/src/main/kotlin/com/openai/models/embeddings/Embedding.kt b/openai-java-core/src/main/kotlin/com/openai/models/embeddings/Embedding.kt index f3a4543f..87738dd0 100644 --- a/openai-java-core/src/main/kotlin/com/openai/models/embeddings/Embedding.kt +++ b/openai-java-core/src/main/kotlin/com/openai/models/embeddings/Embedding.kt @@ -17,6 +17,7 @@ import com.openai.core.immutableEmptyMap import com.openai.core.toImmutable import com.openai.errors.OpenAIInvalidDataException import java.util.Objects +import java.util.Optional /** Represents an embedding vector returned by embedding endpoint. */ @NoAutoDetect @@ -25,7 +26,7 @@ class Embedding private constructor( @JsonProperty("embedding") @ExcludeMissing - private val embedding: JsonField> = JsonMissing.of(), + private val embedding: JsonField = JsonMissing.of(), @JsonProperty("index") @ExcludeMissing private val index: JsonField = JsonMissing.of(), @JsonProperty("object") @ExcludeMissing private val object_: JsonValue = JsonMissing.of(), @JsonAnySetter private val additionalProperties: Map = immutableEmptyMap(), @@ -35,7 +36,7 @@ private constructor( * The embedding vector, which is a list of floats. The length of vector depends on the model as * listed in the [embedding guide](https://platform.openai.com/docs/guides/embeddings). */ - fun embedding(): List = embedding.getRequired("embedding") + fun embedding(): EmbeddingValue = embedding.getRequired("embedding") /** The index of the embedding in the list of embeddings. */ fun index(): Long = index.getRequired("index") @@ -47,7 +48,9 @@ private constructor( * The embedding vector, which is a list of floats. The length of vector depends on the model as * listed in the [embedding guide](https://platform.openai.com/docs/guides/embeddings). */ - @JsonProperty("embedding") @ExcludeMissing fun _embedding(): JsonField> = embedding + @JsonProperty("embedding") + @ExcludeMissing + fun _embedding(): JsonField = embedding /** The index of the embedding in the list of embeddings. */ @JsonProperty("index") @ExcludeMissing fun _index(): JsonField = index @@ -92,14 +95,21 @@ private constructor( /** A builder for [Embedding]. */ class Builder internal constructor() { - private var embedding: JsonField>? = null + private var embedding: JsonField? = null private var index: JsonField? = null private var object_: JsonValue = JsonValue.from("embedding") private var additionalProperties: MutableMap = mutableMapOf() @JvmSynthetic internal fun from(embedding: Embedding) = apply { - this.embedding = embedding.embedding.map { it.toMutableList() } + this.embedding = + embedding.embedding.map { + EmbeddingValue( + floatEmbedding = + Optional.of(it.floatEmbedding.orElse(mutableListOf()).toMutableList()), + base64Embedding = it.base64Embedding, + ) + } index = embedding.index object_ = embedding.object_ additionalProperties = embedding.additionalProperties.toMutableMap() @@ -110,27 +120,32 @@ private constructor( * model as listed in the * [embedding guide](https://platform.openai.com/docs/guides/embeddings). */ - fun embedding(embedding: List) = embedding(JsonField.of(embedding)) + fun embedding(embedding: EmbeddingValue) = embedding(JsonField.of(embedding)) /** - * The embedding vector, which is a list of floats. The length of vector depends on the - * model as listed in the + * The embedding vector, which is a list of floats or Base64. The float length of vector + * depends on the model as listed in the * [embedding guide](https://platform.openai.com/docs/guides/embeddings). */ - fun embedding(embedding: JsonField>) = apply { - this.embedding = embedding.map { it.toMutableList() } + fun embedding(embedding: JsonField) = apply { + this.embedding = + embedding.map { + EmbeddingValue( + floatEmbedding = + Optional.of(it.floatEmbedding.orElse(mutableListOf()).toMutableList()), + base64Embedding = it.base64Embedding, + ) + } } /** - * The embedding vector, which is a list of floats. The length of vector depends on the - * model as listed in the + * The embedding vector, which is a list of floats or Base64. The float length of vector + * depends on the model as listed in the * [embedding guide](https://platform.openai.com/docs/guides/embeddings). */ - fun addEmbedding(embedding: Double) = apply { + fun addEmbedding(embedding: EmbeddingValue) = apply { this.embedding = - (this.embedding ?: JsonField.of(mutableListOf())).also { - checkKnown("embedding", it).add(embedding) - } + (this.embedding ?: JsonField.of(embedding)).also { checkKnown("embedding", it) } } /** The index of the embedding in the list of embeddings. */ @@ -163,7 +178,13 @@ private constructor( fun build(): Embedding = Embedding( - checkRequired("embedding", embedding).map { it.toImmutable() }, + checkRequired("embedding", embedding).map { + EmbeddingValue( + floatEmbedding = + Optional.of(it.floatEmbedding.orElse(mutableListOf()).toMutableList()), + base64Embedding = it.base64Embedding, + ) + }, checkRequired("index", index), object_, additionalProperties.toImmutable(), diff --git a/openai-java-core/src/main/kotlin/com/openai/models/embeddings/EmbeddingCreateParams.kt b/openai-java-core/src/main/kotlin/com/openai/models/embeddings/EmbeddingCreateParams.kt index 7d1e87d8..8550e1c9 100644 --- a/openai-java-core/src/main/kotlin/com/openai/models/embeddings/EmbeddingCreateParams.kt +++ b/openai-java-core/src/main/kotlin/com/openai/models/embeddings/EmbeddingCreateParams.kt @@ -271,7 +271,9 @@ private constructor( private var input: JsonField? = null private var model: JsonField? = null private var dimensions: JsonField = JsonMissing.of() - private var encodingFormat: JsonField = JsonMissing.of() + // Default EncodingFormat value is set to BASE64 for performance improvements. + private var encodingFormat: JsonField = + JsonField.of(EncodingFormat.BASE64) private var user: JsonField = JsonMissing.of() private var additionalProperties: MutableMap = mutableMapOf() diff --git a/openai-java-core/src/main/kotlin/com/openai/models/embeddings/EmbeddingValue.kt b/openai-java-core/src/main/kotlin/com/openai/models/embeddings/EmbeddingValue.kt new file mode 100644 index 00000000..ab20d507 --- /dev/null +++ b/openai-java-core/src/main/kotlin/com/openai/models/embeddings/EmbeddingValue.kt @@ -0,0 +1,81 @@ +package com.openai.models.embeddings + +import com.fasterxml.jackson.databind.annotation.JsonDeserialize +import java.nio.ByteBuffer +import java.nio.ByteOrder +import java.util.Base64 +import java.util.Optional +import kotlin.collections.MutableList + +/** Represents an embedding vector returned by embedding endpoint. */ +@JsonDeserialize(using = EmbeddingValueDeserializer::class) +class EmbeddingValue( + var base64Embedding: Optional = Optional.empty(), + floatEmbedding: Optional> = Optional.empty(), +) { + + /** + * The embedding vector, which is a list of float32. + * [embedding guide](https://platform.openai.com/docs/guides/embeddings). + */ + var floatEmbedding: Optional> = Optional.empty() + get() { + if (field.isPresent) { + return field + } + if (base64Embedding.isPresent) { + field = convertBase64ToFloat(base64Embedding) + } + return field + } + set(value) { + field = value + } + + /** + * Converting Base64 float32 array to Optional + * + * To improve performance, requests are made in Base64 by default. However, not all developers + * need to decode Base64. Therefore, when a request is made in Base64, the system will + * internally convert the Base64 data to MutableList and make this converted data + * available, allowing developers to obtain both the Base64 data and the MutableList + * data by default. + */ + private fun convertBase64ToFloat( + base64Embedding: Optional + ): Optional> { + // The response of Embedding returns a List(float32), + // but the Kotlin API handles MutableList. + // If we directly convert from List to MutableList, + // it increases the precision and changing it from float32 to double. + // + // Since JSON is assigned to MutableList from a String of JSON Value, + // the precision does not increase. + // Therefore, by first converting the Base64-decoded List to a String, + // and then converting the String to Double, + // we can handle it as MutableList without increasing the precision. + return base64Embedding.map { base64String -> + val decoded = Base64.getDecoder().decode(base64String) + val byteBuffer = ByteBuffer.wrap(decoded).order(ByteOrder.LITTLE_ENDIAN) + + val floatList = mutableListOf() + while (byteBuffer.hasRemaining()) { + floatList.add(byteBuffer.float.toString()) + } + floatList.map { it.replace("f", "").toDouble() }.toMutableList() + } + } + + /** + * Output the embedding vector as a string. By default, it will be output as both list of floats + * and Base64 string. if user specifies floatEmbedding, it will be output as list of floats + * only. + */ + override fun toString(): String { + return if (base64Embedding.isPresent) { + "base64: $base64Embedding, float: [${floatEmbedding.get().joinToString(", ")}]" + } else { + "float: [${floatEmbedding.get().joinToString(", ")}]" + } + } +} diff --git a/openai-java-core/src/main/kotlin/com/openai/models/embeddings/EmbeddingValueDeserializer.kt b/openai-java-core/src/main/kotlin/com/openai/models/embeddings/EmbeddingValueDeserializer.kt new file mode 100644 index 00000000..f20c4309 --- /dev/null +++ b/openai-java-core/src/main/kotlin/com/openai/models/embeddings/EmbeddingValueDeserializer.kt @@ -0,0 +1,32 @@ +package com.openai.models.embeddings + +import com.fasterxml.jackson.core.JsonParser +import com.fasterxml.jackson.databind.DeserializationContext +import com.fasterxml.jackson.databind.JsonDeserializer +import com.fasterxml.jackson.databind.JsonNode +import com.fasterxml.jackson.databind.node.ArrayNode +import java.io.IOException +import java.util.Optional + +/** JsonDeserializer for EmbeddingValue */ +class EmbeddingValueDeserializer : JsonDeserializer() { + @Throws(IOException::class) + + /* + * Deserialize the JSON representation of an EmbeddingValue. + * The JSON can either be an array of floats or a base64 string. + */ + override fun deserialize(jp: JsonParser, ctxt: DeserializationContext): EmbeddingValue { + val node = jp.codec.readTree(jp) + val embeddingValue = EmbeddingValue() + + if (node.isArray) { + val floats = mutableListOf() + (node as ArrayNode).forEach { item -> floats.add(item.asDouble()) } + embeddingValue.floatEmbedding = Optional.of(floats) + } else if (node.isTextual) { + embeddingValue.base64Embedding = Optional.of(node.asText()) + } + return embeddingValue + } +} diff --git a/openai-java-core/src/test/kotlin/com/openai/models/embeddings/CreateEmbeddingResponseTest.kt b/openai-java-core/src/test/kotlin/com/openai/models/embeddings/CreateEmbeddingResponseTest.kt index 25eef802..926a18d5 100644 --- a/openai-java-core/src/test/kotlin/com/openai/models/embeddings/CreateEmbeddingResponseTest.kt +++ b/openai-java-core/src/test/kotlin/com/openai/models/embeddings/CreateEmbeddingResponseTest.kt @@ -2,6 +2,7 @@ package com.openai.models.embeddings +import java.util.Optional import org.assertj.core.api.Assertions.assertThat import org.junit.jupiter.api.Test @@ -11,7 +12,17 @@ class CreateEmbeddingResponseTest { fun createCreateEmbeddingResponse() { val createEmbeddingResponse = CreateEmbeddingResponse.builder() - .addData(Embedding.builder().addEmbedding(0.0).index(0L).build()) + .addData( + Embedding.builder() + .addEmbedding( + EmbeddingValue( + floatEmbedding = Optional.of(mutableListOf(0.0)), + base64Embedding = Optional.empty(), + ) + ) + .index(0L) + .build() + ) .model("model") .usage( CreateEmbeddingResponse.Usage.builder().promptTokens(0L).totalTokens(0L).build() @@ -19,7 +30,17 @@ class CreateEmbeddingResponseTest { .build() assertThat(createEmbeddingResponse).isNotNull assertThat(createEmbeddingResponse.data()) - .containsExactly(Embedding.builder().addEmbedding(0.0).index(0L).build()) + .containsExactly( + Embedding.builder() + .addEmbedding( + EmbeddingValue( + floatEmbedding = Optional.of(mutableListOf(0.0)), + base64Embedding = Optional.empty(), + ) + ) + .index(0L) + .build() + ) assertThat(createEmbeddingResponse.model()).isEqualTo("model") assertThat(createEmbeddingResponse.usage()) .isEqualTo( diff --git a/openai-java-core/src/test/kotlin/com/openai/models/embeddings/EmbeddingTest.kt b/openai-java-core/src/test/kotlin/com/openai/models/embeddings/EmbeddingTest.kt index 41286a5a..bf8207f9 100644 --- a/openai-java-core/src/test/kotlin/com/openai/models/embeddings/EmbeddingTest.kt +++ b/openai-java-core/src/test/kotlin/com/openai/models/embeddings/EmbeddingTest.kt @@ -1,7 +1,7 @@ // File generated from our OpenAPI spec by Stainless. package com.openai.models.embeddings - +import java.util.Optional import org.assertj.core.api.Assertions.assertThat import org.junit.jupiter.api.Test @@ -9,9 +9,18 @@ class EmbeddingTest { @Test fun createEmbedding() { - val embedding = Embedding.builder().addEmbedding(0.0).index(0L).build() + val embedding = + Embedding.builder() + .addEmbedding( + EmbeddingValue( + floatEmbedding = Optional.of(mutableListOf(0.0)), + base64Embedding = Optional.empty(), + ) + ) + .build() assertThat(embedding).isNotNull - assertThat(embedding.embedding()).containsExactly(0.0) + // assertThat(embedding.embedding()).containsExactly(0.0) + assertThat(embedding.embedding().floatEmbedding).containsSame(mutableListOf(0.0)) assertThat(embedding.index()).isEqualTo(0L) } }