From 071ad1b12d311e4b51377906efba984318fa40dd Mon Sep 17 00:00:00 2001 From: Darin Kotter Date: Thu, 6 Jun 2024 16:59:23 -0600 Subject: [PATCH 1/4] Add methods to generate embeddings for an array of text, allowing you to generate embeddings using less requests --- .../Classifai/Providers/Azure/Embeddings.php | 74 +++++++++++++++++++ .../Classifai/Providers/OpenAI/Embeddings.php | 72 ++++++++++++++++++ 2 files changed, 146 insertions(+) diff --git a/includes/Classifai/Providers/Azure/Embeddings.php b/includes/Classifai/Providers/Azure/Embeddings.php index 5c9b43c66..d7eb37774 100644 --- a/includes/Classifai/Providers/Azure/Embeddings.php +++ b/includes/Classifai/Providers/Azure/Embeddings.php @@ -863,6 +863,80 @@ public function generate_embedding( string $text = '', Feature $feature = null ) return $return; } + /** + * Generate embeddings for an array of text. + * + * @param array $strings Array of text to generate embeddings for. + * @param Feature|null $feature Feature instance. + * @return array|boolean|WP_Error + */ + public function generate_embeddings( array $strings = [], $feature = null ) { + if ( ! $feature ) { + $feature = new Classification(); + } + + $settings = $feature->get_settings(); + + // Ensure the feature is enabled. + if ( ! $feature->is_feature_enabled() ) { + return new WP_Error( 'not_enabled', esc_html__( 'Classification is disabled or OpenAI authentication failed. Please check your settings.', 'classifai' ) ); + } + + /** + * Filter the request body before sending to OpenAI. + * + * @since 3.1.0 + * @hook classifai_azure_openai_embeddings_request_body + * + * @param {array} $body Request body that will be sent to OpenAI. + * @param {array} $strings Array of text we are getting embeddings for. + * + * @return {array} Request body. + */ + $body = apply_filters( + 'classifai_azure_openai_embeddings_request_body', + [ + 'input' => $strings, + 'dimensions' => $this->get_dimensions(), + ], + $strings + ); + + // Make our API request. + $response = wp_remote_post( + $this->prep_api_url( $feature ), + [ + 'headers' => [ + 'api-key' => $settings[ static::ID ]['api_key'], + 'Content-Type' => 'application/json', + ], + 'body' => wp_json_encode( $body ), + ] + ); + $response = $this->get_result( $response ); + + if ( is_wp_error( $response ) ) { + return $response; + } + + if ( empty( $response['data'] ) ) { + return new WP_Error( 'no_data', esc_html__( 'No data returned from OpenAI.', 'classifai' ) ); + } + + $return = []; + + // Parse out the embeddings response. + foreach ( $response['data'] as $data ) { + if ( ! isset( $data['embedding'] ) || ! is_array( $data['embedding'] ) ) { + continue; + } + + $return[] = $data['embedding']; + } + + return $return; + } + /** * Chunk content into smaller pieces with an overlap. * diff --git a/includes/Classifai/Providers/OpenAI/Embeddings.php b/includes/Classifai/Providers/OpenAI/Embeddings.php index 76643972e..9f324315c 100644 --- a/includes/Classifai/Providers/OpenAI/Embeddings.php +++ b/includes/Classifai/Providers/OpenAI/Embeddings.php @@ -975,6 +975,78 @@ public function generate_embedding( string $text = '', $feature = null ) { return $return; } + /** + * Generate embeddings for an array of text. + * + * @param array $strings Array of text to generate embeddings for. + * @param Feature|null $feature Feature instance. + * @return array|boolean|WP_Error + */ + public function generate_embeddings( array $strings = [], $feature = null ) { + if ( ! $feature ) { + $feature = new Classification(); + } + + $settings = $feature->get_settings(); + + // Ensure the feature is enabled. + if ( ! $feature->is_feature_enabled() ) { + return new WP_Error( 'not_enabled', esc_html__( 'Classification is disabled or OpenAI authentication failed. Please check your settings.', 'classifai' ) ); + } + + $request = new APIRequest( $settings[ static::ID ]['api_key'] ?? '', $feature->get_option_name() ); + + /** + * Filter the request body before sending to OpenAI. + * + * @since 2.2.0 + * @hook classifai_openai_embeddings_request_body + * + * @param {array} $body Request body that will be sent to OpenAI. + * @param {array} $strings Array of text we are getting embeddings for. + * + * @return {array} Request body. + */ + $body = apply_filters( + 'classifai_openai_embeddings_request_body', + [ + 'model' => $this->get_model(), + 'input' => $strings, + 'dimensions' => $this->get_dimensions(), + ], + $strings + ); + + // Make our API request. + $response = $request->post( + $this->get_api_url(), + [ + 'body' => wp_json_encode( $body ), + ] + ); + + if ( is_wp_error( $response ) ) { + return $response; + } + + if ( empty( $response['data'] ) ) { + return new WP_Error( 'no_data', esc_html__( 'No data returned from OpenAI.', 'classifai' ) ); + } + + $return = []; + + // Parse out the embeddings response. + foreach ( $response['data'] as $data ) { + if ( ! isset( $data['embedding'] ) || ! is_array( $data['embedding'] ) ) { + continue; + } + + $return[] = $data['embedding']; + } + + return $return; + } + /** * Chunk content into smaller pieces with an overlap. * From 6d3eb4350e837d5629182ade8ad59dc35857d78d Mon Sep 17 00:00:00 2001 From: Darin Kotter Date: Fri, 7 Jun 2024 13:15:58 -0600 Subject: [PATCH 2/4] Fix the chunking of content. Set a higher timeout for Azure OpenAI requests --- includes/Classifai/Providers/Azure/Embeddings.php | 4 +++- includes/Classifai/Providers/OpenAI/Embeddings.php | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/includes/Classifai/Providers/Azure/Embeddings.php b/includes/Classifai/Providers/Azure/Embeddings.php index d7eb37774..1b85bc1dc 100644 --- a/includes/Classifai/Providers/Azure/Embeddings.php +++ b/includes/Classifai/Providers/Azure/Embeddings.php @@ -833,6 +833,7 @@ public function generate_embedding( string $text = '', Feature $feature = null ) 'Content-Type' => 'application/json', ], 'body' => wp_json_encode( $body ), + 'timeout' => 60, // phpcs:ignore WordPressVIPMinimum.Performance.RemoteRequestTimeout.timeout_timeout ] ); $response = $this->get_result( $response ); @@ -911,6 +912,7 @@ public function generate_embeddings( array $strings = [], $feature = null ) { 'Content-Type' => 'application/json', ], 'body' => wp_json_encode( $body ), + 'timeout' => 60, // phpcs:ignore WordPressVIPMinimum.Performance.RemoteRequestTimeout.timeout_timeout ] ); $response = $this->get_result( $response ); @@ -963,7 +965,7 @@ public function chunk_content( string $content = '', int $chunk_size = 150, $ove array_slice( $words, max( $i - $overlap_size, 0 ), - $i + $chunk_size + $chunk_size + $overlap_size ) ); diff --git a/includes/Classifai/Providers/OpenAI/Embeddings.php b/includes/Classifai/Providers/OpenAI/Embeddings.php index 9f324315c..793eac59e 100644 --- a/includes/Classifai/Providers/OpenAI/Embeddings.php +++ b/includes/Classifai/Providers/OpenAI/Embeddings.php @@ -1073,7 +1073,7 @@ public function chunk_content( string $content = '', int $chunk_size = 150, $ove array_slice( $words, max( $i - $overlap_size, 0 ), - $i + $chunk_size + $chunk_size + $overlap_size ) ); From cf8ae9b88c7425ef17abb39bfe31206af81b5ef5 Mon Sep 17 00:00:00 2001 From: Darin Kotter Date: Fri, 7 Jun 2024 13:29:21 -0600 Subject: [PATCH 3/4] Send our chunked content in a single request if we can --- .../Classifai/Providers/Azure/Embeddings.php | 27 ++++++++++++++++--- .../Classifai/Providers/OpenAI/Embeddings.php | 26 +++++++++++++++--- 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/includes/Classifai/Providers/Azure/Embeddings.php b/includes/Classifai/Providers/Azure/Embeddings.php index 1b85bc1dc..35587e601 100644 --- a/includes/Classifai/Providers/Azure/Embeddings.php +++ b/includes/Classifai/Providers/Azure/Embeddings.php @@ -6,6 +6,7 @@ namespace Classifai\Providers\Azure; use Classifai\Providers\OpenAI\EmbeddingCalculations; +use Classifai\Providers\OpenAI\Tokenizer; use Classifai\Normalizer; use Classifai\Features\Classification; use Classifai\Features\Feature; @@ -400,11 +401,29 @@ public function generate_embeddings_for_post( int $post_id, bool $force = false // Get the embeddings for each chunk. if ( ! empty( $content_chunks ) ) { - foreach ( $content_chunks as $chunk ) { - $embedding = $this->generate_embedding( $chunk ); + $tokenizer = new Tokenizer( $this->get_max_tokens() ); + $total_tokens = $tokenizer->tokens_in_content( $content ); - if ( $embedding && ! is_wp_error( $embedding ) ) { - $embeddings[] = array_map( 'floatval', $embedding ); + // If we have a lot of tokens, we need to get embeddings for each chunk individually. + if ( $this->max_tokens < $total_tokens ) { + foreach ( $content_chunks as $chunk ) { + $embedding = $this->generate_embedding( $chunk ); + + if ( $embedding && ! is_wp_error( $embedding ) ) { + $embeddings[] = array_map( 'floatval', $embedding ); + } + } + } else { + // Otherwise let's get all embeddings in a single request. + $all_embeddings = $this->generate_embeddings( $content_chunks ); + + if ( $all_embeddings && ! is_wp_error( $all_embeddings ) ) { + $embeddings = array_map( + function ( $embedding ) { + return floatval( $embedding ); + }, + $all_embeddings + ); } } } diff --git a/includes/Classifai/Providers/OpenAI/Embeddings.php b/includes/Classifai/Providers/OpenAI/Embeddings.php index 793eac59e..3ba9e9482 100644 --- a/includes/Classifai/Providers/OpenAI/Embeddings.php +++ b/includes/Classifai/Providers/OpenAI/Embeddings.php @@ -514,11 +514,29 @@ public function generate_embeddings_for_post( int $post_id, bool $force = false // Get the embeddings for each chunk. if ( ! empty( $content_chunks ) ) { - foreach ( $content_chunks as $chunk ) { - $embedding = $this->generate_embedding( $chunk ); + $tokenizer = new Tokenizer( $this->get_max_tokens() ); + $total_tokens = $tokenizer->tokens_in_content( $content ); - if ( $embedding && ! is_wp_error( $embedding ) ) { - $embeddings[] = array_map( 'floatval', $embedding ); + // If we have a lot of tokens, we need to get embeddings for each chunk individually. + if ( $this->max_tokens < $total_tokens ) { + foreach ( $content_chunks as $chunk ) { + $embedding = $this->generate_embedding( $chunk ); + + if ( $embedding && ! is_wp_error( $embedding ) ) { + $embeddings[] = array_map( 'floatval', $embedding ); + } + } + } else { + // Otherwise let's get all embeddings in a single request. + $all_embeddings = $this->generate_embeddings( $content_chunks ); + + if ( $all_embeddings && ! is_wp_error( $all_embeddings ) ) { + $embeddings = array_map( + function ( $embedding ) { + return floatval( $embedding ); + }, + $all_embeddings + ); } } } From b260d6f288b37382dc3c3f0f10bd2c2eabf43b1b Mon Sep 17 00:00:00 2001 From: Darin Kotter Date: Fri, 7 Jun 2024 13:40:40 -0600 Subject: [PATCH 4/4] Fix sanitizing the embedding values --- includes/Classifai/Providers/Azure/Embeddings.php | 2 +- includes/Classifai/Providers/OpenAI/Embeddings.php | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/includes/Classifai/Providers/Azure/Embeddings.php b/includes/Classifai/Providers/Azure/Embeddings.php index 35587e601..affbc24ac 100644 --- a/includes/Classifai/Providers/Azure/Embeddings.php +++ b/includes/Classifai/Providers/Azure/Embeddings.php @@ -420,7 +420,7 @@ public function generate_embeddings_for_post( int $post_id, bool $force = false if ( $all_embeddings && ! is_wp_error( $all_embeddings ) ) { $embeddings = array_map( function ( $embedding ) { - return floatval( $embedding ); + return array_map( 'floatval', $embedding ); }, $all_embeddings ); diff --git a/includes/Classifai/Providers/OpenAI/Embeddings.php b/includes/Classifai/Providers/OpenAI/Embeddings.php index 3ba9e9482..581eb53f8 100644 --- a/includes/Classifai/Providers/OpenAI/Embeddings.php +++ b/includes/Classifai/Providers/OpenAI/Embeddings.php @@ -533,7 +533,7 @@ public function generate_embeddings_for_post( int $post_id, bool $force = false if ( $all_embeddings && ! is_wp_error( $all_embeddings ) ) { $embeddings = array_map( function ( $embedding ) { - return floatval( $embedding ); + return array_map( 'floatval', $embedding ); }, $all_embeddings );