Merge pull request #514 from 10up/feature/498

Add `WP-CLI` command to bulk process audio transcriptions
10up · Jul 12, 2023 · 3d7cd23 · 3d7cd23
2 parents 31555fd + dfe443b
commit 3d7cd23
Show file tree

Hide file tree

Showing 3 changed files with 229 additions and 2 deletions.
diff --git a/hookdocs/wp-cli.md b/hookdocs/wp-cli.md
@@ -109,6 +109,40 @@ The following WP-CLI commands are supported by ClassifAI:
     * `true` to run in dry-run mode
     * `false` to run in normal mode
 
+* `wp classifai transcribe_audio <attachment_ids> [--per_page=<per_page>] [--force=<bool>] [--dry-run=<bool>]`
+
+  Batch generation of audio transcriptions using OpenAI's Whisper API.
+
+  * `<attachment_ids>`: A comma-delimited list of attachment IDs to generate transcriptions for. If not set, will instead run a query to get all audio attachments to process.
+
+    default: `null`
+
+  * `[--per_page=<int>]`: How many items should be processed at a time. Will still process all items but will do it in batches matching this number. Defaults to 100.
+
+    default: `100`
+
+    options:
+
+    * N, max number of items to process at a time
+
+  * `[--force=<bool>]`: Whether to process audio files that already have a transcription set. Defaults to `false`.
+
+    default: `false`
+
+    options:
+
+    * `true` to process all items
+    * `false` to only process items that don't have transcriptions set
+
+  * `[--dry-run=<bool>]`: Whether to run as a dry-run. Defaults to `true`, so will run in dry-run mode unless this is set to `false`.
+
+    default: `true`
+
+    options:
+
+    * `true` to run in dry-run mode
+    * `false` to run in normal mode
+
 * `wp classifai generate_excerpt <post_ids> [--post_type=<post_type>] [--post_status=<post_status>] [--per_page=<per_page>] [--force=<bool>] [--dry-run=<bool>]`
 
   Batch generation of excerpts using OpenAI's ChatGPT API.

diff --git a/includes/Classifai/Command/ClassifaiCommand.php b/includes/Classifai/Command/ClassifaiCommand.php
@@ -10,6 +10,8 @@
 use Classifai\Providers\Azure\ComputerVision;
 use Classifai\Providers\Azure\SmartCropping;
 use Classifai\Providers\Azure\TextToSpeech;
+use Classifai\Providers\OpenAI\Whisper;
+use Classifai\Providers\OpenAI\Whisper\Transcribe;
 use Classifai\Providers\OpenAI\ChatGPT;
 use Classifai\Providers\OpenAI\Embeddings;
 
@@ -367,6 +369,160 @@ public function text_to_speech( $args = [], $opts = [] ) {
 		\WP_CLI::log( sprintf( '%d items had errors', $errors ) );
 	}
 
+	/**
+	 * Batch trigger generation of audio transcriptions depending on passed-in settings.
+	 *
+	 * ## Options
+	 *
+	 * [<attachment_ids>]
+	 * : Comma-delimited list of attachments IDs to generate transcriptions for
+	 *
+	 * [--per_page=<int>]
+	 * : How many items should be processed at a time. Default 100
+	 *
+	 * [--force=<bool>]
+	 * : Whether to process audio files that already have a transcription set. Default false
+	 *
+	 * [--dry-run=<bool>]
+	 * : Whether to run as a dry-run. Default true
+	 *
+	 * @param array $args Arguments.
+	 * @param array $opts Options.
+	 */
+	public function transcribe_audio( $args = [], $opts = [] ) {
+		$defaults = [
+			'per_page' => 100,
+			'force'    => false,
+		];
+
+		$opts             = wp_parse_args( $opts, $defaults );
+		$opts['per_page'] = (int) $opts['per_page'] > 0 ? $opts['per_page'] : 100;
+
+		$count  = 0;
+		$errors = 0;
+
+		$whisper  = new Whisper( false );
+		$settings = $whisper->get_settings();
+
+		// Determine if this is a dry run or not.
+		if ( isset( $opts['dry-run'] ) ) {
+			if ( 'false' === $opts['dry-run'] ) {
+				$dry_run = false;
+			} else {
+				$dry_run = (bool) $opts['dry-run'];
+			}
+		} else {
+			$dry_run = true;
+		}
+
+		if ( $dry_run ) {
+			\WP_CLI::line( '--- Running command in dry-run mode ---' );
+		}
+
+		// Process the passed in attachment IDs.
+		if ( ! empty( $args[0] ) ) {
+			$attachment_ids = array_map( 'absint', explode( ',', $args[0] ) );
+
+			\WP_CLI::log( sprintf( 'Starting processing of %s items', count( $attachment_ids ) ) );
+
+			$progress_bar = \WP_CLI\Utils\make_progress_bar( 'Processing ...', count( $attachment_ids ) );
+
+			foreach ( $attachment_ids as $attachment_id ) {
+				$attachment = get_post( $attachment_id );
+				$transcribe = new Transcribe( $attachment_id, $settings );
+
+				if ( ! $this->should_transcribe_attachment( $attachment, $attachment_id, $transcribe, (bool) $opts['force'] ) ) {
+					$errors ++;
+					continue;
+				}
+
+				if ( ! $dry_run ) {
+					$result = $transcribe->process();
+
+					if ( is_wp_error( $result ) ) {
+						\WP_CLI::error( sprintf( 'Error while processing item ID %s: %s', $attachment_id, $result->get_error_message() ), false );
+						$errors ++;
+					}
+				}
+
+				$progress_bar->tick();
+				$count ++;
+			}
+
+			$progress_bar->finish();
+		} else {
+			\WP_CLI::log( sprintf( 'Starting processing of attachment items in batches of %d', $opts['per_page'] ) );
+
+			$paged      = 1;
+			$mime_types = [];
+			$transcribe = new Transcribe( 1, [] );
+
+			// Get all the mime types for the file formats we support.
+			foreach ( wp_get_mime_types() as $extensions => $mime ) {
+				foreach ( explode( '|', $extensions ) as $ext ) {
+					if ( in_array( $ext, $transcribe->file_formats, true ) ) {
+						$mime_types[] = $mime;
+					}
+				}
+			}
+
+			do {
+				$attachments = get_posts(
+					array(
+						'post_type'        => 'attachment',
+						'posts_per_page'   => $opts['per_page'],
+						'post_mime_type'   => array_unique( $mime_types ),
+						'paged'            => $paged,
+						'suppress_filters' => 'false',
+						'fields'           => 'ids',
+					)
+				);
+				$total       = count( $attachments );
+
+				foreach ( $attachments as $attachment_id ) {
+					$attachment = get_post( $attachment_id );
+					$transcribe = new Transcribe( $attachment_id, $settings );
+
+					if ( ! $this->should_transcribe_attachment( $attachment, (int) $attachment_id, $transcribe, (bool) $opts['force'] ) ) {
+						$errors ++;
+						continue;
+					}
+
+					if ( ! $dry_run ) {
+						$result = $transcribe->process();
+
+						if ( is_wp_error( $result ) ) {
+							\WP_CLI::error( sprintf( 'Error while processing item ID %s: %s', $attachment_id, $result->get_error_message() ), false );
+							$errors ++;
+						}
+					}
+
+					$count ++;
+				}
+
+				$this->inmemory_cleanup();
+
+				if ( $total ) {
+					\WP_CLI::log( sprintf( 'Batch %d is done, proceeding to next batch', $paged ) );
+				}
+
+				$paged ++;
+			} while ( $total );
+		}
+
+		if ( ! $dry_run ) {
+			\WP_CLI::log( '-------- Finished! --------' );
+			\WP_CLI::log( sprintf( '%d items had transcriptions added', $count ) );
+		} else {
+			\WP_CLI::log( '-------- Finished! --------' );
+			\WP_CLI::log( sprintf( '%d items would have had transcriptions added', $count ) );
+		}
+
+		if ( $errors > 0 ) {
+			\WP_CLI::error( sprintf( '%d items had errors', $errors ), false );
+		}
+	}
+
 	/**
 	 * Batch trigger generation of excerpts depending on passed-in settings.
 	 *
@@ -551,6 +707,43 @@ public function generate_excerpt( $args = [], $opts = [] ) {
 		\WP_CLI::log( sprintf( '%d items had errors', $errors ) );
 	}
 
+	/**
+	 * Determine if an attachment should be transcribed.
+	 *
+	 * @param \WP_Post|null $attachment Attachment we are processing.
+	 * @param int           $attachment_id Attachment ID.
+	 * @param Transcribe    $transcribe Transcribe instance.
+	 * @param boolean       $force Whether to force processing.
+	 * @return boolean
+	 */
+	private function should_transcribe_attachment( $attachment, int $attachment_id, Transcribe $transcribe, bool $force = false ) {
+		// Ensure we have a valid ID.
+		if ( ! $attachment ) {
+			\WP_CLI::error( sprintf( 'Item ID %d does not exist', $attachment_id ), false );
+			return false;
+		}
+
+		// Ensure we have a valid post type.
+		if ( 'attachment' !== $attachment->post_type ) {
+			\WP_CLI::error( sprintf( 'The "%s" post type is not supported for audio transcription processing', $attachment->post_type ), false );
+			return false;
+		}
+
+		// Ensure the attachment meets the requirements for processing.
+		if ( ! $transcribe->should_process( $attachment_id ) ) {
+			\WP_CLI::error( sprintf( 'Item ID %d does not meet processing requirements. Ensure the file type is one of %s and file size is under %d bytes.', $attachment_id, implode( ', ', $transcribe->file_formats ), $transcribe->max_file_size ), false );
+			return false;
+		}
+
+		// Don't process if the attachment already has a transcription, unless force is set.
+		if ( '' !== trim( $attachment->post_content ) && ! $force ) {
+			\WP_CLI::error( sprintf( 'Item ID %d already has a transcription and the force option hasn\'t been set. Skipping...', $attachment_id ), false );
+			return false;
+		}
+
+		return true;
+	}
+
 	/**
 	 * Batch classifies attachments(s) using the current ClassifAI configuration.
 	 *

diff --git a/includes/Classifai/Providers/OpenAI/Whisper/Whisper.php b/includes/Classifai/Providers/OpenAI/Whisper/Whisper.php
@@ -26,7 +26,7 @@ trait Whisper {
 	 *
 	 * @var array
 	 */
-	protected $file_formats = [
+	public $file_formats = [
 		'mp3',
 		'mp4',
 		'mpeg',
@@ -41,7 +41,7 @@ trait Whisper {
 	 *
 	 * @var int
 	 */
-	protected $max_file_size = 25 * MB_IN_BYTES;
+	public $max_file_size = 25 * MB_IN_BYTES;
 
 	/**
 	 * Builds the API url.