Skip to content

Commit

Permalink
Merge pull request #514 from 10up/feature/498
Browse files Browse the repository at this point in the history
Add `WP-CLI` command to bulk process audio transcriptions
  • Loading branch information
dkotter authored Jul 12, 2023
2 parents 31555fd + dfe443b commit 3d7cd23
Show file tree
Hide file tree
Showing 3 changed files with 229 additions and 2 deletions.
34 changes: 34 additions & 0 deletions hookdocs/wp-cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,40 @@ The following WP-CLI commands are supported by ClassifAI:
* `true` to run in dry-run mode
* `false` to run in normal mode

* `wp classifai transcribe_audio <attachment_ids> [--per_page=<per_page>] [--force=<bool>] [--dry-run=<bool>]`

Batch generation of audio transcriptions using OpenAI's Whisper API.

* `<attachment_ids>`: A comma-delimited list of attachment IDs to generate transcriptions for. If not set, will instead run a query to get all audio attachments to process.

default: `null`

* `[--per_page=<int>]`: How many items should be processed at a time. Will still process all items but will do it in batches matching this number. Defaults to 100.

default: `100`

options:

* N, max number of items to process at a time

* `[--force=<bool>]`: Whether to process audio files that already have a transcription set. Defaults to `false`.

default: `false`

options:

* `true` to process all items
* `false` to only process items that don't have transcriptions set

* `[--dry-run=<bool>]`: Whether to run as a dry-run. Defaults to `true`, so will run in dry-run mode unless this is set to `false`.

default: `true`

options:

* `true` to run in dry-run mode
* `false` to run in normal mode

* `wp classifai generate_excerpt <post_ids> [--post_type=<post_type>] [--post_status=<post_status>] [--per_page=<per_page>] [--force=<bool>] [--dry-run=<bool>]`

Batch generation of excerpts using OpenAI's ChatGPT API.
Expand Down
193 changes: 193 additions & 0 deletions includes/Classifai/Command/ClassifaiCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
use Classifai\Providers\Azure\ComputerVision;
use Classifai\Providers\Azure\SmartCropping;
use Classifai\Providers\Azure\TextToSpeech;
use Classifai\Providers\OpenAI\Whisper;
use Classifai\Providers\OpenAI\Whisper\Transcribe;
use Classifai\Providers\OpenAI\ChatGPT;
use Classifai\Providers\OpenAI\Embeddings;

Expand Down Expand Up @@ -367,6 +369,160 @@ public function text_to_speech( $args = [], $opts = [] ) {
\WP_CLI::log( sprintf( '%d items had errors', $errors ) );
}

/**
* Batch trigger generation of audio transcriptions depending on passed-in settings.
*
* ## Options
*
* [<attachment_ids>]
* : Comma-delimited list of attachments IDs to generate transcriptions for
*
* [--per_page=<int>]
* : How many items should be processed at a time. Default 100
*
* [--force=<bool>]
* : Whether to process audio files that already have a transcription set. Default false
*
* [--dry-run=<bool>]
* : Whether to run as a dry-run. Default true
*
* @param array $args Arguments.
* @param array $opts Options.
*/
public function transcribe_audio( $args = [], $opts = [] ) {
$defaults = [
'per_page' => 100,
'force' => false,
];

$opts = wp_parse_args( $opts, $defaults );
$opts['per_page'] = (int) $opts['per_page'] > 0 ? $opts['per_page'] : 100;

$count = 0;
$errors = 0;

$whisper = new Whisper( false );
$settings = $whisper->get_settings();

// Determine if this is a dry run or not.
if ( isset( $opts['dry-run'] ) ) {
if ( 'false' === $opts['dry-run'] ) {
$dry_run = false;
} else {
$dry_run = (bool) $opts['dry-run'];
}
} else {
$dry_run = true;
}

if ( $dry_run ) {
\WP_CLI::line( '--- Running command in dry-run mode ---' );
}

// Process the passed in attachment IDs.
if ( ! empty( $args[0] ) ) {
$attachment_ids = array_map( 'absint', explode( ',', $args[0] ) );

\WP_CLI::log( sprintf( 'Starting processing of %s items', count( $attachment_ids ) ) );

$progress_bar = \WP_CLI\Utils\make_progress_bar( 'Processing ...', count( $attachment_ids ) );

foreach ( $attachment_ids as $attachment_id ) {
$attachment = get_post( $attachment_id );
$transcribe = new Transcribe( $attachment_id, $settings );

if ( ! $this->should_transcribe_attachment( $attachment, $attachment_id, $transcribe, (bool) $opts['force'] ) ) {
$errors ++;
continue;
}

if ( ! $dry_run ) {
$result = $transcribe->process();

if ( is_wp_error( $result ) ) {
\WP_CLI::error( sprintf( 'Error while processing item ID %s: %s', $attachment_id, $result->get_error_message() ), false );
$errors ++;
}
}

$progress_bar->tick();
$count ++;
}

$progress_bar->finish();
} else {
\WP_CLI::log( sprintf( 'Starting processing of attachment items in batches of %d', $opts['per_page'] ) );

$paged = 1;
$mime_types = [];
$transcribe = new Transcribe( 1, [] );

// Get all the mime types for the file formats we support.
foreach ( wp_get_mime_types() as $extensions => $mime ) {
foreach ( explode( '|', $extensions ) as $ext ) {
if ( in_array( $ext, $transcribe->file_formats, true ) ) {
$mime_types[] = $mime;
}
}
}

do {
$attachments = get_posts(
array(
'post_type' => 'attachment',
'posts_per_page' => $opts['per_page'],
'post_mime_type' => array_unique( $mime_types ),
'paged' => $paged,
'suppress_filters' => 'false',
'fields' => 'ids',
)
);
$total = count( $attachments );

foreach ( $attachments as $attachment_id ) {
$attachment = get_post( $attachment_id );
$transcribe = new Transcribe( $attachment_id, $settings );

if ( ! $this->should_transcribe_attachment( $attachment, (int) $attachment_id, $transcribe, (bool) $opts['force'] ) ) {
$errors ++;
continue;
}

if ( ! $dry_run ) {
$result = $transcribe->process();

if ( is_wp_error( $result ) ) {
\WP_CLI::error( sprintf( 'Error while processing item ID %s: %s', $attachment_id, $result->get_error_message() ), false );
$errors ++;
}
}

$count ++;
}

$this->inmemory_cleanup();

if ( $total ) {
\WP_CLI::log( sprintf( 'Batch %d is done, proceeding to next batch', $paged ) );
}

$paged ++;
} while ( $total );
}

if ( ! $dry_run ) {
\WP_CLI::log( '-------- Finished! --------' );
\WP_CLI::log( sprintf( '%d items had transcriptions added', $count ) );
} else {
\WP_CLI::log( '-------- Finished! --------' );
\WP_CLI::log( sprintf( '%d items would have had transcriptions added', $count ) );
}

if ( $errors > 0 ) {
\WP_CLI::error( sprintf( '%d items had errors', $errors ), false );
}
}

/**
* Batch trigger generation of excerpts depending on passed-in settings.
*
Expand Down Expand Up @@ -551,6 +707,43 @@ public function generate_excerpt( $args = [], $opts = [] ) {
\WP_CLI::log( sprintf( '%d items had errors', $errors ) );
}

/**
* Determine if an attachment should be transcribed.
*
* @param \WP_Post|null $attachment Attachment we are processing.
* @param int $attachment_id Attachment ID.
* @param Transcribe $transcribe Transcribe instance.
* @param boolean $force Whether to force processing.
* @return boolean
*/
private function should_transcribe_attachment( $attachment, int $attachment_id, Transcribe $transcribe, bool $force = false ) {
// Ensure we have a valid ID.
if ( ! $attachment ) {
\WP_CLI::error( sprintf( 'Item ID %d does not exist', $attachment_id ), false );
return false;
}

// Ensure we have a valid post type.
if ( 'attachment' !== $attachment->post_type ) {
\WP_CLI::error( sprintf( 'The "%s" post type is not supported for audio transcription processing', $attachment->post_type ), false );
return false;
}

// Ensure the attachment meets the requirements for processing.
if ( ! $transcribe->should_process( $attachment_id ) ) {
\WP_CLI::error( sprintf( 'Item ID %d does not meet processing requirements. Ensure the file type is one of %s and file size is under %d bytes.', $attachment_id, implode( ', ', $transcribe->file_formats ), $transcribe->max_file_size ), false );
return false;
}

// Don't process if the attachment already has a transcription, unless force is set.
if ( '' !== trim( $attachment->post_content ) && ! $force ) {
\WP_CLI::error( sprintf( 'Item ID %d already has a transcription and the force option hasn\'t been set. Skipping...', $attachment_id ), false );
return false;
}

return true;
}

/**
* Batch classifies attachments(s) using the current ClassifAI configuration.
*
Expand Down
4 changes: 2 additions & 2 deletions includes/Classifai/Providers/OpenAI/Whisper/Whisper.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ trait Whisper {
*
* @var array
*/
protected $file_formats = [
public $file_formats = [
'mp3',
'mp4',
'mpeg',
Expand All @@ -41,7 +41,7 @@ trait Whisper {
*
* @var int
*/
protected $max_file_size = 25 * MB_IN_BYTES;
public $max_file_size = 25 * MB_IN_BYTES;

/**
* Builds the API url.
Expand Down

0 comments on commit 3d7cd23

Please sign in to comment.