Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add WP-CLI command to bulk process audio transcriptions #514

Merged
merged 7 commits into from
Jul 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions hookdocs/wp-cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,40 @@ The following WP-CLI commands are supported by ClassifAI:
* `true` to run in dry-run mode
* `false` to run in normal mode

* `wp classifai transcribe_audio <attachment_ids> [--per_page=<per_page>] [--force=<bool>] [--dry-run=<bool>]`

Batch generation of audio transcriptions using OpenAI's Whisper API.

* `<attachment_ids>`: A comma-delimited list of attachment IDs to generate transcriptions for. If not set, will instead run a query to get all audio attachments to process.

default: `null`

* `[--per_page=<int>]`: How many items should be processed at a time. Will still process all items but will do it in batches matching this number. Defaults to 100.

default: `100`

options:

* N, max number of items to process at a time

* `[--force=<bool>]`: Whether to process audio files that already have a transcription set. Defaults to `false`.

default: `false`

options:

* `true` to process all items
* `false` to only process items that don't have transcriptions set

* `[--dry-run=<bool>]`: Whether to run as a dry-run. Defaults to `true`, so will run in dry-run mode unless this is set to `false`.

default: `true`

options:

* `true` to run in dry-run mode
* `false` to run in normal mode

* `wp classifai generate_excerpt <post_ids> [--post_type=<post_type>] [--post_status=<post_status>] [--per_page=<per_page>] [--force=<bool>] [--dry-run=<bool>]`

Batch generation of excerpts using OpenAI's ChatGPT API.
Expand Down
193 changes: 193 additions & 0 deletions includes/Classifai/Command/ClassifaiCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
use Classifai\Providers\Azure\ComputerVision;
use Classifai\Providers\Azure\SmartCropping;
use Classifai\Providers\Azure\TextToSpeech;
use Classifai\Providers\OpenAI\Whisper;
use Classifai\Providers\OpenAI\Whisper\Transcribe;
use Classifai\Providers\OpenAI\ChatGPT;
use Classifai\Providers\OpenAI\Embeddings;

Expand Down Expand Up @@ -367,6 +369,160 @@ public function text_to_speech( $args = [], $opts = [] ) {
\WP_CLI::log( sprintf( '%d items had errors', $errors ) );
}

/**
* Batch trigger generation of audio transcriptions depending on passed-in settings.
*
* ## Options
*
* [<attachment_ids>]
* : Comma-delimited list of attachments IDs to generate transcriptions for
*
* [--per_page=<int>]
* : How many items should be processed at a time. Default 100
*
* [--force=<bool>]
* : Whether to process audio files that already have a transcription set. Default false
*
* [--dry-run=<bool>]
* : Whether to run as a dry-run. Default true
*
* @param array $args Arguments.
* @param array $opts Options.
*/
public function transcribe_audio( $args = [], $opts = [] ) {
$defaults = [
'per_page' => 100,
'force' => false,
];

$opts = wp_parse_args( $opts, $defaults );
$opts['per_page'] = (int) $opts['per_page'] > 0 ? $opts['per_page'] : 100;

$count = 0;
$errors = 0;

$whisper = new Whisper( false );
$settings = $whisper->get_settings();

// Determine if this is a dry run or not.
if ( isset( $opts['dry-run'] ) ) {
if ( 'false' === $opts['dry-run'] ) {
$dry_run = false;
} else {
$dry_run = (bool) $opts['dry-run'];
}
} else {
$dry_run = true;
}

if ( $dry_run ) {
\WP_CLI::line( '--- Running command in dry-run mode ---' );
}

// Process the passed in attachment IDs.
if ( ! empty( $args[0] ) ) {
$attachment_ids = array_map( 'absint', explode( ',', $args[0] ) );

\WP_CLI::log( sprintf( 'Starting processing of %s items', count( $attachment_ids ) ) );

$progress_bar = \WP_CLI\Utils\make_progress_bar( 'Processing ...', count( $attachment_ids ) );

foreach ( $attachment_ids as $attachment_id ) {
$attachment = get_post( $attachment_id );
$transcribe = new Transcribe( $attachment_id, $settings );

if ( ! $this->should_transcribe_attachment( $attachment, $attachment_id, $transcribe, (bool) $opts['force'] ) ) {
$errors ++;
continue;
}

if ( ! $dry_run ) {
$result = $transcribe->process();

if ( is_wp_error( $result ) ) {
\WP_CLI::error( sprintf( 'Error while processing item ID %s: %s', $attachment_id, $result->get_error_message() ), false );
$errors ++;
}
}

$progress_bar->tick();
$count ++;
}

$progress_bar->finish();
} else {
\WP_CLI::log( sprintf( 'Starting processing of attachment items in batches of %d', $opts['per_page'] ) );

$paged = 1;
$mime_types = [];
$transcribe = new Transcribe( 1, [] );

// Get all the mime types for the file formats we support.
foreach ( wp_get_mime_types() as $extensions => $mime ) {
foreach ( explode( '|', $extensions ) as $ext ) {
if ( in_array( $ext, $transcribe->file_formats, true ) ) {
$mime_types[] = $mime;
}
}
}

do {
$attachments = get_posts(
array(
'post_type' => 'attachment',
'posts_per_page' => $opts['per_page'],
'post_mime_type' => array_unique( $mime_types ),
'paged' => $paged,
'suppress_filters' => 'false',
'fields' => 'ids',
)
);
$total = count( $attachments );

foreach ( $attachments as $attachment_id ) {
$attachment = get_post( $attachment_id );
$transcribe = new Transcribe( $attachment_id, $settings );

if ( ! $this->should_transcribe_attachment( $attachment, (int) $attachment_id, $transcribe, (bool) $opts['force'] ) ) {
$errors ++;
continue;
}

if ( ! $dry_run ) {
$result = $transcribe->process();

if ( is_wp_error( $result ) ) {
\WP_CLI::error( sprintf( 'Error while processing item ID %s: %s', $attachment_id, $result->get_error_message() ), false );
$errors ++;
}
}

$count ++;
}

$this->inmemory_cleanup();

if ( $total ) {
\WP_CLI::log( sprintf( 'Batch %d is done, proceeding to next batch', $paged ) );
}

$paged ++;
} while ( $total );
}

if ( ! $dry_run ) {
\WP_CLI::log( '-------- Finished! --------' );
\WP_CLI::log( sprintf( '%d items had transcriptions added', $count ) );
} else {
\WP_CLI::log( '-------- Finished! --------' );
\WP_CLI::log( sprintf( '%d items would have had transcriptions added', $count ) );
}

if ( $errors > 0 ) {
\WP_CLI::error( sprintf( '%d items had errors', $errors ), false );
}
}

/**
* Batch trigger generation of excerpts depending on passed-in settings.
*
Expand Down Expand Up @@ -551,6 +707,43 @@ public function generate_excerpt( $args = [], $opts = [] ) {
\WP_CLI::log( sprintf( '%d items had errors', $errors ) );
}

/**
* Determine if an attachment should be transcribed.
*
* @param \WP_Post|null $attachment Attachment we are processing.
* @param int $attachment_id Attachment ID.
* @param Transcribe $transcribe Transcribe instance.
* @param boolean $force Whether to force processing.
* @return boolean
*/
private function should_transcribe_attachment( $attachment, int $attachment_id, Transcribe $transcribe, bool $force = false ) {
// Ensure we have a valid ID.
if ( ! $attachment ) {
\WP_CLI::error( sprintf( 'Item ID %d does not exist', $attachment_id ), false );
return false;
}

// Ensure we have a valid post type.
if ( 'attachment' !== $attachment->post_type ) {
\WP_CLI::error( sprintf( 'The "%s" post type is not supported for audio transcription processing', $attachment->post_type ), false );
return false;
}

// Ensure the attachment meets the requirements for processing.
if ( ! $transcribe->should_process( $attachment_id ) ) {
\WP_CLI::error( sprintf( 'Item ID %d does not meet processing requirements. Ensure the file type is one of %s and file size is under %d bytes.', $attachment_id, implode( ', ', $transcribe->file_formats ), $transcribe->max_file_size ), false );
return false;
}

// Don't process if the attachment already has a transcription, unless force is set.
if ( '' !== trim( $attachment->post_content ) && ! $force ) {
\WP_CLI::error( sprintf( 'Item ID %d already has a transcription and the force option hasn\'t been set. Skipping...', $attachment_id ), false );
return false;
}

return true;
}

/**
* Batch classifies attachments(s) using the current ClassifAI configuration.
*
Expand Down
4 changes: 2 additions & 2 deletions includes/Classifai/Providers/OpenAI/Whisper/Whisper.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ trait Whisper {
*
* @var array
*/
protected $file_formats = [
public $file_formats = [
'mp3',
'mp4',
'mpeg',
Expand All @@ -41,7 +41,7 @@ trait Whisper {
*
* @var int
*/
protected $max_file_size = 25 * MB_IN_BYTES;
public $max_file_size = 25 * MB_IN_BYTES;

/**
* Builds the API url.
Expand Down