Merge pull request #37 from ajcwebdev/bench

Benchmark for different sized models with `whisper.cpp`, `openai-whisper`, and `whisper-diarization`
ajcwebdev · Oct 29, 2024 · d73573c · d73573c
2 parents f8ceb73 + 1dd8a18
commit d73573c
Show file tree

Hide file tree

Showing 20 changed files with 1,706 additions and 668 deletions.
diff --git a/docs/examples.md b/docs/examples.md
diff --git a/package.json b/package.json
@@ -26,16 +26,19 @@
     "docker-up": "docker compose up --build -d --remove-orphans --no-start",
     "ds": "docker compose images && docker compose ls",
     "prune": "docker system prune -af --volumes && docker image prune -af && docker container prune -f && docker volume prune -af",
-    "v": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v2 --video",
-    "u": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v2 --urls",
-    "p": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v2 --playlist",
-    "f": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v2 --file",
-    "r": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v2 --rss",
-    "last3": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v2 --last 3 --rss",
+    "v": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v3-turbo --video",
+    "u": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v3-turbo --urls",
+    "p": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v3-turbo --playlist",
+    "f": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v3-turbo --file",
+    "r": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v3-turbo --rss",
+    "last2": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v3-turbo --last 2 --rss",
+    "last3": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v3-turbo --last 3 --rss",
     "serve": "tsx --env-file=.env --no-warnings --watch packages/server/index.ts",
     "fetch-local": "tsx --env-file=.env --no-warnings packages/server/tests/fetch-local.ts",
     "fetch-all": "tsx --env-file=.env --no-warnings packages/server/tests/fetch-all.ts",
     "t": "npm run test-local",
+    "bench": "tsx --test test/bench.test.ts",
+    "test-bench": "tsx --test test/bench.test.ts",
     "test-local": "tsx --test test/local.test.ts",
     "test-docker": "tsx --test test/docker.test.ts",
     "test-integrations": "tsx --test test/integrations.test.ts",
@@ -44,8 +47,8 @@
     "deno-as": "deno run --allow-sys --allow-read --allow-run --allow-write --allow-env src/autoshow.ts"
   },
   "dependencies": {
-    "@anthropic-ai/sdk": "0.29.0",
-    "@deepgram/sdk": "3.8.1",
+    "@anthropic-ai/sdk": "0.30.1",
+    "@deepgram/sdk": "3.9.0",
     "@fastify/cors": "10.0.1",
     "@google/generative-ai": "0.21.0",
     "@mistralai/mistralai": "1.1.0",
@@ -56,17 +59,17 @@
     "commander": "12.1.0",
     "fast-xml-parser": "4.5.0",
     "fastify": "5.0.0",
-    "file-type": "19.5.0",
-    "inquirer": "12.0.0",
+    "file-type": "19.6.0",
+    "inquirer": "12.0.1",
     "node-llama-cpp": "3.1.1",
     "ollama": "0.5.9",
-    "openai": "4.67.3"
+    "openai": "4.68.4"
   },
   "devDependencies": {
     "@types/inquirer": "9.0.7",
-    "@types/node": "22.7.5",
+    "@types/node": "22.8.1",
     "tsx": "4.19.1",
-    "typedoc": "^0.26.10",
+    "typedoc": "0.26.10",
     "typescript": "5.6.3"
   }
 }
diff --git a/src/autoshow.ts b/src/autoshow.ts
@@ -22,35 +22,39 @@ import { argv, exit } from 'node:process'
 import { log, opts, final, ACTION_OPTIONS, LLM_OPTIONS, TRANSCRIPT_OPTIONS } from './models.js'
 import type { ProcessingOptions, HandlerFunction, LLMServices, TranscriptServices } from './types.js'
 
-// Initialize the command-line interface
+// Initialize the command-line interface using Commander.js
 const program = new Command()
 
 /**
  * Defines the command-line interface options and descriptions.
+ * Sets up all available commands and their respective flags
  */
 program
   .name('autoshow')
   .version('0.0.1')
   .description('Automate processing of audio and video content from various sources.')
   .usage('[options]')
-  .option('--prompt <sections...>', 'Specify prompt sections to include')
+  // Input source options
   .option('-v, --video <url>', 'Process a single YouTube video')
   .option('-p, --playlist <playlistUrl>', 'Process all videos in a YouTube playlist')
   .option('-u, --urls <filePath>', 'Process YouTube videos from a list of URLs in a file')
   .option('-f, --file <filePath>', 'Process a local audio or video file')
   .option('-r, --rss <rssURL>', 'Process a podcast RSS feed')
+  // RSS feed specific options
   .option('--item <itemUrls...>', 'Process specific items in the RSS feed by providing their audio URLs')
   .option('--order <order>', 'Specify the order for RSS feed processing (newest or oldest)')
   .option('--skip <number>', 'Number of items to skip when processing RSS feed', parseInt)
   .option('--last <number>', 'Number of most recent items to process (overrides --order and --skip)', parseInt)
   .option('--info', 'Generate JSON file with RSS feed information instead of processing items')
+  // Transcription service options
   .option('--whisper [model]', 'Use Whisper.cpp for transcription with optional model specification')
   .option('--whisperDocker [model]', 'Use Whisper.cpp in Docker for transcription with optional model specification')
   .option('--whisperPython [model]', 'Use openai-whisper for transcription with optional model specification')
   .option('--whisperDiarization [model]', 'Use whisper-diarization for transcription with optional model specification')
   .option('--deepgram', 'Use Deepgram for transcription')
   .option('--assembly', 'Use AssemblyAI for transcription')
   .option('--speakerLabels', 'Use speaker labels for AssemblyAI transcription')
+  // LLM service options
   .option('--chatgpt [model]', 'Use ChatGPT for processing with optional model specification')
   .option('--claude [model]', 'Use Claude for processing with optional model specification')
   .option('--cohere [model]', 'Use Cohere for processing with optional model specification')
@@ -62,6 +66,8 @@ program
   .option('--llama [model]', 'Use Node Llama for processing with optional model specification')
   .option('--ollama [model]', 'Use Ollama for processing with optional model specification')
   .option('--gemini [model]', 'Use Gemini for processing with optional model specification')
+  // Utility options
+  .option('--prompt <sections...>', 'Specify prompt sections to include')
   .option('--noCleanUp', 'Do not delete intermediary files after processing')
   .option('-i, --interactive', 'Run in interactive mode')
   .addHelpText(
@@ -80,6 +86,8 @@ Report Issues: https://github.com/ajcwebdev/autoshow/issues
 
 /**
  * Helper function to validate that only one option from a list is provided.
+ * Prevents users from specifying multiple conflicting options simultaneously.
+ * 
  * @param optionKeys - The list of option keys to check.
  * @param options - The options object.
  * @param errorMessage - The prefix of the error message.
@@ -90,7 +98,10 @@ function getSingleOption(
   options: ProcessingOptions,
   errorMessage: string
 ): string | undefined {
+  // Filter out which options from the provided list are actually set
   const selectedOptions = optionKeys.filter((opt) => options[opt as keyof ProcessingOptions])
+
+  // If more than one option is selected, throw an error
   if (selectedOptions.length > 1) {
     console.error(`Error: Multiple ${errorMessage} provided (${selectedOptions.join(', ')}). Please specify only one.`)
     exit(1)
@@ -100,13 +111,17 @@ function getSingleOption(
 
 /**
  * Main action for the program.
+ * Handles the processing of options and executes the appropriate command handler.
+ * 
  * @param options - The command-line options provided by the user.
  */
 program.action(async (options: ProcessingOptions) => {
+  // Log received options for debugging purposes
   log(opts(`Options received at beginning of command:\n`))
   log(options)
   log(``)
 
+  // Define mapping of action types to their handler functions
   const PROCESS_HANDLERS: Record<string, HandlerFunction> = {
     video: processVideo,
     playlist: processPlaylist,
@@ -115,61 +130,65 @@ program.action(async (options: ProcessingOptions) => {
     rss: processRSS,
   }
 
+  // Extract interactive mode flag
   const { interactive } = options
+
+  // Check if no action option was provided
   const noActionProvided = ACTION_OPTIONS.every((opt) => !options[opt as keyof ProcessingOptions])
 
+  // If in interactive mode or no action provided, prompt user for input
   if (interactive || noActionProvided) {
     options = await handleInteractivePrompt(options)
   }
 
-  // Ensure options.item is an array if provided via command line
+  // Ensure options.item is always an array if provided via command line
   if (options.item && !Array.isArray(options.item)) {
     options.item = [options.item]
   }
 
-  // Validate and retrieve single action option
+  // Validate and get single options for action, LLM, and transcription
   const action = getSingleOption(ACTION_OPTIONS, options, 'input option')
-
-  // Validate and retrieve single LLM option
   const llmKey = getSingleOption(LLM_OPTIONS, options, 'LLM option')
   const llmServices = llmKey as LLMServices | undefined
-
-  // Validate and retrieve single transcription option
   const transcriptKey = getSingleOption(TRANSCRIPT_OPTIONS, options, 'transcription option')
   const transcriptServices: TranscriptServices | undefined = transcriptKey as TranscriptServices | undefined
 
-  // Set default transcription service if not provided
+  // Set default transcription service to whisper if none provided
   const finalTranscriptServices: TranscriptServices = transcriptServices || 'whisper'
 
-  // Set default Whisper model if not provided
+  // Set default Whisper model to 'large-v3-turbo' if whisper is selected but no model specified
   if (finalTranscriptServices === 'whisper' && !options.whisper) {
-    options.whisper = 'base'
+    options.whisper = 'large-v3-turbo'
   }
 
+  // Execute the appropriate handler if an action was specified
   if (action) {
     try {
+      // Process the content using the selected handler
       await PROCESS_HANDLERS[action](
         options,
         options[action as keyof ProcessingOptions] as string,
         llmServices,
         finalTranscriptServices
       )
+      // Log success message
       log(final(`\n================================================================================================`))
       log(final(`  ${action} Processing Completed Successfully.`))
       log(final(`================================================================================================\n`))
       exit(0)
     } catch (error) {
+      // Log error and exit if processing fails
       console.error(`Error processing ${action}:`, (error as Error).message)
       exit(1)
     }
   }
 })
 
-// Handle unknown commands
+// Set up error handling for unknown commands
 program.on('command:*', function () {
   console.error(`Error: Invalid command '${program.args.join(' ')}'. Use --help to see available commands.`)
   exit(1)
 })
 
-// Parse the command-line arguments
+// Parse the command-line arguments and execute the program
 program.parse(argv)
diff --git a/src/commands/processFile.ts b/src/commands/processFile.ts
@@ -1,5 +1,10 @@
 // src/commands/processFile.ts
 
+/**
+ * @file Process a local audio or video file for transcription and analysis.
+ * @packageDocumentation
+ */
+
 import { generateMarkdown } from '../utils/generateMarkdown.js'
 import { downloadAudio } from '../utils/downloadAudio.js'
 import { runTranscription } from '../utils/runTranscription.js'
@@ -9,31 +14,53 @@ import { log, opts, wait } from '../models.js'
 import type { LLMServices, TranscriptServices, ProcessingOptions } from '../types.js'
 
 /**
- * Main function to process a local audio or video file.
- * @param {string} filePath - The path to the local file to process.
- * @param {LLMServices} [llmServices] - The selected Language Model option.
- * @param {TranscriptServices} [transcriptServices] - The transcription service to use.
- * @param {ProcessingOptions} options - Additional options for processing.
- * @returns {Promise<void>}
+ * Processes a local audio or video file through a series of operations:
+ * 1. Generates markdown with file metadata
+ * 2. Converts the file to the required audio format
+ * 3. Transcribes the audio content
+ * 4. Processes the transcript with a language model (if specified)
+ * 5. Cleans up temporary files (unless disabled)
+ * 
+ * Unlike processVideo, this function handles local files and doesn't need
+ * to check for external dependencies like yt-dlp.
+ * 
+ * @param options - Configuration options for processing
+ * @param filePath - Path to the local audio or video file to process
+ * @param llmServices - Optional language model service to use for processing the transcript
+ * @param transcriptServices - Optional transcription service to use for converting audio to text
+ * @throws Will terminate the process with exit code 1 if any processing step fails
+ * @returns Promise that resolves when all processing is complete
  */
 export async function processFile(
   options: ProcessingOptions,
   filePath: string,
   llmServices?: LLMServices,
   transcriptServices?: TranscriptServices
 ): Promise<void> {
+  // Log the processing parameters for debugging purposes
   log(opts('Parameters passed to processFile:\n'))
   log(wait(`  - llmServices: ${llmServices}\n  - transcriptServices: ${transcriptServices}\n`))
+
   try {
-    const { frontMatter, finalPath, filename } = await generateMarkdown(options, filePath)  // Generate markdown for the file
-    await downloadAudio(options, filePath, filename)                                        // Convert the audio or video file to the required format
-    await runTranscription(options, finalPath, frontMatter, transcriptServices)             // Run transcription on the file
-    await runLLM(options, finalPath, frontMatter, llmServices)                              // Process the transcript with the selected Language Model
-    if (!options.noCleanUp) {                                                               // Clean up temporary files if the noCleanUp option is not set
+    // Generate markdown file with file metadata and get file paths
+    const { frontMatter, finalPath, filename } = await generateMarkdown(options, filePath)
+
+    // Convert the input file to the required audio format for processing
+    await downloadAudio(options, filePath, filename)
+
+    // Convert the audio to text using the specified transcription service
+    await runTranscription(options, finalPath, frontMatter, transcriptServices)
+
+    // Process the transcript with a language model if one was specified
+    await runLLM(options, finalPath, frontMatter, llmServices)
+
+    // Remove temporary files unless the noCleanUp option is set
+    if (!options.noCleanUp) {
       await cleanUpFiles(finalPath)
     }
   } catch (error) {
+    // Log the error and terminate the process with error code
     console.error(`Error processing file: ${(error as Error).message}`)
-    process.exit(1) // Exit with an error code
+    process.exit(1)
   }
 }