feat(whisper): Whisper speech to text (#100)

briansunter · Mar 20, 2023 · 563b3b4 · 563b3b4
1 parent 84f938f
commit 563b3b4
Show file tree

Hide file tree

Showing 6 changed files with 109 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -111,8 +111,21 @@ Type `/gpt-page` in a block or select `gpt-page` from the block menu.
 
 ![logseq gpt-page](docs/gpt-page.gif)
 
+#### Whisper speech to text transcription
+
+Transcribe audio files to text using the Whisper API.
+
+Type `/whisper` in a block or select `whisper` from the block menu.
+
+Supported formats are:
+```m4a, mp3, mp4, mpeg, mpga, wav, webm```
+
+Note, that the Logseq mobile app audio recorder uses `.aac` which is not supported by Whisper. I recommend using a separate audio recorder app to record audio files and then uploading them to Logseq. For example, I use the Voice Memos app on iOS and share that file to the Logseq mobile app.
+
 ### `dalle`
 
+Generate images with DALL-E
+
 Type `/dalle` in a block or select `dalle` from the block menu.
 
 This will generate an image using the DALL-E model, save the image to the `assets` folder, and insert the image into the block.

diff --git a/package-lock.json b/package-lock.json
diff --git a/src/lib/logseq.ts b/src/lib/logseq.ts
@@ -64,3 +64,29 @@ export async function saveDalleImage(imageURL: string): Promise<string> {
   const imageFileName = `![](assets/storages/logseq-plugin-gpt3-openai/${imageName})`;
   return imageFileName;
 }
+
+export async function getAudioFile(content: string): Promise<File | null> {
+  //supported formats are mp3, mp4, mpeg, mpga, m4a, wav, and webm
+  //extract file path in () from markdown link like ![my file](assets/2023-03-17-13-24-36.m4a)
+  const regex = /!\[.*\]\((.*(mp3|mp4|mpeg|mpga|m4a|wav|webm))\)/;
+  const path = (await logseq.App.getCurrentGraph())?.path;
+  const match = regex.exec(content);
+  if (!match || !match[1]) {
+    return null;
+  }
+  //get extension from file path
+  const extension = match[1].split(".").pop();
+  if (!extension) {
+    return null;
+  }
+  //remove ../ from path
+  const filepath = match[1].replace("../", "");
+  // get filename from path by removing assets/ from path
+  const filename = filepath.replace("assets/", "");
+  const fullFilename = "file://" + path + "/" + filepath;
+  const response = await fetch(fullFilename);
+  const audioBlob = await response.blob();
+  const file = new File([audioBlob], filename, { type: `audio/${extension}` });
+  return file;
+}
+
diff --git a/src/lib/openai.ts b/src/lib/openai.ts
@@ -21,8 +21,14 @@ const OpenAIDefaults = (apiKey: string): OpenAIOptions => ({
 });
 
 const retryOptions = {
-  numOfAttempts: 3,
+  numOfAttempts: 7,
   retry: (err: any) => {
+    if (err instanceof TypeError && err.message === 'Failed to fetch') {
+      // Handle the TypeError: Failed to fetch error
+      console.warn('retrying due to network error', err);
+      return true;
+    }
+
     if (!err.response || !err.response.data || !err.response.data.error) {
       return false;
     }
@@ -34,10 +40,44 @@ const retryOptions = {
       console.warn("Rate limit exceeded. Retrying...");
       return true;
     }
+    if (err.response.status >= 500){
+      return true;
+    }
+
     return false;
   },
 };
 
+export async function whisper(file: File,openAiOptions:OpenAIOptions): Promise<string> {
+    const apiKey = openAiOptions.apiKey;
+    const model = 'whisper-1';
+
+    // Create a FormData object and append the file
+    const formData = new FormData();
+    formData.append('model', model);
+    formData.append('file', file);
+
+    // Send a request to the OpenAI API using a form post
+    const response = await backOff(
+
+    () => fetch('https://api.openai.com/v1/audio/transcriptions', {
+      method: 'POST',
+      headers: {
+        'Authorization': `Bearer ${apiKey}`,
+      },
+      body: formData,
+    }), retryOptions);
+
+    // Check if the response status is OK
+    if (!response.ok) {
+      throw new Error(`Error transcribing audio: ${response.statusText}`);
+    }
+
+    // Parse the response JSON and extract the transcription
+    const jsonResponse = await response.json();
+    return jsonResponse.text;
+  }
+
 export async function dallE(
   prompt: string,
   openAiOptions: OpenAIOptions

diff --git a/src/lib/rawCommands.ts b/src/lib/rawCommands.ts
@@ -1,6 +1,6 @@
 import { IHookEvent } from "@logseq/libs/dist/LSPlugin.user";
-import { getPageContentFromBlock, saveDalleImage } from "./logseq";
-import { OpenAIOptions, openAI, dallE } from "./openai";
+import { getAudioFile, getPageContentFromBlock, saveDalleImage } from "./logseq";
+import { OpenAIOptions, openAI, dallE, whisper } from "./openai";
 import { getOpenaiSettings } from "./settings";
 
 function handleOpenAIError(e: any) {
@@ -165,3 +165,23 @@ export async function runDalleBlock(b: IHookEvent) {
     handleOpenAIError(e);
   }
 }
+
+export async function runWhisper(b: IHookEvent) {
+  const currentBlock = await logseq.Editor.getBlock(b.uuid);
+  if (currentBlock) {
+    const audioFile = await getAudioFile(currentBlock.content);
+    if (!audioFile) {
+      logseq.App.showMsg("No supported audio file found in block.", "warning");
+      return;
+    }
+    const openAISettings = getOpenaiSettings();
+    try {
+      const transcribe = await whisper(audioFile, openAISettings);
+      if (transcribe) {
+        await logseq.Editor.insertBlock(currentBlock.uuid, transcribe);
+      }
+    } catch (e: any) {
+      handleOpenAIError(e);
+    }
+  }
+}
diff --git a/src/main.tsx b/src/main.tsx
@@ -6,7 +6,7 @@ import ReactDOM from "react-dom/client";
 import { Command, LogseqAI } from "./ui/LogseqAI";
 import { loadUserCommands, loadBuiltInCommands } from "./lib/prompts";
 import { getOpenaiSettings, settingsSchema } from "./lib/settings";
-import { runDalleBlock, runGptBlock, runGptPage } from "./lib/rawCommands";
+import { runDalleBlock, runGptBlock, runGptPage, runWhisper } from "./lib/rawCommands";
 import { BlockEntity } from "@logseq/libs/dist/LSPlugin.user";
 import { useImmer } from 'use-immer';
 
@@ -61,6 +61,7 @@ const defaultAppState: AppState = {
 };
 
 const LogseqApp = () => {
+
   const [builtInCommands, setBuiltInCommands] = useState<Command[]>([]);
   const [userCommands, setUserCommands] = useState<Command[]>([]);
   const [appState, updateAppState] = useImmer<AppState>(defaultAppState);
@@ -158,13 +159,14 @@ const LogseqApp = () => {
         openUI();
       }
     });
-
     logseq.Editor.registerSlashCommand("gpt-page", runGptPage);
     logseq.Editor.registerBlockContextMenuItem("gpt-page", runGptPage);
     logseq.Editor.registerSlashCommand("gpt-block", runGptBlock);
     logseq.Editor.registerBlockContextMenuItem("gpt-block", runGptBlock);
     logseq.Editor.registerSlashCommand("dalle", runDalleBlock);
     logseq.Editor.registerBlockContextMenuItem("dalle", runDalleBlock);
+    logseq.Editor.registerSlashCommand("whisper", runWhisper);
+    logseq.Editor.registerBlockContextMenuItem("whisper", runWhisper);
 
     if (logseq.settings!["shortcutBlock"]) {
       logseq.App.registerCommandShortcut(
@@ -174,6 +176,7 @@ const LogseqApp = () => {
     }
   }, []);
 
+
   const allCommands = [...builtInCommands, ...userCommands];
 
   const handleCommand = async (command: Command): Promise<string> => {