Skip to content

Commit

Permalink
feat(whisper): Whisper speech to text (#100)
Browse files Browse the repository at this point in the history
  • Loading branch information
briansunter authored Mar 20, 2023
1 parent 84f938f commit 563b3b4
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 8 deletions.
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,21 @@ Type `/gpt-page` in a block or select `gpt-page` from the block menu.

![logseq gpt-page](docs/gpt-page.gif)

#### Whisper speech to text transcription

Transcribe audio files to text using the Whisper API.

Type `/whisper` in a block or select `whisper` from the block menu.

Supported formats are:
```m4a, mp3, mp4, mpeg, mpga, wav, webm```

Note, that the Logseq mobile app audio recorder uses `.aac` which is not supported by Whisper. I recommend using a separate audio recorder app to record audio files and then uploading them to Logseq. For example, I use the Voice Memos app on iOS and share that file to the Logseq mobile app.

### `dalle`

Generate images with DALL-E

Type `/dalle` in a block or select `dalle` from the block menu.

This will generate an image using the DALL-E model, save the image to the `assets` folder, and insert the image into the block.
Expand Down
5 changes: 2 additions & 3 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions src/lib/logseq.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,29 @@ export async function saveDalleImage(imageURL: string): Promise<string> {
const imageFileName = `![](assets/storages/logseq-plugin-gpt3-openai/${imageName})`;
return imageFileName;
}

export async function getAudioFile(content: string): Promise<File | null> {
//supported formats are mp3, mp4, mpeg, mpga, m4a, wav, and webm
//extract file path in () from markdown link like ![my file](assets/2023-03-17-13-24-36.m4a)
const regex = /!\[.*\]\((.*(mp3|mp4|mpeg|mpga|m4a|wav|webm))\)/;
const path = (await logseq.App.getCurrentGraph())?.path;
const match = regex.exec(content);
if (!match || !match[1]) {
return null;
}
//get extension from file path
const extension = match[1].split(".").pop();
if (!extension) {
return null;
}
//remove ../ from path
const filepath = match[1].replace("../", "");
// get filename from path by removing assets/ from path
const filename = filepath.replace("assets/", "");
const fullFilename = "file://" + path + "/" + filepath;
const response = await fetch(fullFilename);
const audioBlob = await response.blob();
const file = new File([audioBlob], filename, { type: `audio/${extension}` });
return file;
}

42 changes: 41 additions & 1 deletion src/lib/openai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,14 @@ const OpenAIDefaults = (apiKey: string): OpenAIOptions => ({
});

const retryOptions = {
numOfAttempts: 3,
numOfAttempts: 7,
retry: (err: any) => {
if (err instanceof TypeError && err.message === 'Failed to fetch') {
// Handle the TypeError: Failed to fetch error
console.warn('retrying due to network error', err);
return true;
}

if (!err.response || !err.response.data || !err.response.data.error) {
return false;
}
Expand All @@ -34,10 +40,44 @@ const retryOptions = {
console.warn("Rate limit exceeded. Retrying...");
return true;
}
if (err.response.status >= 500){
return true;
}

return false;
},
};

export async function whisper(file: File,openAiOptions:OpenAIOptions): Promise<string> {
const apiKey = openAiOptions.apiKey;
const model = 'whisper-1';

// Create a FormData object and append the file
const formData = new FormData();
formData.append('model', model);
formData.append('file', file);

// Send a request to the OpenAI API using a form post
const response = await backOff(

() => fetch('https://api.openai.com/v1/audio/transcriptions', {
method: 'POST',
headers: {
'Authorization': `Bearer ${apiKey}`,
},
body: formData,
}), retryOptions);

// Check if the response status is OK
if (!response.ok) {
throw new Error(`Error transcribing audio: ${response.statusText}`);
}

// Parse the response JSON and extract the transcription
const jsonResponse = await response.json();
return jsonResponse.text;
}

export async function dallE(
prompt: string,
openAiOptions: OpenAIOptions
Expand Down
24 changes: 22 additions & 2 deletions src/lib/rawCommands.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { IHookEvent } from "@logseq/libs/dist/LSPlugin.user";
import { getPageContentFromBlock, saveDalleImage } from "./logseq";
import { OpenAIOptions, openAI, dallE } from "./openai";
import { getAudioFile, getPageContentFromBlock, saveDalleImage } from "./logseq";
import { OpenAIOptions, openAI, dallE, whisper } from "./openai";
import { getOpenaiSettings } from "./settings";

function handleOpenAIError(e: any) {
Expand Down Expand Up @@ -165,3 +165,23 @@ export async function runDalleBlock(b: IHookEvent) {
handleOpenAIError(e);
}
}

export async function runWhisper(b: IHookEvent) {
const currentBlock = await logseq.Editor.getBlock(b.uuid);
if (currentBlock) {
const audioFile = await getAudioFile(currentBlock.content);
if (!audioFile) {
logseq.App.showMsg("No supported audio file found in block.", "warning");
return;
}
const openAISettings = getOpenaiSettings();
try {
const transcribe = await whisper(audioFile, openAISettings);
if (transcribe) {
await logseq.Editor.insertBlock(currentBlock.uuid, transcribe);
}
} catch (e: any) {
handleOpenAIError(e);
}
}
}
7 changes: 5 additions & 2 deletions src/main.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import ReactDOM from "react-dom/client";
import { Command, LogseqAI } from "./ui/LogseqAI";
import { loadUserCommands, loadBuiltInCommands } from "./lib/prompts";
import { getOpenaiSettings, settingsSchema } from "./lib/settings";
import { runDalleBlock, runGptBlock, runGptPage } from "./lib/rawCommands";
import { runDalleBlock, runGptBlock, runGptPage, runWhisper } from "./lib/rawCommands";
import { BlockEntity } from "@logseq/libs/dist/LSPlugin.user";
import { useImmer } from 'use-immer';

Expand Down Expand Up @@ -61,6 +61,7 @@ const defaultAppState: AppState = {
};

const LogseqApp = () => {

const [builtInCommands, setBuiltInCommands] = useState<Command[]>([]);
const [userCommands, setUserCommands] = useState<Command[]>([]);
const [appState, updateAppState] = useImmer<AppState>(defaultAppState);
Expand Down Expand Up @@ -158,13 +159,14 @@ const LogseqApp = () => {
openUI();
}
});

logseq.Editor.registerSlashCommand("gpt-page", runGptPage);
logseq.Editor.registerBlockContextMenuItem("gpt-page", runGptPage);
logseq.Editor.registerSlashCommand("gpt-block", runGptBlock);
logseq.Editor.registerBlockContextMenuItem("gpt-block", runGptBlock);
logseq.Editor.registerSlashCommand("dalle", runDalleBlock);
logseq.Editor.registerBlockContextMenuItem("dalle", runDalleBlock);
logseq.Editor.registerSlashCommand("whisper", runWhisper);
logseq.Editor.registerBlockContextMenuItem("whisper", runWhisper);

if (logseq.settings!["shortcutBlock"]) {
logseq.App.registerCommandShortcut(
Expand All @@ -174,6 +176,7 @@ const LogseqApp = () => {
}
}, []);


const allCommands = [...builtInCommands, ...userCommands];

const handleCommand = async (command: Command): Promise<string> => {
Expand Down

0 comments on commit 563b3b4

Please sign in to comment.