Skip to content

Commit

Permalink
Merge pull request #10 from JoinTheAlliance/moon/fix-process-docs
Browse files Browse the repository at this point in the history
Moon/fix process docs
  • Loading branch information
lalalune committed Mar 7, 2024
2 parents 1157354 + 5858bc4 commit 1bfc9e5
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 152 deletions.
211 changes: 126 additions & 85 deletions src/docs.ts
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
import { Octokit } from 'octokit';
import openai from 'openai';
import { SupabaseClient } from '@supabase/supabase-js';
import { generateEmbeddings } from './embeddingCreation/createSectionEmbeddings';
import { BgentRuntime, addLore, wait } from 'bgent';
import { Octokit } from 'octokit';

export interface ProcessDocsParams {
supabase: SupabaseClient;
openai: openai;
octokit: Octokit;
repoOwner: string;
repoName: string;
pathToRepoDocuments: string;
documentationFileExt: string;
sectionDelimiter: string;
sourceDocumentationUrl: string;
env: { [key: string]: string };
}

/**
Expand Down Expand Up @@ -43,32 +42,70 @@ function sectionizeDocument(documentContent: string, sectionDelimiter: string) {
* pgvector in Supabase. Currently only supports Markdown (.MD) files.
* @param {ProcessDocsParams} params - An object that conforms to the ProcessDocsParams interface.
*/
async function makeRequest(
octokit: Octokit,
requestOptions: {
method: string;
url: string;
owner: string;
repo: string;
path?: string;
headers:
| { 'X-GitHub-Api-Version': string }
| { 'X-GitHub-Api-Version': string }
| { 'X-GitHub-Api-Version': string }
| { 'X-GitHub-Api-Version': string };
pull_number?: number;
per_page?: number;
page?: number;
},
) {
try {
const response = await octokit.request(requestOptions);
return response;
// @ts-expect-error - weird error
} catch (error: { status: number; headers: { [x: string]: string } }) {
if (
error.status === 403 &&
error.headers['x-ratelimit-remaining'] === '0'
) {
const retryAfter =
parseInt(error.headers['x-ratelimit-reset'], 10) -
Math.floor(Date.now() / 1000);
console.log(`Rate limited. Retrying in ${retryAfter} seconds...`);
await new Promise((resolve) => setTimeout(resolve, retryAfter * 1000));
return makeRequest(octokit, requestOptions);
} else {
throw error;
}
}
}

export async function vectorizeDocuments(params: ProcessDocsParams) {
try {
const {
supabase,
openai,
octokit,
repoOwner,
repoName,
pathToRepoDocuments,
documentationFileExt,
sectionDelimiter,
sourceDocumentationUrl,
env,
} = params;

// Fetch the documentation directories or files.
let response = await octokit.request(
'GET /repos/{owner}/{repo}/contents/{path}',
{
owner: repoOwner,
repo: repoName,
path: pathToRepoDocuments,
headers: {
'X-GitHub-Api-Version': '2022-11-28',
},
let response = await makeRequest(octokit, {
method: 'GET',
url: '/repos/{owner}/{repo}/contents/{path}',
owner: repoOwner,
repo: repoName,
path: pathToRepoDocuments,
headers: {
'X-GitHub-Api-Version': '2022-11-28',
},
);
});

response.data = Array.isArray(response.data)
? response.data
Expand All @@ -78,25 +115,23 @@ export async function vectorizeDocuments(params: ProcessDocsParams) {
for (const resData of response.data) {
let dirDocuments = [];
if (resData.type == 'dir') {
console.log('requesting dir: ', resData.name);
// Fetch all files from the directory.
response = await octokit.request(
'GET /repos/{owner}/{repo}/contents/{path}',
{
owner: repoOwner,
repo: repoName,
path: pathToRepoDocuments + '/' + resData.name,
headers: {
'X-GitHub-Api-Version': '2022-11-28',
},
response = await makeRequest(octokit, {
method: 'GET',
url: '/repos/{owner}/{repo}/contents/{path}',
owner: repoOwner,
repo: repoName,
path: pathToRepoDocuments + '/' + resData.name,
headers: {
'X-GitHub-Api-Version': '2022-11-28',
},
);
});

// Type assertion for response.data
const documentsArray = response.data as {
name: string;
path: string;
}[];

dirDocuments = documentsArray.filter((document) =>
document.name.endsWith(`.${documentationFileExt}`),
);
Expand All @@ -106,49 +141,53 @@ export async function vectorizeDocuments(params: ProcessDocsParams) {
throw new Error('Repository URL does not exist!');
}

// Retrieve document data for all docs to process.
await Promise.all(
dirDocuments.map(async (document) => {
const contentResponse = await octokit.request(
'GET /repos/{owner}/{repo}/contents/{path}',
{
owner: repoOwner,
repo: repoName,
path: document.path,
headers: {
'X-GitHub-Api-Version': '2022-11-28',
},
},
);

const decodedContent = Buffer.from(
(contentResponse.data as { content: string }).content,
'base64',
).toString('utf-8');
const { sections } = sectionizeDocument(
decodedContent,
sectionDelimiter,
);
const updatedPath = document.path.replace('docs/', '');
await generateEmbeddings(
sections,
sourceDocumentationUrl + updatedPath,
supabase,
openai,
);
}),
);
// Retrieve and process document data for each document sequentially.
for (const document of dirDocuments) {
console.log('requesting doc: ', document.path);
const contentResponse = await makeRequest(octokit, {
method: 'GET',
url: '/repos/{owner}/{repo}/contents/{path}',
owner: repoOwner,
repo: repoName,
path: document.path,
headers: {
'X-GitHub-Api-Version': '2022-11-28',
},
});

const decodedContent = Buffer.from(
(contentResponse.data as { content: string }).content,
'base64',
).toString('utf-8');
const { sections } = sectionizeDocument(
decodedContent,
sectionDelimiter,
);
const updatedPath = document.path.replace('docs/', '');
const runtime = new BgentRuntime({
debugMode: true,
serverUrl: 'https://api.openai.com/v1',
supabase: supabase,
token: env.OPENAI_API_KEY,
evaluators: [],
actions: [wait],
});
for (const document of sections) {
await addLore({
runtime,
content: { content: document },
source: sourceDocumentationUrl + updatedPath,
});
}
}
// wait 200 ms
await new Promise((resolve) => setTimeout(resolve, 200));
}
} catch (error) {
console.error('Error fetching data from GitHub API:', error);
}
}

/**
* Retrieves and processes a list of all documentation documents modified from a pull request.
* @param {ProcessDocsParams} params - An object that conforms to the ProcessDocsParams interface.
* @param {string} pullRequestNum - The pull request number.
*/
export async function fetchLatestPullRequest(
params: ProcessDocsParams,
pullRequestNum: string,
Expand All @@ -158,28 +197,30 @@ export async function fetchLatestPullRequest(

const page = 1;

const response = await octokit.request(
'GET /repos/{owner}/{repo}/pulls/{pull_number}/files',
{
owner: repoOwner,
repo: repoName,
pull_number: parseInt(pullRequestNum),
per_page: 100,
page: page,
headers: {
'X-GitHub-Api-Version': '2022-11-28',
},
const response = await makeRequest(octokit, {
method: 'GET',
url: '/repos/{owner}/{repo}/pulls/{pull_number}/files',
owner: repoOwner,
repo: repoName,
pull_number: parseInt(pullRequestNum),
per_page: 100,
page: page,
headers: {
'X-GitHub-Api-Version': '2022-11-28',
},
);

await Promise.all(
response.data.map(async (filePath) => {
if (filePath.filename.includes(`${pathToRepoDocuments}/`)) {
params.pathToRepoDocuments = filePath.filename;
await vectorizeDocuments(params);
}
}),
);
});

// Iterate over each file path sequentially
for (const filePath of response.data) {
if (filePath.filename.includes(`${pathToRepoDocuments}/`)) {
params.pathToRepoDocuments = filePath.filename as string;
await vectorizeDocuments(params); // Process each document one by one
// wait 200 ms
await new Promise((resolve) => setTimeout(resolve, 200));
}
}
// wait 200 ms
await new Promise((resolve) => setTimeout(resolve, 200));
} catch (error) {
console.error('Error fetching data from GitHub API:', error);
}
Expand Down
28 changes: 0 additions & 28 deletions src/embeddingCreation/createSectionEmbeddings.ts

This file was deleted.

Loading

0 comments on commit 1bfc9e5

Please sign in to comment.