Skip to content

Commit

Permalink
enhance: optimistic search page loading and improved content refinement
Browse files Browse the repository at this point in the history
Signed-off-by: Nick Hale <4175918+njhale@users.noreply.github.com>
  • Loading branch information
njhale committed Feb 6, 2025
1 parent 10c84a9 commit c58e7e3
Show file tree
Hide file tree
Showing 8 changed files with 395 additions and 175 deletions.
29 changes: 26 additions & 3 deletions google/search/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions google/search/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
"express": "^4.18.2",
"global-cache-dir": "^6.0.0",
"playwright": "^1.46.0",
"prettier": "^3.4.2",
"tiktoken": "^1.0.18",
"ts-node-dev": "^2.0.0",
"turndown": "^7.1.3"
}
Expand Down
38 changes: 38 additions & 0 deletions google/search/src/headers.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import { type IncomingHttpHeaders } from 'node:http'
import { createHash } from 'node:crypto'

export interface ModelProviderCredentials {
baseUrl: string
apiKey: string
}

export function getModelProviderCredentials(headers: IncomingHttpHeaders): ModelProviderCredentials | undefined {
const baseUrl = getGPTScriptEnv(headers, 'OPENAI_BASE_URL')?.trim()
if (!baseUrl) return undefined

const apiKey = getGPTScriptEnv(headers, 'OPENAI_API_KEY')?.trim()
if (!apiKey) return undefined

return { baseUrl, apiKey }
}

export function getSessionId(headers: IncomingHttpHeaders): string {
const workspaceId = getGPTScriptEnv(headers, 'GPTSCRIPT_WORKSPACE_ID')
if (!workspaceId?.trim()) throw new Error('No GPTScript workspace ID provided')

return createHash('sha256').update(workspaceId).digest('hex').substring(0, 16)
}

export function getGPTScriptEnv(headers: IncomingHttpHeaders, envKey: string): string | undefined {
const envHeader = headers?.['x-gptscript-env']
const envArray = Array.isArray(envHeader) ? envHeader : [envHeader]

for (const env of envArray) {
if (env == null) continue
for (const pair of env.split(',')) {
const [key, value] = pair.split('=').map((part) => part.trim())
if (key === envKey) return value
}
}
return undefined
}
218 changes: 159 additions & 59 deletions google/search/src/refine.ts
Original file line number Diff line number Diff line change
@@ -1,39 +1,139 @@
import { encoding_for_model } from "tiktoken"
import {GPTScript, type ToolDef} from "@gptscript-ai/gptscript"
import {type SearchResult, type SearchResults} from "./search.ts"
import {type ModelProviderCredentials} from "./headers.ts"

// Max number of tokens in the search results
const MAX_RESULTS_TOKENS = 50000

const gptscript = new GPTScript()

export async function refine (unrefined: SearchResults): Promise<SearchResults> {
const now = new Date().toISOString()
const refined = await Promise.all(
export async function refine (creds: ModelProviderCredentials | undefined, unrefined: SearchResults): Promise<SearchResults> {
const totalUnrefinedTokens = tokenCount(unrefined.results.reduce((acc, result) => acc + result.content, ''))
if (totalUnrefinedTokens <= MAX_RESULTS_TOKENS) {
console.info(`Total tokens (${totalUnrefinedTokens}) are within the limit (${MAX_RESULTS_TOKENS}), skipping refinement`)
return unrefined
}

if (!creds) {
console.warn('No model provider credentials provided, skipping refinement')
return unrefined
}

console.info(`Total tokens (${totalUnrefinedTokens}) are above the limit (${MAX_RESULTS_TOKENS}), calling GPTScript to refine results`)

const now = userDateTime()
let refined = await Promise.all(
unrefined.results.map(async (result) => {
if (result.content?.length ?? 0 <= 10000) {
// Don't refine content that is 10k tokens or less
return result
const refinedContent = await refineContent(creds, now, unrefined.query, result)
const refinedTokens = tokenCount(refinedContent.content)
return {
...result,
...refinedContent,
refinedTokens
}

return await refineResult(now, unrefined.query, result)
})
)

return {
...unrefined,
results: refined.filter(result => hasContent(result.content))
const totalRefinedTokens = refined.reduce((sum, r) => sum + r.refinedTokens, 0)
if (totalRefinedTokens <= MAX_RESULTS_TOKENS) {
// If the refined tokens already fit the limit, return as is.
return { query: unrefined.query, results: refined }
}

// Filter zero score or zero tokens
refined = refined.filter(r => r.score > 0 && r.refinedTokens > 0)

// Sort by "value density" = score / tokens (descending)
refined.sort((a, b) => (b.score / b.refinedTokens) - (a.score / a.refinedTokens))

const pruned: SearchResult[] = []
let tokenBudget = MAX_RESULTS_TOKENS

for (const r of refined) {
if (tokenBudget < 1) break

if (r.refinedTokens >= tokenBudget) {
// If the result is too long, truncate it to fit the budget
const truncated = truncateContent(r.content, tokenBudget)
pruned.push({
...r,
content: truncated.content,
})

// Consume the tokens from the budget
tokenBudget -= truncated.tokenCount
continue
}

// The entire result fits in the budget, so add it to the pruned results
pruned.push(r)
tokenBudget -= r.refinedTokens
}

return { query: unrefined.query, results: pruned }
}

function tokenCount (content?: string): number {
if (!content || content.length === 0) {
return 0
}

const enc = encoding_for_model('gpt-4o-mini');
try {
return enc.encode(content).length;
} catch (e) {
console.warn('Error encoding content', e);
} finally {
// Free encoding resources when done
enc.free()
}

return 0
}


function truncateContent (content: string, maxTokens: number): {
content: string,
tokenCount: number
} {
const codec = encoding_for_model('gpt-4o-mini');
try {
const tokens = codec.encode(content)
const truncated = tokens.slice(0, maxTokens)
return {
content: new TextDecoder().decode(truncated),
tokenCount: truncated.length
}
} finally {
codec.free()
}
}

function hasContent (content?: string | string[]): boolean {
return !(Array.isArray(content) ? content?.length === 0 : content?.trim() === '')

function userDateTime (): string {
const tz = process.env.TIMEZONE || 'UTC';
try {
new Intl.DateTimeFormat('en-US', { timeZone: tz });
} catch {
return new Date().toLocaleString('en-US', { timeZone: 'UTC', timeZoneName: 'short' });
}
return new Date().toLocaleString('en-US', { timeZone: tz, timeZoneName: 'short' });
}

async function refineResult (

async function refineContent (
creds: ModelProviderCredentials,
time: string,
query: string,
result: SearchResult): Promise<SearchResult> {
result: SearchResult): Promise<{
content: string,
score: number
}> {

const tool: ToolDef = {
chat: false,
jsonResponse: true,
jsonResponse: false,
modelName: process.env.OBOT_DEFAULT_LLM_MINI_MODEL ?? 'gpt-4o-mini',
temperature: 0.0,
arguments: {
Expand All @@ -43,38 +143,54 @@ async function refineResult (
type: 'string',
description: 'Current date and time that the search was requested at'
},
query: {
topic: {
type: 'string',
description: 'query or subject matter to generate citations for'
description: 'Topic to extract excerpts for'
},
url: {
type: 'string',
description: 'URL that the content was sourced from'
description: 'URL that the markdown content was sourced from'
},
content: {
type: 'string',
description: 'Markdown content to cite'
description: 'Markdown document created by exporting an HTML web page to markdown'
}
},
required: ['query', 'url', 'content']
required: ['time', 'topic', 'url', 'content']
},
instructions: refineInstructions
}

const run = await gptscript.evaluate(tool, {
input: JSON.stringify({
query,
...result,
time
BaseURL: creds.baseUrl,
APIKey: creds.apiKey,
input: minify({
time,
topic: query,
url: result.url,
content: result.content
})
})

return await run.json()
// Parse the output into a score and content
const output = await run.text()
const [firstLine, ...restLines] = output?.split('\n') ?? []
const score = Math.max(1, Math.min(10, parseInt(firstLine, 10))) || 0
const content = restLines.join('\n')

return { score, content }
}

// Note: Tools can't introspect their parameters schema, so we provide it in the instructions as well
const refineInstructions = `
Given an object with the following JSON schema:
Do not respond with any additional dialog or commentary.
You are a research assistant tasked with extracting excerpts from a markdown document that will
be used as notes to conduct detailed research about a given topic.
The document is the result of exporting an HTML webpage to markdown.
When given an object with the following JSON schema:
${minify({
type: 'object',
Expand All @@ -83,51 +199,35 @@ ${minify({
type: 'string',
description: 'Current date and time that the search was requested at'
},
query: {
topic: {
type: 'string',
description: 'Query or subject matter to generate citations for'
description: 'Topic to extract excerpts for'
},
url: {
type: 'string',
description: 'URL that the content was sourced from'
description: 'URL that the markdown content was sourced from'
},
content: {
type: 'string',
description: 'Markdown content to cite'
description: 'Markdown document created by exporting an HTML web page to markdown'
}
},
required: ['query', 'url', 'content', 'time']
required: ['time', 'topic', 'url', 'content', 'time']
})}
Select all markdown from \${CONTENT} containing information useful to cite when researching \${QUERY}.
Selected markdown should contain the most useful and relevant information to \${QUERY} available in \${CONTENT}.
Don't select markdown that is not helpful or related to \${QUERY}.
Respond with a single object containing all of the selected markdown that adheres to the following JSON schema:
Perform the following steps in order:
1. Refine the markdown content by removing all:
- boilerplate and unintelligable text
- unrelated advertisements, links, and web page structure
2. Select excerpts from the refined content that you think would make good notes for conducting detailed research about the topic
3. Compose a concise markdown document containing the excerpts organized in decending order of importance to understanding the topic. Do not paraphrase, summarize, or reword the excerpts. The goal is to preserve as much of the original content as possible.
4. Grade the corpus of excerpts as a whole based how well it covers the topic on a scale of 0-10, where high scores are good and low scores contain no relevant information
${minify({
type: 'object',
properties: {
url: {
type: 'string',
description: 'URL that the content was sourced from'
},
title: {
type: 'string',
description: 'Main title of the source content'
},
content: {
type: 'array',
description: 'Cleaned up markdown from the original content that can be cited to research the query',
items: {
type: 'string'
}
}
},
required: ['url', 'title', 'content']
})}
Afterwards, respond with the grade followed by the markdown document on a new line.
Do not respond with any additional dialog or commentary.
EXAMPLE
5
<content of markdown document>
`

function minify (obj: object): string {
Expand Down
Loading

0 comments on commit c58e7e3

Please sign in to comment.