enhance: optimistic search page loading and improved content refinement

Signed-off-by: Nick Hale <4175918+njhale@users.noreply.github.com>
obot-platform · Feb 6, 2025 · c58e7e3 · c58e7e3
1 parent 10c84a9
commit c58e7e3
Show file tree

Hide file tree

Showing 8 changed files with 395 additions and 175 deletions.
diff --git a/google/search/package-lock.json b/google/search/package-lock.json
diff --git a/google/search/package.json b/google/search/package.json
@@ -31,6 +31,8 @@
     "express": "^4.18.2",
     "global-cache-dir": "^6.0.0",
     "playwright": "^1.46.0",
+    "prettier": "^3.4.2",
+    "tiktoken": "^1.0.18",
     "ts-node-dev": "^2.0.0",
     "turndown": "^7.1.3"
   }

diff --git a/google/search/src/headers.ts b/google/search/src/headers.ts
@@ -0,0 +1,38 @@
+import { type IncomingHttpHeaders } from 'node:http'
+import { createHash } from 'node:crypto'
+
+export interface ModelProviderCredentials {
+    baseUrl: string
+    apiKey: string
+}
+
+export function getModelProviderCredentials(headers: IncomingHttpHeaders): ModelProviderCredentials | undefined {
+  const baseUrl = getGPTScriptEnv(headers, 'OPENAI_BASE_URL')?.trim()
+  if (!baseUrl) return undefined
+
+  const apiKey = getGPTScriptEnv(headers, 'OPENAI_API_KEY')?.trim()
+  if (!apiKey) return undefined
+
+  return { baseUrl, apiKey }
+}
+
+export function getSessionId(headers: IncomingHttpHeaders): string {
+  const workspaceId = getGPTScriptEnv(headers, 'GPTSCRIPT_WORKSPACE_ID')
+  if (!workspaceId?.trim()) throw new Error('No GPTScript workspace ID provided')
+
+  return createHash('sha256').update(workspaceId).digest('hex').substring(0, 16)
+}
+
+export function getGPTScriptEnv(headers: IncomingHttpHeaders, envKey: string): string | undefined {
+  const envHeader = headers?.['x-gptscript-env']
+  const envArray = Array.isArray(envHeader) ? envHeader : [envHeader]
+
+  for (const env of envArray) {
+    if (env == null) continue
+    for (const pair of env.split(',')) {
+      const [key, value] = pair.split('=').map((part) => part.trim())
+      if (key === envKey) return value
+    }
+  }
+  return undefined
+}
diff --git a/google/search/src/refine.ts b/google/search/src/refine.ts
@@ -1,39 +1,139 @@
+import { encoding_for_model } from "tiktoken"
 import {GPTScript, type ToolDef} from "@gptscript-ai/gptscript"
 import {type SearchResult, type SearchResults} from "./search.ts"
+import {type ModelProviderCredentials} from "./headers.ts"
+
+// Max number of tokens in the search results
+const MAX_RESULTS_TOKENS = 50000
 
 const gptscript = new GPTScript()
 
-export async function refine (unrefined: SearchResults): Promise<SearchResults> {
-  const now = new Date().toISOString()
-  const refined = await Promise.all(
+export async function refine (creds: ModelProviderCredentials | undefined, unrefined: SearchResults): Promise<SearchResults> {
+  const totalUnrefinedTokens = tokenCount(unrefined.results.reduce((acc, result) => acc + result.content, ''))
+  if (totalUnrefinedTokens <= MAX_RESULTS_TOKENS) {
+    console.info(`Total tokens (${totalUnrefinedTokens}) are within the limit (${MAX_RESULTS_TOKENS}), skipping refinement`)
+    return unrefined
+  }
+
+  if (!creds) {
+    console.warn('No model provider credentials provided, skipping refinement')
+    return unrefined
+  }
+
+  console.info(`Total tokens (${totalUnrefinedTokens}) are above the limit (${MAX_RESULTS_TOKENS}), calling GPTScript to refine results`)
+
+  const now = userDateTime()
+  let refined = await Promise.all(
     unrefined.results.map(async (result) => {
-      if (result.content?.length ?? 0 <= 10000) {
-        // Don't refine content that is 10k tokens or less
-        return result
+      const refinedContent = await refineContent(creds, now, unrefined.query, result)
+      const refinedTokens = tokenCount(refinedContent.content)
+      return {
+        ...result,
+        ...refinedContent,
+        refinedTokens 
       }
-
-      return await refineResult(now, unrefined.query, result)
     })
   )
 
-  return {
-    ...unrefined,
-    results: refined.filter(result => hasContent(result.content))
+  const totalRefinedTokens = refined.reduce((sum, r) => sum + r.refinedTokens, 0)
+  if (totalRefinedTokens <= MAX_RESULTS_TOKENS) {
+    // If the refined tokens already fit the limit, return as is.
+    return { query: unrefined.query, results: refined }
+  }
+
+  // Filter zero score or zero tokens
+  refined = refined.filter(r => r.score > 0 && r.refinedTokens > 0)
+
+  // Sort by "value density" = score / tokens (descending)
+  refined.sort((a, b) => (b.score / b.refinedTokens) - (a.score / a.refinedTokens))
+
+  const pruned: SearchResult[] = []
+  let tokenBudget = MAX_RESULTS_TOKENS
+
+  for (const r of refined) {
+    if (tokenBudget < 1) break
+
+    if (r.refinedTokens >= tokenBudget) {
+      // If the result is too long, truncate it to fit the budget
+      const truncated = truncateContent(r.content, tokenBudget)
+      pruned.push({
+        ...r,
+        content: truncated.content,
+      })
+
+      // Consume the tokens from the budget
+      tokenBudget -= truncated.tokenCount
+      continue
+    }
+
+    // The entire result fits in the budget, so add it to the pruned results
+    pruned.push(r)
+    tokenBudget -= r.refinedTokens
+  }
+
+  return { query: unrefined.query, results: pruned }
+}
+
+function tokenCount (content?: string): number {
+  if (!content || content.length === 0) {
+    return 0
+  }
+
+  const enc = encoding_for_model('gpt-4o-mini');
+  try {
+    return enc.encode(content).length;
+  } catch (e) {
+    console.warn('Error encoding content', e);
+  } finally {
+    // Free encoding resources when done
+    enc.free()
+  }
+
+  return 0
+}
+
+
+function truncateContent (content: string, maxTokens: number): {
+  content: string,
+  tokenCount: number
+} {
+  const codec = encoding_for_model('gpt-4o-mini');
+  try {
+    const tokens = codec.encode(content)
+    const truncated = tokens.slice(0, maxTokens)
+    return {
+      content: new TextDecoder().decode(truncated),
+      tokenCount: truncated.length
+    }
+  } finally {
+    codec.free()
   }
 }
 
-function hasContent (content?: string | string[]): boolean {
-  return !(Array.isArray(content) ? content?.length === 0 : content?.trim() === '')
+
+function userDateTime (): string {
+  const tz = process.env.TIMEZONE || 'UTC';
+  try {
+    new Intl.DateTimeFormat('en-US', { timeZone: tz });
+  } catch {
+    return new Date().toLocaleString('en-US', { timeZone: 'UTC', timeZoneName: 'short' });
+  }
+  return new Date().toLocaleString('en-US', { timeZone: tz, timeZoneName: 'short' });
 }
 
-async function refineResult (
+
+async function refineContent (
+  creds: ModelProviderCredentials,
   time: string,
   query: string,
-  result: SearchResult): Promise<SearchResult> {
+  result: SearchResult): Promise<{
+    content: string,
+    score: number
+  }> {
 
   const tool: ToolDef = {
     chat: false,
-    jsonResponse: true,
+    jsonResponse: false,
     modelName: process.env.OBOT_DEFAULT_LLM_MINI_MODEL ?? 'gpt-4o-mini',
     temperature: 0.0,
     arguments: {
@@ -43,38 +143,54 @@ async function refineResult (
           type: 'string',
           description: 'Current date and time that the search was requested at'
         },
-        query: {
+        topic: {
           type: 'string',
-          description: 'query or subject matter to generate citations for'
+          description: 'Topic to extract excerpts for'
         },
         url: {
           type: 'string',
-          description: 'URL that the content was sourced from'
+          description: 'URL that the markdown content was sourced from'
         },
         content: {
           type: 'string',
-          description: 'Markdown content to cite'
+          description: 'Markdown document created by exporting an HTML web page to markdown'
         }
       },
-      required: ['query', 'url', 'content']
+      required: ['time', 'topic', 'url', 'content']
     },
     instructions: refineInstructions
   }
 
   const run = await gptscript.evaluate(tool, {
-    input: JSON.stringify({
-      query,
-      ...result,
-      time
+    BaseURL: creds.baseUrl,
+    APIKey: creds.apiKey,
+    input: minify({
+      time,
+      topic: query,
+      url: result.url,
+      content: result.content
     })
   })
 
-  return await run.json()
+  // Parse the output into a score and content
+  const output = await run.text()
+  const [firstLine, ...restLines] = output?.split('\n') ?? []
+  const score = Math.max(1, Math.min(10, parseInt(firstLine, 10))) || 0
+  const content = restLines.join('\n')
+
+  return { score, content }
 }
 
 // Note: Tools can't introspect their parameters schema, so we provide it in the instructions as well
 const refineInstructions = `
-Given an object with the following JSON schema:
+Do not respond with any additional dialog or commentary.
+
+You are a research assistant tasked with extracting excerpts from a markdown document that will
+be used as notes to conduct detailed research about a given topic.
+
+The document is the result of exporting an HTML webpage to markdown.
+
+When given an object with the following JSON schema:
 
 ${minify({
   type: 'object',
@@ -83,51 +199,35 @@ ${minify({
       type: 'string',
       description: 'Current date and time that the search was requested at'
     },
-    query: {
+    topic: {
       type: 'string',
-      description: 'Query or subject matter to generate citations for'
+      description: 'Topic to extract excerpts for'
     },
     url: {
       type: 'string',
-      description: 'URL that the content was sourced from'
+      description: 'URL that the markdown content was sourced from'
     },
     content: {
       type: 'string',
-      description: 'Markdown content to cite'
+      description: 'Markdown document created by exporting an HTML web page to markdown'
     }
   },
-  required: ['query', 'url', 'content', 'time']
+  required: ['time', 'topic', 'url', 'content', 'time']
 })}
 
-Select all markdown from \${CONTENT} containing information useful to cite when researching \${QUERY}.
-Selected markdown should contain the most useful and relevant information to \${QUERY} available in \${CONTENT}.
-Don't select markdown that is not helpful or related to \${QUERY}.
- 
-Respond with a single object containing all of the selected markdown that adheres to the following JSON schema:
+Perform the following steps in order:
+1. Refine the markdown content by removing all:
+  - boilerplate and unintelligable text
+  - unrelated advertisements, links, and web page structure
+2. Select excerpts from the refined content that you think would make good notes for conducting detailed research about the topic
+3. Compose a concise markdown document containing the excerpts organized in decending order of importance to understanding the topic. Do not paraphrase, summarize, or reword the excerpts. The goal is to preserve as much of the original content as possible.
+4. Grade the corpus of excerpts as a whole based how well it covers the topic on a scale of 0-10, where high scores are good and low scores contain no relevant information
 
-${minify({
-  type: 'object',
-  properties: {
-    url: {
-      type: 'string',
-      description: 'URL that the content was sourced from'
-    },
-    title: {
-      type: 'string',
-      description: 'Main title of the source content'
-    },
-    content: {
-      type: 'array',
-      description: 'Cleaned up markdown from the original content that can be cited to research the query',
-      items: {
-        type: 'string'
-      }
-    }
-  },
-  required: ['url', 'title', 'content']
-})}
+Afterwards, respond with the grade followed by the markdown document on a new line.
 
-Do not respond with any additional dialog or commentary.
+EXAMPLE
+5
+<content of markdown document>
 `
 
 function minify (obj: object): string {