From cbffb199088557c4e44fd746349aa5d886fc17a6 Mon Sep 17 00:00:00 2001
From: Camden Cheek <camden@ccheek.com>
Date: Fri, 17 Jan 2025 13:43:41 -0700
Subject: [PATCH] unify keyword extraction prompts

---
 .../command-bench/strategy-chat-context.ts    |  5 ++
 .../local-context/rewrite-keyword-query.ts    | 79 +------------------
 2 files changed, 9 insertions(+), 75 deletions(-)
diff --git a/agent/src/cli/command-bench/strategy-chat-context.ts b/agent/src/cli/command-bench/strategy-chat-context.ts
index d4ce473c1c8c..0f68c461b3e0 100644
--- a/agent/src/cli/command-bench/strategy-chat-context.ts
+++ b/agent/src/cli/command-bench/strategy-chat-context.ts
@@ -88,7 +88,9 @@ async function runContextCommand(
     const repoIDNamesCache = new Map<string, string>()
 
     for (const example of examples) {
+        const start = Date.now()
         const { targetRepoRevs, query: origQuery } = example
+        console.log({ query: example.query })
         const repoNames = targetRepoRevs.map(repoRev => repoRev.repoName)
 
         // Get repo IDs from cache or fetch them
@@ -108,6 +110,7 @@ async function runContextCommand(
                 uncachedRepoNames,
                 uncachedRepoNames.length + 10
             )
+            console.log('repo id names', Date.now() - start)
             if (isError(fetchedRepoIDNames)) {
                 throw new Error(
                     `getRepoIds failed for [${uncachedRepoNames.join(',')}]: ${fetchedRepoIDNames}`
@@ -135,6 +138,7 @@ async function runContextCommand(
                 completionsClient,
                 PromptString.unsafe_fromUserQuery(origQuery)
             )
+            console.log('rewrote keyword query', Date.now() - start)
         }
 
         const resultsResp = await graphqlClient.contextSearchEvalDebug({
@@ -144,6 +148,7 @@ async function runContextCommand(
             codeResultsCount: clientOpts.codeResultsCount,
             textResultsCount: clientOpts.textResultsCount,
         })
+        console.log('fetched context', Date.now() - start)
 
         if (isError(resultsResp)) {
             throw new Error(
diff --git a/vscode/src/local-context/rewrite-keyword-query.ts b/vscode/src/local-context/rewrite-keyword-query.ts
index 5e7ae2d71665..eb896c88ee82 100644
--- a/vscode/src/local-context/rewrite-keyword-query.ts
+++ b/vscode/src/local-context/rewrite-keyword-query.ts
@@ -8,10 +8,6 @@ import {
 } from '@sourcegraph/cody-shared'
 import { outputChannelLogger } from '../output-channel-logger'
 
-import { francAll } from 'franc-min'
-
-const containsMultipleSentences = /[.!?][\s\r\n]+\w/
-
 /**
  * Rewrite the query, using the fast completions model to pull out keywords.
  *
@@ -22,85 +18,18 @@ export async function rewriteKeywordQuery(
     query: PromptString,
     signal?: AbortSignal
 ): Promise<string> {
-    // In evals, we saw that rewriting tends to make performance worse for simple queries. So we only rewrite
-    // in cases where it clearly helps: when it's likely in a non-English language, or there are multiple
-    // sentences (so we really need to distill the question).
-    const queryString = query.toString()
-    if (!containsMultipleSentences.test(queryString)) {
-        const english = francAll(queryString).find(v => v[0] === 'eng')
-        if (english && english[1] > 0.9) {
-            return queryString
-        }
-    }
-
     try {
-        const rewritten = await doRewrite(completionsClient, query, signal)
+        const rewritten = await extractKeywords(completionsClient, query, signal!)
+        console.log({ rewritten })
         return rewritten.length !== 0 ? rewritten.sort().join(' ') : query.toString()
     } catch (err) {
+        console.log({ err })
         outputChannelLogger.logDebug('rewrite-keyword-query', 'failed', { verbose: err })
         // If we fail to rewrite, just return the original query.
         return query.toString()
     }
 }
 
-async function doRewrite(
-    completionsClient: SourcegraphCompletionsClient,
-    query: PromptString,
-    signal?: AbortSignal
-): Promise<string[]> {
-    const preamble = getSimplePreamble(undefined, 0, 'Default')
-    const stream = completionsClient.stream(
-        {
-            messages: [
-                ...preamble,
-                {
-                    speaker: 'human',
-                    text: ps`You are helping the user search over a codebase. List some filename fragments that would match files relevant to read to answer the user's query. Present your results in a *single* XML list in the following format: <keywords><keyword><value>a single keyword</value><variants>a space separated list of synonyms and variants of the keyword, including acronyms, abbreviations, and expansions</variants><weight>a numerical weight between 0.0 and 1.0 that indicates the importance of the keyword</weight></keyword></keywords>. Here is the user query: <userQuery>${query}</userQuery>`,
-                },
-                { speaker: 'assistant' },
-            ],
-            maxTokensToSample: 400,
-            temperature: 0,
-            topK: 1,
-            fast: true,
-        },
-        { apiVersion: 0 }, // Use legacy API version for now
-        signal
-    )
-
-    const streamingText: string[] = []
-    for await (const message of stream) {
-        signal?.throwIfAborted()
-        switch (message.type) {
-            case 'change': {
-                streamingText.push(message.text)
-                break
-            }
-            case 'error': {
-                throw message.error
-            }
-        }
-    }
-
-    const text = streamingText.at(-1) ?? ''
-    const parser = new XMLParser()
-    const document = parser.parse(text)
-
-    const keywords: { value?: string; variants?: string; weight?: number }[] =
-        // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
-        document?.keywords?.keyword ?? []
-    const result = new Set<string>()
-    for (const { value } of keywords) {
-        if (value) {
-            for (const v of value.split(' ')) {
-                result.add(v)
-            }
-        }
-    }
-
-    return [...result]
-}
-
 /**
  * Extracts keywords from a user query by using the completions model to identify relevant search terms.
  * The function processes the query and returns an array of individual keywords that could be found
@@ -118,7 +47,7 @@ export async function extractKeywords(
                 ...preamble,
                 {
                     speaker: 'human',
-                    text: ps`You are helping the user search over a codebase. List terms that could be found literally in code snippets or file names relevant to answering the user's query. Limit your results to terms that are in the user's query. Present your results in a *single* XML list in the following format: <keywords><keyword>a single keyword</keyword></keywords>. Here is the user query: <userQuery>${query}</userQuery>`,
+                    text: ps`You are helping the user search over a codebase. List terms that could be found literally in code snippets or file names relevant to answering the user's query. Limit your results to literal terms that are in the user's query. Present your results in a *single* XML list in the following format: <keywords><keyword>a single keyword</keyword></keywords>. Here is the user query: <userQuery>${query}</userQuery>`,
                 },
                 { speaker: 'assistant' },
             ],