Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

unify keyword extraction prompts #6689

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions agent/src/cli/command-bench/strategy-chat-context.ts
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,9 @@ async function runContextCommand(
const repoIDNamesCache = new Map<string, string>()

for (const example of examples) {
const start = Date.now()
const { targetRepoRevs, query: origQuery } = example
console.log({ query: example.query })
const repoNames = targetRepoRevs.map(repoRev => repoRev.repoName)

// Get repo IDs from cache or fetch them
Expand All @@ -108,6 +110,7 @@ async function runContextCommand(
uncachedRepoNames,
uncachedRepoNames.length + 10
)
console.log('repo id names', Date.now() - start)
if (isError(fetchedRepoIDNames)) {
throw new Error(
`getRepoIds failed for [${uncachedRepoNames.join(',')}]: ${fetchedRepoIDNames}`
Expand Down Expand Up @@ -135,6 +138,7 @@ async function runContextCommand(
completionsClient,
PromptString.unsafe_fromUserQuery(origQuery)
)
console.log('rewrote keyword query', Date.now() - start)
}

const resultsResp = await graphqlClient.contextSearchEvalDebug({
Expand All @@ -144,6 +148,7 @@ async function runContextCommand(
codeResultsCount: clientOpts.codeResultsCount,
textResultsCount: clientOpts.textResultsCount,
})
console.log('fetched context', Date.now() - start)

if (isError(resultsResp)) {
throw new Error(
Expand Down
79 changes: 4 additions & 75 deletions vscode/src/local-context/rewrite-keyword-query.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,6 @@ import {
} from '@sourcegraph/cody-shared'
import { outputChannelLogger } from '../output-channel-logger'

import { francAll } from 'franc-min'

const containsMultipleSentences = /[.!?][\s\r\n]+\w/

/**
* Rewrite the query, using the fast completions model to pull out keywords.
*
Expand All @@ -22,85 +18,18 @@ export async function rewriteKeywordQuery(
query: PromptString,
signal?: AbortSignal
): Promise<string> {
// In evals, we saw that rewriting tends to make performance worse for simple queries. So we only rewrite
// in cases where it clearly helps: when it's likely in a non-English language, or there are multiple
// sentences (so we really need to distill the question).
const queryString = query.toString()
if (!containsMultipleSentences.test(queryString)) {
const english = francAll(queryString).find(v => v[0] === 'eng')
if (english && english[1] > 0.9) {
return queryString
}
}

try {
const rewritten = await doRewrite(completionsClient, query, signal)
const rewritten = await extractKeywords(completionsClient, query, signal!)
console.log({ rewritten })
return rewritten.length !== 0 ? rewritten.sort().join(' ') : query.toString()
} catch (err) {
console.log({ err })
outputChannelLogger.logDebug('rewrite-keyword-query', 'failed', { verbose: err })
// If we fail to rewrite, just return the original query.
return query.toString()
}
}

async function doRewrite(
completionsClient: SourcegraphCompletionsClient,
query: PromptString,
signal?: AbortSignal
): Promise<string[]> {
const preamble = getSimplePreamble(undefined, 0, 'Default')
const stream = completionsClient.stream(
{
messages: [
...preamble,
{
speaker: 'human',
text: ps`You are helping the user search over a codebase. List some filename fragments that would match files relevant to read to answer the user's query. Present your results in a *single* XML list in the following format: <keywords><keyword><value>a single keyword</value><variants>a space separated list of synonyms and variants of the keyword, including acronyms, abbreviations, and expansions</variants><weight>a numerical weight between 0.0 and 1.0 that indicates the importance of the keyword</weight></keyword></keywords>. Here is the user query: <userQuery>${query}</userQuery>`,
},
{ speaker: 'assistant' },
],
maxTokensToSample: 400,
temperature: 0,
topK: 1,
fast: true,
},
{ apiVersion: 0 }, // Use legacy API version for now
signal
)

const streamingText: string[] = []
for await (const message of stream) {
signal?.throwIfAborted()
switch (message.type) {
case 'change': {
streamingText.push(message.text)
break
}
case 'error': {
throw message.error
}
}
}

const text = streamingText.at(-1) ?? ''
const parser = new XMLParser()
const document = parser.parse(text)

const keywords: { value?: string; variants?: string; weight?: number }[] =
// eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
document?.keywords?.keyword ?? []
const result = new Set<string>()
for (const { value } of keywords) {
if (value) {
for (const v of value.split(' ')) {
result.add(v)
}
}
}

return [...result]
}

/**
* Extracts keywords from a user query by using the completions model to identify relevant search terms.
* The function processes the query and returns an array of individual keywords that could be found
Expand All @@ -118,7 +47,7 @@ export async function extractKeywords(
...preamble,
{
speaker: 'human',
text: ps`You are helping the user search over a codebase. List terms that could be found literally in code snippets or file names relevant to answering the user's query. Limit your results to terms that are in the user's query. Present your results in a *single* XML list in the following format: <keywords><keyword>a single keyword</keyword></keywords>. Here is the user query: <userQuery>${query}</userQuery>`,
text: ps`You are helping the user search over a codebase. List terms that could be found literally in code snippets or file names relevant to answering the user's query. Limit your results to literal terms that are in the user's query. Present your results in a *single* XML list in the following format: <keywords><keyword>a single keyword</keyword></keywords>. Here is the user query: <userQuery>${query}</userQuery>`,
},
{ speaker: 'assistant' },
],
Expand Down
Loading