Skip to content

Commit

Permalink
Fixed and finished llm similar command, closes #190
Browse files Browse the repository at this point in the history
  • Loading branch information
simonw committed Sep 2, 2023
1 parent 3ee9215 commit 4be89fa
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 14 deletions.
39 changes: 38 additions & 1 deletion docs/embeddings/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,45 @@ llm embed -d my-embeddings.db -c 'my happy hound' phrases hound
```
This creates a database file called `my-embeddings.db` in the current directory.

(embeddings-cli-similar)=
## llm similar

The `llm similar` command searches a collection of embeddings for the items that are most similar to a given or item ID.

To search the `quotations` collection for items that are semantically similar to `'computer science'`:

```bash
llm similar quotations -c 'computer science'
```
This embeds the provided string and returns a newline-delimited list of JSON objects like this:
```json
{"id": "philkarlton-1", "score": 0.8323904531677017, "content": null, "metadata": null}
```
You can compare against text stored in a file using `-i filename`:
```bash
llm similar quotations -i one.txt
```
Or feed text to standard input using `-i -`:
```bash
cat one.txt | llm similar quotations -i -
```

(embeddings-cli-embed-models)=
## llm embed-models

To list all available embedding models, including those provided by plugins, run this command:

```bash
llm embed-models
```
The output should look something like this:
```
ada-002 (aliases: ada)
sentence-transformers/all-MiniLM-L6-v2 (aliases: all-MiniLM-L6-v2)
```

(embeddings-cli-embed-models-default)=
## llm embed-models default
### llm embed-models default

This command can be used to get and set the default embedding model.

Expand Down
12 changes: 10 additions & 2 deletions docs/help.md
Original file line number Diff line number Diff line change
Expand Up @@ -406,9 +406,17 @@ Usage: llm similar [OPTIONS] COLLECTION [ID]
Return top N similar IDs from a collection
Example usage:
llm similar my-collection -c "I like cats"
Or to find content similar to a specific stored ID:
llm similar my-collection 1234
Options:
-i, --input FILE Content to embed for comparison
-c, --content FILE
-i, --input FILENAME File to embed for comparison
-c, --content TEXT Content to embed for comparison
-n, --number INTEGER Number of results to return
-d, --database FILE
--help Show this message and exit.
Expand Down
29 changes: 19 additions & 10 deletions llm/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import click
from click_default_group import DefaultGroup
from dataclasses import asdict
import json
from llm import (
Collection,
Expand Down Expand Up @@ -998,14 +999,10 @@ def get_db():
@click.option(
"-i",
"--input",
type=click.Path(file_okay=True, allow_dash=True, dir_okay=False),
help="Content to embed for comparison",
)
@click.option(
"-c",
"--content",
type=click.Path(file_okay=True, allow_dash=False, dir_okay=False, writable=True),
type=click.File("r"),
help="File to embed for comparison",
)
@click.option("-c", "--content", help="Content to embed for comparison")
@click.option(
"-n", "--number", type=int, default=10, help="Number of results to return"
)
Expand All @@ -1016,7 +1013,19 @@ def get_db():
envvar="LLM_EMBEDDINGS_DB",
)
def similar(collection, id, input, content, number, database):
"""Return top N similar IDs from a collection"""
"""
Return top N similar IDs from a collection
Example usage:
\b
llm similar my-collection -c "I like cats"
Or to find content similar to a specific stored ID:
\b
llm similar my-collection 1234
"""
if not id and not content and not input:
raise click.ClickException("Must provide content or an ID for the comparison")

Expand Down Expand Up @@ -1050,10 +1059,10 @@ def similar(collection, id, input, content, number, database):
content = input.read()
if not content:
raise click.ClickException("No content provided")
results = collection_obj.similar_by_content(content, number)
results = collection_obj.similar(content, number)

for result in results:
click.echo(json.dumps(result))
click.echo(json.dumps(asdict(result)))


@cli.group(
Expand Down
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def user_path_with_embeddings(user_path):
db = sqlite_utils.Database(path)
collection = llm.Collection(db, "demo", model_id="embed-demo")
collection.embed("1", "hello world")
collection.embed("2", "goodbye world")


@pytest.fixture
Expand Down
46 changes: 45 additions & 1 deletion tests/test_embed_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,55 @@ def test_embed_store(user_path):
(
([], "Missing argument 'COLLECTION'"),
(["badcollection", "-c", "content"], "Collection does not exist"),
(["demo", "2"], "ID not found in collection"),
(["demo", "bad-id"], "ID not found in collection"),
),
)
def test_similar_errors(args, expected_error, user_path_with_embeddings):
runner = CliRunner()
result = runner.invoke(cli, ["similar"] + args, catch_exceptions=False)
assert result.exit_code != 0
assert expected_error in result.output


def test_similar_by_id_cli(user_path_with_embeddings):
runner = CliRunner()
result = runner.invoke(cli, ["similar", "demo", "1"], catch_exceptions=False)
assert result.exit_code == 0
assert json.loads(result.output) == {
"id": "2",
"score": pytest.approx(0.9863939238321437),
"content": None,
"metadata": None,
}


@pytest.mark.parametrize("scenario", ("argument", "file", "stdin"))
def test_similar_by_content_cli(tmpdir, user_path_with_embeddings, scenario):
runner = CliRunner()
args = ["similar", "demo"]
input = None
if scenario == "argument":
args.extend(["-c", "hello world"])
elif scenario == "file":
path = tmpdir / "content.txt"
path.write_text("hello world", "utf-8")
args.extend(["-i", str(path)])
elif scenario == "stdin":
input = "hello world"
args.extend(["-i", "-"])
result = runner.invoke(cli, args, input=input, catch_exceptions=False)
assert result.exit_code == 0
lines = [line for line in result.output.splitlines() if line.strip()]
assert len(lines) == 2
assert json.loads(lines[0]) == {
"id": "1",
"score": pytest.approx(0.9999999999999999),
"content": None,
"metadata": None,
}
assert json.loads(lines[1]) == {
"id": "2",
"score": pytest.approx(0.9863939238321437),
"content": None,
"metadata": None,
}

0 comments on commit 4be89fa

Please sign in to comment.