Fixed and finished llm similar command, closes #190

simonw · Sep 2, 2023 · 4be89fa · 4be89fa
1 parent 3ee9215
commit 4be89fa
Show file tree

Hide file tree

Showing 5 changed files with 113 additions and 14 deletions.
diff --git a/docs/embeddings/cli.md b/docs/embeddings/cli.md
@@ -73,8 +73,45 @@ llm embed -d my-embeddings.db -c 'my happy hound' phrases hound
 ```
 This creates a database file called `my-embeddings.db` in the current directory.
 
+(embeddings-cli-similar)=
+## llm similar
+
+The `llm similar` command searches a collection of embeddings for the items that are most similar to a given or item ID.
+
+To search the `quotations` collection for items that are semantically similar to `'computer science'`:
+
+```bash
+llm similar quotations -c 'computer science'
+```
+This embeds the provided string and returns a newline-delimited list of JSON objects like this:
+```json
+{"id": "philkarlton-1", "score": 0.8323904531677017, "content": null, "metadata": null}
+```
+You can compare against text stored in a file using `-i filename`:
+```bash
+llm similar quotations -i one.txt
+```
+Or feed text to standard input using `-i -`:
+```bash
+cat one.txt | llm similar quotations -i -
+```
+
+(embeddings-cli-embed-models)=
+## llm embed-models
+
+To list all available embedding models, including those provided by plugins, run this command:
+
+```bash
+llm embed-models
+```
+The output should look something like this:
+```
+ada-002 (aliases: ada)
+sentence-transformers/all-MiniLM-L6-v2 (aliases: all-MiniLM-L6-v2)
+```
+
 (embeddings-cli-embed-models-default)=
-## llm embed-models default
+### llm embed-models default
 
 This command can be used to get and set the default embedding model.
 

diff --git a/docs/help.md b/docs/help.md
@@ -406,9 +406,17 @@ Usage: llm similar [OPTIONS] COLLECTION [ID]
 
   Return top N similar IDs from a collection
 
+  Example usage:
+
+      llm similar my-collection -c "I like cats"
+
+  Or to find content similar to a specific stored ID:
+
+      llm similar my-collection 1234
+
 Options:
-  -i, --input FILE      Content to embed for comparison
-  -c, --content FILE
+  -i, --input FILENAME  File to embed for comparison
+  -c, --content TEXT    Content to embed for comparison
   -n, --number INTEGER  Number of results to return
   -d, --database FILE
   --help                Show this message and exit.

diff --git a/llm/cli.py b/llm/cli.py
@@ -1,5 +1,6 @@
 import click
 from click_default_group import DefaultGroup
+from dataclasses import asdict
 import json
 from llm import (
     Collection,
@@ -998,14 +999,10 @@ def get_db():
 @click.option(
     "-i",
     "--input",
-    type=click.Path(file_okay=True, allow_dash=True, dir_okay=False),
-    help="Content to embed for comparison",
-)
-@click.option(
-    "-c",
-    "--content",
-    type=click.Path(file_okay=True, allow_dash=False, dir_okay=False, writable=True),
+    type=click.File("r"),
+    help="File to embed for comparison",
 )
+@click.option("-c", "--content", help="Content to embed for comparison")
 @click.option(
     "-n", "--number", type=int, default=10, help="Number of results to return"
 )
@@ -1016,7 +1013,19 @@ def get_db():
     envvar="LLM_EMBEDDINGS_DB",
 )
 def similar(collection, id, input, content, number, database):
-    """Return top N similar IDs from a collection"""
+    """
+    Return top N similar IDs from a collection
+
+    Example usage:
+
+    \b
+        llm similar my-collection -c "I like cats"
+
+    Or to find content similar to a specific stored ID:
+
+    \b
+        llm similar my-collection 1234
+    """
     if not id and not content and not input:
         raise click.ClickException("Must provide content or an ID for the comparison")
 
@@ -1050,10 +1059,10 @@ def similar(collection, id, input, content, number, database):
             content = input.read()
         if not content:
             raise click.ClickException("No content provided")
-        results = collection_obj.similar_by_content(content, number)
+        results = collection_obj.similar(content, number)
 
     for result in results:
-        click.echo(json.dumps(result))
+        click.echo(json.dumps(asdict(result)))
 
 
 @cli.group(

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -23,6 +23,7 @@ def user_path_with_embeddings(user_path):
     db = sqlite_utils.Database(path)
     collection = llm.Collection(db, "demo", model_id="embed-demo")
     collection.embed("1", "hello world")
+    collection.embed("2", "goodbye world")
 
 
 @pytest.fixture

diff --git a/tests/test_embed_cli.py b/tests/test_embed_cli.py
@@ -110,11 +110,55 @@ def test_embed_store(user_path):
     (
         ([], "Missing argument 'COLLECTION'"),
         (["badcollection", "-c", "content"], "Collection does not exist"),
-        (["demo", "2"], "ID not found in collection"),
+        (["demo", "bad-id"], "ID not found in collection"),
     ),
 )
 def test_similar_errors(args, expected_error, user_path_with_embeddings):
     runner = CliRunner()
     result = runner.invoke(cli, ["similar"] + args, catch_exceptions=False)
     assert result.exit_code != 0
     assert expected_error in result.output
+
+
+def test_similar_by_id_cli(user_path_with_embeddings):
+    runner = CliRunner()
+    result = runner.invoke(cli, ["similar", "demo", "1"], catch_exceptions=False)
+    assert result.exit_code == 0
+    assert json.loads(result.output) == {
+        "id": "2",
+        "score": pytest.approx(0.9863939238321437),
+        "content": None,
+        "metadata": None,
+    }
+
+
+@pytest.mark.parametrize("scenario", ("argument", "file", "stdin"))
+def test_similar_by_content_cli(tmpdir, user_path_with_embeddings, scenario):
+    runner = CliRunner()
+    args = ["similar", "demo"]
+    input = None
+    if scenario == "argument":
+        args.extend(["-c", "hello world"])
+    elif scenario == "file":
+        path = tmpdir / "content.txt"
+        path.write_text("hello world", "utf-8")
+        args.extend(["-i", str(path)])
+    elif scenario == "stdin":
+        input = "hello world"
+        args.extend(["-i", "-"])
+    result = runner.invoke(cli, args, input=input, catch_exceptions=False)
+    assert result.exit_code == 0
+    lines = [line for line in result.output.splitlines() if line.strip()]
+    assert len(lines) == 2
+    assert json.loads(lines[0]) == {
+        "id": "1",
+        "score": pytest.approx(0.9999999999999999),
+        "content": None,
+        "metadata": None,
+    }
+    assert json.loads(lines[1]) == {
+        "id": "2",
+        "score": pytest.approx(0.9863939238321437),
+        "content": None,
+        "metadata": None,
+    }