diff --git a/docs/embeddings/cli.md b/docs/embeddings/cli.md index 7bcabc29..f4286ea7 100644 --- a/docs/embeddings/cli.md +++ b/docs/embeddings/cli.md @@ -73,8 +73,45 @@ llm embed -d my-embeddings.db -c 'my happy hound' phrases hound ``` This creates a database file called `my-embeddings.db` in the current directory. +(embeddings-cli-similar)= +## llm similar + +The `llm similar` command searches a collection of embeddings for the items that are most similar to a given or item ID. + +To search the `quotations` collection for items that are semantically similar to `'computer science'`: + +```bash +llm similar quotations -c 'computer science' +``` +This embeds the provided string and returns a newline-delimited list of JSON objects like this: +```json +{"id": "philkarlton-1", "score": 0.8323904531677017, "content": null, "metadata": null} +``` +You can compare against text stored in a file using `-i filename`: +```bash +llm similar quotations -i one.txt +``` +Or feed text to standard input using `-i -`: +```bash +cat one.txt | llm similar quotations -i - +``` + +(embeddings-cli-embed-models)= +## llm embed-models + +To list all available embedding models, including those provided by plugins, run this command: + +```bash +llm embed-models +``` +The output should look something like this: +``` +ada-002 (aliases: ada) +sentence-transformers/all-MiniLM-L6-v2 (aliases: all-MiniLM-L6-v2) +``` + (embeddings-cli-embed-models-default)= -## llm embed-models default +### llm embed-models default This command can be used to get and set the default embedding model. diff --git a/docs/help.md b/docs/help.md index 28ed37e5..b6b6fafe 100644 --- a/docs/help.md +++ b/docs/help.md @@ -406,9 +406,17 @@ Usage: llm similar [OPTIONS] COLLECTION [ID] Return top N similar IDs from a collection + Example usage: + + llm similar my-collection -c "I like cats" + + Or to find content similar to a specific stored ID: + + llm similar my-collection 1234 + Options: - -i, --input FILE Content to embed for comparison - -c, --content FILE + -i, --input FILENAME File to embed for comparison + -c, --content TEXT Content to embed for comparison -n, --number INTEGER Number of results to return -d, --database FILE --help Show this message and exit. diff --git a/llm/cli.py b/llm/cli.py index 13bb6d65..214dec1b 100644 --- a/llm/cli.py +++ b/llm/cli.py @@ -1,5 +1,6 @@ import click from click_default_group import DefaultGroup +from dataclasses import asdict import json from llm import ( Collection, @@ -998,14 +999,10 @@ def get_db(): @click.option( "-i", "--input", - type=click.Path(file_okay=True, allow_dash=True, dir_okay=False), - help="Content to embed for comparison", -) -@click.option( - "-c", - "--content", - type=click.Path(file_okay=True, allow_dash=False, dir_okay=False, writable=True), + type=click.File("r"), + help="File to embed for comparison", ) +@click.option("-c", "--content", help="Content to embed for comparison") @click.option( "-n", "--number", type=int, default=10, help="Number of results to return" ) @@ -1016,7 +1013,19 @@ def get_db(): envvar="LLM_EMBEDDINGS_DB", ) def similar(collection, id, input, content, number, database): - """Return top N similar IDs from a collection""" + """ + Return top N similar IDs from a collection + + Example usage: + + \b + llm similar my-collection -c "I like cats" + + Or to find content similar to a specific stored ID: + + \b + llm similar my-collection 1234 + """ if not id and not content and not input: raise click.ClickException("Must provide content or an ID for the comparison") @@ -1050,10 +1059,10 @@ def similar(collection, id, input, content, number, database): content = input.read() if not content: raise click.ClickException("No content provided") - results = collection_obj.similar_by_content(content, number) + results = collection_obj.similar(content, number) for result in results: - click.echo(json.dumps(result)) + click.echo(json.dumps(asdict(result))) @cli.group( diff --git a/tests/conftest.py b/tests/conftest.py index 5a8a5e45..0b7dc504 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,6 +23,7 @@ def user_path_with_embeddings(user_path): db = sqlite_utils.Database(path) collection = llm.Collection(db, "demo", model_id="embed-demo") collection.embed("1", "hello world") + collection.embed("2", "goodbye world") @pytest.fixture diff --git a/tests/test_embed_cli.py b/tests/test_embed_cli.py index 48485566..a1acaf13 100644 --- a/tests/test_embed_cli.py +++ b/tests/test_embed_cli.py @@ -110,7 +110,7 @@ def test_embed_store(user_path): ( ([], "Missing argument 'COLLECTION'"), (["badcollection", "-c", "content"], "Collection does not exist"), - (["demo", "2"], "ID not found in collection"), + (["demo", "bad-id"], "ID not found in collection"), ), ) def test_similar_errors(args, expected_error, user_path_with_embeddings): @@ -118,3 +118,47 @@ def test_similar_errors(args, expected_error, user_path_with_embeddings): result = runner.invoke(cli, ["similar"] + args, catch_exceptions=False) assert result.exit_code != 0 assert expected_error in result.output + + +def test_similar_by_id_cli(user_path_with_embeddings): + runner = CliRunner() + result = runner.invoke(cli, ["similar", "demo", "1"], catch_exceptions=False) + assert result.exit_code == 0 + assert json.loads(result.output) == { + "id": "2", + "score": pytest.approx(0.9863939238321437), + "content": None, + "metadata": None, + } + + +@pytest.mark.parametrize("scenario", ("argument", "file", "stdin")) +def test_similar_by_content_cli(tmpdir, user_path_with_embeddings, scenario): + runner = CliRunner() + args = ["similar", "demo"] + input = None + if scenario == "argument": + args.extend(["-c", "hello world"]) + elif scenario == "file": + path = tmpdir / "content.txt" + path.write_text("hello world", "utf-8") + args.extend(["-i", str(path)]) + elif scenario == "stdin": + input = "hello world" + args.extend(["-i", "-"]) + result = runner.invoke(cli, args, input=input, catch_exceptions=False) + assert result.exit_code == 0 + lines = [line for line in result.output.splitlines() if line.strip()] + assert len(lines) == 2 + assert json.loads(lines[0]) == { + "id": "1", + "score": pytest.approx(0.9999999999999999), + "content": None, + "metadata": None, + } + assert json.loads(lines[1]) == { + "id": "2", + "score": pytest.approx(0.9863939238321437), + "content": None, + "metadata": None, + }