From ec5d17918e48fc9220f9b6d32c534670280c97f2 Mon Sep 17 00:00:00 2001 From: Karol Wilk <115570377+karol-t-wilk@users.noreply.github.com> Date: Thu, 7 Mar 2024 07:18:18 +0100 Subject: [PATCH] Create minimal prototype (#4) * Add pgvector to project and workflows * Run formatting * Add basic functionality for the embedding model * Use builtin text embedding for the serving * Run formatting * Create context and tests for documentation fragments * Add moduledocs for Embedding and Fragment * Enable async testing for fragments * Add parsing of ExDoc generated docs for raw documentation * Bump Bumblebee version * Use regex for search_data matching, update tests accordingly * Add Hex API client * Refactor ExDocParser to work better with Req * Prep HexClient for testing * Add tests for HexClient * Run formatter * Refactor map_json_to_releases to reduce nesting * Add mix task for indexing package * Add simple frontend for the app * Try to fix auth issue with postgres * Revert "Try to fix auth issue with postgres" This reverts commit 9a4e70d777f93eab79ce23bcf8a696701076e5ff. * Remove dummy test from Phoenix * Use Jason instead of JSON * Refactor ExDocParser.extract_search_data, clean up tests * Refactor parse_search_data * Refactor HexClient and HexClientTest * Refactor the search page and controller * Fix code style in tests * Run formatter * Refactor parse_search_data for readability, remove unnecessary error type * Clean up HexClient helper * Remove tgz helper * Clean up import/alias in EmbeddingTest * Use case instead of with * Add non null constraint to fragments * Inline unnecessary private function --- .github/workflows/test.yml | 2 +- config/config.exs | 6 + config/test.exs | 6 + lib/mix/tasks/search/index.ex | 53 ++++ lib/search/application.ex | 2 +- lib/search/embedding.ex | 23 ++ lib/search/ex_doc_parser.ex | 38 +++ lib/search/fragment.ex | 53 ++++ lib/search/hex_client.ex | 47 ++++ lib/search/hex_client/release.ex | 9 + lib/search/postgrex_types.ex | 5 + .../components/layouts/app.html.heex | 20 +- lib/search_web/controllers/page_controller.ex | 23 +- .../controllers/page_html/home.html.heex | 229 +----------------- .../controllers/page_html/search.html.heex | 3 + lib/search_web/router.ex | 1 + mix.exs | 8 +- mix.lock | 29 ++- ...20240223110820_create_vector_extension.exs | 11 + .../20240226120519_create_fragments.exs | 12 + priv/static/images/temp-logo.svg | 6 + test/search/embedding_test.exs | 22 ++ test/search/ex_doc_parser_test.exs | 47 ++++ test/search/fragment_test.exs | 63 +++++ test/search/hex_client_test.exs | 107 ++++++++ .../controllers/page_controller_test.exs | 8 - 26 files changed, 580 insertions(+), 253 deletions(-) create mode 100644 lib/mix/tasks/search/index.ex create mode 100644 lib/search/embedding.ex create mode 100644 lib/search/ex_doc_parser.ex create mode 100644 lib/search/fragment.ex create mode 100644 lib/search/hex_client.ex create mode 100644 lib/search/hex_client/release.ex create mode 100644 lib/search/postgrex_types.ex create mode 100644 lib/search_web/controllers/page_html/search.html.heex create mode 100644 priv/repo/migrations/20240223110820_create_vector_extension.exs create mode 100644 priv/repo/migrations/20240226120519_create_fragments.exs create mode 100644 priv/static/images/temp-logo.svg create mode 100644 test/search/embedding_test.exs create mode 100644 test/search/ex_doc_parser_test.exs create mode 100644 test/search/fragment_test.exs create mode 100644 test/search/hex_client_test.exs delete mode 100644 test/search_web/controllers/page_controller_test.exs diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 01f7cad..ca0507c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,7 +16,7 @@ jobs: services: db: - image: postgres:16.2 + image: pgvector/pgvector:pg16 ports: ["5432:5432"] env: POSTGRES_PASSWORD: postgres diff --git a/config/config.exs b/config/config.exs index 602c529..249f185 100644 --- a/config/config.exs +++ b/config/config.exs @@ -11,6 +11,9 @@ config :search, ecto_repos: [Search.Repo], generators: [timestamp_type: :utc_datetime] +# Add types added by the pgvector-elixir extension to Postgrex +config :search, Search.Repo, types: Search.PostgrexTypes + # Configures the endpoint config :search, SearchWeb.Endpoint, url: [host: "localhost"], @@ -52,6 +55,9 @@ config :logger, :console, # Use Jason for JSON parsing in Phoenix config :phoenix, :json_library, Jason +# Configure the EXLA backend for Nx +config :nx, :default_backend, EXLA.Backend + # Import environment specific config. This must remain at the bottom # of this file so it overrides the configuration defined above. import_config "#{config_env()}.exs" diff --git a/config/test.exs b/config/test.exs index 65d7210..9959654 100644 --- a/config/test.exs +++ b/config/test.exs @@ -20,6 +20,12 @@ config :search, SearchWeb.Endpoint, secret_key_base: "KoDCWrF9SEsnoM8svaiDh9g62hqg8cKGhafXsTKunfl/FVMTq1psZAyOoMp3eIO2", server: false +# Enable testing plug for Req to enable stubs in HexClientTest +config :search, + hex_client_req_options: [ + plug: {Req.Test, Search.HexClient} + ] + # Print only warnings and errors during test config :logger, level: :warning diff --git a/lib/mix/tasks/search/index.ex b/lib/mix/tasks/search/index.ex new file mode 100644 index 0000000..cfcb899 --- /dev/null +++ b/lib/mix/tasks/search/index.ex @@ -0,0 +1,53 @@ +defmodule Mix.Tasks.Search.Index do + @moduledoc """ + Usage: mix #{Mix.Task.task_name(__MODULE__)} [] + + Fetches the documentation for the given package from Hex and indexes it using the embedding model. + + If the version is ommitted, it will choose the newest release. + """ + @shortdoc "Indexes a package's documentation" + + use Mix.Task + + @requirements ["app.start"] + + @impl Mix.Task + def run(args) do + [package | args_tail] = args + {:ok, releases} = Search.HexClient.get_releases(package) + + release = + case args_tail do + [version] -> + version = Version.parse!(version) + Enum.find(releases, &(&1.version == version)) + + [] -> + Enum.max_by(releases, & &1.version, Version, fn -> nil end) + end + + if release do + {:ok, tarball} = Search.HexClient.get_docs_tarball(release) + {:ok, docs} = Search.ExDocParser.extract_search_data(tarball) + docs = Enum.map(docs, & &1["doc"]) + docs_len = length(docs) + + ProgressBar.render(0, docs_len) + + docs + |> Stream.with_index(1) + |> Enum.each(fn {doc, i} -> + %{embedding: embedding} = Nx.Serving.batched_run(Search.Embedding, doc) + + ProgressBar.render(i, docs_len) + + fragment = %Search.Fragment{doc_text: doc, embedding: embedding} + + Search.Repo.insert!(fragment) + end) + else + Mix.shell().error("Release not found.") + end + end +end diff --git a/lib/search/application.ex b/lib/search/application.ex index 3242044..c4ef71b 100644 --- a/lib/search/application.ex +++ b/lib/search/application.ex @@ -13,7 +13,7 @@ defmodule Search.Application do {DNSCluster, query: Application.get_env(:search, :dns_cluster_query) || :ignore}, {Phoenix.PubSub, name: Search.PubSub}, # Start a worker by calling: Search.Worker.start_link(arg) - # {Search.Worker, arg}, + {Search.Embedding, name: Search.Embedding}, # Start to serve requests, typically the last entry SearchWeb.Endpoint ] diff --git a/lib/search/embedding.ex b/lib/search/embedding.ex new file mode 100644 index 0000000..9e92d33 --- /dev/null +++ b/lib/search/embedding.ex @@ -0,0 +1,23 @@ +defmodule Search.Embedding do + @moduledoc """ + Provides text embedding capabilities. Currently uses Bumblebee with Sentence Transformers paraphrase L3 model + """ + + @embedding_size 384 + @model_repo {:hf, "sentence-transformers/paraphrase-MiniLM-L3-v2"} + + def embedding_size, do: @embedding_size + + def child_spec(opts) do + opts + |> Keyword.merge(serving: load_model()) + |> Nx.Serving.child_spec() + end + + defp load_model() do + {:ok, model_info} = Bumblebee.load_model(@model_repo) + {:ok, tokenizer} = Bumblebee.load_tokenizer(@model_repo) + + Bumblebee.Text.text_embedding(model_info, tokenizer) + end +end diff --git a/lib/search/ex_doc_parser.ex b/lib/search/ex_doc_parser.ex new file mode 100644 index 0000000..b975923 --- /dev/null +++ b/lib/search/ex_doc_parser.ex @@ -0,0 +1,38 @@ +defmodule Search.ExDocParser do + @moduledoc """ + Contains functionality for extracting the raw documentation with metadata from a tarball of an + ExDoc-generated documentation page + """ + + def extract_search_data(untarred_docs) when is_list(untarred_docs) do + search_data = + Enum.find_value(untarred_docs, fn {path, contents} -> + if match?(~c"dist/search_data-" ++ _, path) do + contents + end + end) + + if search_data do + parse_search_data(search_data) + else + {:error, "Search data not found, package documentation is not in a supported format."} + end + end + + @search_data_prefix "searchData=" + defp parse_search_data(search_data) when is_binary(search_data) do + case search_data do + @search_data_prefix <> json -> + case Jason.decode(json) do + {:ok, %{"items" => items}} -> + {:ok, items} + + _ -> + {:error, "Search data content is invalid JSON"} + end + + _ -> + {:error, "Search data content does not start with \"#{@search_data_prefix}\"."} + end + end +end diff --git a/lib/search/fragment.ex b/lib/search/fragment.ex new file mode 100644 index 0000000..f3349a1 --- /dev/null +++ b/lib/search/fragment.ex @@ -0,0 +1,53 @@ +defmodule Search.Fragment do + @moduledoc """ + Context for indexed documentation fragments - each fragment has associated with it an embedding vector, upon which + kNN lookup can be performed. + """ + + alias Search.{Fragment, Repo} + use Ecto.Schema + import Ecto.{Changeset, Query} + import Pgvector.Ecto.Query + + # Pgvector cannot handle inner product heuristic in ascending order, making it useless here + @metrics [:cosine, :l2] + + schema "fragments" do + field :doc_text, :string + field :embedding, Pgvector.Ecto.Vector + + timestamps(type: :utc_datetime) + end + + def metrics, do: @metrics + + def knn_lookup(query_tensor, opts \\ []) do + opts = Keyword.validate!(opts, metric: :cosine, k: nil) + metric = opts[:metric] + k = opts[:k] + + query = + case metric do + :cosine -> + from f in Fragment, + order_by: cosine_distance(f.embedding, ^query_tensor), + limit: ^k, + select: f + + :l2 -> + from f in Fragment, + order_by: l2_distance(f.embedding, ^query_tensor), + limit: ^k, + select: f + end + + Repo.all(query) + end + + @doc false + def changeset(fragment, attrs) do + fragment + |> cast(attrs, [:doc_text, :embedding]) + |> validate_required([:doc_text, :embedding]) + end +end diff --git a/lib/search/hex_client.ex b/lib/search/hex_client.ex new file mode 100644 index 0000000..febefd9 --- /dev/null +++ b/lib/search/hex_client.ex @@ -0,0 +1,47 @@ +defmodule Search.HexClient do + @api_url "https://hex.pm/api" + @repo_url "https://repo.hex.pm" + + alias Search.HexClient + + def get_releases(package_name) when is_binary(package_name) do + case get("#{@api_url}/packages/#{package_name}") do + {:ok, %{status: 200, body: %{"releases" => releases}}} -> + res = + for %{"has_docs" => has_docs, "version" => version} <- releases do + %HexClient.Release{ + package_name: package_name, + version: Version.parse!(version), + has_docs: has_docs + } + end + + {:ok, res} + + {:ok, %{status: status}} -> + {:error, "HTTP #{status}"} + + err -> + err + end + end + + def get_docs_tarball( + %HexClient.Release{has_docs: has_docs, package_name: package_name, version: version} = + _release + ) do + if has_docs do + case get("#{@repo_url}/docs/#{package_name}-#{version}.tar.gz") do + {:ok, %{status: 200, body: body}} -> {:ok, body} + {:ok, %{status: status}} -> {:error, "HTTP #{status}"} + err -> err + end + else + {:error, "Package release has no documentation."} + end + end + + defp get(url) do + Req.get(url, Application.get_env(:search, :hex_client_req_options, [])) + end +end diff --git a/lib/search/hex_client/release.ex b/lib/search/hex_client/release.ex new file mode 100644 index 0000000..b666a19 --- /dev/null +++ b/lib/search/hex_client/release.ex @@ -0,0 +1,9 @@ +defmodule Search.HexClient.Release do + defstruct [:package_name, :version, :has_docs] + + @type t :: %__MODULE__{ + package_name: String.t(), + version: Version.t(), + has_docs: boolean() + } +end diff --git a/lib/search/postgrex_types.ex b/lib/search/postgrex_types.ex new file mode 100644 index 0000000..585610d --- /dev/null +++ b/lib/search/postgrex_types.ex @@ -0,0 +1,5 @@ +Postgrex.Types.define( + Search.PostgrexTypes, + [Pgvector.Extensions.Vector] ++ Ecto.Adapters.Postgres.extensions(), + [] +) diff --git a/lib/search_web/components/layouts/app.html.heex b/lib/search_web/components/layouts/app.html.heex index e23bfc8..8c3a331 100644 --- a/lib/search_web/components/layouts/app.html.heex +++ b/lib/search_web/components/layouts/app.html.heex @@ -2,26 +2,12 @@
- + -

- v<%= Application.spec(:phoenix, :vsn) %> +

+ v<%= Application.spec(:search, :vsn) %>

-
diff --git a/lib/search_web/controllers/page_controller.ex b/lib/search_web/controllers/page_controller.ex index c315b25..d260396 100644 --- a/lib/search_web/controllers/page_controller.ex +++ b/lib/search_web/controllers/page_controller.ex @@ -4,6 +4,27 @@ defmodule SearchWeb.PageController do def home(conn, _params) do # The home page is often custom made, # so skip the default app layout. - render(conn, :home, layout: false) + render(conn, :home, form: Phoenix.Component.to_form(%{"search_text" => nil, "k" => nil})) + end + + def search(conn, %{"k" => k, "search_text" => search_text} = params) do + k = String.to_integer(k) + search_text = String.trim(search_text) + + errors = + if search_text == "" do + [search_text: {"Can't be blank", []}] + else + [] + end + + if errors == [] do + %{embedding: query_tensor} = Nx.Serving.batched_run(Search.Embedding, search_text) + fragments = Search.Fragment.knn_lookup(query_tensor, k: k) + + render(conn, :search, fragments: fragments) + else + render(conn, :home, form: Phoenix.Component.to_form(params, errors: errors)) + end end end diff --git a/lib/search_web/controllers/page_html/home.html.heex b/lib/search_web/controllers/page_html/home.html.heex index dc1820b..e4931eb 100644 --- a/lib/search_web/controllers/page_html/home.html.heex +++ b/lib/search_web/controllers/page_html/home.html.heex @@ -1,222 +1,7 @@ -<.flash_group flash={@flash} /> - -
-
- -

- Phoenix Framework - - v<%= Application.spec(:phoenix, :vsn) %> - -

-

- Peace of mind from prototype to production. -

-

- Build rich, interactive web applications quickly, with less code and fewer moving parts. Join our growing community of developers using Phoenix to craft APIs, HTML5 apps and more, for fun or at scale. -

- -
-
+<.simple_form for={@form} class="bg-transparent" action="/search" method="get"> + <.input field={@form[:search_text]} label="Search prompt" required /> + <.input field={@form[:k]} type="number" label="Number of entries" required min="1" value="10" /> + <:actions> + <.button>Search + + diff --git a/lib/search_web/controllers/page_html/search.html.heex b/lib/search_web/controllers/page_html/search.html.heex new file mode 100644 index 0000000..d2ecb4a --- /dev/null +++ b/lib/search_web/controllers/page_html/search.html.heex @@ -0,0 +1,3 @@ +
+ <%= raw(Earmark.as_html!(fragment.doc_text)) %> +
diff --git a/lib/search_web/router.ex b/lib/search_web/router.ex index 9d6cea9..98b8163 100644 --- a/lib/search_web/router.ex +++ b/lib/search_web/router.ex @@ -18,6 +18,7 @@ defmodule SearchWeb.Router do pipe_through :browser get "/", PageController, :home + get "/search", PageController, :search end # Other scopes may use custom stacks. diff --git a/mix.exs b/mix.exs index 8f9af9c..808b54a 100644 --- a/mix.exs +++ b/mix.exs @@ -54,7 +54,13 @@ defmodule Search.MixProject do {:telemetry_poller, "~> 1.0"}, {:jason, "~> 1.2"}, {:dns_cluster, "~> 0.1.1"}, - {:bandit, "~> 1.2"} + {:bandit, "~> 1.2"}, + {:pgvector, "~> 0.2.0"}, + {:bumblebee, "~> 0.5.3"}, + {:exla, ">= 0.0.0"}, + {:req, "~> 0.4.0"}, + {:progress_bar, "> 0.0.0"}, + {:earmark, "~> 1.4.46"} ] end diff --git a/mix.lock b/mix.lock index 28adb47..efc3bbd 100644 --- a/mix.lock +++ b/mix.lock @@ -1,18 +1,34 @@ %{ - "bandit": {:hex, :bandit, "1.2.2", "569fe5d0efb107c9af37a1e37e25ce2ceec293101a2d4bc512876fc3207192b5", [:mix], [{:hpax, "~> 0.1.1", [hex: :hpax, repo: "hexpm", optional: false]}, {:plug, "~> 1.14", [hex: :plug, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}, {:thousand_island, "~> 1.0", [hex: :thousand_island, repo: "hexpm", optional: false]}, {:websock, "~> 0.5", [hex: :websock, repo: "hexpm", optional: false]}], "hexpm", "2f89adb7281c78d4e75733e0a9e1b24f46f84d2993963d6fa57d0eafadec5f03"}, + "axon": {:hex, :axon, "0.6.1", "1d042fdba1c1b4413a3d65800524feebd1bc8ed218f8cdefe7a97510c3f427f3", [:mix], [{:kino, "~> 0.7", [hex: :kino, repo: "hexpm", optional: true]}, {:kino_vega_lite, "~> 0.1.7", [hex: :kino_vega_lite, repo: "hexpm", optional: true]}, {:nx, "~> 0.6.0 or ~> 0.7.0", [hex: :nx, repo: "hexpm", optional: false]}, {:polaris, "~> 0.1", [hex: :polaris, repo: "hexpm", optional: false]}, {:table_rex, "~> 3.1.1", [hex: :table_rex, repo: "hexpm", optional: true]}], "hexpm", "d6b0ae2f0dd284f6bf702edcab71e790d6c01ca502dd06c4070836554f5a48e1"}, + "bandit": {:hex, :bandit, "1.2.3", "a98d664a96fec23b68e776062296d76a94b4459795b38209f4ae89cb4225709c", [:mix], [{:hpax, "~> 0.1.1", [hex: :hpax, repo: "hexpm", optional: false]}, {:plug, "~> 1.14", [hex: :plug, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}, {:thousand_island, "~> 1.0", [hex: :thousand_island, repo: "hexpm", optional: false]}, {:websock, "~> 0.5", [hex: :websock, repo: "hexpm", optional: false]}], "hexpm", "3e29150245a9b5f56944434e5240966e75c917dad248f689ab589b32187a81af"}, + "bumblebee": {:hex, :bumblebee, "0.5.3", "151c215fd6014958dbfc322fe5f31b44d170293f69cfdca419936c81e39b1f64", [:mix], [{:axon, "~> 0.6.1", [hex: :axon, repo: "hexpm", optional: false]}, {:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:jason, "~> 1.4.0", [hex: :jason, repo: "hexpm", optional: false]}, {:nx, "~> 0.7.0", [hex: :nx, repo: "hexpm", optional: false]}, {:nx_image, "~> 0.1.0", [hex: :nx_image, repo: "hexpm", optional: false]}, {:nx_signal, "~> 0.2.0", [hex: :nx_signal, repo: "hexpm", optional: false]}, {:progress_bar, "~> 3.0", [hex: :progress_bar, repo: "hexpm", optional: false]}, {:safetensors, "~> 0.1.3", [hex: :safetensors, repo: "hexpm", optional: false]}, {:tokenizers, "~> 0.4", [hex: :tokenizers, repo: "hexpm", optional: false]}, {:unpickler, "~> 0.1.0", [hex: :unpickler, repo: "hexpm", optional: false]}, {:unzip, "~> 0.10.0", [hex: :unzip, repo: "hexpm", optional: false]}], "hexpm", "5518f11e424c431a9cbedc80e0d26525368f0b6e50572a674ff247ec3b26bdd7"}, "castore": {:hex, :castore, "1.0.5", "9eeebb394cc9a0f3ae56b813459f990abb0a3dedee1be6b27fdb50301930502f", [:mix], [], "hexpm", "8d7c597c3e4a64c395980882d4bca3cebb8d74197c590dc272cfd3b6a6310578"}, + "complex": {:hex, :complex, "0.5.0", "af2d2331ff6170b61bb738695e481b27a66780e18763e066ee2cd863d0b1dd92", [:mix], [], "hexpm", "2683bd3c184466cfb94fad74cbfddfaa94b860e27ad4ca1bffe3bff169d91ef1"}, "db_connection": {:hex, :db_connection, "2.6.0", "77d835c472b5b67fc4f29556dee74bf511bbafecdcaf98c27d27fa5918152086", [:mix], [{:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "c2f992d15725e721ec7fbc1189d4ecdb8afef76648c746a8e1cad35e3b8a35f3"}, "decimal": {:hex, :decimal, "2.1.1", "5611dca5d4b2c3dd497dec8f68751f1f1a54755e8ed2a966c2633cf885973ad6", [:mix], [], "hexpm", "53cfe5f497ed0e7771ae1a475575603d77425099ba5faef9394932b35020ffcc"}, "dns_cluster": {:hex, :dns_cluster, "0.1.3", "0bc20a2c88ed6cc494f2964075c359f8c2d00e1bf25518a6a6c7fd277c9b0c66", [:mix], [], "hexpm", "46cb7c4a1b3e52c7ad4cbe33ca5079fbde4840dedeafca2baf77996c2da1bc33"}, + "earmark": {:hex, :earmark, "1.4.46", "8c7287bd3137e99d26ae4643e5b7ef2129a260e3dcf41f251750cb4563c8fb81", [:mix], [], "hexpm", "798d86db3d79964e759ddc0c077d5eb254968ed426399fbf5a62de2b5ff8910a"}, "ecto": {:hex, :ecto, "3.11.1", "4b4972b717e7ca83d30121b12998f5fcdc62ba0ed4f20fd390f16f3270d85c3e", [:mix], [{:decimal, "~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "ebd3d3772cd0dfcd8d772659e41ed527c28b2a8bde4b00fe03e0463da0f1983b"}, "ecto_sql": {:hex, :ecto_sql, "3.11.1", "e9abf28ae27ef3916b43545f9578b4750956ccea444853606472089e7d169470", [:mix], [{:db_connection, "~> 2.4.1 or ~> 2.5", [hex: :db_connection, repo: "hexpm", optional: false]}, {:ecto, "~> 3.11.0", [hex: :ecto, repo: "hexpm", optional: false]}, {:myxql, "~> 0.6.0", [hex: :myxql, repo: "hexpm", optional: true]}, {:postgrex, "~> 0.16.0 or ~> 0.17.0 or ~> 1.0", [hex: :postgrex, repo: "hexpm", optional: true]}, {:tds, "~> 2.1.1 or ~> 2.2", [hex: :tds, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4.0 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "ce14063ab3514424276e7e360108ad6c2308f6d88164a076aac8a387e1fea634"}, + "elixir_make": {:hex, :elixir_make, "0.7.8", "505026f266552ee5aabca0b9f9c229cbb496c689537c9f922f3eb5431157efc7", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:certifi, "~> 2.0", [hex: :certifi, repo: "hexpm", optional: true]}], "hexpm", "7a71945b913d37ea89b06966e1342c85cfe549b15e6d6d081e8081c493062c07"}, "esbuild": {:hex, :esbuild, "0.8.1", "0cbf919f0eccb136d2eeef0df49c4acf55336de864e63594adcea3814f3edf41", [:mix], [{:castore, ">= 0.0.0", [hex: :castore, repo: "hexpm", optional: false]}, {:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "25fc876a67c13cb0a776e7b5d7974851556baeda2085296c14ab48555ea7560f"}, + "exla": {:hex, :exla, "0.7.0", "27fac40a580f0d3816fe3bf35c50dfc2f99597d26ac7e2aca4a3c62b89bb427f", [:make, :mix], [{:elixir_make, "~> 0.6", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:nx, "~> 0.7.0", [hex: :nx, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.0 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}, {:xla, "~> 0.6.0", [hex: :xla, repo: "hexpm", optional: false]}], "hexpm", "d3bfc622deb52cec95efc9d76063891afc7cd33e38eddbb01f3385c53e043c40"}, "file_system": {:hex, :file_system, "0.2.10", "fb082005a9cd1711c05b5248710f8826b02d7d1784e7c3451f9c1231d4fc162d", [:mix], [], "hexpm", "41195edbfb562a593726eda3b3e8b103a309b733ad25f3d642ba49696bf715dc"}, + "finch": {:hex, :finch, "0.18.0", "944ac7d34d0bd2ac8998f79f7a811b21d87d911e77a786bc5810adb75632ada4", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.3", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 0.2.6 or ~> 1.0", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "69f5045b042e531e53edc2574f15e25e735b522c37e2ddb766e15b979e03aa65"}, "floki": {:hex, :floki, "0.35.4", "cc947b446024732c07274ac656600c5c4dc014caa1f8fb2dfff93d275b83890d", [:mix], [], "hexpm", "27fa185d3469bd8fc5947ef0f8d5c4e47f0af02eb6b070b63c868f69e3af0204"}, "heroicons": {:git, "https://github.com/tailwindlabs/heroicons.git", "88ab3a0d790e6a47404cba02800a6b25d2afae50", [tag: "v2.1.1", sparse: "optimized"]}, "hpax": {:hex, :hpax, "0.1.2", "09a75600d9d8bbd064cdd741f21fc06fc1f4cf3d0fcc335e5aa19be1a7235c84", [:mix], [], "hexpm", "2c87843d5a23f5f16748ebe77969880e29809580efdaccd615cd3bed628a8c13"}, "jason": {:hex, :jason, "1.4.1", "af1504e35f629ddcdd6addb3513c3853991f694921b1b9368b0bd32beb9f1b63", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "fbb01ecdfd565b56261302f7e1fcc27c4fb8f32d56eab74db621fc154604a7a1"}, + "json": {:hex, :json, "1.4.1", "8648f04a9439765ad449bc56a3ff7d8b11dd44ff08ffcdefc4329f7c93843dfa", [:mix], [], "hexpm", "9abf218dbe4ea4fcb875e087d5f904ef263d012ee5ed21d46e9dbca63f053d16"}, "mime": {:hex, :mime, "2.0.5", "dc34c8efd439abe6ae0343edbb8556f4d63f178594894720607772a041b04b02", [:mix], [], "hexpm", "da0d64a365c45bc9935cc5c8a7fc5e49a0e0f9932a761c55d6c52b142780a05c"}, + "mint": {:hex, :mint, "1.5.2", "4805e059f96028948870d23d7783613b7e6b0e2fb4e98d720383852a760067fd", [:mix], [{:castore, "~> 0.1.0 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:hpax, "~> 0.1.1", [hex: :hpax, repo: "hexpm", optional: false]}], "hexpm", "d77d9e9ce4eb35941907f1d3df38d8f750c357865353e21d335bdcdf6d892a02"}, + "nimble_options": {:hex, :nimble_options, "1.1.0", "3b31a57ede9cb1502071fade751ab0c7b8dbe75a9a4c2b5bbb0943a690b63172", [:mix], [], "hexpm", "8bbbb3941af3ca9acc7835f5655ea062111c9c27bcac53e004460dfd19008a99"}, + "nimble_ownership": {:hex, :nimble_ownership, "0.2.1", "3e44c72ebe8dd213db4e13aff4090aaa331d158e72ce1891d02e0ffb05a1eb2d", [:mix], [], "hexpm", "bf38d2ef4fb990521a4ecf112843063c1f58a5c602484af4c7977324042badee"}, + "nimble_pool": {:hex, :nimble_pool, "1.0.0", "5eb82705d138f4dd4423f69ceb19ac667b3b492ae570c9f5c900bb3d2f50a847", [:mix], [], "hexpm", "80be3b882d2d351882256087078e1b1952a28bf98d0a287be87e4a24a710b67a"}, + "nx": {:hex, :nx, "0.7.1", "5f6376e3d18408116e8a84b8f4ac851fb07dfe61764a5410ebf0b5dcb69c1b7e", [:mix], [{:complex, "~> 0.5", [hex: :complex, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.0 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "e3ddd6a3f2a9bac79c67b3933368c25bb5ec814a883fc68aba8fd8a236751777"}, + "nx_image": {:hex, :nx_image, "0.1.2", "0c6e3453c1dc30fc80c723a54861204304cebc8a89ed3b806b972c73ee5d119d", [:mix], [{:nx, "~> 0.4", [hex: :nx, repo: "hexpm", optional: false]}], "hexpm", "9161863c42405ddccb6dbbbeae078ad23e30201509cc804b3b3a7c9e98764b81"}, + "nx_signal": {:hex, :nx_signal, "0.2.0", "e1ca0318877b17c81ce8906329f5125f1e2361e4c4235a5baac8a95ee88ea98e", [:mix], [{:nx, "~> 0.6", [hex: :nx, repo: "hexpm", optional: false]}], "hexpm", "7247e5e18a177a59c4cb5355952900c62fdeadeb2bad02a9a34237b68744e2bb"}, + "pgvector": {:hex, :pgvector, "0.2.1", "dc707ce6065ac0e82e5716bc17f9c6a97f92aca23994e5cceef7dfc48bb57eed", [:mix], [{:ecto, "~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}, {:nx, "~> 0.5", [hex: :nx, repo: "hexpm", optional: true]}, {:postgrex, ">= 0.0.0", [hex: :postgrex, repo: "hexpm", optional: false]}], "hexpm", "ed86c560af2f85b31d79f119192ce98f3342b4d06ceac63824a8686fe07e59b6"}, "phoenix": {:hex, :phoenix, "1.7.11", "1d88fc6b05ab0c735b250932c4e6e33bfa1c186f76dcf623d8dd52f07d6379c7", [:mix], [{:castore, ">= 0.0.0", [hex: :castore, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:phoenix_pubsub, "~> 2.1", [hex: :phoenix_pubsub, repo: "hexpm", optional: false]}, {:phoenix_template, "~> 1.0", [hex: :phoenix_template, repo: "hexpm", optional: false]}, {:phoenix_view, "~> 2.0", [hex: :phoenix_view, repo: "hexpm", optional: true]}, {:plug, "~> 1.14", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 2.7", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:plug_crypto, "~> 1.2 or ~> 2.0", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}, {:websock_adapter, "~> 0.5.3", [hex: :websock_adapter, repo: "hexpm", optional: false]}], "hexpm", "b1ec57f2e40316b306708fe59b92a16b9f6f4bf50ccfa41aa8c7feb79e0ec02a"}, "phoenix_ecto": {:hex, :phoenix_ecto, "4.4.3", "86e9878f833829c3f66da03d75254c155d91d72a201eb56ae83482328dc7ca93", [:mix], [{:ecto, "~> 3.5", [hex: :ecto, repo: "hexpm", optional: false]}, {:phoenix_html, "~> 2.14.2 or ~> 3.0 or ~> 4.0", [hex: :phoenix_html, repo: "hexpm", optional: true]}, {:plug, "~> 1.9", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm", "d36c401206f3011fefd63d04e8ef626ec8791975d9d107f9a0817d426f61ac07"}, "phoenix_html": {:hex, :phoenix_html, "4.0.0", "4857ec2edaccd0934a923c2b0ba526c44a173c86b847e8db725172e9e51d11d6", [:mix], [], "hexpm", "cee794a052f243291d92fa3ccabcb4c29bb8d236f655fb03bcbdc3a8214b8d13"}, @@ -23,12 +39,21 @@ "phoenix_template": {:hex, :phoenix_template, "1.0.4", "e2092c132f3b5e5b2d49c96695342eb36d0ed514c5b252a77048d5969330d639", [:mix], [{:phoenix_html, "~> 2.14.2 or ~> 3.0 or ~> 4.0", [hex: :phoenix_html, repo: "hexpm", optional: true]}], "hexpm", "2c0c81f0e5c6753faf5cca2f229c9709919aba34fab866d3bc05060c9c444206"}, "plug": {:hex, :plug, "1.15.3", "712976f504418f6dff0a3e554c40d705a9bcf89a7ccef92fc6a5ef8f16a30a97", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.1.1 or ~> 1.2 or ~> 2.0", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.3 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "cc4365a3c010a56af402e0809208873d113e9c38c401cabd88027ef4f5c01fd2"}, "plug_crypto": {:hex, :plug_crypto, "2.0.0", "77515cc10af06645abbfb5e6ad7a3e9714f805ae118fa1a70205f80d2d70fe73", [:mix], [], "hexpm", "53695bae57cc4e54566d993eb01074e4d894b65a3766f1c43e2c61a1b0f45ea9"}, + "polaris": {:hex, :polaris, "0.1.0", "dca61b18e3e801ecdae6ac9f0eca5f19792b44a5cb4b8d63db50fc40fc038d22", [:mix], [{:nx, "~> 0.5", [hex: :nx, repo: "hexpm", optional: false]}], "hexpm", "13ef2b166650e533cb24b10e2f3b8ab4f2f449ba4d63156e8c569527f206e2c2"}, "postgrex": {:hex, :postgrex, "0.17.4", "5777781f80f53b7c431a001c8dad83ee167bcebcf3a793e3906efff680ab62b3", [:mix], [{:db_connection, "~> 2.1", [hex: :db_connection, repo: "hexpm", optional: false]}, {:decimal, "~> 1.5 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:table, "~> 0.1.0", [hex: :table, repo: "hexpm", optional: true]}], "hexpm", "6458f7d5b70652bc81c3ea759f91736c16a31be000f306d3c64bcdfe9a18b3cc"}, + "progress_bar": {:hex, :progress_bar, "3.0.0", "f54ff038c2ac540cfbb4c2bfe97c75e7116ead044f3c2b10c9f212452194b5cd", [:mix], [{:decimal, "~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}], "hexpm", "6981c2b25ab24aecc91a2dc46623658e1399c21a2ae24db986b90d678530f2b7"}, + "req": {:hex, :req, "0.4.11", "cb19f87d5251e7de30cfc67d1899696b290711092207c6b2e8fc2294f237fcdc", [:mix], [{:aws_signature, "~> 0.3.2", [hex: :aws_signature, repo: "hexpm", optional: true]}, {:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 1.6 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:nimble_ownership, "~> 0.2.0", [hex: :nimble_ownership, repo: "hexpm", optional: false]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "bbf4f2393c649fa4146a3b8470e2a7e8c9b23e4100a16c75f5e7d1d3d33144f3"}, + "rustler_precompiled": {:hex, :rustler_precompiled, "0.7.1", "ecadf02cc59a0eccbaed6c1937303a5827fbcf60010c541595e6d3747d3d0f9f", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "b9e4657b99a1483ea31502e1d58c464bedebe9028808eda45c3a429af4550c66"}, + "safetensors": {:hex, :safetensors, "0.1.3", "7ff3c22391e213289c713898481d492c9c28a49ab1d0705b72630fb8360426b2", [:mix], [{:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}, {:nx, "~> 0.5", [hex: :nx, repo: "hexpm", optional: false]}], "hexpm", "fe50b53ea59fde4e723dd1a2e31cfdc6013e69343afac84c6be86d6d7c562c14"}, "tailwind": {:hex, :tailwind, "0.2.2", "9e27288b568ede1d88517e8c61259bc214a12d7eed271e102db4c93fcca9b2cd", [:mix], [{:castore, ">= 0.0.0", [hex: :castore, repo: "hexpm", optional: false]}], "hexpm", "ccfb5025179ea307f7f899d1bb3905cd0ac9f687ed77feebc8f67bdca78565c4"}, "telemetry": {:hex, :telemetry, "1.2.1", "68fdfe8d8f05a8428483a97d7aab2f268aaff24b49e0f599faa091f1d4e7f61c", [:rebar3], [], "hexpm", "dad9ce9d8effc621708f99eac538ef1cbe05d6a874dd741de2e689c47feafed5"}, "telemetry_metrics": {:hex, :telemetry_metrics, "0.6.2", "2caabe9344ec17eafe5403304771c3539f3b6e2f7fb6a6f602558c825d0d0bfb", [:mix], [{:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "9b43db0dc33863930b9ef9d27137e78974756f5f198cae18409970ed6fa5b561"}, "telemetry_poller": {:hex, :telemetry_poller, "1.0.0", "db91bb424e07f2bb6e73926fcafbfcbcb295f0193e0a00e825e589a0a47e8453", [:rebar3], [{:telemetry, "~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "b3a24eafd66c3f42da30fc3ca7dda1e9d546c12250a2d60d7b81d264fbec4f6e"}, - "thousand_island": {:hex, :thousand_island, "1.3.2", "bc27f9afba6e1a676dd36507d42e429935a142cf5ee69b8e3f90bff1383943cd", [:mix], [{:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "0e085b93012cd1057b378fce40cbfbf381ff6d957a382bfdd5eca1a98eec2535"}, + "thousand_island": {:hex, :thousand_island, "1.3.5", "6022b6338f1635b3d32406ff98d68b843ba73b3aa95cfc27154223244f3a6ca5", [:mix], [{:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "2be6954916fdfe4756af3239fb6b6d75d0b8063b5df03ba76fd8a4c87849e180"}, + "tokenizers": {:hex, :tokenizers, "0.4.0", "140283ca74a971391ddbd83cd8cbdb9bd03736f37a1b6989b82d245a95e1eb97", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, ">= 0.0.0", [hex: :rustler, repo: "hexpm", optional: true]}, {:rustler_precompiled, "~> 0.6", [hex: :rustler_precompiled, repo: "hexpm", optional: false]}], "hexpm", "ef1a9824f5a893cd3b831c0e5b3d72caa250d2ec462035cc6afef6933b13a82e"}, + "unpickler": {:hex, :unpickler, "0.1.0", "c2262c0819e6985b761e7107546cef96a485f401816be5304a65fdd200d5bd6a", [:mix], [], "hexpm", "e2b3f61e62406187ac52afead8a63bfb4e49394028993f3c4c42712743cab79e"}, + "unzip": {:hex, :unzip, "0.10.0", "374e0059e48e982076f3fd22cd4817ab11016c1bae3f09421511901ddda95c5c", [:mix], [], "hexpm", "101c06b0fa97a858a83beb618f4bc20370624f73ab3954f756d9b52194056de6"}, "websock": {:hex, :websock, "0.5.3", "2f69a6ebe810328555b6fe5c831a851f485e303a7c8ce6c5f675abeb20ebdadc", [:mix], [], "hexpm", "6105453d7fac22c712ad66fab1d45abdf049868f253cf719b625151460b8b453"}, "websock_adapter": {:hex, :websock_adapter, "0.5.5", "9dfeee8269b27e958a65b3e235b7e447769f66b5b5925385f5a569269164a210", [:mix], [{:bandit, ">= 0.6.0", [hex: :bandit, repo: "hexpm", optional: true]}, {:plug, "~> 1.14", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 2.6", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:websock, "~> 0.5", [hex: :websock, repo: "hexpm", optional: false]}], "hexpm", "4b977ba4a01918acbf77045ff88de7f6972c2a009213c515a445c48f224ffce9"}, + "xla": {:hex, :xla, "0.6.0", "67bb7695efa4a23b06211dc212de6a72af1ad5a9e17325e05e0a87e4c241feb8", [:make, :mix], [{:elixir_make, "~> 0.4", [hex: :elixir_make, repo: "hexpm", optional: false]}], "hexpm", "dd074daf942312c6da87c7ed61b62fb1a075bced157f1cc4d47af2d7c9f44fb7"}, } diff --git a/priv/repo/migrations/20240223110820_create_vector_extension.exs b/priv/repo/migrations/20240223110820_create_vector_extension.exs new file mode 100644 index 0000000..5b12f7a --- /dev/null +++ b/priv/repo/migrations/20240223110820_create_vector_extension.exs @@ -0,0 +1,11 @@ +defmodule Search.Repo.Migrations.CreateVectorExtension do + use Ecto.Migration + + def up do + execute "CREATE EXTENSION IF NOT EXISTS vector" + end + + def down do + execute "DROP EXTENSION vector" + end +end diff --git a/priv/repo/migrations/20240226120519_create_fragments.exs b/priv/repo/migrations/20240226120519_create_fragments.exs new file mode 100644 index 0000000..f5f1de6 --- /dev/null +++ b/priv/repo/migrations/20240226120519_create_fragments.exs @@ -0,0 +1,12 @@ +defmodule Search.Repo.Migrations.CreateFragments do + use Ecto.Migration + + def change do + create table(:fragments) do + add :doc_text, :text, null: false + add :embedding, :vector, size: Search.Embedding.embedding_size(), null: false + + timestamps(type: :utc_datetime) + end + end +end diff --git a/priv/static/images/temp-logo.svg b/priv/static/images/temp-logo.svg new file mode 100644 index 0000000..ca47fb7 --- /dev/null +++ b/priv/static/images/temp-logo.svg @@ -0,0 +1,6 @@ + + + + diff --git a/test/search/embedding_test.exs b/test/search/embedding_test.exs new file mode 100644 index 0000000..1f6e560 --- /dev/null +++ b/test/search/embedding_test.exs @@ -0,0 +1,22 @@ +defmodule Search.EmbeddingTest do + use ExUnit.Case, async: true + + alias Search.Embedding + + test "creates embedding tensor of correct shape for a single input" do + %{embedding: embedding} = Nx.Serving.batched_run(Embedding, "The cat chases the mouse") + + assert Nx.shape(embedding) == {Embedding.embedding_size()} + end + + test "creates embedding tensor of correct shape for batched inputs" do + [%{embedding: embedding1}, %{embedding: embedding2}] = + Nx.Serving.batched_run(Embedding, [ + "The cat chases the mouse", + "Lorem ipsum dolor sit amet" + ]) + + assert Nx.shape(embedding1) == {Embedding.embedding_size()} + assert Nx.shape(embedding2) == {Embedding.embedding_size()} + end +end diff --git a/test/search/ex_doc_parser_test.exs b/test/search/ex_doc_parser_test.exs new file mode 100644 index 0000000..35ef5a6 --- /dev/null +++ b/test/search/ex_doc_parser_test.exs @@ -0,0 +1,47 @@ +defmodule Search.ExDocParserTest do + alias Search.ExDocParser + use ExUnit.Case, async: true + + @dummy_items [%{"doc" => "dummy doc"}, %{"doc" => "another dummy"}] + @invalid_json ~c"{\"items\": oops this is invalid}" + + describe "extract_search_data/1" do + test "should extract search data for archives with the right format" do + untar = [ + {~c"dist/search_data-AF57AB42.js", + "searchData=#{Jason.encode!(%{"items" => @dummy_items})}"} + ] + + assert ExDocParser.extract_search_data(untar) == {:ok, @dummy_items} + end + + test "should fail for archives with no dist/search_data-XXXXXXXX.js files" do + untar = [ + {~c"search_data-ABCDEF12.js", "not this"}, + {~c"not_dist/search_data-12345678.js", "not this either"} + ] + + assert ExDocParser.extract_search_data(untar) == + {:error, + "Search data not found, package documentation is not in a supported format."} + end + + test "should fail for search data not starting with the \"searchData=\" prefix" do + untar = [ + {~c"dist/search_data-AF57AB42.js", Jason.encode!(%{"items" => @dummy_items})} + ] + + assert ExDocParser.extract_search_data(untar) == + {:error, "Search data content does not start with \"searchData=\"."} + end + + test "should fail for search data with invalid JSON" do + untar = [ + {~c"dist/search_data-AF57AB42.js", "searchData=#{@invalid_json}"} + ] + + assert ExDocParser.extract_search_data(untar) == + {:error, "Search data content is invalid JSON"} + end + end +end diff --git a/test/search/fragment_test.exs b/test/search/fragment_test.exs new file mode 100644 index 0000000..3232a6d --- /dev/null +++ b/test/search/fragment_test.exs @@ -0,0 +1,63 @@ +defmodule Search.FragmentTest do + use Search.DataCase, async: true + + alias Search.{Fragment, Embedding} + + import Nx.Defn + + setup do + {embeddings, rng_key} = + Nx.Random.normal(Nx.Random.key(42), shape: {10, Embedding.embedding_size()}) + + for i <- 0..9 do + Repo.insert!(%Fragment{doc_text: "Text #{i}", embedding: embeddings[i]}) + end + + {query, _} = Nx.Random.normal(rng_key, shape: {Embedding.embedding_size()}) + + fragments = Repo.all(from f in Fragment, select: f) + + {:ok, %{fragments: fragments, query: query}} + end + + describe "knn_query/2" do + test "when given no options, performs the kNN lookup on the entire repo using cosine distance", + %{fragments: fragments, query: query} do + knn_result = Fragment.knn_lookup(query) + + sorted_fragments = sort_fragments(fragments, query, &manual_cosine_distance/2) + + assert Enum.map(knn_result, & &1.id) == Enum.map(sorted_fragments, & &1.id) + end + + test "when given [metric: :l2], performs the kNN lookup on the entire repo usin l2 distance", + %{fragments: fragments, query: query} do + knn_result = Fragment.knn_lookup(query, metric: :l2) + + sorted_fragments = sort_fragments(fragments, query, &manual_l2_distance/2) + + assert Enum.map(knn_result, & &1.id) == Enum.map(sorted_fragments, & &1.id) + end + + test "when given value for :k option, returns only the top k results", %{query: query} do + knn_result = Fragment.knn_lookup(query, k: 5) + + assert length(knn_result) == 5 + end + end + + defp sort_fragments(fragments, query, dist_fn) do + Enum.sort(fragments, fn a, b -> + Nx.to_number(dist_fn.(query, Pgvector.to_tensor(a.embedding))) <= + Nx.to_number(dist_fn.(query, Pgvector.to_tensor(b.embedding))) + end) + end + + defnp manual_cosine_distance(a, b) do + 1 - Nx.dot(a, b) / Nx.sqrt(Nx.sum(Nx.pow(a, 2)) * Nx.sum(Nx.pow(b, 2))) + end + + defnp manual_l2_distance(a, b) do + Nx.sum(Nx.pow(a - b, 2)) + end +end diff --git a/test/search/hex_client_test.exs b/test/search/hex_client_test.exs new file mode 100644 index 0000000..34973b0 --- /dev/null +++ b/test/search/hex_client_test.exs @@ -0,0 +1,107 @@ +defmodule Search.HexClientTest do + use ExUnit.Case, async: true + + alias Search.HexClient + + setup ctx do + tmp_dir = ctx[:tmp_dir] + + if is_nil(tmp_dir) do + {:ok, []} + else + test_tar_contents = [{~c"README.md", "# I am a README!"}] + test_tar_path = Path.join(tmp_dir, "test_tar.tar.gz") + + :ok = + :erl_tar.create(test_tar_path, test_tar_contents, [ + :compressed + ]) + + {:ok, %{test_tar: test_tar_path, test_tar_contents: test_tar_contents}} + end + end + + describe "get_releases/1" do + test "when given a well-formed package JSON, successfuly parses the releases" do + Req.Test.stub(HexClient, fn conn -> + Req.Test.json(conn, %{ + "releases" => [ + %{"version" => "1.2.3", "has_docs" => true}, + %{"version" => "1.1.25", "has_docs" => false} + ] + }) + end) + + package_name = "test_package" + + assert HexClient.get_releases(package_name) == + {:ok, + [ + %HexClient.Release{ + package_name: package_name, + version: Version.parse!("1.2.3"), + has_docs: true + }, + %HexClient.Release{ + package_name: package_name, + version: Version.parse!("1.1.25"), + has_docs: false + } + ]} + end + + test "when getting a response other than 200 OK, should fail gracefully" do + Req.Test.stub(HexClient, fn conn -> + Plug.Conn.send_resp(conn, 403, "Forbidden") + end) + + assert HexClient.get_releases("test_package") == {:error, "HTTP 403"} + end + end + + describe "get_docs_tarball" do + @tag :tmp_dir + test "when given a release with documentation, should return contents of the archive", %{ + test_tar: test_tar, + test_tar_contents: test_tar_contents + } do + Req.Test.stub(HexClient, fn conn -> + conn + |> Plug.Conn.put_resp_content_type("application/octet-stream", nil) + |> Plug.Conn.send_file(200, test_tar) + end) + + rel = %HexClient.Release{ + package_name: "test_package", + version: Version.parse!("1.2.3"), + has_docs: true + } + + assert HexClient.get_docs_tarball(rel) == {:ok, test_tar_contents} + end + + test "when given a release with no documentation, should return error" do + rel = %HexClient.Release{ + package_name: "test_package", + version: Version.parse!("1.2.3"), + has_docs: false + } + + assert HexClient.get_docs_tarball(rel) == {:error, "Package release has no documentation."} + end + + test "when getting a response other than 200 OK, should fail gracefully" do + Req.Test.stub(HexClient, fn conn -> + Plug.Conn.send_resp(conn, 403, "Forbidden") + end) + + rel = %HexClient.Release{ + package_name: "test_package", + version: Version.parse!("1.2.3"), + has_docs: true + } + + assert HexClient.get_docs_tarball(rel) == {:error, "HTTP 403"} + end + end +end diff --git a/test/search_web/controllers/page_controller_test.exs b/test/search_web/controllers/page_controller_test.exs deleted file mode 100644 index a01020b..0000000 --- a/test/search_web/controllers/page_controller_test.exs +++ /dev/null @@ -1,8 +0,0 @@ -defmodule SearchWeb.PageControllerTest do - use SearchWeb.ConnCase - - test "GET /", %{conn: conn} do - conn = get(conn, ~p"/") - assert html_response(conn, 200) =~ "Peace of mind from prototype to production" - end -end