Skip to content
This repository has been archived by the owner on Sep 3, 2024. It is now read-only.

Commit

Permalink
Create minimal prototype (#4)
Browse files Browse the repository at this point in the history
* Add pgvector to project and workflows

* Run formatting

* Add basic functionality for the embedding model

* Use builtin text embedding for the serving

* Run formatting

* Create context and tests for documentation fragments

* Add moduledocs for Embedding and Fragment

* Enable async testing for fragments

* Add parsing of ExDoc generated docs for raw documentation

* Bump Bumblebee version

* Use regex for search_data matching, update tests accordingly

* Add Hex API client

* Refactor ExDocParser to work better with Req

* Prep HexClient for testing

* Add tests for HexClient

* Run formatter

* Refactor map_json_to_releases to reduce nesting

* Add mix task for indexing package

* Add simple frontend for the app

* Try to fix auth issue with postgres

* Revert "Try to fix auth issue with postgres"

This reverts commit 9a4e70d.

* Remove dummy test from Phoenix

* Use Jason instead of JSON

* Refactor ExDocParser.extract_search_data, clean up tests

* Refactor parse_search_data

* Refactor HexClient and HexClientTest

* Refactor the search page and controller

* Fix code style in tests

* Run formatter

* Refactor parse_search_data for readability, remove unnecessary error type

* Clean up HexClient helper

* Remove tgz helper

* Clean up import/alias in EmbeddingTest

* Use case instead of with

* Add non null constraint to fragments

* Inline unnecessary private function
  • Loading branch information
karol-t-wilk authored Mar 7, 2024
1 parent b3ec8b0 commit ec5d179
Show file tree
Hide file tree
Showing 26 changed files with 580 additions and 253 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:

services:
db:
image: postgres:16.2
image: pgvector/pgvector:pg16
ports: ["5432:5432"]
env:
POSTGRES_PASSWORD: postgres
Expand Down
6 changes: 6 additions & 0 deletions config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ config :search,
ecto_repos: [Search.Repo],
generators: [timestamp_type: :utc_datetime]

# Add types added by the pgvector-elixir extension to Postgrex
config :search, Search.Repo, types: Search.PostgrexTypes

# Configures the endpoint
config :search, SearchWeb.Endpoint,
url: [host: "localhost"],
Expand Down Expand Up @@ -52,6 +55,9 @@ config :logger, :console,
# Use Jason for JSON parsing in Phoenix
config :phoenix, :json_library, Jason

# Configure the EXLA backend for Nx
config :nx, :default_backend, EXLA.Backend

# Import environment specific config. This must remain at the bottom
# of this file so it overrides the configuration defined above.
import_config "#{config_env()}.exs"
6 changes: 6 additions & 0 deletions config/test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ config :search, SearchWeb.Endpoint,
secret_key_base: "KoDCWrF9SEsnoM8svaiDh9g62hqg8cKGhafXsTKunfl/FVMTq1psZAyOoMp3eIO2",
server: false

# Enable testing plug for Req to enable stubs in HexClientTest
config :search,
hex_client_req_options: [
plug: {Req.Test, Search.HexClient}
]

# Print only warnings and errors during test
config :logger, level: :warning

Expand Down
53 changes: 53 additions & 0 deletions lib/mix/tasks/search/index.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
defmodule Mix.Tasks.Search.Index do
@moduledoc """
Usage: mix #{Mix.Task.task_name(__MODULE__)} <PACKAGE> [<VERSION>]
Fetches the documentation for the given package from Hex and indexes it using the embedding model.
If the version is ommitted, it will choose the newest release.
"""
@shortdoc "Indexes a package's documentation"

use Mix.Task

@requirements ["app.start"]

@impl Mix.Task
def run(args) do
[package | args_tail] = args
{:ok, releases} = Search.HexClient.get_releases(package)

release =
case args_tail do
[version] ->
version = Version.parse!(version)
Enum.find(releases, &(&1.version == version))

[] ->
Enum.max_by(releases, & &1.version, Version, fn -> nil end)
end

if release do
{:ok, tarball} = Search.HexClient.get_docs_tarball(release)
{:ok, docs} = Search.ExDocParser.extract_search_data(tarball)
docs = Enum.map(docs, & &1["doc"])
docs_len = length(docs)

ProgressBar.render(0, docs_len)

docs
|> Stream.with_index(1)
|> Enum.each(fn {doc, i} ->
%{embedding: embedding} = Nx.Serving.batched_run(Search.Embedding, doc)

ProgressBar.render(i, docs_len)

fragment = %Search.Fragment{doc_text: doc, embedding: embedding}

Search.Repo.insert!(fragment)
end)
else
Mix.shell().error("Release not found.")
end
end
end
2 changes: 1 addition & 1 deletion lib/search/application.ex
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ defmodule Search.Application do
{DNSCluster, query: Application.get_env(:search, :dns_cluster_query) || :ignore},
{Phoenix.PubSub, name: Search.PubSub},
# Start a worker by calling: Search.Worker.start_link(arg)
# {Search.Worker, arg},
{Search.Embedding, name: Search.Embedding},
# Start to serve requests, typically the last entry
SearchWeb.Endpoint
]
Expand Down
23 changes: 23 additions & 0 deletions lib/search/embedding.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
defmodule Search.Embedding do
@moduledoc """
Provides text embedding capabilities. Currently uses Bumblebee with Sentence Transformers paraphrase L3 model
"""

@embedding_size 384
@model_repo {:hf, "sentence-transformers/paraphrase-MiniLM-L3-v2"}

def embedding_size, do: @embedding_size

def child_spec(opts) do
opts
|> Keyword.merge(serving: load_model())
|> Nx.Serving.child_spec()
end

defp load_model() do
{:ok, model_info} = Bumblebee.load_model(@model_repo)
{:ok, tokenizer} = Bumblebee.load_tokenizer(@model_repo)

Bumblebee.Text.text_embedding(model_info, tokenizer)
end
end
38 changes: 38 additions & 0 deletions lib/search/ex_doc_parser.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
defmodule Search.ExDocParser do
@moduledoc """
Contains functionality for extracting the raw documentation with metadata from a tarball of an
ExDoc-generated documentation page
"""

def extract_search_data(untarred_docs) when is_list(untarred_docs) do
search_data =
Enum.find_value(untarred_docs, fn {path, contents} ->
if match?(~c"dist/search_data-" ++ _, path) do
contents
end
end)

if search_data do
parse_search_data(search_data)
else
{:error, "Search data not found, package documentation is not in a supported format."}
end
end

@search_data_prefix "searchData="
defp parse_search_data(search_data) when is_binary(search_data) do
case search_data do
@search_data_prefix <> json ->
case Jason.decode(json) do
{:ok, %{"items" => items}} ->
{:ok, items}

_ ->
{:error, "Search data content is invalid JSON"}
end

_ ->
{:error, "Search data content does not start with \"#{@search_data_prefix}\"."}
end
end
end
53 changes: 53 additions & 0 deletions lib/search/fragment.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
defmodule Search.Fragment do
@moduledoc """
Context for indexed documentation fragments - each fragment has associated with it an embedding vector, upon which
kNN lookup can be performed.
"""

alias Search.{Fragment, Repo}
use Ecto.Schema
import Ecto.{Changeset, Query}
import Pgvector.Ecto.Query

# Pgvector cannot handle inner product heuristic in ascending order, making it useless here
@metrics [:cosine, :l2]

schema "fragments" do
field :doc_text, :string
field :embedding, Pgvector.Ecto.Vector

timestamps(type: :utc_datetime)
end

def metrics, do: @metrics

def knn_lookup(query_tensor, opts \\ []) do
opts = Keyword.validate!(opts, metric: :cosine, k: nil)
metric = opts[:metric]
k = opts[:k]

query =
case metric do
:cosine ->
from f in Fragment,
order_by: cosine_distance(f.embedding, ^query_tensor),
limit: ^k,
select: f

:l2 ->
from f in Fragment,
order_by: l2_distance(f.embedding, ^query_tensor),
limit: ^k,
select: f
end

Repo.all(query)
end

@doc false
def changeset(fragment, attrs) do
fragment
|> cast(attrs, [:doc_text, :embedding])
|> validate_required([:doc_text, :embedding])
end
end
47 changes: 47 additions & 0 deletions lib/search/hex_client.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
defmodule Search.HexClient do
@api_url "https://hex.pm/api"
@repo_url "https://repo.hex.pm"

alias Search.HexClient

def get_releases(package_name) when is_binary(package_name) do
case get("#{@api_url}/packages/#{package_name}") do
{:ok, %{status: 200, body: %{"releases" => releases}}} ->
res =
for %{"has_docs" => has_docs, "version" => version} <- releases do
%HexClient.Release{
package_name: package_name,
version: Version.parse!(version),
has_docs: has_docs
}
end

{:ok, res}

{:ok, %{status: status}} ->
{:error, "HTTP #{status}"}

err ->
err
end
end

def get_docs_tarball(
%HexClient.Release{has_docs: has_docs, package_name: package_name, version: version} =
_release
) do
if has_docs do
case get("#{@repo_url}/docs/#{package_name}-#{version}.tar.gz") do
{:ok, %{status: 200, body: body}} -> {:ok, body}
{:ok, %{status: status}} -> {:error, "HTTP #{status}"}
err -> err
end
else
{:error, "Package release has no documentation."}
end
end

defp get(url) do
Req.get(url, Application.get_env(:search, :hex_client_req_options, []))
end
end
9 changes: 9 additions & 0 deletions lib/search/hex_client/release.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
defmodule Search.HexClient.Release do
defstruct [:package_name, :version, :has_docs]

@type t :: %__MODULE__{
package_name: String.t(),
version: Version.t(),
has_docs: boolean()
}
end
5 changes: 5 additions & 0 deletions lib/search/postgrex_types.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Postgrex.Types.define(
Search.PostgrexTypes,
[Pgvector.Extensions.Vector] ++ Ecto.Adapters.Postgres.extensions(),
[]
)
20 changes: 3 additions & 17 deletions lib/search_web/components/layouts/app.html.heex
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,12 @@
<div class="flex items-center justify-between border-b border-zinc-100 py-3 text-sm">
<div class="flex items-center gap-4">
<a href="/">
<img src={~p"/images/logo.svg"} width="36" />
<img src={~p"/images/temp-logo.svg"} width="36" />
</a>
<p class="bg-brand/5 text-brand rounded-full px-2 font-medium leading-6">
v<%= Application.spec(:phoenix, :vsn) %>
<p class="bg-gray-100 rounded-full px-2 font-medium leading-6">
v<%= Application.spec(:search, :vsn) %>
</p>
</div>
<div class="flex items-center gap-4 font-semibold leading-6 text-zinc-900">
<a href="https://twitter.com/elixirphoenix" class="hover:text-zinc-700">
@elixirphoenix
</a>
<a href="https://github.com/phoenixframework/phoenix" class="hover:text-zinc-700">
GitHub
</a>
<a
href="https://hexdocs.pm/phoenix/overview.html"
class="rounded-lg bg-zinc-100 px-2 py-1 hover:bg-zinc-200/80"
>
Get Started <span aria-hidden="true">&rarr;</span>
</a>
</div>
</div>
</header>
<main class="px-4 py-20 sm:px-6 lg:px-8">
Expand Down
23 changes: 22 additions & 1 deletion lib/search_web/controllers/page_controller.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,27 @@ defmodule SearchWeb.PageController do
def home(conn, _params) do
# The home page is often custom made,
# so skip the default app layout.
render(conn, :home, layout: false)
render(conn, :home, form: Phoenix.Component.to_form(%{"search_text" => nil, "k" => nil}))
end

def search(conn, %{"k" => k, "search_text" => search_text} = params) do
k = String.to_integer(k)
search_text = String.trim(search_text)

errors =
if search_text == "" do
[search_text: {"Can't be blank", []}]
else
[]
end

if errors == [] do
%{embedding: query_tensor} = Nx.Serving.batched_run(Search.Embedding, search_text)
fragments = Search.Fragment.knn_lookup(query_tensor, k: k)

render(conn, :search, fragments: fragments)
else
render(conn, :home, form: Phoenix.Component.to_form(params, errors: errors))
end
end
end
Loading

0 comments on commit ec5d179

Please sign in to comment.