Skip to content

Commit

Permalink
docs(algolia): also add functions to algolia index
Browse files Browse the repository at this point in the history
I wrote some hacky stuff a while back to add the various methods to the
Algolia index so that they would show up in search.

This is basically the same thing again, except it grabs the top-level
anchored functions that are generated by `quartodoc` (and which are not
included in the quarto-generated `search.json` file).

I've run this already to populate the index manually, and now functions
like `ibis.get_backend` and `ibis.set_backend` now show up in search.
  • Loading branch information
gforsyth authored and jcrist committed Sep 24, 2024
1 parent 10f7f5b commit 5ae697e
Showing 1 changed file with 77 additions and 14 deletions.
91 changes: 77 additions & 14 deletions .github/workflows/algolia/upload-algolia-api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,25 @@


# These are QMD files generated with help from Quartodoc.
API_QMDS = [
"docs/reference/expression-collections.qmd",
"docs/reference/expression-generic.qmd",
"docs/reference/expression-geospatial.qmd",
"docs/reference/expression-numeric.qmd",
"docs/reference/expression-strings.qmd",
"docs/reference/expression-tables.qmd",
"docs/reference/expression-temporal.qmd",
"docs/reference/scalar-udfs.qmd",
"docs/reference/aggregate-udfs.qmd",
"docs/reference/schemas.qmd",
"docs/reference/datatypes.qmd",
EXCLUDE_API_QMDS = [
"docs/reference/cursed_knowledge.qmd", # not quartodoc generated
"docs/reference/BackendTest.qmd", # test infrastructure documentation
"docs/reference/ServiceBackendTest.qmd", # test infrastructure documentation
"docs/reference/SQL.qmd", # esoterica
"docs/reference/index.qmd", # just the index
]

# Grab all QMD files in `docs/reference` that aren't explicitly excluded
API_QMDS = list(set(glob.glob("docs/reference/*.qmd")).difference(EXCLUDE_API_QMDS))

HORRID_REGEX = re.compile(r"\|\s*\[(\w+)\]\((#[\w.]+)\)\s*\|\s*(.*?)\s*\|")
# Given | [method](some-anchor) | some multiword description |
# this regex extracts ("method", "some-anchor", "some multiword description")

FUNCTION_REGEX = re.compile(r"\s#{1} (\w+) \{ (#[^\}]+) \}")
# Looks for # func_name { #func-anchor }
# and extracts ("func_name", "#func-anchor")


def _grab_qmd_methods(lines):
# All of the QMD files have a section that looks like:
Expand All @@ -58,7 +58,8 @@ def _create_api_record_from_method_line(base_url, method):
# for e.g. `reference/expression-collections.html` we want to grab "Collections"
section = (
base_url.removesuffix(".html")
.removeprefix("reference/expression-")
.removeprefix("reference/")
.removeprefix("expression-")
.capitalize()
)
name, anchor, desc = re.match(HORRID_REGEX, method).groups()
Expand All @@ -76,6 +77,59 @@ def _create_api_record_from_method_line(base_url, method):
return record


def _grab_qmd_functions(lines):
"""Quartodoc generated files have top-level function headers that look like.
# connect { #ibis.connect }
`ibis.connect(resource, **kwargs)`
Connect to `resource`, inferring the backend automatically.
or similar. These don't get scraped by `quarto` (yet) because it doesn't
know about `quartodoc`.
We grab the functions using a regex, then grab the text between that
function line and the next `#` sign and take that as the "content" of the
record. This is a rough heuristic, but it works better than the previous
nothing.
"""
# Strip out echo directive to avoid false early stop
lines = lines.replace("#| echo: false", "")
topics = re.findall(FUNCTION_REGEX, lines)

for topic in topics:
func_name, anchor = topic
# Find the first newline _after_ the matching function header
start = lines.find(r"\n", lines.find(f"# {func_name} {{ {anchor} }}"))
end = lines.find("#", start)
text_blob = lines[start:end]
yield func_name, anchor, text_blob
lines = lines[end:]


def _create_api_record_from_scraped_function(base_url, func_name, anchor, text_blob):
# for e.g. `reference/expression-collections.html` we want to grab "Collections"
section = (
base_url.removesuffix(".html")
.removeprefix("reference/")
.removeprefix("expression-")
.capitalize()
)
record = {
"objectID": f"{base_url}{anchor}",
"href": f"{base_url}{anchor}",
"title": func_name,
"backend": "core",
"core": 1,
"crumbs": ["Expression API", "API", f"{section} expression"],
}
if text_blob:
record["text"] = text_blob

return record


def adjust_backend_custom_attributes(backend_records):
"""Adjusts attributes of the Algolia records.
Expand Down Expand Up @@ -109,7 +163,7 @@ def main():
# For each QMD file, get the table-section of the methods, anchors, and descriptions
print(f"Scraping {qmd} for API methods...") # noqa:T201
with open(qmd) as f:
methods = _grab_qmd_methods(f.read())
methods = _grab_qmd_methods(lines := f.read())

# Massage the QMD filename into the expected URL that prepends the anchor
# so we end up eventually with something like
Expand All @@ -120,6 +174,15 @@ def main():
_creator = partial(_create_api_record_from_method_line, base_url)
records += list(map(_creator, methods))

# Now that we've scraped out the methods, also scrape any top-level functions
print(f"Scraping {qmd} for API functions...") # noqa:T201
for func_name, anchor, textblob in _grab_qmd_functions(lines):
record = _create_api_record_from_scraped_function(
base_url, func_name, anchor, textblob
)
assert isinstance(record, dict)
records.append(record)

# This saves the list of records to Algolia
# If the object IDs are new (which typically should be) this adds a new
# record to the Algolia index. If the object ID already exists, it gets
Expand Down

0 comments on commit 5ae697e

Please sign in to comment.