From 9fbf2ed4899c819c5be6a08243664333db402e96 Mon Sep 17 00:00:00 2001 From: Jonathan Sick Date: Tue, 13 Dec 2022 12:47:04 -0500 Subject: [PATCH 1/8] Update tox to use allowlist_externals --- tox.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index 07d8ce1..ec065f3 100644 --- a/tox.ini +++ b/tox.ini @@ -7,7 +7,7 @@ description = Run pytest against {envname}. deps = -r{toxinidir}/requirements/main.txt -r{toxinidir}/requirements/dev.txt -whitelist_externals = +allowlist_externals = docker-compose setenv = SAFIR_KAFKA_BROKER_URL=localhost:9092 @@ -43,7 +43,7 @@ commands = pre-commit run --all-files [testenv:run] description = Run the development server with auto-reload for code changes. usedevelop = true -whitelist_externals = +allowlist_externals = docker-compose setenv = SAFIR_KAFKA_BROKER_URL=localhost:9092 From 2e0908b73a2b79bb564f9131f9328dd597520c0f Mon Sep 17 00:00:00 2001 From: Jonathan Sick Date: Tue, 13 Dec 2022 13:29:07 -0500 Subject: [PATCH 2/8] Use Pydantic Fields for documentation We'll be able to generate better model documentation using Pydantic Fields and description fields. --- src/ook/ingest/algolia/records.py | 144 +++++++++++++++++------------- 1 file changed, 80 insertions(+), 64 deletions(-) diff --git a/src/ook/ingest/algolia/records.py b/src/ook/ingest/algolia/records.py index 6efbc95..a662011 100644 --- a/src/ook/ingest/algolia/records.py +++ b/src/ook/ingest/algolia/records.py @@ -7,7 +7,7 @@ from base64 import b64encode from typing import List, Optional -from pydantic import BaseModel, HttpUrl +from pydantic import BaseModel, Field, HttpUrl __all__ = [ "DocumentRecord", @@ -21,96 +21,112 @@ class DocumentRecord(BaseModel): """Model for an Algolia record of a document.""" - objectID: str - """The Algolia record object identifier.""" + objectID: str = Field(description="The Algolia record object identifier.") - surrogateKey: str - """A key that groups records from the same URL together for a given - ingest so that old records can be dropped from the Algolia index. - """ - - sourceUpdateTime: str - """An ISO 8601 date time for when the source was updated.""" + surrogateKey: str = Field( + description=( + "A key that groups records from the same URL together for a given " + "ingest so that old records can be dropped from the Algolia index." + ) + ) - sourceUpdateTimestamp: int - """A Unix timestamp for when the source was updated. + sourceUpdateTime: str = Field( + description="An ISO 8601 date time for when the source was updated." + ) - This is intended as a sortable version of `sourceUpdateTime`. - """ + sourceUpdateTimestamp: int = Field( + description=( + "A Unix timestamp for when the source was updated. This is " + "intended as a sortable version of `sourceUpdateTime`." + ) + ) - sourceCreationTimestamp: Optional[int] - """A unix timestamp for when the source document was created.""" + sourceCreationTimestamp: Optional[int] = Field( + None, + description=( + "A Unix timestamp for when the source document " "was created." + ), + ) - recordUpdateTime: str - """A ISO 8601 date time for when this record was created.""" + recordUpdateTime: str = Field( + description="A ISO 8601 date time for when this record was created." + ) # str, not HttpUrl because of # https://sqr-027.lsst.io/#What-is-observability? # Ideally we'd want to escape this properly - url: str - """The URL of the record.""" - - baseUrl: HttpUrl - """The base URL of the record (whereas ``url`` may refer to an anchor link. - """ - - content: str - """The full-text content of the record.""" + url: str = Field( + description=( + "The URL of the record. For subsection, this URL can end with an " + "anchor target." + ), + example="https://sqr-027.lsst.io/#What-is-observability?", + ) + + baseUrl: HttpUrl = Field( + description=( + "The base URL of the record (whereas ``url`` may refer to an " + "anchor link." + ) + ) - importance: int = 1 - """The importance of the record. + content: str = Field(description="The full-text content of the record.") - Generally importance should be set by the header level: 1 for h1, 2 for h2, - and so on. - """ + importance: int = Field( + 1, + description=( + "The importance of the record. Generally importance should be set " + "by the header level: 1 for h1, 2 for h2, and so on." + ), + ) - contentCategories_lvl0: str - """Content category.""" + contentCategories_lvl0: str = Field(description="Content category.") - contentCategories_lvl1: Optional[str] - """Content sub-category (level 1).""" + contentCategories_lvl1: Optional[str] = Field( + None, description="Content sub-category (level 1)." + ) - contentType: str - """Content type (ook classification).""" + contentType: str = Field(description="Content type (ook classification).") - description: str - """Description of the URL or short summary for the ``baseUrl``.""" + description: str = Field( + description=( + "Description of the URL or short summary for the ``baseUrl``." + ) + ) - handle: str - """Document handle.""" + handle: str = Field(description="Document handle.") - number: int - """Serial number component of the document handle.""" + number: int = Field( + description=( + "Serial number component of the document handle (``handle``)." + ) + ) - series: str - """Series component of the document handle.""" + series: str = Field( + description="Series component of the document handle (``handle``)." + ) - authorNames: List[str] - """Names of authors.""" + authorNames: List[str] = Field(description="Names of authors.") - h1: str - """The H1 headline (title).""" + h1: str = Field(description="The H1 headline (title).") - h2: Optional[str] - """The h2 headline.""" + h2: Optional[str] = Field(None, description="The h2 headline.") - h3: Optional[str] - """The h3 headline.""" + h3: Optional[str] = Field(None, description="The h3 headline.") - h4: Optional[str] - """The h4 headline.""" + h4: Optional[str] = Field(None, description="The h4 headline.") - h5: Optional[str] - """The h5 headline.""" + h5: Optional[str] = Field(None, description="The h5 headline.") - h6: Optional[str] - """The h6 headline.""" + h6: Optional[str] = Field(None, description="The h6 headline.") - pIndex: Optional[int] - """The paragraph index corresponding to a section.""" + pIndex: Optional[int] = Field( + None, description="The paragraph index corresponding to a section." + ) - githubRepoUrl: Optional[HttpUrl] - """URL of the source repository.""" + githubRepoUrl: Optional[HttpUrl] = Field( + None, description="URL of the source repository." + ) class Config: From 1939cbb497ca2b8293cc217cb593c9ae783a29a0 Mon Sep 17 00:00:00 2001 From: Jonathan Sick Date: Tue, 13 Dec 2022 14:17:13 -0500 Subject: [PATCH 3/8] Create upload_doc_stub command This uses the run_with_asyncio decorator from Safir 2; once Ook migrates to FastAPI we can use safir.asyncio.run_with_asyncio. --- src/ook/cli.py | 15 ++++++++++++ src/ook/utils.py | 60 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 74 insertions(+), 1 deletion(-) diff --git a/src/ook/cli.py b/src/ook/cli.py index 5169524..ecce852 100644 --- a/src/ook/cli.py +++ b/src/ook/cli.py @@ -2,12 +2,15 @@ __all__ = ["main", "help", "run"] +from pathlib import Path from typing import Any, Union import click from aiohttp.web import run_app from ook.app import create_app +from ook.config import Configuration +from ook.utils import run_with_asyncio # Add -h as a help shortcut option CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) @@ -52,3 +55,15 @@ def run(ctx: click.Context, port: int) -> None: """Run the application (for production).""" app = create_app() run_app(app, port=port) + + +@main.command() +@click.option( + "--dataset", required=True, type=click.Path(exists=True, path_type=Path) +) +@click.pass_context +@run_with_asyncio +async def upload_doc_stub(ctx: click.Context, dataset: Path) -> None: + """Upload a stub record for a document that can't be normally indexed.""" + config = Configuration() + print(config) diff --git a/src/ook/utils.py b/src/ook/utils.py index da96bb2..49f1c34 100644 --- a/src/ook/utils.py +++ b/src/ook/utils.py @@ -2,7 +2,19 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Dict, Optional +import asyncio +from functools import wraps +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Coroutine, + Dict, + Optional, + TypeVar, +) + +T = TypeVar("T") if TYPE_CHECKING: from aiohttp import ClientSession @@ -55,3 +67,49 @@ def make_raw_github_url( return ( f"https://raw.githubusercontent.com/{repo_path}/{git_ref}/{file_path}" ) + + +# FIXME this is vendored from Safir 2; use that when Ook becomes a FastAPI app. +def run_with_asyncio( + f: Callable[..., Coroutine[Any, Any, T]] +) -> Callable[..., T]: + """Run the decorated function with `asyncio.run`. + Intended to be used as a decorator around an async function that needs to + be run in a sync context. The decorated function will be run with + `asyncio.run` when invoked. The caller must not already be inside an + asyncio task. + + Parameters + ---------- + f + The function to wrap. + + Examples + -------- + An application that uses Safir and `Click`_ may use the following Click + command function to initialize a database. + + .. code-block:: python + import structlog + from safir.asyncio import run_with_asyncio + from safir.database import initialize_database + from .config import config + from .schema import Base + @main.command() + @run_with_asyncio + async def init() -> None: + logger = structlog.get_logger(config.safir.logger_name) + engine = await initialize_database( + config.database_url, + config.database_password, + logger, + schema=Base.metadata, + ) + await engine.dispose() + """ + + @wraps(f) + def wrapper(*args: Any, **kwargs: Any) -> T: + return asyncio.run(f(*args, **kwargs)) + + return wrapper From 8bd268695471fcbee2d9e06271f0be5a07aa0d33 Mon Sep 17 00:00:00 2001 From: Jonathan Sick Date: Tue, 13 Dec 2022 18:05:11 -0500 Subject: [PATCH 4/8] Set allow_population_by_field_name This allows us to construct a Pydantic model using the field name; not just the alias. --- src/ook/ingest/algolia/records.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ook/ingest/algolia/records.py b/src/ook/ingest/algolia/records.py index a662011..35757bb 100644 --- a/src/ook/ingest/algolia/records.py +++ b/src/ook/ingest/algolia/records.py @@ -136,6 +136,9 @@ class Config: } """Alias for fields that aren't allowable Python names.""" + allow_population_by_field_name = True + """Enables use of Python name for constructing the record.""" + extra = "forbid" """Disable attributes that aren't part of the schema.""" From cfb168f95e4f7a945823567b3f580d97d8c82a97 Mon Sep 17 00:00:00 2001 From: Jonathan Sick Date: Tue, 13 Dec 2022 18:06:12 -0500 Subject: [PATCH 5/8] Create ook upload-doc-stub command This CLI command enables an admin to upload an Algolia record with stub metadata for a document. This is useful for documents that can't be indexed normally by Ook, but still need to appear in the www.lsst.io index. --- src/ook/cli.py | 13 +++- src/ook/ingest/workflows/manualstub.py | 89 ++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 src/ook/ingest/workflows/manualstub.py diff --git a/src/ook/cli.py b/src/ook/cli.py index ecce852..00d27ee 100644 --- a/src/ook/cli.py +++ b/src/ook/cli.py @@ -7,9 +7,11 @@ import click from aiohttp.web import run_app +from algoliasearch.search_client import SearchClient from ook.app import create_app from ook.config import Configuration +from ook.ingest.workflows.manualstub import add_manual_doc_stub from ook.utils import run_with_asyncio # Add -h as a help shortcut option @@ -66,4 +68,13 @@ def run(ctx: click.Context, port: int) -> None: async def upload_doc_stub(ctx: click.Context, dataset: Path) -> None: """Upload a stub record for a document that can't be normally indexed.""" config = Configuration() - print(config) + assert config.algolia_document_index_name is not None + assert config.algolia_app_id is not None + assert config.algolia_api_key is not None + + async with SearchClient.create( + config.algolia_app_id, + api_key=config.algolia_api_key.get_secret_value(), + ) as client: + index = client.init_index(config.algolia_document_index_name) + await add_manual_doc_stub(index, dataset.read_text()) diff --git a/src/ook/ingest/workflows/manualstub.py b/src/ook/ingest/workflows/manualstub.py new file mode 100644 index 0000000..b66b3e2 --- /dev/null +++ b/src/ook/ingest/workflows/manualstub.py @@ -0,0 +1,89 @@ +"""Workfor for manually adding a document stub record to Algolia, without +parsing a document. +""" + +from __future__ import annotations + +from datetime import datetime, timezone +from typing import Optional + +import structlog +from algoliasearch.search_index_async import SearchIndexAsync +from pydantic import BaseModel, Field, HttpUrl + +from ook.classification import ContentType +from ook.ingest.algolia.expiration import delete_old_records +from ook.ingest.algolia.records import ( + DocumentRecord, + format_timestamp, + format_utc_datetime, + generate_object_id, + generate_surrogate_key, +) + + +class MinimalDocumentModel(BaseModel): + """Model for a manually-added record.""" + + title: str = Field(description="Document's title") + + handle: str = Field(description="Document handle.") + + url: HttpUrl = Field(description="The document's URL.") + + authorNames: list[str] = Field( + default_factory=list, description="Author names" + ) + + description: str = Field(description="Description of the document.") + + githubRepoUrl: Optional[HttpUrl] = Field( + None, description="URL of the source repository." + ) + + def make_algolia_record(self) -> DocumentRecord: + object_id = generate_object_id(url=str(self.url), headers=[self.title]) + surrogate_key = generate_surrogate_key() + now = datetime.utcnow().replace(tzinfo=timezone.utc) + series, _number = self.handle.split("-") + + return DocumentRecord( + objectID=object_id, + baseUrl=self.url, + url=self.url, + surrogateKey=surrogate_key, + sourceUpdateTime=format_utc_datetime(now), + sourceUpdateTimestamp=format_timestamp(now), + sourceCreationTimestamp=None, + recordUpdateTime=format_utc_datetime(now), + contentCategories_lvl0="Documents", + contentCategories_lvl1=f"Documents > {series.upper()}", + contentType=ContentType.UNKNOWN.value, + description=self.description, + content=self.description, + handle=self.handle, + number=int(_number), + series=series, + authorNames=self.authorNames, + pIndex=None, + h1=self.title, + githubRepoUrl=self.githubRepoUrl, + ) + + +async def add_manual_doc_stub( + algolia_index: SearchIndexAsync, dataset: str +) -> None: + logger = structlog.get_logger(__file__) + data = MinimalDocumentModel.parse_raw(dataset) + record = data.make_algolia_record() + record_dict = record.dict(by_alias=True, exclude_none=True) + print(record_dict) + result = await algolia_index.save_objects_async([record_dict]) + print(result) + await delete_old_records( + index=algolia_index, + base_url=record.baseUrl, + surrogate_key=record.surrogateKey, + logger=logger, + ) From 945353346e21f87eb56f54cf49830ceb2786dd1c Mon Sep 17 00:00:00 2001 From: Jonathan Sick Date: Tue, 13 Dec 2022 18:26:38 -0500 Subject: [PATCH 6/8] Drop uploading to DockerHub --- .github/workflows/ci.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 9a2f462..85f1628 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -82,12 +82,6 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 - - name: Log in to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_TOKEN }} - - name: Log in to GitHub Container Registry uses: docker/login-action@v2 with: @@ -101,7 +95,6 @@ jobs: context: . push: true tags: | - lsstsqre/ook:${{ steps.vars.outputs.tag }} ghcr.io/lsst-sqre/ook:${{ steps.vars.outputs.tag }} cache-from: type=gha cache-to: type=gha,mode=max From e5bb0fb4c653d727d8fff061647b2950958a7e5c Mon Sep 17 00:00:00 2001 From: Jonathan Sick Date: Tue, 13 Dec 2022 18:32:58 -0500 Subject: [PATCH 7/8] Pre-commit update --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2fa2b21..68cae26 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,14 +11,14 @@ repos: args: [--autofix, --indent=2, '--top-keys=name,doc,type'] - repo: https://github.com/PyCQA/isort - rev: 5.10.1 + rev: 5.11.2 hooks: - id: isort additional_dependencies: - toml - repo: https://github.com/psf/black - rev: 22.10.0 + rev: 22.12.0 hooks: - id: black From 67cd6f5828b43984156c32fb06b5b15e99db7c1c Mon Sep 17 00:00:00 2001 From: Jonathan Sick Date: Tue, 13 Dec 2022 18:40:17 -0500 Subject: [PATCH 8/8] Add change log --- CHANGELOG.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3c681b4..47f52e8 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,12 @@ Change log ########## +Unreleased +========== + +- Update to Python 3.10. +- Add ``ook upload-doc-stub`` CLI command to manually add a single record to Algolia to stub a document into the www.lsst.io search index. This is useful for cases where a document can't be normally indexed by Ook. + 0.5.0 (2021-12-01) ==================