Skip to content

Commit

Permalink
Merge pull request #126 from lsst-sqre/tickets/DM-37331
Browse files Browse the repository at this point in the history
DM-37331: Add ook upload-doc-stub command
  • Loading branch information
jonathansick authored Dec 20, 2022
2 parents 15151b7 + 67cd6f5 commit 88c7fbb
Show file tree
Hide file tree
Showing 8 changed files with 267 additions and 76 deletions.
7 changes: 0 additions & 7 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,6 @@ jobs:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2

- name: Log in to Docker Hub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_TOKEN }}

- name: Log in to GitHub Container Registry
uses: docker/login-action@v2
with:
Expand All @@ -101,7 +95,6 @@ jobs:
context: .
push: true
tags: |
lsstsqre/ook:${{ steps.vars.outputs.tag }}
ghcr.io/lsst-sqre/ook:${{ steps.vars.outputs.tag }}
cache-from: type=gha
cache-to: type=gha,mode=max
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ repos:
args: [--autofix, --indent=2, '--top-keys=name,doc,type']

- repo: https://github.com/PyCQA/isort
rev: 5.10.1
rev: 5.11.2
hooks:
- id: isort
additional_dependencies:
- toml

- repo: https://github.com/psf/black
rev: 22.10.0
rev: 22.12.0
hooks:
- id: black

Expand Down
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
Change log
##########

Unreleased
==========

- Update to Python 3.10.
- Add ``ook upload-doc-stub`` CLI command to manually add a single record to Algolia to stub a document into the www.lsst.io search index. This is useful for cases where a document can't be normally indexed by Ook.

0.5.0 (2021-12-01)
==================

Expand Down
26 changes: 26 additions & 0 deletions src/ook/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@

__all__ = ["main", "help", "run"]

from pathlib import Path
from typing import Any, Union

import click
from aiohttp.web import run_app
from algoliasearch.search_client import SearchClient

from ook.app import create_app
from ook.config import Configuration
from ook.ingest.workflows.manualstub import add_manual_doc_stub
from ook.utils import run_with_asyncio

# Add -h as a help shortcut option
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
Expand Down Expand Up @@ -52,3 +57,24 @@ def run(ctx: click.Context, port: int) -> None:
"""Run the application (for production)."""
app = create_app()
run_app(app, port=port)


@main.command()
@click.option(
"--dataset", required=True, type=click.Path(exists=True, path_type=Path)
)
@click.pass_context
@run_with_asyncio
async def upload_doc_stub(ctx: click.Context, dataset: Path) -> None:
"""Upload a stub record for a document that can't be normally indexed."""
config = Configuration()
assert config.algolia_document_index_name is not None
assert config.algolia_app_id is not None
assert config.algolia_api_key is not None

async with SearchClient.create(
config.algolia_app_id,
api_key=config.algolia_api_key.get_secret_value(),
) as client:
index = client.init_index(config.algolia_document_index_name)
await add_manual_doc_stub(index, dataset.read_text())
147 changes: 83 additions & 64 deletions src/ook/ingest/algolia/records.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from base64 import b64encode
from typing import List, Optional

from pydantic import BaseModel, HttpUrl
from pydantic import BaseModel, Field, HttpUrl

__all__ = [
"DocumentRecord",
Expand All @@ -21,96 +21,112 @@
class DocumentRecord(BaseModel):
"""Model for an Algolia record of a document."""

objectID: str
"""The Algolia record object identifier."""
objectID: str = Field(description="The Algolia record object identifier.")

surrogateKey: str
"""A key that groups records from the same URL together for a given
ingest so that old records can be dropped from the Algolia index.
"""

sourceUpdateTime: str
"""An ISO 8601 date time for when the source was updated."""
surrogateKey: str = Field(
description=(
"A key that groups records from the same URL together for a given "
"ingest so that old records can be dropped from the Algolia index."
)
)

sourceUpdateTimestamp: int
"""A Unix timestamp for when the source was updated.
sourceUpdateTime: str = Field(
description="An ISO 8601 date time for when the source was updated."
)

This is intended as a sortable version of `sourceUpdateTime`.
"""
sourceUpdateTimestamp: int = Field(
description=(
"A Unix timestamp for when the source was updated. This is "
"intended as a sortable version of `sourceUpdateTime`."
)
)

sourceCreationTimestamp: Optional[int]
"""A unix timestamp for when the source document was created."""
sourceCreationTimestamp: Optional[int] = Field(
None,
description=(
"A Unix timestamp for when the source document " "was created."
),
)

recordUpdateTime: str
"""A ISO 8601 date time for when this record was created."""
recordUpdateTime: str = Field(
description="A ISO 8601 date time for when this record was created."
)

# str, not HttpUrl because of
# https://sqr-027.lsst.io/#What-is-observability?
# Ideally we'd want to escape this properly
url: str
"""The URL of the record."""

baseUrl: HttpUrl
"""The base URL of the record (whereas ``url`` may refer to an anchor link.
"""

content: str
"""The full-text content of the record."""
url: str = Field(
description=(
"The URL of the record. For subsection, this URL can end with an "
"anchor target."
),
example="https://sqr-027.lsst.io/#What-is-observability?",
)

baseUrl: HttpUrl = Field(
description=(
"The base URL of the record (whereas ``url`` may refer to an "
"anchor link."
)
)

importance: int = 1
"""The importance of the record.
content: str = Field(description="The full-text content of the record.")

Generally importance should be set by the header level: 1 for h1, 2 for h2,
and so on.
"""
importance: int = Field(
1,
description=(
"The importance of the record. Generally importance should be set "
"by the header level: 1 for h1, 2 for h2, and so on."
),
)

contentCategories_lvl0: str
"""Content category."""
contentCategories_lvl0: str = Field(description="Content category.")

contentCategories_lvl1: Optional[str]
"""Content sub-category (level 1)."""
contentCategories_lvl1: Optional[str] = Field(
None, description="Content sub-category (level 1)."
)

contentType: str
"""Content type (ook classification)."""
contentType: str = Field(description="Content type (ook classification).")

description: str
"""Description of the URL or short summary for the ``baseUrl``."""
description: str = Field(
description=(
"Description of the URL or short summary for the ``baseUrl``."
)
)

handle: str
"""Document handle."""
handle: str = Field(description="Document handle.")

number: int
"""Serial number component of the document handle."""
number: int = Field(
description=(
"Serial number component of the document handle (``handle``)."
)
)

series: str
"""Series component of the document handle."""
series: str = Field(
description="Series component of the document handle (``handle``)."
)

authorNames: List[str]
"""Names of authors."""
authorNames: List[str] = Field(description="Names of authors.")

h1: str
"""The H1 headline (title)."""
h1: str = Field(description="The H1 headline (title).")

h2: Optional[str]
"""The h2 headline."""
h2: Optional[str] = Field(None, description="The h2 headline.")

h3: Optional[str]
"""The h3 headline."""
h3: Optional[str] = Field(None, description="The h3 headline.")

h4: Optional[str]
"""The h4 headline."""
h4: Optional[str] = Field(None, description="The h4 headline.")

h5: Optional[str]
"""The h5 headline."""
h5: Optional[str] = Field(None, description="The h5 headline.")

h6: Optional[str]
"""The h6 headline."""
h6: Optional[str] = Field(None, description="The h6 headline.")

pIndex: Optional[int]
"""The paragraph index corresponding to a section."""
pIndex: Optional[int] = Field(
None, description="The paragraph index corresponding to a section."
)

githubRepoUrl: Optional[HttpUrl]
"""URL of the source repository."""
githubRepoUrl: Optional[HttpUrl] = Field(
None, description="URL of the source repository."
)

class Config:

Expand All @@ -120,6 +136,9 @@ class Config:
}
"""Alias for fields that aren't allowable Python names."""

allow_population_by_field_name = True
"""Enables use of Python name for constructing the record."""

extra = "forbid"
"""Disable attributes that aren't part of the schema."""

Expand Down
89 changes: 89 additions & 0 deletions src/ook/ingest/workflows/manualstub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""Workfor for manually adding a document stub record to Algolia, without
parsing a document.
"""

from __future__ import annotations

from datetime import datetime, timezone
from typing import Optional

import structlog
from algoliasearch.search_index_async import SearchIndexAsync
from pydantic import BaseModel, Field, HttpUrl

from ook.classification import ContentType
from ook.ingest.algolia.expiration import delete_old_records
from ook.ingest.algolia.records import (
DocumentRecord,
format_timestamp,
format_utc_datetime,
generate_object_id,
generate_surrogate_key,
)


class MinimalDocumentModel(BaseModel):
"""Model for a manually-added record."""

title: str = Field(description="Document's title")

handle: str = Field(description="Document handle.")

url: HttpUrl = Field(description="The document's URL.")

authorNames: list[str] = Field(
default_factory=list, description="Author names"
)

description: str = Field(description="Description of the document.")

githubRepoUrl: Optional[HttpUrl] = Field(
None, description="URL of the source repository."
)

def make_algolia_record(self) -> DocumentRecord:
object_id = generate_object_id(url=str(self.url), headers=[self.title])
surrogate_key = generate_surrogate_key()
now = datetime.utcnow().replace(tzinfo=timezone.utc)
series, _number = self.handle.split("-")

return DocumentRecord(
objectID=object_id,
baseUrl=self.url,
url=self.url,
surrogateKey=surrogate_key,
sourceUpdateTime=format_utc_datetime(now),
sourceUpdateTimestamp=format_timestamp(now),
sourceCreationTimestamp=None,
recordUpdateTime=format_utc_datetime(now),
contentCategories_lvl0="Documents",
contentCategories_lvl1=f"Documents > {series.upper()}",
contentType=ContentType.UNKNOWN.value,
description=self.description,
content=self.description,
handle=self.handle,
number=int(_number),
series=series,
authorNames=self.authorNames,
pIndex=None,
h1=self.title,
githubRepoUrl=self.githubRepoUrl,
)


async def add_manual_doc_stub(
algolia_index: SearchIndexAsync, dataset: str
) -> None:
logger = structlog.get_logger(__file__)
data = MinimalDocumentModel.parse_raw(dataset)
record = data.make_algolia_record()
record_dict = record.dict(by_alias=True, exclude_none=True)
print(record_dict)
result = await algolia_index.save_objects_async([record_dict])
print(result)
await delete_old_records(
index=algolia_index,
base_url=record.baseUrl,
surrogate_key=record.surrogateKey,
logger=logger,
)
Loading

0 comments on commit 88c7fbb

Please sign in to comment.