Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-37331: Add ook upload-doc-stub command #126

Merged
merged 8 commits into from
Dec 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,6 @@ jobs:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2

- name: Log in to Docker Hub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_TOKEN }}

- name: Log in to GitHub Container Registry
uses: docker/login-action@v2
with:
Expand All @@ -101,7 +95,6 @@ jobs:
context: .
push: true
tags: |
lsstsqre/ook:${{ steps.vars.outputs.tag }}
ghcr.io/lsst-sqre/ook:${{ steps.vars.outputs.tag }}
cache-from: type=gha
cache-to: type=gha,mode=max
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ repos:
args: [--autofix, --indent=2, '--top-keys=name,doc,type']

- repo: https://github.com/PyCQA/isort
rev: 5.10.1
rev: 5.11.2
hooks:
- id: isort
additional_dependencies:
- toml

- repo: https://github.com/psf/black
rev: 22.10.0
rev: 22.12.0
hooks:
- id: black

Expand Down
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
Change log
##########

Unreleased
==========

- Update to Python 3.10.
- Add ``ook upload-doc-stub`` CLI command to manually add a single record to Algolia to stub a document into the www.lsst.io search index. This is useful for cases where a document can't be normally indexed by Ook.

0.5.0 (2021-12-01)
==================

Expand Down
26 changes: 26 additions & 0 deletions src/ook/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@

__all__ = ["main", "help", "run"]

from pathlib import Path
from typing import Any, Union

import click
from aiohttp.web import run_app
from algoliasearch.search_client import SearchClient

from ook.app import create_app
from ook.config import Configuration
from ook.ingest.workflows.manualstub import add_manual_doc_stub
from ook.utils import run_with_asyncio

# Add -h as a help shortcut option
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
Expand Down Expand Up @@ -52,3 +57,24 @@ def run(ctx: click.Context, port: int) -> None:
"""Run the application (for production)."""
app = create_app()
run_app(app, port=port)


@main.command()
@click.option(
"--dataset", required=True, type=click.Path(exists=True, path_type=Path)
)
@click.pass_context
@run_with_asyncio
async def upload_doc_stub(ctx: click.Context, dataset: Path) -> None:
"""Upload a stub record for a document that can't be normally indexed."""
config = Configuration()
assert config.algolia_document_index_name is not None
assert config.algolia_app_id is not None
assert config.algolia_api_key is not None

async with SearchClient.create(
config.algolia_app_id,
api_key=config.algolia_api_key.get_secret_value(),
) as client:
index = client.init_index(config.algolia_document_index_name)
await add_manual_doc_stub(index, dataset.read_text())
147 changes: 83 additions & 64 deletions src/ook/ingest/algolia/records.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from base64 import b64encode
from typing import List, Optional

from pydantic import BaseModel, HttpUrl
from pydantic import BaseModel, Field, HttpUrl

__all__ = [
"DocumentRecord",
Expand All @@ -21,96 +21,112 @@
class DocumentRecord(BaseModel):
"""Model for an Algolia record of a document."""

objectID: str
"""The Algolia record object identifier."""
objectID: str = Field(description="The Algolia record object identifier.")

surrogateKey: str
"""A key that groups records from the same URL together for a given
ingest so that old records can be dropped from the Algolia index.
"""

sourceUpdateTime: str
"""An ISO 8601 date time for when the source was updated."""
surrogateKey: str = Field(
description=(
"A key that groups records from the same URL together for a given "
"ingest so that old records can be dropped from the Algolia index."
)
)

sourceUpdateTimestamp: int
"""A Unix timestamp for when the source was updated.
sourceUpdateTime: str = Field(
description="An ISO 8601 date time for when the source was updated."
)

This is intended as a sortable version of `sourceUpdateTime`.
"""
sourceUpdateTimestamp: int = Field(
description=(
"A Unix timestamp for when the source was updated. This is "
"intended as a sortable version of `sourceUpdateTime`."
)
)

sourceCreationTimestamp: Optional[int]
"""A unix timestamp for when the source document was created."""
sourceCreationTimestamp: Optional[int] = Field(
None,
description=(
"A Unix timestamp for when the source document " "was created."
),
)

recordUpdateTime: str
"""A ISO 8601 date time for when this record was created."""
recordUpdateTime: str = Field(
description="A ISO 8601 date time for when this record was created."
)

# str, not HttpUrl because of
# https://sqr-027.lsst.io/#What-is-observability?
# Ideally we'd want to escape this properly
url: str
"""The URL of the record."""

baseUrl: HttpUrl
"""The base URL of the record (whereas ``url`` may refer to an anchor link.
"""

content: str
"""The full-text content of the record."""
url: str = Field(
description=(
"The URL of the record. For subsection, this URL can end with an "
"anchor target."
),
example="https://sqr-027.lsst.io/#What-is-observability?",
)

baseUrl: HttpUrl = Field(
description=(
"The base URL of the record (whereas ``url`` may refer to an "
"anchor link."
)
)

importance: int = 1
"""The importance of the record.
content: str = Field(description="The full-text content of the record.")

Generally importance should be set by the header level: 1 for h1, 2 for h2,
and so on.
"""
importance: int = Field(
1,
description=(
"The importance of the record. Generally importance should be set "
"by the header level: 1 for h1, 2 for h2, and so on."
),
)

contentCategories_lvl0: str
"""Content category."""
contentCategories_lvl0: str = Field(description="Content category.")

contentCategories_lvl1: Optional[str]
"""Content sub-category (level 1)."""
contentCategories_lvl1: Optional[str] = Field(
None, description="Content sub-category (level 1)."
)

contentType: str
"""Content type (ook classification)."""
contentType: str = Field(description="Content type (ook classification).")

description: str
"""Description of the URL or short summary for the ``baseUrl``."""
description: str = Field(
description=(
"Description of the URL or short summary for the ``baseUrl``."
)
)

handle: str
"""Document handle."""
handle: str = Field(description="Document handle.")

number: int
"""Serial number component of the document handle."""
number: int = Field(
description=(
"Serial number component of the document handle (``handle``)."
)
)

series: str
"""Series component of the document handle."""
series: str = Field(
description="Series component of the document handle (``handle``)."
)

authorNames: List[str]
"""Names of authors."""
authorNames: List[str] = Field(description="Names of authors.")

h1: str
"""The H1 headline (title)."""
h1: str = Field(description="The H1 headline (title).")

h2: Optional[str]
"""The h2 headline."""
h2: Optional[str] = Field(None, description="The h2 headline.")

h3: Optional[str]
"""The h3 headline."""
h3: Optional[str] = Field(None, description="The h3 headline.")

h4: Optional[str]
"""The h4 headline."""
h4: Optional[str] = Field(None, description="The h4 headline.")

h5: Optional[str]
"""The h5 headline."""
h5: Optional[str] = Field(None, description="The h5 headline.")

h6: Optional[str]
"""The h6 headline."""
h6: Optional[str] = Field(None, description="The h6 headline.")

pIndex: Optional[int]
"""The paragraph index corresponding to a section."""
pIndex: Optional[int] = Field(
None, description="The paragraph index corresponding to a section."
)

githubRepoUrl: Optional[HttpUrl]
"""URL of the source repository."""
githubRepoUrl: Optional[HttpUrl] = Field(
None, description="URL of the source repository."
)

class Config:

Expand All @@ -120,6 +136,9 @@ class Config:
}
"""Alias for fields that aren't allowable Python names."""

allow_population_by_field_name = True
"""Enables use of Python name for constructing the record."""

extra = "forbid"
"""Disable attributes that aren't part of the schema."""

Expand Down
89 changes: 89 additions & 0 deletions src/ook/ingest/workflows/manualstub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""Workfor for manually adding a document stub record to Algolia, without
parsing a document.
"""

from __future__ import annotations

from datetime import datetime, timezone
from typing import Optional

import structlog
from algoliasearch.search_index_async import SearchIndexAsync
from pydantic import BaseModel, Field, HttpUrl

from ook.classification import ContentType
from ook.ingest.algolia.expiration import delete_old_records
from ook.ingest.algolia.records import (
DocumentRecord,
format_timestamp,
format_utc_datetime,
generate_object_id,
generate_surrogate_key,
)


class MinimalDocumentModel(BaseModel):
"""Model for a manually-added record."""

title: str = Field(description="Document's title")

handle: str = Field(description="Document handle.")

url: HttpUrl = Field(description="The document's URL.")

authorNames: list[str] = Field(
default_factory=list, description="Author names"
)

description: str = Field(description="Description of the document.")

githubRepoUrl: Optional[HttpUrl] = Field(
None, description="URL of the source repository."
)

def make_algolia_record(self) -> DocumentRecord:
object_id = generate_object_id(url=str(self.url), headers=[self.title])
surrogate_key = generate_surrogate_key()
now = datetime.utcnow().replace(tzinfo=timezone.utc)
series, _number = self.handle.split("-")

return DocumentRecord(
objectID=object_id,
baseUrl=self.url,
url=self.url,
surrogateKey=surrogate_key,
sourceUpdateTime=format_utc_datetime(now),
sourceUpdateTimestamp=format_timestamp(now),
sourceCreationTimestamp=None,
recordUpdateTime=format_utc_datetime(now),
contentCategories_lvl0="Documents",
contentCategories_lvl1=f"Documents > {series.upper()}",
contentType=ContentType.UNKNOWN.value,
description=self.description,
content=self.description,
handle=self.handle,
number=int(_number),
series=series,
authorNames=self.authorNames,
pIndex=None,
h1=self.title,
githubRepoUrl=self.githubRepoUrl,
)


async def add_manual_doc_stub(
algolia_index: SearchIndexAsync, dataset: str
) -> None:
logger = structlog.get_logger(__file__)
data = MinimalDocumentModel.parse_raw(dataset)
record = data.make_algolia_record()
record_dict = record.dict(by_alias=True, exclude_none=True)
print(record_dict)
result = await algolia_index.save_objects_async([record_dict])
print(result)
await delete_old_records(
index=algolia_index,
base_url=record.baseUrl,
surrogate_key=record.surrogateKey,
logger=logger,
)
Loading