Skip to content

Commit

Permalink
More Metadata Changes (#1180)
Browse files Browse the repository at this point in the history
* Add Typing for the Publishing Keyword Arguments

These arguments are being passed all over, and this enables some
type-safety. This is a potentially interesting use-case for
TypedDicts.

* Enable metadata only package+deployments.

As I go through LION+DCM+Zoning, I want to be able to assemble a package
of only metadata, and deploy only the metadata. For time considerations
more than anything

* Add remaining publication fields

* Use OrgMetadata in CLI tools

* Shuffle around the Org/Product/Dataset Overrides

Refactors the override classes, with the eventual goal of enabling
overrides/defaults from the org level. This is driven by the need to
set Org-level fields like `agency` at the top-level, and pass that along
to the Socrata connector.

* Enable Metadata at the Org Level

Allows us to set fields at the org-level, which can then be set at the
product-level, then dataset-level. This is mostly for fields like
agency, which needs to be passed to the Socrata publish connector

* Socrata Pub: pull new fields from metadata

Okay, now that we can set these fields at the org-level down through to
the dataset, we can remove the literals when we publish

* Add product metadata repo env var

It's really annoying to have to keep pasting the path to your metadata
repo. This adds the `PRODUCT_METADATA_REPO_PATH` env var, and makes that
path flag optional (it's now an override)

* Fix straggling kwarg errors

* Add test for md-only bytes pull, fix docs

* Throw an exception for missing socrata md fields

---------

Co-authored-by: Alex Richey <alexrichey@DCP-APPLE-2180.local>
  • Loading branch information
alexrichey and Alex Richey authored Oct 10, 2024
1 parent def79d7 commit 420a92a
Show file tree
Hide file tree
Showing 14 changed files with 235 additions and 114 deletions.
2 changes: 2 additions & 0 deletions dcpy/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,5 @@
"db-zoningtaxlots",
]
IGNORED_LOGGING_BUILDS = ["nightly_qa", "compile_python_reqs"]

PRODUCT_METADATA_REPO_PATH = env.get("PRODUCT_METADATA_REPO_PATH")
27 changes: 22 additions & 5 deletions dcpy/connectors/socrata/publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,29 +114,46 @@ class Inputs:
class DatasetMetadata(BaseModel):
name: str
description: str
category: str
attribution: str
attributionLink: str
tags: list[str]
# licenseId: str
metadata: dict[str, Any]
privateMetadata: dict[str, Any]

@classmethod
def from_dataset_attributes(cls, attrs: md.DatasetAttributes):
if not (attrs.category and attrs.agency and attrs.publishing_frequency):
raise Exception(
f"Required metadata fields are missing. Found category: {attrs.category}, agency: {attrs.agency} or publishing_frequency: {attrs.publishing_frequency}"
)

return cls(
name=attrs.display_name,
description=attrs.description,
tags=attrs.tags,
category=attrs.category,
attribution=attrs.attribution or "",
attributionLink=attrs.attributionLink or "",
tags=attrs.tags or [],
metadata={
"rowLabel": attrs.each_row_is_a,
"custom_fields": {
"Dataset Information": {"Agency": attrs.agency},
"Update": {
"Data Change Frequency": attrs.publishing_frequency,
"Date Made Public": attrs.date_made_public,
"Update Frequency Details": attrs.publishing_frequency_details,
"Update Frequency": translate_legislative_freq_to_update_freq(
attrs.publishing_frequency or ""
attrs.publishing_frequency
)
or attrs.publishing_frequency,
"Automation": "Yes",
}
},
},
},
privateMetadata={
# "contactEmail": "", # Leaving this here in case we want to add it, so we don't have to remember what the field is called
"custom_fields": {
"Legislative Compliance": {
"Removed Records?": "Yes", # refers to row removal at time of push to Socrata. Always true since we overwrite the existing dataset.
Expand All @@ -153,8 +170,8 @@ def from_dataset_attributes(cls, attrs: md.DatasetAttributes):
if attrs.custom.get("dataset_from_open_data_plan")
else "No"
),
}
}
},
},
},
)

Expand Down
25 changes: 19 additions & 6 deletions dcpy/lifecycle/distribute/socrata.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
from pathlib import Path
import typer
from typing import TypedDict, Unpack, NotRequired, Required

import dcpy.models.product.dataset.metadata_v2 as m
from dcpy.utils.logging import logger
import dcpy.connectors.edm.packaging as packaging
import dcpy.connectors.socrata.publish as soc_pub


class PublishKwargs(TypedDict):
metadata_path: NotRequired[Path]
publish: Required[bool]
ignore_validation_errors: Required[bool]
skip_validation: Required[bool]
metadata_only: Required[bool]


def dist_from_local(
package_path: Path,
dataset_destination_id: str,
Expand Down Expand Up @@ -53,27 +62,31 @@ def dist_from_local(
metadata=md,
dataset_destination_id=dataset_destination_id,
dataset_package_path=package_path,
publish=publish,
metadata_only=metadata_only,
publish=bool(publish),
metadata_only=bool(metadata_only),
)
except Exception as e:
return f"Error pushing {md.attributes.display_name}, destination: {dest.id}: {str(e)}"


def dist_from_local_all_socrata(
package_path: Path,
**pub_kwargs,
**pub_kwargs: Unpack[PublishKwargs],
):
"""Distributes all Socrata destinations within a given metadata"""
md = m.Metadata.from_path(package_path / "metadata.yml")
local_pub_kwargs = pub_kwargs.copy()
local_pub_kwargs.pop(
"metadata_path"
) if "metadata_path" in local_pub_kwargs else None

socrata_dests = [d.id for d in md.destinations if d.type == "socrata"]
logger.info(f"Distributing {md.attributes.display_name}: {socrata_dests}")
results = [
dist_from_local(
package_path=package_path,
metadata_path=None,
dataset_destination_id=dataset_destination_id,
**pub_kwargs,
**local_pub_kwargs,
)
for dataset_destination_id in socrata_dests
]
Expand All @@ -82,7 +95,7 @@ def dist_from_local_all_socrata(

def dist_from_local_product_all_socrata(
product_path: Path,
**pub_kwargs,
**pub_kwargs: Unpack[PublishKwargs],
):
"""Distribute datasets for an entire product."""
results = []
Expand Down
91 changes: 70 additions & 21 deletions dcpy/lifecycle/package/assemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
import tempfile
import typer

from dcpy.configuration import PRODUCT_METADATA_REPO_PATH
from dcpy.lifecycle import WORKING_DIRECTORIES
from dcpy.lifecycle.package import oti_xlsx
from dcpy.lifecycle.package import assemble
import dcpy.models.product.dataset.metadata_v2 as md
import dcpy.models.product.metadata as prod_md
from dcpy.utils.logging import logger


Expand Down Expand Up @@ -125,6 +127,7 @@ def pull_destination_files(
destination_id: str,
*,
unpackage_zips: bool = False,
metadata_only: bool = False,
):
"""Pull all files for a given destination."""
dest = product_metadata.get_destination(destination_id)
Expand All @@ -141,6 +144,12 @@ def pull_destination_files(
package_ids = {p.id for p in product_metadata.assembly}
for f in dest.files:
paths_and_dests = ids_to_paths_and_dests[f.id]
f_is_metadata = (
f.id in product_metadata.get_file_ids()
and product_metadata.get_file_and_overrides(f.id).file.is_metadata
)
if metadata_only and not f_is_metadata:
continue
file_path = local_package_path / (paths_and_dests["path"])
logger.info(f"{local_package_path} - {paths_and_dests['path']} - {file_path}")

Expand Down Expand Up @@ -177,17 +186,22 @@ def pull_all_destination_files(local_package_path: Path, product_metadata: md.Me


def assemble_dataset_from_bytes(
dataset_metadata: md.Metadata,
*,
dataset_metadata: md.Metadata,
product: str,
version: str,
source_destination_id: str,
out_path: Path | None = None,
metadata_only: bool = False,
) -> Path:
out_path = out_path or ASSEMBLY_DIR / product / version / dataset_metadata.id
logger.info(f"Assembling dataset from BYTES. Writing to: {out_path}")
assemble.pull_destination_files(
out_path, dataset_metadata, source_destination_id, unpackage_zips=True
out_path,
dataset_metadata,
source_destination_id,
unpackage_zips=True,
metadata_only=metadata_only,
)

oti_data_dictionaries = [
Expand Down Expand Up @@ -219,44 +233,79 @@ def assemble_dataset_from_bytes(
app = typer.Typer()


@app.command("from_bytes")
@app.command("assemble_from_bytes")
def assemble_dataset_from_bytes_cli(
metadata_path: Path,
source_destination_id: str,
product: str,
version: str,
org_metadata_path: Path = typer.Option(
PRODUCT_METADATA_REPO_PATH,
"-z",
"--metadata-path",
help="Path to metadata repo. Optionally, set in your env.",
),
dataset: str = typer.Option(
None,
"--dataset",
"-d",
help="Dataset, if different from product",
),
out_path: Path = typer.Option(
None,
"--output-path",
"-o",
help="Output Path. Defaults to ./data_dictionary.xlsx",
),
metadata_only: bool = typer.Option(
False,
"--output-path",
"-m",
help="Only Assemble Metadata.",
),
source_destination_id: str = typer.Option(
BYTES_DEST_TYPE,
"--source-destination-id",
"-s",
help="The Destination which acts as a source for this assembly",
),
):
dataset_name = dataset or product
org_md = prod_md.OrgMetadata.from_path(
org_metadata_path, template_vars={"version": version}
)

assemble_dataset_from_bytes(
md.Metadata.from_path(metadata_path, template_vars={"version": version}),
source_destination_id=source_destination_id,
dataset_metadata=org_md.product(product).dataset(dataset_name),
product=product,
source_destination_id=source_destination_id,
version=version,
out_path=out_path,
metadata_only=metadata_only,
)


@app.command("pull_dataset")
def _dataset_from_bytes_cli(
metadata_path: Path, product: str, version: str, dataset: str
product: str,
version: str,
dataset: str = typer.Option(
None,
"--dataset",
"-d",
help="Dataset, if different from product",
),
org_metadata_path: Path = typer.Option(
PRODUCT_METADATA_REPO_PATH,
"-z",
"--metadata-path",
help="Path to metadata repo. Optionally, set in your env.",
),
):
dataset = dataset or product
org_md = prod_md.OrgMetadata.from_path(
org_metadata_path, template_vars={"version": version}
)
out_dir = ASSEMBLY_DIR / product / version / dataset
md_instance = md.Metadata.from_path(
metadata_path,
template_vars={"version": version},
dataset_metadata = org_md.product(product).dataset(dataset)
pull_all_destination_files(
local_package_path=out_dir, product_metadata=dataset_metadata
)
pull_all_destination_files(local_package_path=out_dir, product_metadata=md_instance)


@app.command("pull_product")
def _product_from_bytes_cli(product_metadata_path: Path, product: str, version: str):
for folder in [
p for p in product_metadata_path.iterdir() if not p.name.startswith(".")
]:
# assumes the folders == the name of the dataset, which is true. For now.
_dataset_from_bytes_cli(folder / "metadata.yml", product, version, folder.name)
40 changes: 25 additions & 15 deletions dcpy/lifecycle/scripts/package_and_distribute.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,38 @@
from pathlib import Path
import typer
from typing import Unpack

from dcpy.configuration import PRODUCT_METADATA_REPO_PATH
from dcpy.models.product import metadata as product_metadata
from dcpy.lifecycle.distribute import socrata as soc_dist
from dcpy.lifecycle.package import assemble
from dcpy.utils.logging import logger


def from_bytes_to_tagged_socrata(
product_metadata_path: Path,
org_metadata_path: Path,
product: str,
version: str,
destination_tag: str,
source_destination_id: str = "bytes",
**publish_kwargs,
**publish_kwargs: Unpack[soc_dist.PublishKwargs],
):
"""Package from bytes, and"""
product = product_metadata.ProductMetadata.from_path(
root_path=product_metadata_path,
"""Package tagged datsets from bytes, and distribute to Socrata."""
org_md = product_metadata.OrgMetadata.from_path(
path=org_metadata_path,
template_vars={"version": version},
)
dests = product.get_tagged_destinations(destination_tag)
product_md = org_md.product(product)
dests = product_md.get_tagged_destinations(destination_tag)

logger.info(f"Packaging {product.metadata.id}. Datasets: {list(dests.keys())}")
logger.info(f"Packaging {product_md.metadata.id}. Datasets: {list(dests.keys())}")
package_paths = {}
for ds_id, dests_to_mds in dests.items():
dataset_metadata = list(dests_to_mds.values())[0]
out_path = assemble.assemble_dataset_from_bytes(
dataset_metadata,
product=product.metadata.id,
dataset_metadata=product_md.dataset(ds_id),
product=product,
version=version,
source_destination_id=source_destination_id,
source_destination_id="bytes",
metadata_only=publish_kwargs["metadata_only"],
)
package_paths[ds_id] = out_path

Expand All @@ -50,8 +53,14 @@ def from_bytes_to_tagged_socrata(

@app.command("from_bytes_to_tagged_socrata")
def from_bytes_to_tagged_socrata_cli(
product_metadata_path: Path,
product: str,
version: str,
org_metadata_path: Path = typer.Option(
PRODUCT_METADATA_REPO_PATH,
"-o",
"--metadata-path",
help="Path to metadata repo. Optionally, set in your env.",
),
destination_tag: str = typer.Option(
None,
"-t",
Expand All @@ -78,13 +87,14 @@ def from_bytes_to_tagged_socrata_cli(
),
metadata_only: bool = typer.Option(
False,
"-z",
"-m",
"--metadata-only",
help="Only push metadata (including attachments).",
),
):
results = from_bytes_to_tagged_socrata(
product_metadata_path,
org_metadata_path,
product,
version,
destination_tag,
publish=publish,
Expand Down
Loading

0 comments on commit 420a92a

Please sign in to comment.