More Metadata Changes (#1180)

* Add Typing for the Publishing Keyword Arguments These arguments are being passed all over, and this enables some type-safety. This is a potentially interesting use-case for TypedDicts. * Enable metadata only package+deployments. As I go through LION+DCM+Zoning, I want to be able to assemble a package of only metadata, and deploy only the metadata. For time considerations more than anything * Add remaining publication fields * Use OrgMetadata in CLI tools * Shuffle around the Org/Product/Dataset Overrides Refactors the override classes, with the eventual goal of enabling overrides/defaults from the org level. This is driven by the need to set Org-level fields like `agency` at the top-level, and pass that along to the Socrata connector. * Enable Metadata at the Org Level Allows us to set fields at the org-level, which can then be set at the product-level, then dataset-level. This is mostly for fields like agency, which needs to be passed to the Socrata publish connector * Socrata Pub: pull new fields from metadata Okay, now that we can set these fields at the org-level down through to the dataset, we can remove the literals when we publish * Add product metadata repo env var It's really annoying to have to keep pasting the path to your metadata repo. This adds the `PRODUCT_METADATA_REPO_PATH` env var, and makes that path flag optional (it's now an override) * Fix straggling kwarg errors * Add test for md-only bytes pull, fix docs * Throw an exception for missing socrata md fields --------- Co-authored-by: Alex Richey <alexrichey@DCP-APPLE-2180.local>
NYCPlanning · Oct 10, 2024 · 420a92a · 420a92a
1 parent def79d7
commit 420a92a
Show file tree

Hide file tree

Showing 14 changed files with 235 additions and 114 deletions.
diff --git a/dcpy/configuration.py b/dcpy/configuration.py
@@ -24,3 +24,5 @@
     "db-zoningtaxlots",
 ]
 IGNORED_LOGGING_BUILDS = ["nightly_qa", "compile_python_reqs"]
+
+PRODUCT_METADATA_REPO_PATH = env.get("PRODUCT_METADATA_REPO_PATH")
diff --git a/dcpy/connectors/socrata/publish.py b/dcpy/connectors/socrata/publish.py
@@ -114,29 +114,46 @@ class Inputs:
         class DatasetMetadata(BaseModel):
             name: str
             description: str
+            category: str
+            attribution: str
+            attributionLink: str
             tags: list[str]
+            # licenseId: str
             metadata: dict[str, Any]
             privateMetadata: dict[str, Any]
 
             @classmethod
             def from_dataset_attributes(cls, attrs: md.DatasetAttributes):
+                if not (attrs.category and attrs.agency and attrs.publishing_frequency):
+                    raise Exception(
+                        f"Required metadata fields are missing. Found category: {attrs.category}, agency: {attrs.agency} or publishing_frequency: {attrs.publishing_frequency}"
+                    )
+
                 return cls(
                     name=attrs.display_name,
                     description=attrs.description,
-                    tags=attrs.tags,
+                    category=attrs.category,
+                    attribution=attrs.attribution or "",
+                    attributionLink=attrs.attributionLink or "",
+                    tags=attrs.tags or [],
                     metadata={
                         "rowLabel": attrs.each_row_is_a,
                         "custom_fields": {
+                            "Dataset Information": {"Agency": attrs.agency},
                             "Update": {
+                                "Data Change Frequency": attrs.publishing_frequency,
+                                "Date Made Public": attrs.date_made_public,
+                                "Update Frequency Details": attrs.publishing_frequency_details,
                                 "Update Frequency": translate_legislative_freq_to_update_freq(
-                                    attrs.publishing_frequency or ""
+                                    attrs.publishing_frequency
                                 )
                                 or attrs.publishing_frequency,
                                 "Automation": "Yes",
-                            }
+                            },
                         },
                     },
                     privateMetadata={
+                        # "contactEmail": "", # Leaving this here in case we want to add it, so we don't have to remember what the field is called
                         "custom_fields": {
                             "Legislative Compliance": {
                                 "Removed Records?": "Yes",  # refers to row removal at time of push to Socrata. Always true since we overwrite the existing dataset.
@@ -153,8 +170,8 @@ def from_dataset_attributes(cls, attrs: md.DatasetAttributes):
                                     if attrs.custom.get("dataset_from_open_data_plan")
                                     else "No"
                                 ),
-                            }
-                        }
+                            },
+                        },
                     },
                 )
 

diff --git a/dcpy/lifecycle/distribute/socrata.py b/dcpy/lifecycle/distribute/socrata.py
@@ -1,12 +1,21 @@
 from pathlib import Path
 import typer
+from typing import TypedDict, Unpack, NotRequired, Required
 
 import dcpy.models.product.dataset.metadata_v2 as m
 from dcpy.utils.logging import logger
 import dcpy.connectors.edm.packaging as packaging
 import dcpy.connectors.socrata.publish as soc_pub
 
 
+class PublishKwargs(TypedDict):
+    metadata_path: NotRequired[Path]
+    publish: Required[bool]
+    ignore_validation_errors: Required[bool]
+    skip_validation: Required[bool]
+    metadata_only: Required[bool]
+
+
 def dist_from_local(
     package_path: Path,
     dataset_destination_id: str,
@@ -53,27 +62,31 @@ def dist_from_local(
             metadata=md,
             dataset_destination_id=dataset_destination_id,
             dataset_package_path=package_path,
-            publish=publish,
-            metadata_only=metadata_only,
+            publish=bool(publish),
+            metadata_only=bool(metadata_only),
         )
     except Exception as e:
         return f"Error pushing {md.attributes.display_name}, destination: {dest.id}: {str(e)}"
 
 
 def dist_from_local_all_socrata(
     package_path: Path,
-    **pub_kwargs,
+    **pub_kwargs: Unpack[PublishKwargs],
 ):
     """Distributes all Socrata destinations within a given metadata"""
     md = m.Metadata.from_path(package_path / "metadata.yml")
+    local_pub_kwargs = pub_kwargs.copy()
+    local_pub_kwargs.pop(
+        "metadata_path"
+    ) if "metadata_path" in local_pub_kwargs else None
+
     socrata_dests = [d.id for d in md.destinations if d.type == "socrata"]
     logger.info(f"Distributing {md.attributes.display_name}: {socrata_dests}")
     results = [
         dist_from_local(
             package_path=package_path,
-            metadata_path=None,
             dataset_destination_id=dataset_destination_id,
-            **pub_kwargs,
+            **local_pub_kwargs,
         )
         for dataset_destination_id in socrata_dests
     ]
@@ -82,7 +95,7 @@ def dist_from_local_all_socrata(
 
 def dist_from_local_product_all_socrata(
     product_path: Path,
-    **pub_kwargs,
+    **pub_kwargs: Unpack[PublishKwargs],
 ):
     """Distribute datasets for an entire product."""
     results = []

diff --git a/dcpy/lifecycle/package/assemble.py b/dcpy/lifecycle/package/assemble.py
@@ -5,10 +5,12 @@
 import tempfile
 import typer
 
+from dcpy.configuration import PRODUCT_METADATA_REPO_PATH
 from dcpy.lifecycle import WORKING_DIRECTORIES
 from dcpy.lifecycle.package import oti_xlsx
 from dcpy.lifecycle.package import assemble
 import dcpy.models.product.dataset.metadata_v2 as md
+import dcpy.models.product.metadata as prod_md
 from dcpy.utils.logging import logger
 
 
@@ -125,6 +127,7 @@ def pull_destination_files(
     destination_id: str,
     *,
     unpackage_zips: bool = False,
+    metadata_only: bool = False,
 ):
     """Pull all files for a given destination."""
     dest = product_metadata.get_destination(destination_id)
@@ -141,6 +144,12 @@ def pull_destination_files(
     package_ids = {p.id for p in product_metadata.assembly}
     for f in dest.files:
         paths_and_dests = ids_to_paths_and_dests[f.id]
+        f_is_metadata = (
+            f.id in product_metadata.get_file_ids()
+            and product_metadata.get_file_and_overrides(f.id).file.is_metadata
+        )
+        if metadata_only and not f_is_metadata:
+            continue
         file_path = local_package_path / (paths_and_dests["path"])
         logger.info(f"{local_package_path} - {paths_and_dests['path']} - {file_path}")
 
@@ -177,17 +186,22 @@ def pull_all_destination_files(local_package_path: Path, product_metadata: md.Me
 
 
 def assemble_dataset_from_bytes(
-    dataset_metadata: md.Metadata,
     *,
+    dataset_metadata: md.Metadata,
     product: str,
     version: str,
     source_destination_id: str,
     out_path: Path | None = None,
+    metadata_only: bool = False,
 ) -> Path:
     out_path = out_path or ASSEMBLY_DIR / product / version / dataset_metadata.id
     logger.info(f"Assembling dataset from BYTES. Writing to: {out_path}")
     assemble.pull_destination_files(
-        out_path, dataset_metadata, source_destination_id, unpackage_zips=True
+        out_path,
+        dataset_metadata,
+        source_destination_id,
+        unpackage_zips=True,
+        metadata_only=metadata_only,
     )
 
     oti_data_dictionaries = [
@@ -219,44 +233,79 @@ def assemble_dataset_from_bytes(
 app = typer.Typer()
 
 
-@app.command("from_bytes")
+@app.command("assemble_from_bytes")
 def assemble_dataset_from_bytes_cli(
-    metadata_path: Path,
-    source_destination_id: str,
     product: str,
     version: str,
+    org_metadata_path: Path = typer.Option(
+        PRODUCT_METADATA_REPO_PATH,
+        "-z",
+        "--metadata-path",
+        help="Path to metadata repo. Optionally, set in your env.",
+    ),
+    dataset: str = typer.Option(
+        None,
+        "--dataset",
+        "-d",
+        help="Dataset, if different from product",
+    ),
     out_path: Path = typer.Option(
         None,
         "--output-path",
         "-o",
         help="Output Path. Defaults to ./data_dictionary.xlsx",
     ),
+    metadata_only: bool = typer.Option(
+        False,
+        "--output-path",
+        "-m",
+        help="Only Assemble Metadata.",
+    ),
+    source_destination_id: str = typer.Option(
+        BYTES_DEST_TYPE,
+        "--source-destination-id",
+        "-s",
+        help="The Destination which acts as a source for this assembly",
+    ),
 ):
+    dataset_name = dataset or product
+    org_md = prod_md.OrgMetadata.from_path(
+        org_metadata_path, template_vars={"version": version}
+    )
+
     assemble_dataset_from_bytes(
-        md.Metadata.from_path(metadata_path, template_vars={"version": version}),
-        source_destination_id=source_destination_id,
+        dataset_metadata=org_md.product(product).dataset(dataset_name),
         product=product,
+        source_destination_id=source_destination_id,
         version=version,
         out_path=out_path,
+        metadata_only=metadata_only,
     )
 
 
 @app.command("pull_dataset")
 def _dataset_from_bytes_cli(
-    metadata_path: Path, product: str, version: str, dataset: str
+    product: str,
+    version: str,
+    dataset: str = typer.Option(
+        None,
+        "--dataset",
+        "-d",
+        help="Dataset, if different from product",
+    ),
+    org_metadata_path: Path = typer.Option(
+        PRODUCT_METADATA_REPO_PATH,
+        "-z",
+        "--metadata-path",
+        help="Path to metadata repo. Optionally, set in your env.",
+    ),
 ):
+    dataset = dataset or product
+    org_md = prod_md.OrgMetadata.from_path(
+        org_metadata_path, template_vars={"version": version}
+    )
     out_dir = ASSEMBLY_DIR / product / version / dataset
-    md_instance = md.Metadata.from_path(
-        metadata_path,
-        template_vars={"version": version},
+    dataset_metadata = org_md.product(product).dataset(dataset)
+    pull_all_destination_files(
+        local_package_path=out_dir, product_metadata=dataset_metadata
     )
-    pull_all_destination_files(local_package_path=out_dir, product_metadata=md_instance)
-
-
-@app.command("pull_product")
-def _product_from_bytes_cli(product_metadata_path: Path, product: str, version: str):
-    for folder in [
-        p for p in product_metadata_path.iterdir() if not p.name.startswith(".")
-    ]:
-        # assumes the folders == the name of the dataset, which is true. For now.
-        _dataset_from_bytes_cli(folder / "metadata.yml", product, version, folder.name)
diff --git a/dcpy/lifecycle/scripts/package_and_distribute.py b/dcpy/lifecycle/scripts/package_and_distribute.py
@@ -1,35 +1,38 @@
 from pathlib import Path
 import typer
+from typing import Unpack
 
+from dcpy.configuration import PRODUCT_METADATA_REPO_PATH
 from dcpy.models.product import metadata as product_metadata
 from dcpy.lifecycle.distribute import socrata as soc_dist
 from dcpy.lifecycle.package import assemble
 from dcpy.utils.logging import logger
 
 
 def from_bytes_to_tagged_socrata(
-    product_metadata_path: Path,
+    org_metadata_path: Path,
+    product: str,
     version: str,
     destination_tag: str,
-    source_destination_id: str = "bytes",
-    **publish_kwargs,
+    **publish_kwargs: Unpack[soc_dist.PublishKwargs],
 ):
-    """Package from bytes, and"""
-    product = product_metadata.ProductMetadata.from_path(
-        root_path=product_metadata_path,
+    """Package tagged datsets from bytes, and distribute to Socrata."""
+    org_md = product_metadata.OrgMetadata.from_path(
+        path=org_metadata_path,
         template_vars={"version": version},
     )
-    dests = product.get_tagged_destinations(destination_tag)
+    product_md = org_md.product(product)
+    dests = product_md.get_tagged_destinations(destination_tag)
 
-    logger.info(f"Packaging {product.metadata.id}. Datasets: {list(dests.keys())}")
+    logger.info(f"Packaging {product_md.metadata.id}. Datasets: {list(dests.keys())}")
     package_paths = {}
     for ds_id, dests_to_mds in dests.items():
-        dataset_metadata = list(dests_to_mds.values())[0]
         out_path = assemble.assemble_dataset_from_bytes(
-            dataset_metadata,
-            product=product.metadata.id,
+            dataset_metadata=product_md.dataset(ds_id),
+            product=product,
             version=version,
-            source_destination_id=source_destination_id,
+            source_destination_id="bytes",
+            metadata_only=publish_kwargs["metadata_only"],
         )
         package_paths[ds_id] = out_path
 
@@ -50,8 +53,14 @@ def from_bytes_to_tagged_socrata(
 
 @app.command("from_bytes_to_tagged_socrata")
 def from_bytes_to_tagged_socrata_cli(
-    product_metadata_path: Path,
+    product: str,
     version: str,
+    org_metadata_path: Path = typer.Option(
+        PRODUCT_METADATA_REPO_PATH,
+        "-o",
+        "--metadata-path",
+        help="Path to metadata repo. Optionally, set in your env.",
+    ),
     destination_tag: str = typer.Option(
         None,
         "-t",
@@ -78,13 +87,14 @@ def from_bytes_to_tagged_socrata_cli(
     ),
     metadata_only: bool = typer.Option(
         False,
-        "-z",
+        "-m",
         "--metadata-only",
         help="Only push metadata (including attachments).",
     ),
 ):
     results = from_bytes_to_tagged_socrata(
-        product_metadata_path,
+        org_metadata_path,
+        product,
         version,
         destination_tag,
         publish=publish,