diff --git a/space2stats_api/src/README.md b/space2stats_api/src/README.md index a67d7fe..0883793 100644 --- a/space2stats_api/src/README.md +++ b/space2stats_api/src/README.md @@ -1,16 +1,10 @@ ## space2stats -### Generating Preliminary CATALOG, COLLECTION, and ITEM files +### Generating STAC files - Navigate to the METADATA sub-directory and run the following commands in order: 1. get_types.py 2. create_stac.py - Note that the get types function is reading in a parquet file from the following directory: space2stats_api/src/local.parquet -- Here is a workflow diagram of the initial STAC metadata creation: +- Here is a workflow diagram of the STAC metadata creation: -![Create Stac](../../docs/images/create_stac_workflow.png) - -### Adding new ITEM files -- In link_new_item.py set "Paths and metadata setup" in the main function to point towards the corresponding locally saved parquet file -- Navigate to the METADATA sub-directory and run the following commands in order: - 1. get_types.py - 2. line_new_items.py \ No newline at end of file +![Create Stac](../../docs/images/create_stac_workflow.png) \ No newline at end of file diff --git a/space2stats_api/src/space2stats_ingest/METADATA/Space2Stats Metadata Content.xlsx b/space2stats_api/src/space2stats_ingest/METADATA/Space2Stats Metadata Content.xlsx index c6b34e6..c3df1c7 100644 Binary files a/space2stats_api/src/space2stats_ingest/METADATA/Space2Stats Metadata Content.xlsx and b/space2stats_api/src/space2stats_ingest/METADATA/Space2Stats Metadata Content.xlsx differ diff --git a/space2stats_api/src/space2stats_ingest/METADATA/create_stac.py b/space2stats_api/src/space2stats_ingest/METADATA/create_stac.py index c3ab6ad..c1f530e 100644 --- a/space2stats_api/src/space2stats_ingest/METADATA/create_stac.py +++ b/space2stats_api/src/space2stats_ingest/METADATA/create_stac.py @@ -50,7 +50,9 @@ def load_metadata(file: str) -> Dict[str, pd.DataFrame]: # Function to create STAC catalog -def create_stac_catalog(overview: pd.DataFrame, nada: pd.DataFrame) -> Catalog: +def create_stac_catalog( + overview: pd.DataFrame, nada: pd.DataFrame, catalog_dir: str +) -> Catalog: catalog = Catalog( id="space2stats-catalog", description=overview.loc["Description Resource"].values[0], @@ -64,6 +66,8 @@ def create_stac_catalog(overview: pd.DataFrame, nada: pd.DataFrame) -> Catalog: href="https://worldbank.github.io/DECAT_Space2Stats/stac/catalog.json", ) + # catalog.set_self_href(os.path.relpath("catalog.json", start=catalog_dir)) + return catalog @@ -108,11 +112,11 @@ def create_stac_collection(overview: pd.DataFrame) -> Collection: # Function to create STAC Item from GeoDataFrame -def create_stac_item(column_types: dict, metadata: pd.DataFrame) -> Item: +def create_stac_item( + column_types: dict, feature_catalog: pd.DataFrame, item_dir: str +) -> Item: data_dict = [] - feature_catalog = metadata["feature_catalog"] - for column, dtype in column_types.items(): description = feature_catalog.loc[ feature_catalog["variable"] == column, "description" @@ -150,28 +154,26 @@ def create_stac_item(column_types: dict, metadata: pd.DataFrame) -> Item: 89.98750455101016, ] - sources = metadata["sources"] - pop_metadata = sources[sources["Name"] == "Population"].iloc[0] item = Item( id="space2stats_population_2020", geometry=geom, bbox=bbox, datetime=datetime.now(), properties={ - "name": pop_metadata["Name"], - "description": pop_metadata["Description"], - "methodological_notes": pop_metadata["Methodological Notes"], - "source_data": pop_metadata["Source Data"], - "sci:citation": pop_metadata["Citation source"], - "organization": pop_metadata["Organization"], - "method": pop_metadata["Method"], - "resolution": pop_metadata["Resolution"], + "name": "Population Data", + "description": "Gridded population disaggregated by gender for the year 2020, with data available for different age groups.", + "methodological_notes": "Global raster files are processed for each hexagonal grid using zonal statistics.", + "source_data": "WorldPop gridded population, 2020, Unconstrained, UN-Adjusted", + "sci:citation": "Stevens FR, Gaughan AE, Linard C, Tatem AJ (2015) Disaggregating Census Data for Population Mapping Using Random Forests with Remotely-Sensed and Ancillary Data.", + "organization": "WorldPop, https://www.worldpop.org", + "method": "sum", + "resolution": "100 meters", "table:primary_geometry": "geometry", "table:columns": data_dict, "vector:layers": { "space2stats": column_types_with_geometry, }, - "themes": pop_metadata["Theme"], + "themes": ["Demographics", "Population"], }, stac_extensions=[ "https://stac-extensions.github.io/table/v1.2.0/schema.json", @@ -179,6 +181,7 @@ def create_stac_item(column_types: dict, metadata: pd.DataFrame) -> Item: ], ) + # item.set_self_href(os.path.join("items", f"{item.id}.json")) return item @@ -229,6 +232,7 @@ def main(): catalog = create_stac_catalog( metadata["overview"], metadata["nada"], + join(git_root, metadata_dir, "stac"), ) # Create STAC collection @@ -237,7 +241,8 @@ def main(): # Create STAC item item = create_stac_item( column_types, - metadata, + metadata["feature_catalog"], + join(git_root, metadata_dir, "stac"), ) # Add assets to item diff --git a/space2stats_api/src/space2stats_ingest/METADATA/get_types.py b/space2stats_api/src/space2stats_ingest/METADATA/get_types.py index 37b22f7..daf8eed 100644 --- a/space2stats_api/src/space2stats_ingest/METADATA/get_types.py +++ b/space2stats_api/src/space2stats_ingest/METADATA/get_types.py @@ -33,7 +33,7 @@ def save_parquet_types_to_json(parquet_file: str, json_file: str): if __name__ == "__main__": git_root = get_git_root() - parquet_file = join(git_root, "space2stats_api/src/ntl2012.parquet") + parquet_file = join(git_root, "space2stats_api/src/space2stats.parquet") json_file = join( git_root, "space2stats_api/src/space2stats_ingest/METADATA/types.json" ) diff --git a/space2stats_api/src/space2stats_ingest/METADATA/link_new_item.py b/space2stats_api/src/space2stats_ingest/METADATA/link_new_item.py deleted file mode 100644 index b087081..0000000 --- a/space2stats_api/src/space2stats_ingest/METADATA/link_new_item.py +++ /dev/null @@ -1,151 +0,0 @@ -import ast -import json -import os -from datetime import datetime -from os.path import join -from typing import Dict - -import git -import pandas as pd -from pystac import Asset, CatalogType, Collection, Item -from pystac.extensions.table import TableExtension - - -# Function to get the root of the git repository -def get_git_root() -> str: - git_repo = git.Repo(os.getcwd(), search_parent_directories=True) - return git_repo.git.rev_parse("--show-toplevel") - - -# Function to load metadata from the Excel file -def load_metadata(file: str) -> Dict[str, pd.DataFrame]: - overview = pd.read_excel(file, sheet_name="DDH Dataset", index_col="Field") - nada = pd.read_excel(file, sheet_name="NADA", index_col="Field") - feature_catalog = pd.read_excel(file, sheet_name="Feature Catalog") - sources = pd.read_excel(file, sheet_name="Sources") - sources["Variables"] = sources.apply( - lambda x: ast.literal_eval(x["Variables"]), axis=1 - ) - return { - "overview": overview, - "nada": nada, - "feature_catalog": feature_catalog, - "sources": sources, - } - - -# Function to read the existing STAC collection -def load_existing_collection(collection_path: str) -> Collection: - return Collection.from_file(collection_path) - - -# Function to create a new STAC item -def create_new_item(sources: pd.DataFrame, column_types: dict, item_name: str) -> Item: - # Define geometry and bounding box (you may want to customize these) - geom = { - "type": "Polygon", - "coordinates": [ - [ - [-179.99999561620714, -89.98750455101016], - [-179.99999561620714, 89.98750455101016], - [179.99999096313272, 89.98750455101016], - [179.99999096313272, -89.98750455101016], - [-179.99999561620714, -89.98750455101016], - ] - ], - } - bbox = [ - -179.99999561620714, - -89.98750455101016, - 179.99999096313272, - 89.98750455101016, - ] - - # Get metadata for Population item - src_metadata = sources[sources["Name"] == "Nighttime Lights"].iloc[0] - - # Define the item - item = Item( - id=item_name, - geometry=geom, - bbox=bbox, - datetime=datetime.now(), - properties={ - "name": src_metadata["Name"], - "description": src_metadata["Description"], - "methodological_notes": src_metadata["Methodological Notes"], - "source_data": src_metadata["Source Data"], - "sci:citation": src_metadata["Citation source"], - "method": src_metadata["Method"], - "resolution": src_metadata["Resolution"], - "themes": src_metadata["Theme"], - }, - stac_extensions=[ - "https://stac-extensions.github.io/table/v1.2.0/schema.json", - "https://stac-extensions.github.io/scientific/v1.0.0/schema.json", - ], - ) - - # Add table columns as properties - TableExtension.add_to(item) - table_extension = TableExtension.ext(item, add_if_missing=True) - table_extension.columns = [ - {"name": col, "type": dtype} for col, dtype in column_types.items() - ] - - # Add asset - item.add_asset( - "api-docs", - Asset( - href="https://space2stats.ds.io/docs", - title="API Documentation", - media_type="text/html", - roles=["metadata"], - ), - ) - - return item - - -# Function to add the new item to the existing collection -def add_item_to_collection(collection: Collection, item: Item): - collection.add_item(item) - - -# Save the updated collection -def save_collection(collection: Collection, collection_path: str): - collection.normalize_hrefs(collection_path) - collection.save(catalog_type=CatalogType.RELATIVE_PUBLISHED) - - -# Main function -def main(): - git_root = get_git_root() - metadata_dir = join(git_root, "space2stats_api/src/space2stats_ingest/METADATA") - - # Paths and metadata setup - item_name = "space2stats_ntl_2013" - collection_path = join(metadata_dir, "stac/space2stats-collection/collection.json") - excel_path = join(metadata_dir, "Space2Stats Metadata Content.xlsx") - column_types_file = join(metadata_dir, "types.json") - - # Load metadata and column types - metadata = load_metadata(excel_path) - with open(column_types_file, "r") as f: - column_types = json.load(f) - - # Load existing collection - collection = load_existing_collection(collection_path) - - # Create a new item - new_item = create_new_item(metadata["sources"], column_types, item_name) - - # Add the new item to the collection - collection.add_item(new_item, title="Space2Stats NTL 2013 Data Item") - - # Save the updated collection - save_collection(collection, collection_path) - - -if __name__ == "__main__": - main() diff --git a/space2stats_api/src/space2stats_ingest/METADATA/stac/space2stats-collection/collection.json b/space2stats_api/src/space2stats_ingest/METADATA/stac/space2stats-collection/collection.json index 1de5433..04c8bdf 100644 --- a/space2stats_api/src/space2stats_ingest/METADATA/stac/space2stats-collection/collection.json +++ b/space2stats_api/src/space2stats_ingest/METADATA/stac/space2stats-collection/collection.json @@ -21,12 +21,6 @@ "href": "./space2stats_population_2020/space2stats_population_2020.json", "type": "application/json", "title": "Space2Stats Population Data Item" - }, - { - "rel": "item", - "href": "./space2stats_ntl_2013/space2stats_ntl_2013.json", - "type": "application/json", - "title": "Space2Stats NTL 2013 Data Item" } ], "Title": "Space2Stats Database", @@ -38,28 +32,12 @@ "hexagons", "global" ], - "title": "Space2Stats Collection", - "extent": { - "spatial": { - "bbox": [ - [ - -180.0, - -90.0, - 180.0, - 90.0 - ] - ] - }, - "temporal": { - "interval": [ - [ - "2020-01-01T00:00:00Z", - null - ] - ] + "summaries": { + "datetime": { + "min": "2020-01-01T00:00:00Z", + "max": null } }, - "license": "CC-BY-4.0", "providers": [ { "name": "World Bank", @@ -70,12 +48,6 @@ "url": "https://www.worldbank.org/" } ], - "summaries": { - "datetime": { - "min": "2020-01-01T00:00:00Z", - "max": null - } - }, "assets": { "documentation": { "href": "https://space2stats.ds.io/docs", @@ -85,5 +57,27 @@ "metadata" ] } - } + }, + "title": "Space2Stats Collection", + "extent": { + "spatial": { + "bbox": [ + [ + -180.0, + -90.0, + 180.0, + 90.0 + ] + ] + }, + "temporal": { + "interval": [ + [ + "2020-01-01T00:00:00Z", + null + ] + ] + } + }, + "license": "CC-BY-4.0" } \ No newline at end of file diff --git a/space2stats_api/src/space2stats_ingest/METADATA/stac/space2stats-collection/space2stats_ntl_2013/space2stats_ntl_2013.json b/space2stats_api/src/space2stats_ingest/METADATA/stac/space2stats-collection/space2stats_ntl_2013/space2stats_ntl_2013.json deleted file mode 100644 index f50c6c5..0000000 --- a/space2stats_api/src/space2stats_ingest/METADATA/stac/space2stats-collection/space2stats_ntl_2013/space2stats_ntl_2013.json +++ /dev/null @@ -1,283 +0,0 @@ -{ - "type": "Feature", - "stac_version": "1.0.0", - "stac_extensions": [ - "https://stac-extensions.github.io/table/v1.2.0/schema.json", - "https://stac-extensions.github.io/scientific/v1.0.0/schema.json" - ], - "id": "space2stats_ntl_2013", - "geometry": { - "type": "Polygon", - "coordinates": [ - [ - [ - -179.99999561620714, - -89.98750455101016 - ], - [ - -179.99999561620714, - 89.98750455101016 - ], - [ - 179.99999096313272, - 89.98750455101016 - ], - [ - 179.99999096313272, - -89.98750455101016 - ], - [ - -179.99999561620714, - -89.98750455101016 - ] - ] - ] - }, - "bbox": [ - -179.99999561620714, - -89.98750455101016, - 179.99999096313272, - 89.98750455101016 - ], - "properties": { - "name": "Nighttime Lights", - "description": "Sum of luminosity values measured by monthly composites from VIIRS satellite.", - "methodological_notes": "Monthly composites generated by NASA through the Lights Every Night partnership.", - "source_data": "World Bank - Light Every Night, https://registry.opendata.aws/wb-light-every-night/", - "sci:citation": null, - "method": "sum", - "resolution": "500 mts", - "themes": "Socio-economic", - "table:columns": [ - { - "name": "hex_id", - "description": "H3 unique identifier", - "type": "object" - }, - { - "name": "SUM_VIIRS_NTL_201301", - "type": "float64" - }, - { - "name": "MIN_VIIRS_NTL_201301", - "type": "float64" - }, - { - "name": "MAX_VIIRS_NTL_201301", - "type": "float64" - }, - { - "name": "MEAN_VIIRS_NTL_201301", - "type": "float64" - }, - { - "name": "SUM_VIIRS_NTL_201302", - "type": "float64" - }, - { - "name": "MIN_VIIRS_NTL_201302", - "type": "float64" - }, - { - "name": "MAX_VIIRS_NTL_201302", - "type": "float64" - }, - { - "name": "MEAN_VIIRS_NTL_201302", - "type": "float64" - }, - { - "name": "SUM_VIIRS_NTL_201303", - "type": "float64" - }, - { - "name": "MIN_VIIRS_NTL_201303", - "type": "float64" - }, - { - "name": "MAX_VIIRS_NTL_201303", - "type": "float64" - }, - { - "name": "MEAN_VIIRS_NTL_201303", - "type": "float64" - }, - { - "name": "SUM_VIIRS_NTL_201304", - "type": "float64" - }, - { - "name": "MIN_VIIRS_NTL_201304", - "type": "float64" - }, - { - "name": "MAX_VIIRS_NTL_201304", - "type": "float64" - }, - { - "name": "MEAN_VIIRS_NTL_201304", - "type": "float64" - }, - { - "name": "SUM_VIIRS_NTL_201305", - "type": "float64" - }, - { - "name": "MIN_VIIRS_NTL_201305", - "type": "float64" - }, - { - "name": "MAX_VIIRS_NTL_201305", - "type": "float64" - }, - { - "name": "MEAN_VIIRS_NTL_201305", - "type": "float64" - }, - { - "name": "SUM_VIIRS_NTL_201306", - "type": "float64" - }, - { - "name": "MIN_VIIRS_NTL_201306", - "type": "float64" - }, - { - "name": "MAX_VIIRS_NTL_201306", - "type": "float64" - }, - { - "name": "MEAN_VIIRS_NTL_201306", - "type": "float64" - }, - { - "name": "SUM_VIIRS_NTL_201307", - "type": "float64" - }, - { - "name": "MIN_VIIRS_NTL_201307", - "type": "float64" - }, - { - "name": "MAX_VIIRS_NTL_201307", - "type": "float64" - }, - { - "name": "MEAN_VIIRS_NTL_201307", - "type": "float64" - }, - { - "name": "SUM_VIIRS_NTL_201308", - "type": "float64" - }, - { - "name": "MIN_VIIRS_NTL_201308", - "type": "float64" - }, - { - "name": "MAX_VIIRS_NTL_201308", - "type": "float64" - }, - { - "name": "MEAN_VIIRS_NTL_201308", - "type": "float64" - }, - { - "name": "SUM_VIIRS_NTL_201309", - "type": "float64" - }, - { - "name": "MIN_VIIRS_NTL_201309", - "type": "float64" - }, - { - "name": "MAX_VIIRS_NTL_201309", - "type": "float64" - }, - { - "name": "MEAN_VIIRS_NTL_201309", - "type": "float64" - }, - { - "name": "SUM_VIIRS_NTL_201310", - "type": "float64" - }, - { - "name": "MIN_VIIRS_NTL_201310", - "type": "float64" - }, - { - "name": "MAX_VIIRS_NTL_201310", - "type": "float64" - }, - { - "name": "MEAN_VIIRS_NTL_201310", - "type": "float64" - }, - { - "name": "SUM_VIIRS_NTL_201311", - "type": "float64" - }, - { - "name": "MIN_VIIRS_NTL_201311", - "type": "float64" - }, - { - "name": "MAX_VIIRS_NTL_201311", - "type": "float64" - }, - { - "name": "MEAN_VIIRS_NTL_201311", - "type": "float64" - }, - { - "name": "SUM_VIIRS_NTL_201312", - "type": "float64" - }, - { - "name": "MIN_VIIRS_NTL_201312", - "type": "float64" - }, - { - "name": "MAX_VIIRS_NTL_201312", - "type": "float64" - }, - { - "name": "MEAN_VIIRS_NTL_201312", - "type": "float64" - } - ], - "datetime": "2024-10-30T17:00:16.514238Z" - }, - "links": [ - { - "rel": "root", - "href": "../../catalog.json", - "type": "application/json", - "title": "Space2Stats Database" - }, - { - "rel": "collection", - "href": "../collection.json", - "type": "application/json", - "title": "Space2Stats Collection" - }, - { - "rel": "parent", - "href": "../collection.json", - "type": "application/json", - "title": "Space2Stats Collection" - } - ], - "assets": { - "api-docs": { - "href": "https://space2stats.ds.io/docs", - "type": "text/html", - "title": "API Documentation", - "roles": [ - "metadata" - ] - } - }, - "collection": "space2stats-collection" -} \ No newline at end of file diff --git a/space2stats_api/src/space2stats_ingest/METADATA/stac/space2stats-collection/space2stats_population_2020/space2stats_population_2020.json b/space2stats_api/src/space2stats_ingest/METADATA/stac/space2stats-collection/space2stats_population_2020/space2stats_population_2020.json index 9febf04..b41a284 100644 --- a/space2stats_api/src/space2stats_ingest/METADATA/stac/space2stats-collection/space2stats_population_2020/space2stats_population_2020.json +++ b/space2stats_api/src/space2stats_ingest/METADATA/stac/space2stats-collection/space2stats_population_2020/space2stats_population_2020.json @@ -40,14 +40,14 @@ 89.98750455101016 ], "properties": { - "name": "Population", - "description": "Gridded population disaggregated by gender.", + "name": "Population Data", + "description": "Gridded population disaggregated by gender for the year 2020, with data available for different age groups.", "methodological_notes": "Global raster files are processed for each hexagonal grid using zonal statistics.", - "source_data": "WorldPop gridded population, 2020, Unconstrained, UN-Adjusted, https://www.worldpop.org/methods/top_down_constrained_vs_unconstrained/", - "sci:citation": "Stevens FR, Gaughan AE, Linard C, Tatem AJ (2015) Disaggregating Census Data for Population Mapping Using Random Forests with Remotely-Sensed and Ancillary Data. ", - "organization": "World Pop, https://www.worldpop.org/methods/populations", + "source_data": "WorldPop gridded population, 2020, Unconstrained, UN-Adjusted", + "sci:citation": "Stevens FR, Gaughan AE, Linard C, Tatem AJ (2015) Disaggregating Census Data for Population Mapping Using Random Forests with Remotely-Sensed and Ancillary Data.", + "organization": "WorldPop, https://www.worldpop.org", "method": "sum", - "resolution": "100 mts", + "resolution": "100 meters", "table:primary_geometry": "geometry", "table:columns": [ { @@ -296,8 +296,11 @@ "geometry": "geometry" } }, - "themes": "Demographics", - "datetime": "2024-10-30T13:43:45.940644Z" + "themes": [ + "Demographics", + "Population" + ], + "datetime": "2024-10-24T14:54:26.131129Z" }, "links": [ { diff --git a/space2stats_api/src/space2stats_ingest/METADATA/types.json b/space2stats_api/src/space2stats_ingest/METADATA/types.json index 3463637..29a504b 100644 --- a/space2stats_api/src/space2stats_ingest/METADATA/types.json +++ b/space2stats_api/src/space2stats_ingest/METADATA/types.json @@ -1,50 +1,42 @@ { - "SUM_VIIRS_NTL_201301": "float64", - "MIN_VIIRS_NTL_201301": "float64", - "MAX_VIIRS_NTL_201301": "float64", - "MEAN_VIIRS_NTL_201301": "float64", - "SUM_VIIRS_NTL_201302": "float64", - "MIN_VIIRS_NTL_201302": "float64", - "MAX_VIIRS_NTL_201302": "float64", - "MEAN_VIIRS_NTL_201302": "float64", - "SUM_VIIRS_NTL_201303": "float64", - "MIN_VIIRS_NTL_201303": "float64", - "MAX_VIIRS_NTL_201303": "float64", - "MEAN_VIIRS_NTL_201303": "float64", - "SUM_VIIRS_NTL_201304": "float64", - "MIN_VIIRS_NTL_201304": "float64", - "MAX_VIIRS_NTL_201304": "float64", - "MEAN_VIIRS_NTL_201304": "float64", - "SUM_VIIRS_NTL_201305": "float64", - "MIN_VIIRS_NTL_201305": "float64", - "MAX_VIIRS_NTL_201305": "float64", - "MEAN_VIIRS_NTL_201305": "float64", - "SUM_VIIRS_NTL_201306": "float64", - "MIN_VIIRS_NTL_201306": "float64", - "MAX_VIIRS_NTL_201306": "float64", - "MEAN_VIIRS_NTL_201306": "float64", - "SUM_VIIRS_NTL_201307": "float64", - "MIN_VIIRS_NTL_201307": "float64", - "MAX_VIIRS_NTL_201307": "float64", - "MEAN_VIIRS_NTL_201307": "float64", - "SUM_VIIRS_NTL_201308": "float64", - "MIN_VIIRS_NTL_201308": "float64", - "MAX_VIIRS_NTL_201308": "float64", - "MEAN_VIIRS_NTL_201308": "float64", - "SUM_VIIRS_NTL_201309": "float64", - "MIN_VIIRS_NTL_201309": "float64", - "MAX_VIIRS_NTL_201309": "float64", - "MEAN_VIIRS_NTL_201309": "float64", - "SUM_VIIRS_NTL_201310": "float64", - "MIN_VIIRS_NTL_201310": "float64", - "MAX_VIIRS_NTL_201310": "float64", - "MEAN_VIIRS_NTL_201310": "float64", - "SUM_VIIRS_NTL_201311": "float64", - "MIN_VIIRS_NTL_201311": "float64", - "MAX_VIIRS_NTL_201311": "float64", - "MEAN_VIIRS_NTL_201311": "float64", - "SUM_VIIRS_NTL_201312": "float64", - "MIN_VIIRS_NTL_201312": "float64", - "MAX_VIIRS_NTL_201312": "float64", - "MEAN_VIIRS_NTL_201312": "float64" + "hex_id": "object", + "sum_pop_f_0_2020": "float64", + "sum_pop_f_10_2020": "float64", + "sum_pop_f_15_2020": "float64", + "sum_pop_f_1_2020": "float64", + "sum_pop_f_20_2020": "float64", + "sum_pop_f_25_2020": "float64", + "sum_pop_f_30_2020": "float64", + "sum_pop_f_35_2020": "float64", + "sum_pop_f_40_2020": "float64", + "sum_pop_f_45_2020": "float64", + "sum_pop_f_50_2020": "float64", + "sum_pop_f_55_2020": "float64", + "sum_pop_f_5_2020": "float64", + "sum_pop_f_60_2020": "float64", + "sum_pop_f_65_2020": "float64", + "sum_pop_f_70_2020": "float64", + "sum_pop_f_75_2020": "float64", + "sum_pop_f_80_2020": "float64", + "sum_pop_m_0_2020": "float64", + "sum_pop_m_10_2020": "float64", + "sum_pop_m_15_2020": "float64", + "sum_pop_m_1_2020": "float64", + "sum_pop_m_20_2020": "float64", + "sum_pop_m_25_2020": "float64", + "sum_pop_m_30_2020": "float64", + "sum_pop_m_35_2020": "float64", + "sum_pop_m_40_2020": "float64", + "sum_pop_m_45_2020": "float64", + "sum_pop_m_50_2020": "float64", + "sum_pop_m_55_2020": "float64", + "sum_pop_m_5_2020": "float64", + "sum_pop_m_60_2020": "float64", + "sum_pop_m_65_2020": "float64", + "sum_pop_m_70_2020": "float64", + "sum_pop_m_75_2020": "float64", + "sum_pop_m_80_2020": "float64", + "sum_pop_f_2020": "float64", + "sum_pop_m_2020": "float64", + "sum_pop_2020": "float64" } \ No newline at end of file diff --git a/space2stats_api/src/space2stats_ingest/cli.py b/space2stats_api/src/space2stats_ingest/cli.py index e61427d..c23a838 100644 --- a/space2stats_api/src/space2stats_ingest/cli.py +++ b/space2stats_api/src/space2stats_ingest/cli.py @@ -38,7 +38,6 @@ def download(s3_path: str, local_path: str = typer.Option("local.parquet")): def load( connection_string: str, stac_catalog_path: str, # Add the STAC metadata file path as an argument - item_name: str, parquet_file: str = typer.Option("local.parquet"), chunksize: int = 64_000, ): @@ -46,9 +45,7 @@ def load( Load a Parquet file into a PostgreSQL database after verifying columns with the STAC metadata. """ typer.echo(f"Loading data into PostgreSQL database from {parquet_file}") - load_parquet_to_db( - parquet_file, connection_string, stac_catalog_path, item_name, chunksize - ) + load_parquet_to_db(parquet_file, connection_string, stac_catalog_path, chunksize) typer.echo("Data loaded successfully to PostgreSQL!") diff --git a/space2stats_api/src/space2stats_ingest/main.py b/space2stats_api/src/space2stats_ingest/main.py index 8fbe32b..e352789 100644 --- a/space2stats_api/src/space2stats_ingest/main.py +++ b/space2stats_api/src/space2stats_ingest/main.py @@ -7,7 +7,7 @@ from pystac import Catalog from tqdm import tqdm -TABLE_NAME = "NTL2013" +TABLE_NAME = "space2stats" def read_parquet_file(file_path: str): @@ -34,30 +34,23 @@ def read_parquet_file(file_path: str): return table -def get_all_stac_fields(stac_catalog_path: str, item: str) -> Set[str]: +def get_all_stac_fields(stac_catalog_path: str) -> Set[str]: catalog = Catalog.from_file(stac_catalog_path) items = catalog.get_items(recursive=True) columns = [] - - # Filter items to match the given item param for it in items: - if item in it.get_self_href(): - columns.extend( - [col["name"] for col in it.properties.get("table:columns", [])] - ) - break - + columns.extend([col["name"] for col in it.properties.get("table:columns")]) + print(columns) return set(columns) -def verify_columns(parquet_file: str, stac_catalog_path: str, item: str) -> bool: +def verify_columns(parquet_file: str, stac_catalog_path: str) -> bool: """ Verifies that the Parquet file columns match the STAC item metadata columns. Args: parquet_file (str): Path to the Parquet file. stac_metadata_file (str): Path to the STAC item metadata JSON file. - item (str): Name of the relevant STAC item. Returns: bool: True if the columns match, False otherwise. @@ -65,7 +58,8 @@ def verify_columns(parquet_file: str, stac_catalog_path: str, item: str) -> bool parquet_table = read_parquet_file(parquet_file) parquet_columns = set(parquet_table.column_names) - stac_fields = get_all_stac_fields(stac_catalog_path, item) + stac_fields = get_all_stac_fields(stac_catalog_path) + if parquet_columns != stac_fields: extra_in_parquet = parquet_columns - stac_fields extra_in_stac = stac_fields - parquet_columns @@ -94,11 +88,10 @@ def load_parquet_to_db( parquet_file: str, connection_string: str, stac_catalog_path: str, - item: str, chunksize: int = 64_000, ): # Verify column consistency between Parquet file and STAC metadata - if not verify_columns(parquet_file, stac_catalog_path, item): + if not verify_columns(parquet_file, stac_catalog_path): raise ValueError("Column mismatch between Parquet file and STAC metadata") table = pq.read_table(parquet_file) diff --git a/space2stats_api/src/tests/conftest.py b/space2stats_api/src/tests/conftest.py index 694cad0..81d9277 100644 --- a/space2stats_api/src/tests/conftest.py +++ b/space2stats_api/src/tests/conftest.py @@ -130,11 +130,10 @@ def stac_file_path(): @pytest.fixture -def metadata_excel_file_path(): +def types_json_file_path(): current_dir = os.path.dirname(os.path.abspath(__file__)) root_dir = os.path.abspath(os.path.join(current_dir, "../../..")) - metadata_excel_file_path = os.path.join( - root_dir, - "space2stats_api/src/space2stats_ingest/METADATA/Space2Stats Metadata Content.xlsx", + types_json_file_path = os.path.join( + root_dir, "space2stats_api/src/space2stats_ingest/METADATA/types.json" ) - return metadata_excel_file_path + return types_json_file_path diff --git a/space2stats_api/src/tests/metadata_tests/test_stac_columns.py b/space2stats_api/src/tests/metadata_tests/test_stac_columns.py index 18940e9..303790f 100644 --- a/space2stats_api/src/tests/metadata_tests/test_stac_columns.py +++ b/space2stats_api/src/tests/metadata_tests/test_stac_columns.py @@ -1,19 +1,10 @@ import json -import pandas as pd - -def test_stac_columns_vs_types_json(stac_file_path, metadata_excel_file_path): - # Load the expected column types from the Metadata Content Excel - feature_catalog = pd.read_excel( - metadata_excel_file_path, sheet_name="Feature Catalog" - ) - expected_columns = feature_catalog[feature_catalog["source"] == "Population"] - - # Convert the DataFrame to a dictionary for easier comparison - expected_columns_dict = dict( - zip(expected_columns["variable"], expected_columns["type"]) - ) +def test_stac_columns_vs_types_json(stac_file_path, types_json_file_path): + # Load the expected column types from the types JSON file + with open(types_json_file_path, "r") as f: + expected_columns = json.load(f) # Load the STAC item from the JSON file with open(stac_file_path, "r") as f: @@ -24,16 +15,16 @@ def test_stac_columns_vs_types_json(stac_file_path, metadata_excel_file_path): col["name"]: col["type"] for col in stac_item["properties"]["table:columns"] } - # Assert that the number of columns in the STAC file matches the number of columns in the types TABLE file + # Assert that the number of columns in the STAC file matches the number of columns in the types JSON file assert ( - len(stac_columns) == len(expected_columns_dict) - ), f"Mismatch in column count: STAC ({len(stac_columns)}) vs TABLE ({len(expected_columns_dict)})" + len(stac_columns) == len(expected_columns) + ), f"Mismatch in column count: STAC ({len(stac_columns)}) vs JSON ({len(expected_columns)})" # Assert that column names and types match - for column_name, column_type in expected_columns_dict.items(): + for column_name, column_type in expected_columns.items(): assert ( column_name in stac_columns ), f"Column {column_name} is missing in the STAC file" assert ( stac_columns[column_name] == column_type - ), f"Mismatch in column type for {column_name}: STAC ({stac_columns[column_name]}) vs TABLE ({column_type})" + ), f"Mismatch in column type for {column_name}: STAC ({stac_columns[column_name]}) vs JSON ({column_type})"