From d2c0901289c427a1e0510f9d2220897714709c5f Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 28 Feb 2024 10:36:10 +0000 Subject: [PATCH 01/60] Initial application of asset to scotland methods --- pyproject.toml | 1 + python/popgetter/__init__.py | 7 + python/popgetter/assets/__init__.py | 2 +- python/popgetter/assets/scotland/__init__.py | 22 +++ python/popgetter/assets/scotland/scotland.py | 175 +++++++++++++++++++ 5 files changed, 206 insertions(+), 1 deletion(-) create mode 100755 python/popgetter/assets/scotland/__init__.py create mode 100644 python/popgetter/assets/scotland/scotland.py diff --git a/pyproject.toml b/pyproject.toml index d73748b..9e4b01b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ dependencies = [ "pydantic<2.0.0", "rdflib >=7.0.0", # Required to parse BEL TTL Metadata catalogue. "icecream >=2.1.3", # General debugging tool + "openpyxl", ] [project.optional-dependencies] diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 951e4df..65f247a 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -29,6 +29,7 @@ *load_assets_from_package_module(assets.us, group_name="us"), *load_assets_from_package_module(assets.be, group_name="be"), *load_assets_from_package_module(assets.uk, group_name="uk"), + *load_assets_from_package_module(assets.scotland, group_name="scotland"), ] job_be: UnresolvedAssetJobDefinition = define_asset_job( @@ -50,6 +51,12 @@ description="Downloads UK data.", ) +job_uk: UnresolvedAssetJobDefinition = define_asset_job( + name="job_scotland", + selection=AssetSelection.groups("scotland"), + description="Downloads Scotland data.", +) + defs: Definitions = Definitions( assets=all_assets, schedules=[], diff --git a/python/popgetter/assets/__init__.py b/python/popgetter/assets/__init__.py index e050bf8..7ecbf5d 100644 --- a/python/popgetter/assets/__init__.py +++ b/python/popgetter/assets/__init__.py @@ -1,3 +1,3 @@ from __future__ import annotations -from . import be, uk, us # noqa: F401 +from . import be, uk, us, scotland # noqa: F401 diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py new file mode 100755 index 0000000..fb102a3 --- /dev/null +++ b/python/popgetter/assets/scotland/__init__.py @@ -0,0 +1,22 @@ +#!/usr/bin/python3 +from __future__ import annotations + +from dagster import ( + asset, +) + +from popgetter.metadata import ( + CountryMetadata, +) + +from . import ( + scotland, +) + + +# @asset(key_prefix=asset_prefix) +# def get_country_metadata() -> CountryMetadata: +# """ +# Returns a CountryMetadata of metadata about the country. +# """ +# return country diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py new file mode 100644 index 0000000..8e7a0ab --- /dev/null +++ b/python/popgetter/assets/scotland/scotland.py @@ -0,0 +1,175 @@ +import subprocess +import requests +import zipfile +import os +import urllib +import pandas as pd +import geopandas +import numpy as np +import matplotlib.pyplot as plt + +from dagster import asset + +HEADERS = { + "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0" +} + + +def download_file( + cache_dir: str, + url: str, + file_name: str | None = None, + headers: dict[str, str] = HEADERS, +) -> str: + """Downloads file checking first if exists in cache, returning file name.""" + file_name = ( + os.path.join(cache_dir, url.split("/")[-1]) if file_name is None else file_name + ) + if not os.path.exists(file_name): + r = requests.get(url, allow_redirects=True, headers=headers) + open(file_name, "wb").write(r.content) + return file_name + + +""" +Notes: + - 2011 data using UKCensusAPI, 2022 data expected soon given recent initial + publication + - Reusing some bits of code from UKCensusAPI: + https://github.com/alan-turing-institute/UKCensusAPI/blob/master/ukcensusapi/NRScotland.py +""" + + +class Scotland: + cache_dir: str + lookup: pd.DataFrame + + URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html" + URL1 = "https://www.scotlandscensus.gov.uk/" + URL2 = "https://nrscensusprodumb.blob.core.windows.net/downloads/" + URL_LOOKUP = ( + "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx" + ) + URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip" + + data_sources = ["Council Area blk", "SNS Data Zone 2011 blk", "Output Area blk"] + GeoCodeLookup = { + "LAD": 0, # "Council Area blk" + # MSOA (intermediate zone)? + "LSOA11": 1, # "SNS Data Zone 2011 blk" + "OA11": 2, # "Output Area blk" + } + SCGeoCodes = ["CA", "DZ", "OA"] + + def __init__(self, cache_dir: str = "./cache/"): + """Init and get lookup.""" + self.cache_dir = cache_dir + os.makedirs(self.cache_dir, exist_ok=True) + lookup_path = download_file(self.cache_dir, self.URL_LOOKUP) + self.lookup = pd.read_excel(lookup_path, sheet_name="OA_DZ_IZ_2011 Lookup") + + def __source_to_zip(self, source_name: str) -> str: + """Downloads if necessary and returns the name of the locally cached zip file + of the source data (replacing spaces with _)""" + file_name = os.path.join(self.cache_dir, source_name.replace(" ", "_") + ".zip") + if not os.path.isfile(file_name): + if source_name.split()[0] == "Council": + scotland_src = ( + self.URL1 + + "media/hjmd0oqr/" + + source_name.lower().replace(" ", "-") + + ".zip" + ) + else: + scotland_src = self.URL2 + urllib.parse.quote(source_name) + ".zip" + return download_file(self.cache_dir, scotland_src, file_name) + + def get_rawdata(self, table: str, resolution: str) -> pd.DataFrame: + """Gets the raw csv data and metadata.""" + if not os.path.exists(os.path.join(self.cache_dir, table + ".csv")): + try: + zf = self.__source_to_zip( + self.data_sources[self.GeoCodeLookup[resolution]] + ) + with zipfile.ZipFile(zf) as zip_ref: + zip_ref.extractall(self.cache_dir) + except NotImplementedError as _: + subprocess.run(["unzip", "-o", zf, "-d", self.cache_dir]) + + return pd.read_csv(os.path.join(self.cache_dir, table + ".csv")) + + def get_lc1117sc(self) -> pd.DataFrame: + """Gets LC1117SC age by sex table at OA11 resolution.""" + df = self.get_rawdata("LC1117SC", "OA11").rename( + columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"} + ) + return df.loc[df["OA11"].isin(self.lookup["OutputArea2011Code"])] + + def get_shapefile(self) -> geopandas.GeoDataFrame: + """Gets the shape file for OA11 resolution.""" + file_name = download_file(self.cache_dir, self.URL_SHAPEFILE) + geo = geopandas.read_file(f"zip://{file_name}") + return geo[geo["geo_code"].isin(self.lookup["OutputArea2011Code"])] + +@asset +def download_data(): + cache_dir = "./cache/" + scotland = Scotland(cache_dir) + +@asset +def download_census() -> pd.DataFrame: + cache_dir = "./cache/" + scotland = Scotland(cache_dir) + return scotland.get_lc1117sc() + + +@asset +def download_shapefile() -> geopandas.GeoDataFrame: + cache_dir = "./cache/" + scotland = Scotland(cache_dir) + return scotland.get_shapefile() + +# @multi_asset( +# ins={ +# "individual_census_table": AssetIn( +# key_prefix=asset_prefix, partition_mapping=needed_dataset_mapping +# ), +# # "individual_census_table": AssetIn(key_prefix=asset_prefix), +# "filter_needed_catalog": AssetIn(key_prefix=asset_prefix), +# }, +# def generate_plots(): +# geo.merge(pop, left_on="geo_code", right_on="OA11", how="left") +# # Plot +# merged["log10 people"] = np.log10(merged["All people"]) +# merged[merged["Age bracket"] == "All people"].plot( +# column="log10 people", legend=True +# ) +# plt.show() + + +def main(): + cache_dir = "./cache/" + + # Make instance of Scotland + scotland = Scotland(cache_dir) + + # Get OA11 Age/Sex data + pop = scotland.get_lc1117sc() + + # Get shape file + geo = scotland.get_shapefile() + + # Merge + merged = geo.merge(pop, left_on="geo_code", right_on="OA11", how="left") + print(merged) + + # Plot + merged["log10 people"] = np.log10(merged["All people"]) + merged[merged["Age bracket"] == "All people"].plot( + column="log10 people", legend=True + ) + plt.show() + + +if __name__ == "__main__": + main() From 714ceab12d8d371cd60c9bd61b1b511c4659c1f6 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 6 Mar 2024 17:32:23 +0000 Subject: [PATCH 02/60] Add zipfile-deflate64 dep --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 9e4b01b..ae68ef1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ dependencies = [ "rdflib >=7.0.0", # Required to parse BEL TTL Metadata catalogue. "icecream >=2.1.3", # General debugging tool "openpyxl", + "zipfile-deflate64", ] [project.optional-dependencies] From cb8e86b664d48451299979a9906c8fb337c0afda Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 6 Mar 2024 17:33:22 +0000 Subject: [PATCH 03/60] Add key_prefix --- python/popgetter/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 65f247a..a98dd3b 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -29,7 +29,7 @@ *load_assets_from_package_module(assets.us, group_name="us"), *load_assets_from_package_module(assets.be, group_name="be"), *load_assets_from_package_module(assets.uk, group_name="uk"), - *load_assets_from_package_module(assets.scotland, group_name="scotland"), + *load_assets_from_package_module(assets.scotland, group_name="scotland", key_prefix="uk-scotland"), ] job_be: UnresolvedAssetJobDefinition = define_asset_job( From 2e7429173b25a892d40b6ba32a295dc660f21e3d Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 6 Mar 2024 17:37:29 +0000 Subject: [PATCH 04/60] Begin refactor with dagster --- python/popgetter/assets/scotland/scotland.py | 274 ++++++++++--------- 1 file changed, 152 insertions(+), 122 deletions(-) diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py index 8e7a0ab..a7de2a6 100644 --- a/python/popgetter/assets/scotland/scotland.py +++ b/python/popgetter/assets/scotland/scotland.py @@ -1,14 +1,17 @@ import subprocess +import tempfile +from typing import Tuple import requests -import zipfile +# import zipfile +import zipfile_deflate64 as zipfile import os -import urllib +import urllib.parse as urlparse import pandas as pd import geopandas import numpy as np import matplotlib.pyplot as plt - -from dagster import asset +from icecream import ic +from dagster import AssetIn, AssetOut, DynamicPartitionsDefinition, MetadataValue, Output, SpecificPartitionsPartitionMapping, StaticPartitionsDefinition, asset, multi_asset HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0" @@ -40,103 +43,158 @@ def download_file( """ -class Scotland: - cache_dir: str - lookup: pd.DataFrame +PARTITIONS_DEF_NAME = "dataset_tables" +dataset_node_partition = DynamicPartitionsDefinition(name=PARTITIONS_DEF_NAME) - URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html" - URL1 = "https://www.scotlandscensus.gov.uk/" - URL2 = "https://nrscensusprodumb.blob.core.windows.net/downloads/" - URL_LOOKUP = ( - "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx" - ) - URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip" - - data_sources = ["Council Area blk", "SNS Data Zone 2011 blk", "Output Area blk"] - GeoCodeLookup = { - "LAD": 0, # "Council Area blk" - # MSOA (intermediate zone)? - "LSOA11": 1, # "SNS Data Zone 2011 blk" - "OA11": 2, # "Output Area blk" +# cache_dir = tempfile.mkdtemp() +cache_dir = "./cache" + +URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html" +URL1 = "https://www.scotlandscensus.gov.uk/" +URL2 = "https://nrscensusprodumb.blob.core.windows.net/downloads/" +URL_LOOKUP = ( + "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx" +) +URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip" + +data_sources = ["Council Area blk", "SNS Data Zone 2011 blk", "Output Area blk"] +GeoCodeLookup = { + "LAD": 0, # "Council Area blk" + # MSOA (intermediate zone)? + "LSOA11": 1, # "SNS Data Zone 2011 blk" + "OA11": 2, # "Output Area blk" +} +# SCGeoCodes = ["CA", "DZ", "OA"] + + +DATA_SOURCES = { + 0: { + "source": "Council Area blk", + "resolution": "LAD", + "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip" + }, + 1: { + "source": "SNS Data Zone 2011 blk", + "resolution": "LSOA11", + "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip" + }, + 2: { + "source": "Output Area blk", + "resolution": "OA11", + "url": URL2 + urlparse.quote("Output Area blk") + ".zip" } - SCGeoCodes = ["CA", "DZ", "OA"] - - def __init__(self, cache_dir: str = "./cache/"): - """Init and get lookup.""" - self.cache_dir = cache_dir - os.makedirs(self.cache_dir, exist_ok=True) - lookup_path = download_file(self.cache_dir, self.URL_LOOKUP) - self.lookup = pd.read_excel(lookup_path, sheet_name="OA_DZ_IZ_2011 Lookup") - - def __source_to_zip(self, source_name: str) -> str: - """Downloads if necessary and returns the name of the locally cached zip file - of the source data (replacing spaces with _)""" - file_name = os.path.join(self.cache_dir, source_name.replace(" ", "_") + ".zip") - if not os.path.isfile(file_name): - if source_name.split()[0] == "Council": - scotland_src = ( - self.URL1 - + "media/hjmd0oqr/" - + source_name.lower().replace(" ", "-") - + ".zip" - ) - else: - scotland_src = self.URL2 + urllib.parse.quote(source_name) + ".zip" - return download_file(self.cache_dir, scotland_src, file_name) - - def get_rawdata(self, table: str, resolution: str) -> pd.DataFrame: - """Gets the raw csv data and metadata.""" - if not os.path.exists(os.path.join(self.cache_dir, table + ".csv")): - try: - zf = self.__source_to_zip( - self.data_sources[self.GeoCodeLookup[resolution]] - ) - with zipfile.ZipFile(zf) as zip_ref: - zip_ref.extractall(self.cache_dir) - except NotImplementedError as _: - subprocess.run(["unzip", "-o", zf, "-d", self.cache_dir]) - - return pd.read_csv(os.path.join(self.cache_dir, table + ".csv")) - - def get_lc1117sc(self) -> pd.DataFrame: - """Gets LC1117SC age by sex table at OA11 resolution.""" - df = self.get_rawdata("LC1117SC", "OA11").rename( - columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"} - ) - return df.loc[df["OA11"].isin(self.lookup["OutputArea2011Code"])] - - def get_shapefile(self) -> geopandas.GeoDataFrame: - """Gets the shape file for OA11 resolution.""" - file_name = download_file(self.cache_dir, self.URL_SHAPEFILE) - geo = geopandas.read_file(f"zip://{file_name}") - return geo[geo["geo_code"].isin(self.lookup["OutputArea2011Code"])] +} -@asset -def download_data(): - cache_dir = "./cache/" - scotland = Scotland(cache_dir) + +# NB. Make sure no spaces in asset keys +@multi_asset( + outs={ + "oa_dz_iz_2011_lookup": AssetOut(), + "data_zone_2011_lookup": AssetOut(), + "intermediate_zone_2011_lookup": AssetOut(), + }, +) +def download_lookup(): + os.makedirs(cache_dir, exist_ok=True) + lookup_path = download_file(cache_dir, URL_LOOKUP) + df1 = pd.read_excel(lookup_path, sheet_name="OA_DZ_IZ_2011 Lookup") + df2 = pd.read_excel(lookup_path, sheet_name="DataZone2011Lookup") + df3 = pd.read_excel(lookup_path, sheet_name="IntermediateZone2011Lookup") + return df1, df2, df3 + + +def source_to_zip(source_name: str, url: str) -> str: + """Downloads if necessary and returns the name of the locally cached zip file + of the source data (replacing spaces with _)""" + file_name = os.path.join(cache_dir, source_name.replace(" ", "_") + ".zip") + return download_file(cache_dir, url, file_name) @asset -def download_census() -> pd.DataFrame: - cache_dir = "./cache/" - scotland = Scotland(cache_dir) - return scotland.get_lc1117sc() +def make_catalog(context) -> pd.DataFrame: + records = [] + for data_source in DATA_SOURCES.values(): + resolution = data_source["resolution"] + source = data_source["source"] + url = data_source["url"] + with zipfile.ZipFile(source_to_zip(source, url)) as zip_ref: + for name in zip_ref.namelist(): + print(name) + record = { + "resolution": resolution, + "source": source, + "url": url, + "file_name": name, + } + records.append(record) + ic(record) + zip_ref.extract(name, cache_dir) + + for partition in context.instance.get_dynamic_partitions(PARTITIONS_DEF_NAME): + context.instance.delete_dynamic_partition(PARTITIONS_DEF_NAME, partition) + + # Create a dynamic partition for the datasets listed in the catalog + catalog_df: pd.DataFrame = pd.DataFrame.from_records(records) + partition_keys = catalog_df["file_name"].to_list() + context.instance.add_dynamic_partitions( + partitions_def_name=PARTITIONS_DEF_NAME, partition_keys=partition_keys + ) + context.add_output_metadata( + metadata={ + "num_records": len(catalog_df), + "ignored_datasets": "", + "columns": MetadataValue.md( + "\n".join([f"- '`{col}`'" for col in catalog_df.columns.to_list()]) + ), + "columns_types": MetadataValue.md(catalog_df.dtypes.to_markdown()), + "preview": MetadataValue.md(catalog_df.to_markdown()), + } + ) + return catalog_df + + +def get_table(context, table_details) -> pd.DataFrame: + df = pd.read_csv(os.path.join(cache_dir, table_details["file_name"].iloc[0])) + context.add_output_metadata( + metadata={ + "title": table_details["file_nae"].iloc[0], + # "title": "Test", + "num_records": len(df), # Metadata can be any key-value pair + "columns": MetadataValue.md( + "\n".join([f"- '`{col}`'" for col in df.columns.to_list()]) + ), + "preview": MetadataValue.md(df.head().to_markdown()), + } + ) + return df + +@asset(partitions_def=dataset_node_partition) +def individual_census_table(context, make_catalog: pd.DataFrame) -> pd.DataFrame: + partition_key = context.asset_partition_key_for_output() + ic(partition_key) + row = make_catalog.loc[make_catalog["file_name"].isin([partition_key])] + ic(row) + return get_table(context, table_details=row) + + +# # TODO: add to derived +# def get_lc1117sc(context, lookup, ) -> pd.DataFrame: +# """Gets LC1117SC age by sex table at OA11 resolution.""" +# df = get_rawdata("LC1117SC", "OA11").rename( +# columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"} +# ) +# return df.loc[df["OA11"].isin(lookup["OutputArea2011Code"])] -@asset -def download_shapefile() -> geopandas.GeoDataFrame: - cache_dir = "./cache/" - scotland = Scotland(cache_dir) - return scotland.get_shapefile() - -# @multi_asset( -# ins={ -# "individual_census_table": AssetIn( -# key_prefix=asset_prefix, partition_mapping=needed_dataset_mapping -# ), -# # "individual_census_table": AssetIn(key_prefix=asset_prefix), -# "filter_needed_catalog": AssetIn(key_prefix=asset_prefix), -# }, +# # TODO: add shapefile +# def shapefile(context) -> geopandas.GeoDataFrame: +# """Gets the shape file for OA11 resolution.""" +# file_name = download_file(cache_dir, URL_SHAPEFILE) +# geo = geopandas.read_file(f"zip://{file_name}") +# return geo[geo["geo_code"].isin(lookup["OutputArea2011Code"])] + + +# # TODO: add plots +# @asset # def generate_plots(): # geo.merge(pop, left_on="geo_code", right_on="OA11", how="left") # # Plot @@ -145,31 +203,3 @@ def download_shapefile() -> geopandas.GeoDataFrame: # column="log10 people", legend=True # ) # plt.show() - - -def main(): - cache_dir = "./cache/" - - # Make instance of Scotland - scotland = Scotland(cache_dir) - - # Get OA11 Age/Sex data - pop = scotland.get_lc1117sc() - - # Get shape file - geo = scotland.get_shapefile() - - # Merge - merged = geo.merge(pop, left_on="geo_code", right_on="OA11", how="left") - print(merged) - - # Plot - merged["log10 people"] = np.log10(merged["All people"]) - merged[merged["Age bracket"] == "All people"].plot( - column="log10 people", legend=True - ) - plt.show() - - -if __name__ == "__main__": - main() From 9dadda2cca1fb21e9ea88a5e190c139ab967cb98 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 7 Mar 2024 22:19:13 +0000 Subject: [PATCH 05/60] Format, make partition keys unique, add geographies asset --- python/popgetter/assets/scotland/scotland.py | 157 ++++++++++++------- 1 file changed, 99 insertions(+), 58 deletions(-) diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py index a7de2a6..3885167 100644 --- a/python/popgetter/assets/scotland/scotland.py +++ b/python/popgetter/assets/scotland/scotland.py @@ -2,6 +2,7 @@ import tempfile from typing import Tuple import requests + # import zipfile import zipfile_deflate64 as zipfile import os @@ -11,28 +12,20 @@ import numpy as np import matplotlib.pyplot as plt from icecream import ic -from dagster import AssetIn, AssetOut, DynamicPartitionsDefinition, MetadataValue, Output, SpecificPartitionsPartitionMapping, StaticPartitionsDefinition, asset, multi_asset - -HEADERS = { - "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0" -} - - -def download_file( - cache_dir: str, - url: str, - file_name: str | None = None, - headers: dict[str, str] = HEADERS, -) -> str: - """Downloads file checking first if exists in cache, returning file name.""" - file_name = ( - os.path.join(cache_dir, url.split("/")[-1]) if file_name is None else file_name - ) - if not os.path.exists(file_name): - r = requests.get(url, allow_redirects=True, headers=headers) - open(file_name, "wb").write(r.content) - return file_name - +import popgetter +from dagster import ( + AssetIn, + AssetKey, + AssetOut, + DynamicPartitionsDefinition, + MetadataValue, + Output, + SpecificPartitionsPartitionMapping, + StaticPartitionsDefinition, + asset, + multi_asset, + op, +) """ Notes: @@ -64,28 +57,46 @@ def download_file( "LSOA11": 1, # "SNS Data Zone 2011 blk" "OA11": 2, # "Output Area blk" } -# SCGeoCodes = ["CA", "DZ", "OA"] - DATA_SOURCES = { 0: { "source": "Council Area blk", "resolution": "LAD", - "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip" + "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip", }, 1: { "source": "SNS Data Zone 2011 blk", "resolution": "LSOA11", - "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip" + "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip", }, 2: { "source": "Output Area blk", "resolution": "OA11", - "url": URL2 + urlparse.quote("Output Area blk") + ".zip" - } + "url": URL2 + urlparse.quote("Output Area blk") + ".zip", + }, +} + +HEADERS = { + "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0" } +def download_file( + cache_dir: str, + url: str, + file_name: str | None = None, + headers: dict[str, str] = HEADERS, +) -> str: + """Downloads file checking first if exists in cache, returning file name.""" + file_name = ( + os.path.join(cache_dir, url.split("/")[-1]) if file_name is None else file_name + ) + if not os.path.exists(file_name): + r = requests.get(url, allow_redirects=True, headers=headers) + open(file_name, "wb").write(r.content) + return file_name + + # NB. Make sure no spaces in asset keys @multi_asset( outs={ @@ -109,34 +120,39 @@ def source_to_zip(source_name: str, url: str) -> str: file_name = os.path.join(cache_dir, source_name.replace(" ", "_") + ".zip") return download_file(cache_dir, url, file_name) + @asset -def make_catalog(context) -> pd.DataFrame: +def catalog(context) -> pd.DataFrame: records = [] for data_source in DATA_SOURCES.values(): resolution = data_source["resolution"] source = data_source["source"] url = data_source["url"] - with zipfile.ZipFile(source_to_zip(source, url)) as zip_ref: + with zipfile.ZipFile(source_to_zip(source, url)) as zip_ref: for name in zip_ref.namelist(): print(name) record = { - "resolution": resolution, - "source": source, - "url": url, - "file_name": name, - } + "resolution": resolution, + "source": source, + "url": url, + "file_name": name, + } records.append(record) ic(record) zip_ref.extract(name, cache_dir) - + + # TODO: check if required for partition in context.instance.get_dynamic_partitions(PARTITIONS_DEF_NAME): context.instance.delete_dynamic_partition(PARTITIONS_DEF_NAME, partition) # Create a dynamic partition for the datasets listed in the catalog catalog_df: pd.DataFrame = pd.DataFrame.from_records(records) - partition_keys = catalog_df["file_name"].to_list() + catalog_df["partition_keys"] = ( + catalog_df[["resolution", "file_name"]].agg("/".join, axis=1).to_list() + ) context.instance.add_dynamic_partitions( - partitions_def_name=PARTITIONS_DEF_NAME, partition_keys=partition_keys + partitions_def_name=PARTITIONS_DEF_NAME, + partition_keys=catalog_df["partition_keys"].to_list(), ) context.add_output_metadata( metadata={ @@ -156,9 +172,8 @@ def get_table(context, table_details) -> pd.DataFrame: df = pd.read_csv(os.path.join(cache_dir, table_details["file_name"].iloc[0])) context.add_output_metadata( metadata={ - "title": table_details["file_nae"].iloc[0], - # "title": "Test", - "num_records": len(df), # Metadata can be any key-value pair + "title": table_details["partition_keys"].iloc[0], + "num_records": len(df), "columns": MetadataValue.md( "\n".join([f"- '`{col}`'" for col in df.columns.to_list()]) ), @@ -167,32 +182,58 @@ def get_table(context, table_details) -> pd.DataFrame: ) return df + @asset(partitions_def=dataset_node_partition) -def individual_census_table(context, make_catalog: pd.DataFrame) -> pd.DataFrame: +def individual_census_table(context, catalog: pd.DataFrame) -> pd.DataFrame: partition_key = context.asset_partition_key_for_output() - ic(partition_key) - row = make_catalog.loc[make_catalog["file_name"].isin([partition_key])] - ic(row) + context.log.info(partition_key) + row = catalog.loc[catalog["partition_keys"].isin([partition_key])] + context.log.info(row) return get_table(context, table_details=row) -# # TODO: add to derived -# def get_lc1117sc(context, lookup, ) -> pd.DataFrame: +# @op +# def lc1117sc(context, individual_census_table, oa_dz_iz_2011_lookup) -> pd.DataFrame: # """Gets LC1117SC age by sex table at OA11 resolution.""" -# df = get_rawdata("LC1117SC", "OA11").rename( -# columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"} -# ) -# return df.loc[df["OA11"].isin(lookup["OutputArea2011Code"])] +# from popgetter import defs +# with defs.get_asset_value_loader(instance=context.instance) as loader: +# df = loader.load_asset_value(AssetKey(["uk-scotland", "individual_census_table"]), partition_key="LC1117SC.csv") +# df = df.rename( +# columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"} +# ) +# df = df.loc[df["OA11"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])] +# context.add_output_metadata( +# metadata = { +# "title": df["file_name"].iloc[0], +# "num_records": len(df), +# "columns": MetadataValue.md( +# "\n".join([f"- '`{col}`'" for col in df.columns.to_list()]) +# ), +# "preview": MetadataValue.md(df.head().to_markdown()), +# } +# ) +# return df + +@asset +def geometry(context, oa_dz_iz_2011_lookup) -> geopandas.GeoDataFrame: + """Gets the shape file for OA11 resolution.""" + file_name = download_file(cache_dir, URL_SHAPEFILE) + geo = geopandas.read_file(f"zip://{file_name}") + # TODO: add metadat for geopandas + # context.add_output_metadata( + # metadata={ + # "title": table_details["partition_keys"].iloc[0], + # "num_records": len(df), + # "columns": MetadataValue.md( + # "\n".join([f"- '`{col}`'" for col in df.columns.to_list()]) + # ), + # "preview": MetadataValue.md(df.head().to_markdown()), + # } + # ) + return geo[geo["geo_code"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])] -# # TODO: add shapefile -# def shapefile(context) -> geopandas.GeoDataFrame: -# """Gets the shape file for OA11 resolution.""" -# file_name = download_file(cache_dir, URL_SHAPEFILE) -# geo = geopandas.read_file(f"zip://{file_name}") -# return geo[geo["geo_code"].isin(lookup["OutputArea2011Code"])] - # # TODO: add plots # @asset # def generate_plots(): From b6e8e1d7809dc20faa185426eab22345fcb01a43 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 8 Mar 2024 10:47:17 +0000 Subject: [PATCH 06/60] Add config to asset_job --- python/popgetter/__init__.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index a98dd3b..8f0e279 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -55,6 +55,16 @@ name="job_scotland", selection=AssetSelection.groups("scotland"), description="Downloads Scotland data.", + # https://docs.dagster.io/guides/limiting-concurrency-in-data-pipelines#asset-based-jobs + config={ + "execution": { + "config": { + "multiprocess": { + "max_concurrent": 20, # limits concurrent assets + }, + } + } + } ) defs: Definitions = Definitions( From 65d27049ac71ae62e36874219afae29ee42a88c1 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Sat, 9 Mar 2024 08:39:17 +0000 Subject: [PATCH 07/60] Initial dagster rewrite for Scotland --- python/popgetter/assets/scotland/scotland.py | 161 +++++++++++-------- 1 file changed, 98 insertions(+), 63 deletions(-) diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py index 3885167..b54b9ab 100644 --- a/python/popgetter/assets/scotland/scotland.py +++ b/python/popgetter/assets/scotland/scotland.py @@ -1,14 +1,14 @@ +import base64 +from io import BytesIO import subprocess import tempfile from typing import Tuple import requests - -# import zipfile import zipfile_deflate64 as zipfile import os import urllib.parse as urlparse import pandas as pd -import geopandas +import geopandas as gpd import numpy as np import matplotlib.pyplot as plt from icecream import ic @@ -18,8 +18,10 @@ AssetKey, AssetOut, DynamicPartitionsDefinition, + MaterializeResult, MetadataValue, Output, + Partition, SpecificPartitionsPartitionMapping, StaticPartitionsDefinition, asset, @@ -58,23 +60,23 @@ "OA11": 2, # "Output Area blk" } -DATA_SOURCES = { - 0: { +DATA_SOURCES = [ + { "source": "Council Area blk", "resolution": "LAD", "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip", }, - 1: { + { "source": "SNS Data Zone 2011 blk", "resolution": "LSOA11", "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip", }, - 2: { + { "source": "Output Area blk", "resolution": "OA11", "url": URL2 + urlparse.quote("Output Area blk") + ".zip", }, -} +] HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0" @@ -105,7 +107,8 @@ def download_file( "intermediate_zone_2011_lookup": AssetOut(), }, ) -def download_lookup(): +def lookups(): + """Creates lookup dataframes.""" os.makedirs(cache_dir, exist_ok=True) lookup_path = download_file(cache_dir, URL_LOOKUP) df1 = pd.read_excel(lookup_path, sheet_name="OA_DZ_IZ_2011 Lookup") @@ -123,22 +126,22 @@ def source_to_zip(source_name: str, url: str) -> str: @asset def catalog(context) -> pd.DataFrame: + """Creates a catalog of the individual census tables from all data sources.""" records = [] - for data_source in DATA_SOURCES.values(): + for data_source in DATA_SOURCES: resolution = data_source["resolution"] source = data_source["source"] url = data_source["url"] with zipfile.ZipFile(source_to_zip(source, url)) as zip_ref: for name in zip_ref.namelist(): - print(name) record = { "resolution": resolution, "source": source, "url": url, "file_name": name, } + context.log.debug(record) records.append(record) - ic(record) zip_ref.extract(name, cache_dir) # TODO: check if required @@ -147,11 +150,12 @@ def catalog(context) -> pd.DataFrame: # Create a dynamic partition for the datasets listed in the catalog catalog_df: pd.DataFrame = pd.DataFrame.from_records(records) - catalog_df["partition_keys"] = ( - catalog_df[["resolution", "file_name"]].agg("/".join, axis=1).to_list() + catalog_df["partition_keys"] = catalog_df[["resolution", "file_name"]].agg( + lambda s: "/".join(s).rsplit(".")[0], axis=1 ) context.instance.add_dynamic_partitions( partitions_def_name=PARTITIONS_DEF_NAME, + # To ensure this is unique, prepend the resolution partition_keys=catalog_df["partition_keys"].to_list(), ) context.add_output_metadata( @@ -185,62 +189,93 @@ def get_table(context, table_details) -> pd.DataFrame: @asset(partitions_def=dataset_node_partition) def individual_census_table(context, catalog: pd.DataFrame) -> pd.DataFrame: + """Creates individual census tables as dataframe.""" partition_key = context.asset_partition_key_for_output() context.log.info(partition_key) - row = catalog.loc[catalog["partition_keys"].isin([partition_key])] - context.log.info(row) - return get_table(context, table_details=row) - - -# @op -# def lc1117sc(context, individual_census_table, oa_dz_iz_2011_lookup) -> pd.DataFrame: -# """Gets LC1117SC age by sex table at OA11 resolution.""" -# from popgetter import defs -# with defs.get_asset_value_loader(instance=context.instance) as loader: -# df = loader.load_asset_value(AssetKey(["uk-scotland", "individual_census_table"]), partition_key="LC1117SC.csv") -# df = df.rename( -# columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"} -# ) -# df = df.loc[df["OA11"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])] -# context.add_output_metadata( -# metadata = { -# "title": df["file_name"].iloc[0], -# "num_records": len(df), -# "columns": MetadataValue.md( -# "\n".join([f"- '`{col}`'" for col in df.columns.to_list()]) -# ), -# "preview": MetadataValue.md(df.head().to_markdown()), -# } -# ) -# return df + table_details = catalog.loc[catalog["partition_keys"].isin([partition_key])] + context.log.info(table_details) + return get_table(context, table_details) + + +_subset = [ + { + "partition_keys": "OA11/LC1117SC", + }, +] +_subset_partition_keys: list[str] = [r["partition_keys"] for r in _subset] +subset_mapping = SpecificPartitionsPartitionMapping(_subset_partition_keys) +subset_partition = StaticPartitionsDefinition(_subset_partition_keys) + + +@multi_asset( + ins={ + "individual_census_table": AssetIn(partition_mapping=subset_mapping), + }, + outs={ + "oa11_lc1117sc": AssetOut(), + }, + partitions_def=dataset_node_partition, +) +def oa11_lc1117sc( + context, individual_census_table, oa_dz_iz_2011_lookup +) -> pd.DataFrame: + """Gets LC1117SC age by sex table at OA11 resolution.""" + df = individual_census_table.rename( + columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"} + ) + df = df.loc[df["OA11"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])] + context.add_output_metadata( + metadata={ + "title": _subset_partition_keys, + "num_records": len(df), + "columns": MetadataValue.md( + "\n".join([f"- '`{col}`'" for col in df.columns.to_list()]) + ), + "preview": MetadataValue.md(df.head().to_markdown()), + } + ) + return df @asset -def geometry(context, oa_dz_iz_2011_lookup) -> geopandas.GeoDataFrame: +def geometry(context, oa_dz_iz_2011_lookup) -> gpd.GeoDataFrame: """Gets the shape file for OA11 resolution.""" file_name = download_file(cache_dir, URL_SHAPEFILE) - geo = geopandas.read_file(f"zip://{file_name}") - # TODO: add metadat for geopandas - # context.add_output_metadata( - # metadata={ - # "title": table_details["partition_keys"].iloc[0], - # "num_records": len(df), - # "columns": MetadataValue.md( - # "\n".join([f"- '`{col}`'" for col in df.columns.to_list()]) - # ), - # "preview": MetadataValue.md(df.head().to_markdown()), - # } - # ) + geo = gpd.read_file(f"zip://{file_name}") + context.add_output_metadata( + metadata={ + "title": "Geometry file", + "num_records": len(geo), + "columns": MetadataValue.md( + "\n".join([f"- '`{col}`'" for col in geo.columns.to_list()]) + ), + "preview": MetadataValue.md(geo.head().to_markdown()), + } + ) return geo[geo["geo_code"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])] -# # TODO: add plots -# @asset -# def generate_plots(): -# geo.merge(pop, left_on="geo_code", right_on="OA11", how="left") -# # Plot -# merged["log10 people"] = np.log10(merged["All people"]) -# merged[merged["Age bracket"] == "All people"].plot( -# column="log10 people", legend=True -# ) -# plt.show() +@multi_asset( + ins={ + "oa11_lc1117sc": AssetIn(partition_mapping=subset_mapping), + "geometry": AssetIn(partition_mapping=subset_mapping), + }, + outs={ + "plot": AssetOut(), + }, + partitions_def=dataset_node_partition, +) +def plot(geometry: gpd.GeoDataFrame, oa11_lc1117sc: pd.DataFrame): + """Plots map with log density of people.""" + merged = geometry.merge( + oa11_lc1117sc, left_on="geo_code", right_on="OA11", how="left" + ) + merged["log10 people"] = np.log10(merged["All people"]) + merged[merged["Age bracket"] == "All people"].plot( + column="log10 people", legend=True + ) + buffer = BytesIO() + plt.savefig(buffer, format="png") + image_data = base64.b64encode(buffer.getvalue()) + md_content = f"![img](data:image/png;base64,{image_data.decode()})" + return MaterializeResult(metadata={"plot": MetadataValue.md(md_content)}) From 6b20ab1de89b5604f193771d047d13f3db6891f0 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 15 Mar 2024 18:49:03 +0000 Subject: [PATCH 08/60] Add function to add metadata and metadata index asset --- python/popgetter/__init__.py | 2 + python/popgetter/assets/scotland/scotland.py | 60 ++++++++++---------- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 8f0e279..37e5c7c 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -70,6 +70,8 @@ defs: Definitions = Definitions( assets=all_assets, schedules=[], + # Example with multiple configs including for production: + # https://docs.dagster.io/guides/dagster/transitioning-data-pipelines-from-development-to-production#production resources={"pipes_subprocess_client": PipesSubprocessClient()}, jobs=[job_be, job_us, job_uk], ) diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py index b54b9ab..33e4c3f 100644 --- a/python/popgetter/assets/scotland/scotland.py +++ b/python/popgetter/assets/scotland/scotland.py @@ -51,6 +51,7 @@ "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx" ) URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip" +URL_METADATA_INDEX = "https://www.scotlandscensus.gov.uk/media/kqcmo4ge/census-table-index-2011.xlsm" data_sources = ["Council Area blk", "SNS Data Zone 2011 blk", "Output Area blk"] GeoCodeLookup = { @@ -124,6 +125,30 @@ def source_to_zip(source_name: str, url: str) -> str: return download_file(cache_dir, url, file_name) +def add_metadata(context, df: pd.DataFrame | gpd.DataFrame, title: str | list[str]): + context.add_output_metadata( + metadata={ + "title": title, + "num_records": len(df), + "columns": MetadataValue.md( + "\n".join([f"- '`{col}`'" for col in df.columns.to_list()]) + ), + "preview": MetadataValue.md(df.head().to_markdown()), + } + ) + +@asset +def metadata_index(context) -> pd.DataFrame: + dfs = pd.read_excel( + URL_METADATA_INDEX, + sheet_name=None, + storage_options={"User-Agent": "Mozilla/5.0"}, + ) + df = dfs["Index"] + add_metadata(context, df, "Metadata for census tables") + return df + + @asset def catalog(context) -> pd.DataFrame: """Creates a catalog of the individual census tables from all data sources.""" @@ -153,6 +178,8 @@ def catalog(context) -> pd.DataFrame: catalog_df["partition_keys"] = catalog_df[["resolution", "file_name"]].agg( lambda s: "/".join(s).rsplit(".")[0], axis=1 ) + # TODO: consider filtering here based on a set of keys to keep derived from + # config (i.e. backend/frontend modes) context.instance.add_dynamic_partitions( partitions_def_name=PARTITIONS_DEF_NAME, # To ensure this is unique, prepend the resolution @@ -174,16 +201,7 @@ def catalog(context) -> pd.DataFrame: def get_table(context, table_details) -> pd.DataFrame: df = pd.read_csv(os.path.join(cache_dir, table_details["file_name"].iloc[0])) - context.add_output_metadata( - metadata={ - "title": table_details["partition_keys"].iloc[0], - "num_records": len(df), - "columns": MetadataValue.md( - "\n".join([f"- '`{col}`'" for col in df.columns.to_list()]) - ), - "preview": MetadataValue.md(df.head().to_markdown()), - } - ) + add_metadata(context, df, table_details["partition_keys"].iloc[0]) return df @@ -224,16 +242,7 @@ def oa11_lc1117sc( columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"} ) df = df.loc[df["OA11"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])] - context.add_output_metadata( - metadata={ - "title": _subset_partition_keys, - "num_records": len(df), - "columns": MetadataValue.md( - "\n".join([f"- '`{col}`'" for col in df.columns.to_list()]) - ), - "preview": MetadataValue.md(df.head().to_markdown()), - } - ) + add_metadata(context, df, _subset_partition_keys) return df @@ -242,16 +251,7 @@ def geometry(context, oa_dz_iz_2011_lookup) -> gpd.GeoDataFrame: """Gets the shape file for OA11 resolution.""" file_name = download_file(cache_dir, URL_SHAPEFILE) geo = gpd.read_file(f"zip://{file_name}") - context.add_output_metadata( - metadata={ - "title": "Geometry file", - "num_records": len(geo), - "columns": MetadataValue.md( - "\n".join([f"- '`{col}`'" for col in geo.columns.to_list()]) - ), - "preview": MetadataValue.md(geo.head().to_markdown()), - } - ) + add_metadata(context, geo, "Geometry file") return geo[geo["geo_code"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])] From 9cef0c38779fb659841616e0bc6f656e3277cceb Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Mon, 18 Mar 2024 09:37:10 +0000 Subject: [PATCH 09/60] Use markdown_from_plot util --- python/popgetter/assets/scotland/scotland.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py index 33e4c3f..1ac7de2 100644 --- a/python/popgetter/assets/scotland/scotland.py +++ b/python/popgetter/assets/scotland/scotland.py @@ -3,6 +3,7 @@ import subprocess import tempfile from typing import Tuple +from popgetter.utils import markdown_from_plot import requests import zipfile_deflate64 as zipfile import os @@ -125,7 +126,7 @@ def source_to_zip(source_name: str, url: str) -> str: return download_file(cache_dir, url, file_name) -def add_metadata(context, df: pd.DataFrame | gpd.DataFrame, title: str | list[str]): +def add_metadata(context, df: pd.DataFrame | gpd.GeoDataFrame, title: str | list[str]): context.add_output_metadata( metadata={ "title": title, @@ -274,8 +275,5 @@ def plot(geometry: gpd.GeoDataFrame, oa11_lc1117sc: pd.DataFrame): merged[merged["Age bracket"] == "All people"].plot( column="log10 people", legend=True ) - buffer = BytesIO() - plt.savefig(buffer, format="png") - image_data = base64.b64encode(buffer.getvalue()) - md_content = f"![img](data:image/png;base64,{image_data.decode()})" + md_content = markdown_from_plot(plt) return MaterializeResult(metadata={"plot": MetadataValue.md(md_content)}) From eafa2388618e3b9f408b234ad60af033adfaeb80 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 17 Apr 2024 19:24:57 +0100 Subject: [PATCH 10/60] Add required tables --- python/popgetter/assets/scotland/scotland.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py index 1ac7de2..4f19773 100644 --- a/python/popgetter/assets/scotland/scotland.py +++ b/python/popgetter/assets/scotland/scotland.py @@ -52,7 +52,9 @@ "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx" ) URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip" -URL_METADATA_INDEX = "https://www.scotlandscensus.gov.uk/media/kqcmo4ge/census-table-index-2011.xlsm" +URL_METADATA_INDEX = ( + "https://www.scotlandscensus.gov.uk/media/kqcmo4ge/census-table-index-2011.xlsm" +) data_sources = ["Council Area blk", "SNS Data Zone 2011 blk", "Output Area blk"] GeoCodeLookup = { @@ -62,6 +64,10 @@ "OA11": 2, # "Output Area blk" } +# From: https://github.com/alan-turing-institute/microsimulation/blob/37ce2843f10b83a8e7a225c801cec83b85e6e0d0/microsimulation/common.py#L32 +REQUIRED_TABLES = ["QS103SC", "QS104SC", "KS201SC", "DC1117SC", "DC2101SC", "DC6206SC"] +REQUIRED_TABLES_REGEX = "|".join(REQUIRED_TABLES) + DATA_SOURCES = [ { "source": "Council Area blk", @@ -180,11 +186,15 @@ def catalog(context) -> pd.DataFrame: lambda s: "/".join(s).rsplit(".")[0], axis=1 ) # TODO: consider filtering here based on a set of keys to keep derived from - # config (i.e. backend/frontend modes) + # config (i.e. backend/frontend modes) context.instance.add_dynamic_partitions( partitions_def_name=PARTITIONS_DEF_NAME, # To ensure this is unique, prepend the resolution - partition_keys=catalog_df["partition_keys"].to_list(), + # partition_keys=catalog_df["partition_keys"].to_list(), + partition_keys=catalog_df.loc[ + catalog_df["partition_keys"].str.contains(REQUIRED_TABLES_REGEX), + "partition_keys", + ].to_list(), ) context.add_output_metadata( metadata={ From 5f65c914383d0be3fc38572b56e74ef3d7def9ba Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 18 Apr 2024 13:20:28 +0100 Subject: [PATCH 11/60] Add catalog_metadata, revise catalog towards metric metadata --- python/popgetter/assets/scotland/scotland.py | 172 +++++++++++++------ 1 file changed, 122 insertions(+), 50 deletions(-) diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py index 4f19773..70a1296 100644 --- a/python/popgetter/assets/scotland/scotland.py +++ b/python/popgetter/assets/scotland/scotland.py @@ -1,35 +1,28 @@ -import base64 -from io import BytesIO -import subprocess -import tempfile -from typing import Tuple -from popgetter.utils import markdown_from_plot -import requests -import zipfile_deflate64 as zipfile -import os +from __future__ import annotations + import urllib.parse as urlparse -import pandas as pd +from pathlib import Path + import geopandas as gpd -import numpy as np import matplotlib.pyplot as plt -from icecream import ic -import popgetter +import numpy as np +import pandas as pd +import requests +import zipfile_deflate64 as zipfile from dagster import ( AssetIn, - AssetKey, AssetOut, DynamicPartitionsDefinition, MaterializeResult, MetadataValue, - Output, - Partition, SpecificPartitionsPartitionMapping, StaticPartitionsDefinition, asset, multi_asset, - op, ) +from popgetter.utils import markdown_from_plot + """ Notes: - 2011 data using UKCensusAPI, 2022 data expected soon given recent initial @@ -52,7 +45,7 @@ "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx" ) URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip" -URL_METADATA_INDEX = ( +URL_CATALOG_METADATA = ( "https://www.scotlandscensus.gov.uk/media/kqcmo4ge/census-table-index-2011.xlsm" ) @@ -94,16 +87,15 @@ def download_file( cache_dir: str, url: str, - file_name: str | None = None, + file_name: Path | None = None, headers: dict[str, str] = HEADERS, -) -> str: +) -> Path: """Downloads file checking first if exists in cache, returning file name.""" - file_name = ( - os.path.join(cache_dir, url.split("/")[-1]) if file_name is None else file_name - ) - if not os.path.exists(file_name): + file_name = Path(cache_dir) / url.split("/")[-1] if file_name is None else file_name + if not Path(file_name).exists(): r = requests.get(url, allow_redirects=True, headers=headers) - open(file_name, "wb").write(r.content) + with Path(file_name).open("wb") as fp: + fp.write(r.content) return file_name @@ -117,7 +109,7 @@ def download_file( ) def lookups(): """Creates lookup dataframes.""" - os.makedirs(cache_dir, exist_ok=True) + Path(cache_dir).mkdir(parents=True, exist_ok=True) lookup_path = download_file(cache_dir, URL_LOOKUP) df1 = pd.read_excel(lookup_path, sheet_name="OA_DZ_IZ_2011 Lookup") df2 = pd.read_excel(lookup_path, sheet_name="DataZone2011Lookup") @@ -125,10 +117,10 @@ def lookups(): return df1, df2, df3 -def source_to_zip(source_name: str, url: str) -> str: +def source_to_zip(source_name: str, url: str) -> Path: """Downloads if necessary and returns the name of the locally cached zip file of the source data (replacing spaces with _)""" - file_name = os.path.join(cache_dir, source_name.replace(" ", "_") + ".zip") + file_name = Path(cache_dir) / (source_name.replace(" ", "_") + ".zip") return download_file(cache_dir, url, file_name) @@ -144,20 +136,59 @@ def add_metadata(context, df: pd.DataFrame | gpd.GeoDataFrame, title: str | list } ) + @asset -def metadata_index(context) -> pd.DataFrame: - dfs = pd.read_excel( - URL_METADATA_INDEX, +def catalog_metadata(context) -> pd.DataFrame: + catalog_metadata_df = pd.read_excel( + URL_CATALOG_METADATA, sheet_name=None, + header=None, storage_options={"User-Agent": "Mozilla/5.0"}, + )["Index"].rename( + columns={ + 0: "census_release", + 1: "table_name", + 2: "description", + 3: "population_coverage", + 4: "variable", + 5: "catalog_resolution", + 6: "year", + 7: "additional_url", + 8: "population_coverage_and_variable", + } ) - df = dfs["Index"] - add_metadata(context, df, "Metadata for census tables") - return df + add_metadata(context, catalog_metadata_df, "Metadata for census tables") + return catalog_metadata_df + + +def get_table_metadata( + catalog_metadata: pd.DataFrame, table_name: str +) -> dict[str, str]: + """Returns a dict of table metadata for a given table name.""" + rows = catalog_metadata.loc[catalog_metadata.loc[:, "table_name"].eq(table_name)] + census_release = rows.loc[:, "description"].unique()[0] + description = rows.loc[:, "description"].unique()[0] + population_coverage = rows.loc[:, "description"].unique()[0] + variables = ", ".join(rows.loc[:, "variable"].astype(str).to_list()) + catalog_resolution = rows.loc[:, "catalog_resolution"].unique()[0] + year = int(rows.loc[:, "year"].unique()[0]) + return { + "census_release": census_release, + "description": description, + "population_coverage": population_coverage, + "variables": variables, + "catalog_resolution": catalog_resolution, + "year": str(year), + "human_readable_name": f"{description} ({population_coverage})", + } + + +def get_table_name(file_name: str) -> str: + return file_name.rsplit(".csv")[0] @asset -def catalog(context) -> pd.DataFrame: +def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame: """Creates a catalog of the individual census tables from all data sources.""" records = [] for data_source in DATA_SOURCES: @@ -165,16 +196,53 @@ def catalog(context) -> pd.DataFrame: source = data_source["source"] url = data_source["url"] with zipfile.ZipFile(source_to_zip(source, url)) as zip_ref: - for name in zip_ref.namelist(): + for file_name in zip_ref.namelist(): + # Get table name + table_name = get_table_name(file_name) + + # Skip bulk output files and missing tables from catalog_metadata + if ( + "bulk_output" in file_name.lower() + or catalog_metadata.loc[:, "table_name"].ne(table_name).all() + ): + continue + + # Get table metadata + table_metadata = get_table_metadata(catalog_metadata, table_name) + + # Create a record for each census table use same keys as MetricMetadata + # where possible since this makes it simpler to populate derived + # metrics downstream record = { "resolution": resolution, + "catalog_resolution": table_metadata["catalog_resolution"], "source": source, "url": url, - "file_name": name, + "file_name": file_name, + "table_name": table_name, + "year": table_metadata["year"], + # Use constructed name of description and coverage + "human_readable_name": table_metadata["human_readable_name"], + "source_metric_id": None, + # Use catalog_metadata description + "description": table_metadata["description"], + "hxl_tag": None, + "metric_parquet_file_url": None, + "parquet_column_name": None, + "parquet_margin_of_error_column": None, + "parquet_margin_of_error_file": None, + "potential_denominator_ids": None, + "parent_metric_id": None, + # TODO: check this is not an ID but a name + "source_data_release_id": table_metadata["census_release"], + "source_download_url": url, + # TODO: what should this be? + "source_archive_file_path": None, + "source_documentation_url": URL_CATALOG_METADATA, } context.log.debug(record) records.append(record) - zip_ref.extract(name, cache_dir) + zip_ref.extract(file_name, cache_dir) # TODO: check if required for partition in context.instance.get_dynamic_partitions(PARTITIONS_DEF_NAME): @@ -182,15 +250,16 @@ def catalog(context) -> pd.DataFrame: # Create a dynamic partition for the datasets listed in the catalog catalog_df: pd.DataFrame = pd.DataFrame.from_records(records) - catalog_df["partition_keys"] = catalog_df[["resolution", "file_name"]].agg( - lambda s: "/".join(s).rsplit(".")[0], axis=1 + catalog_df["partition_keys"] = ( + catalog_df[["year", "resolution", "table_name"]] + .astype(str) + .agg(lambda s: "/".join(s).rsplit(".")[0], axis=1) ) # TODO: consider filtering here based on a set of keys to keep derived from # config (i.e. backend/frontend modes) context.instance.add_dynamic_partitions( partitions_def_name=PARTITIONS_DEF_NAME, - # To ensure this is unique, prepend the resolution - # partition_keys=catalog_df["partition_keys"].to_list(), + # To ensure this is unique, prepend the resolution, partition_keys=catalog_df.loc[ catalog_df["partition_keys"].str.contains(REQUIRED_TABLES_REGEX), "partition_keys", @@ -211,9 +280,9 @@ def catalog(context) -> pd.DataFrame: def get_table(context, table_details) -> pd.DataFrame: - df = pd.read_csv(os.path.join(cache_dir, table_details["file_name"].iloc[0])) - add_metadata(context, df, table_details["partition_keys"].iloc[0]) - return df + table_df = pd.read_csv(Path(cache_dir) / table_details["file_name"].iloc[0]) + add_metadata(context, table_df, table_details["partition_keys"].iloc[0]) + return table_df @asset(partitions_def=dataset_node_partition) @@ -228,7 +297,7 @@ def individual_census_table(context, catalog: pd.DataFrame) -> pd.DataFrame: _subset = [ { - "partition_keys": "OA11/LC1117SC", + "partition_keys": "2011/DCLC1117SC", }, ] _subset_partition_keys: list[str] = [r["partition_keys"] for r in _subset] @@ -236,6 +305,7 @@ def individual_census_table(context, catalog: pd.DataFrame) -> pd.DataFrame: subset_partition = StaticPartitionsDefinition(_subset_partition_keys) +# TODO: revise to include all partitions and extract column name for metadata from catalog @multi_asset( ins={ "individual_census_table": AssetIn(partition_mapping=subset_mapping), @@ -249,12 +319,14 @@ def oa11_lc1117sc( context, individual_census_table, oa_dz_iz_2011_lookup ) -> pd.DataFrame: """Gets LC1117SC age by sex table at OA11 resolution.""" - df = individual_census_table.rename( + derived_census_table = individual_census_table.rename( columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"} ) - df = df.loc[df["OA11"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])] - add_metadata(context, df, _subset_partition_keys) - return df + derived_census_table = derived_census_table.loc[ + derived_census_table["OA11"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"]) + ] + add_metadata(context, derived_census_table, _subset_partition_keys) + return derived_census_table @asset From 687c33fca5c71f072df1be58b3577b8ab3756db6 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 18 Apr 2024 20:58:41 +0100 Subject: [PATCH 12/60] Fix extracted zip file names --- python/popgetter/assets/scotland/scotland.py | 30 +++++++++++--------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py index 70a1296..c042df6 100644 --- a/python/popgetter/assets/scotland/scotland.py +++ b/python/popgetter/assets/scotland/scotland.py @@ -58,7 +58,15 @@ } # From: https://github.com/alan-turing-institute/microsimulation/blob/37ce2843f10b83a8e7a225c801cec83b85e6e0d0/microsimulation/common.py#L32 -REQUIRED_TABLES = ["QS103SC", "QS104SC", "KS201SC", "DC1117SC", "DC2101SC", "DC6206SC"] +REQUIRED_TABLES = [ + "QS103SC", + "QS104SC", + "KS201SC", + "DC1117SC", + "DC2101SC", + "DC6206SC", + "LC1117SC", +] REQUIRED_TABLES_REGEX = "|".join(REQUIRED_TABLES) DATA_SOURCES = [ @@ -195,7 +203,8 @@ def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame: resolution = data_source["resolution"] source = data_source["source"] url = data_source["url"] - with zipfile.ZipFile(source_to_zip(source, url)) as zip_ref: + zip_file_name = source_to_zip(source, url) + with zipfile.ZipFile(zip_file_name) as zip_ref: for file_name in zip_ref.namelist(): # Get table name table_name = get_table_name(file_name) @@ -218,7 +227,7 @@ def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame: "catalog_resolution": table_metadata["catalog_resolution"], "source": source, "url": url, - "file_name": file_name, + "file_name": Path(source) / file_name, "table_name": table_name, "year": table_metadata["year"], # Use constructed name of description and coverage @@ -242,7 +251,7 @@ def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame: } context.log.debug(record) records.append(record) - zip_ref.extract(file_name, cache_dir) + zip_ref.extract(file_name, Path(cache_dir) / source) # TODO: check if required for partition in context.instance.get_dynamic_partitions(PARTITIONS_DEF_NAME): @@ -295,14 +304,9 @@ def individual_census_table(context, catalog: pd.DataFrame) -> pd.DataFrame: return get_table(context, table_details) -_subset = [ - { - "partition_keys": "2011/DCLC1117SC", - }, -] -_subset_partition_keys: list[str] = [r["partition_keys"] for r in _subset] -subset_mapping = SpecificPartitionsPartitionMapping(_subset_partition_keys) -subset_partition = StaticPartitionsDefinition(_subset_partition_keys) +subset_partition_keys: list[str] = ["2011/OA11/LC1117SC"] +subset_mapping = SpecificPartitionsPartitionMapping(subset_partition_keys) +subset_partition = StaticPartitionsDefinition(subset_partition_keys) # TODO: revise to include all partitions and extract column name for metadata from catalog @@ -325,7 +329,7 @@ def oa11_lc1117sc( derived_census_table = derived_census_table.loc[ derived_census_table["OA11"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"]) ] - add_metadata(context, derived_census_table, _subset_partition_keys) + add_metadata(context, derived_census_table, subset_partition_keys) return derived_census_table From 739c9d1a784fec5e5dce6a637752fb5e6141e1a8 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Mon, 22 Apr 2024 21:26:18 +0100 Subject: [PATCH 13/60] Rename as df column to partition_key --- python/popgetter/assets/scotland/scotland.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py index c042df6..1029394 100644 --- a/python/popgetter/assets/scotland/scotland.py +++ b/python/popgetter/assets/scotland/scotland.py @@ -259,7 +259,7 @@ def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame: # Create a dynamic partition for the datasets listed in the catalog catalog_df: pd.DataFrame = pd.DataFrame.from_records(records) - catalog_df["partition_keys"] = ( + catalog_df["partition_key"] = ( catalog_df[["year", "resolution", "table_name"]] .astype(str) .agg(lambda s: "/".join(s).rsplit(".")[0], axis=1) @@ -270,8 +270,8 @@ def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame: partitions_def_name=PARTITIONS_DEF_NAME, # To ensure this is unique, prepend the resolution, partition_keys=catalog_df.loc[ - catalog_df["partition_keys"].str.contains(REQUIRED_TABLES_REGEX), - "partition_keys", + catalog_df["partition_key"].str.contains(REQUIRED_TABLES_REGEX), + "partition_key", ].to_list(), ) context.add_output_metadata( @@ -290,7 +290,7 @@ def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame: def get_table(context, table_details) -> pd.DataFrame: table_df = pd.read_csv(Path(cache_dir) / table_details["file_name"].iloc[0]) - add_metadata(context, table_df, table_details["partition_keys"].iloc[0]) + add_metadata(context, table_df, table_details["partition_key"].iloc[0]) return table_df @@ -299,7 +299,7 @@ def individual_census_table(context, catalog: pd.DataFrame) -> pd.DataFrame: """Creates individual census tables as dataframe.""" partition_key = context.asset_partition_key_for_output() context.log.info(partition_key) - table_details = catalog.loc[catalog["partition_keys"].isin([partition_key])] + table_details = catalog.loc[catalog["partition_key"].isin([partition_key])] context.log.info(table_details) return get_table(context, table_details) From b9e76b0bdc51b4cbfb61239a88250d885c838cc5 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Mon, 22 Apr 2024 21:29:37 +0100 Subject: [PATCH 14/60] Add initial census_derived for Scotland --- .../assets/scotland/census_derived.py | 272 ++++++++++++++++++ 1 file changed, 272 insertions(+) create mode 100644 python/popgetter/assets/scotland/census_derived.py diff --git a/python/popgetter/assets/scotland/census_derived.py b/python/popgetter/assets/scotland/census_derived.py new file mode 100644 index 0000000..102929d --- /dev/null +++ b/python/popgetter/assets/scotland/census_derived.py @@ -0,0 +1,272 @@ +from __future__ import annotations + +import pandas as pd +from dagster import ( + AssetIn, + AssetOut, + SpecificPartitionsPartitionMapping, + StaticPartitionsDefinition, + asset, + multi_asset, +) + +from ...metadata import MetricMetadata +from .scotland import add_metadata, dataset_node_partition + + +def get_lc1117sc_metric( + lc1117sc: pd.DataFrame, col: str, subset: list[str] +) -> pd.DataFrame: + lc1117sc_transformed = lc1117sc.rename( + columns={"Unnamed: 0": "OA11CD", "Unnamed: 1": "Age Category"} + ) + lc1117sc_transformed = lc1117sc_transformed.loc[ + ~lc1117sc_transformed["OA11CD"].str.startswith("S92"), : + ] + return ( + lc1117sc_transformed.loc[ + lc1117sc_transformed["Age Category"].isin(subset), + ["OA11CD", col], + ] + .groupby("OA11CD") + .agg("sum") + .rename(columns={col: "Count"}) + ) + + +ALL_PEOPLE = ["All people"] +INFANTS_AGE_0_TO_4 = ["0 to 4"] +CHILDREN_AGE_0_TO_17 = ["0 to 4", "5 to 9", "10 to 11", "12 to 14", "15", "16 to 17"] +CHILDREN_AGE_5_TO_17 = ["5 to 9", "10 to 11", "12 to 14", "15", "16 to 17"] +ADULTS = [ + "18 to 19", + "20 to 24", + "25 to 29", + "30 to 34", + "35 to 39", + "40 to 44", + "45 to 49", + "50 to 54", + "55 to 59", + "60 to 64", + "65 to 69", + "70 to 74", + "75 to 79", + "80 to 84", + "85 to 89", + "90 to 94", + "95 and over", +] + +needed_dataset_list = [ + { + # Population by OA11, Period: 2011 + "partition_key": "2011/OA11/LC1117SC", + "hxltag": "#population+oa11+2011", + # TODO: this partition key does not have a single column for source + "source_column": "", + } +] +needed_dataset_partions_keys: list[str] = [ + r["partition_key"] for r in needed_dataset_list +] +needed_dataset_mapping = SpecificPartitionsPartitionMapping( + needed_dataset_partions_keys +) +needed_dataset_partition = StaticPartitionsDefinition(needed_dataset_partions_keys) + +# Using HXL tags for variable names (https://hxlstandard.org/standard/1-1final/dictionary/#tag_population) +_derived_columns: list[dict] = [ + { + "partition_key": "2011/OA11/LC1117SC", + "hxltag": "population_children_age5_17", + "filter_func": lambda df: get_lc1117sc_metric( + df, "All people", CHILDREN_AGE_5_TO_17 + ), + }, + { + "partition_key": "2011/OA11/LC1117SC", + "hxltag": "population_infants_age0_4", + "filter_func": lambda df: get_lc1117sc_metric( + df, "All people", INFANTS_AGE_0_TO_4 + ), + }, + { + "partition_key": "2011/OA11/LC1117SC", + "hxltag": "population_children_age0_17", + "filter_func": lambda df: get_lc1117sc_metric( + df, "All people", CHILDREN_AGE_0_TO_17 + ), + }, + { + "partition_key": "2011/OA11/LC1117SC", + "hxltag": "population_adults_f", + "filter_func": lambda df: get_lc1117sc_metric(df, "Females", ADULTS), + }, + { + "partition_key": "2011/OA11/LC1117SC", + "hxltag": "population_adults_m", + "filter_func": lambda df: get_lc1117sc_metric(df, "Males", ADULTS), + }, + { + "partition_key": "2011/OA11/LC1117SC", + "hxltag": "population_adults", + "filter_func": lambda df: get_lc1117sc_metric(df, "All people", ADULTS), + }, + { + "partition_key": "2011/OA11/LC1117SC", + "hxltag": "population_ind", + "filter_func": lambda df: get_lc1117sc_metric(df, "All people", ALL_PEOPLE), + }, +] + +derived_columns = pd.DataFrame( + _derived_columns, columns=["node", "hxltag", "filter_func"] +) + + +# record = { +# "resolution": resolution, +# "catalog_resolution": table_metadata["catalog_resolution"], +# "source": source, +# "url": url, +# "file_name": Path(source) / file_name, +# "table_name": table_name, +# "year": table_metadata["year"], +# # Use constructed name of description and coverage +# "human_readable_name": table_metadata["human_readable_name"], +# "source_metric_id": None, +# # Use catalog_metadata description +# "description": table_metadata["description"], +# "hxl_tag": None, +# "metric_parquet_file_url": None, +# "parquet_column_name": None, +# "parquet_margin_of_error_column": None, +# "parquet_margin_of_error_file": None, +# "potential_denominator_ids": None, +# "parent_metric_id": None, +# # TODO: check this is not an ID but a name +# "source_data_release_id": table_metadata["census_release"], +# "source_download_url": url, +# # TODO: what should this be? +# "source_archive_file_path": None, +# "source_documentation_url": URL_CATALOG_METADATA, +# } + + +def census_table_metadata(catalog_row: dict) -> MetricMetadata: + return MetricMetadata( + human_readable_name=catalog_row["human_readable_name"], + source_download_url=catalog_row["source_download_url"], + source_archive_file_path=catalog_row["source_archive_file_path"], + source_documentation_url=catalog_row["source_documentation_url"], + source_data_release_id="TODO", + # TODO - this is a placeholder + parent_metric_id="unknown_at_this_stage", + potential_denominator_ids=None, + parquet_margin_of_error_file=None, + parquet_margin_of_error_column=None, + parquet_column_name=catalog_row["source_column"], + # TODO - this is a placeholder + metric_parquet_file_url="unknown_at_this_stage", + hxl_tag=catalog_row["hxltag"], + description=catalog_row["description"], + source_metric_id=catalog_row["hxltag"], + ) + + +@asset( + ins={ + "catalog": AssetIn(partition_mapping=needed_dataset_mapping), + }, +) +def filter_needed_catalog( + context, needed_datasets, catalog: pd.DataFrame +) -> pd.DataFrame: + needed_df = needed_datasets.merge(catalog, how="inner", on="partition_key") + add_metadata(context, needed_df, "needed_df") + return needed_df + + +@asset +def needed_datasets(context) -> pd.DataFrame: + needed_df = pd.DataFrame( + needed_dataset_list, + columns=["partition_key", "hxltag", "source_column", "derived_columns"], + dtype="string", + ) + add_metadata(context, needed_df, "needed_datasets") + return needed_df + + +@multi_asset( + ins={ + "individual_census_table": AssetIn(partition_mapping=needed_dataset_mapping), + "filter_needed_catalog": AssetIn(), + }, + outs={ + "source_table": AssetOut(), + "source_mmd": AssetOut(), + }, + partitions_def=dataset_node_partition, +) +def get_enriched_tables_scotland( + context, individual_census_table, filter_needed_catalog +) -> tuple[pd.DataFrame, MetricMetadata]: + partition_keys = context.asset_partition_keys_for_input( + input_name="individual_census_table" + ) + output_partition = context.asset_partition_key_for_output("source_table") + if output_partition not in partition_keys: + err_msg = f"Requested partition {output_partition} not found in the subset of 'needed' partitions {partition_keys}" + raise ValueError(err_msg) + + if output_partition not in individual_census_table: + err_msg = ( + f"Partition key {output_partition} not found in individual_census_table\n" + f"Available keys are {individual_census_table.keys()}" + ) + raise ValueError(err_msg) + result_df = individual_census_table[output_partition] + catalog_row = filter_needed_catalog[ + filter_needed_catalog["partition_key"].eq(output_partition) + ] + catalog_row = catalog_row.to_dict(orient="index") + catalog_row = catalog_row.popitem()[1] + result_mmd = census_table_metadata(catalog_row) + return result_df, result_mmd + + +# TODO: from here + + +@multi_asset( + partitions_def=dataset_node_partition, + ins={ + "source_table": AssetIn(partition_mapping=needed_dataset_mapping), + "source_mmd": AssetIn(partition_mapping=needed_dataset_mapping), + }, + outs={"derived_table": AssetOut(), "derived_mmds": AssetOut()}, +) +def transform_data( + context, + source_table: dict[str, pd.DataFrame], + source_mmd: dict[str, MetricMetadata], +) -> tuple[pd.DataFrame, list[MetricMetadata]]: + partition_key = context.asset_partition_key_for_output("derived_table") + census_table = source_table[partition_key] + parent_mmd = source_mmd[partition_key] + # source_column = parent_mmd.parquet_column_name + metrics = derived_columns[derived_columns["partition_key"].eq(partition_key)] + new_series: list[pd.Series] = [] + new_mmds: list[MetricMetadata] = [] + for row_tuple in metrics.itertuples(): + _, _, col_name, group_by_column, filter = row_tuple + new_series.append(filter(census_table)) + new_mmd = parent_mmd.copy() + new_mmd.parent_metric_id = parent_mmd.source_metric_id + new_mmd.hxl_tag = col_name + new_mmds.append(new_mmd) + new_table: pd.DataFrame = pd.concat(new_series, axis=1) + add_metadata(context, new_table, "derived_table") + return new_table, new_mmds From b365a1496dc65ddb561b4dbae1baff5425a99ee0 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 23 Apr 2024 14:12:28 +0100 Subject: [PATCH 15/60] Add ISO3116-2 field, move download_file to module import --- python/popgetter/assets/scotland/__init__.py | 47 ++++++++++++++++---- python/popgetter/metadata.py | 3 ++ 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py index fb102a3..044a2cb 100755 --- a/python/popgetter/assets/scotland/__init__.py +++ b/python/popgetter/assets/scotland/__init__.py @@ -1,22 +1,53 @@ #!/usr/bin/python3 from __future__ import annotations +from pathlib import Path + +import requests from dagster import ( asset, ) +# from popgetter.assets.scotland import country +# from . import ( +# scotland, +# ) from popgetter.metadata import ( CountryMetadata, ) -from . import ( - scotland, +country: CountryMetadata = CountryMetadata( + name_short_en="Scotland", + name_official="Kingdom of Belgium", + iso3="GBR", + iso2="GB", + iso3116_2="GB-SCT", ) +WORKING_DIR = Path("scotland") + + +@asset() +def get_country_metadata() -> CountryMetadata: + """Returns a CountryMetadata of metadata about the country.""" + return country + + +HEADERS = { + "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0" +} + -# @asset(key_prefix=asset_prefix) -# def get_country_metadata() -> CountryMetadata: -# """ -# Returns a CountryMetadata of metadata about the country. -# """ -# return country +def download_file( + cache_dir: str, + url: str, + file_name: Path | None = None, + headers: dict[str, str] = HEADERS, +) -> Path: + """Downloads file checking first if exists in cache, returning file name.""" + file_name = Path(cache_dir) / url.split("/")[-1] if file_name is None else file_name + if not Path(file_name).exists(): + r = requests.get(url, allow_redirects=True, headers=headers) + with Path(file_name).open("wb") as fp: + fp.write(r.content) + return file_name diff --git a/python/popgetter/metadata.py b/python/popgetter/metadata.py index aa6178a..237ca09 100644 --- a/python/popgetter/metadata.py +++ b/python/popgetter/metadata.py @@ -15,6 +15,9 @@ class CountryMetadata(BaseModel): ) iso3: str = Field(description="The ISO3 code of the country (for example 'BEL').") iso2: str = Field(description="The ISO2 code of the country (for example 'BE').") + iso3116_2: str | None = Field( + description="The ISO3116-2 code for the names of the principal subdivisions (for example 'GB-SCT')." + ) class DataPublisher(BaseModel): From a44e744cc2df6b7b88741b5137d40042363ed4b3 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 23 Apr 2024 14:13:49 +0100 Subject: [PATCH 16/60] Add ISO3116-2 field to Belgium and UK --- python/popgetter/assets/be/belgium.py | 1 + python/popgetter/assets/uk/united_kingdom.py | 1 + 2 files changed, 2 insertions(+) diff --git a/python/popgetter/assets/be/belgium.py b/python/popgetter/assets/be/belgium.py index 211e0b2..3253e9d 100644 --- a/python/popgetter/assets/be/belgium.py +++ b/python/popgetter/assets/be/belgium.py @@ -9,6 +9,7 @@ name_official="Kingdom of Belgium", iso3="BEL", iso2="BE", + iso3116_2=None, ) WORKING_DIR = Path("belgium") diff --git a/python/popgetter/assets/uk/united_kingdom.py b/python/popgetter/assets/uk/united_kingdom.py index e6ab99f..3f12b39 100644 --- a/python/popgetter/assets/uk/united_kingdom.py +++ b/python/popgetter/assets/uk/united_kingdom.py @@ -7,6 +7,7 @@ name_official="The United Kingdom of Great Britain and Northern Ireland", iso3="GBR", iso2="GB", + iso3116_2=None, ) asset_prefix = "uk" From f65cbaec025f4022b87ac07ee3d0ae7ca7a5b269 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 23 Apr 2024 14:39:58 +0100 Subject: [PATCH 17/60] Rename country metadata asset --- python/popgetter/assets/scotland/__init__.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py index 044a2cb..a84b2bc 100755 --- a/python/popgetter/assets/scotland/__init__.py +++ b/python/popgetter/assets/scotland/__init__.py @@ -8,10 +8,6 @@ asset, ) -# from popgetter.assets.scotland import country -# from . import ( -# scotland, -# ) from popgetter.metadata import ( CountryMetadata, ) @@ -28,7 +24,7 @@ @asset() -def get_country_metadata() -> CountryMetadata: +def country_metadata() -> CountryMetadata: """Returns a CountryMetadata of metadata about the country.""" return country From dd5dcfd865f6227beba04455912569ba9337d0ce Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 23 Apr 2024 14:56:23 +0100 Subject: [PATCH 18/60] Refactor and fix derived module, add geometry module --- .../assets/scotland/census_derived.py | 111 +++++++++++++----- .../assets/scotland/census_geometry.py | 17 +++ python/popgetter/assets/scotland/scotland.py | 92 ++------------- 3 files changed, 105 insertions(+), 115 deletions(-) create mode 100644 python/popgetter/assets/scotland/census_geometry.py diff --git a/python/popgetter/assets/scotland/census_derived.py b/python/popgetter/assets/scotland/census_derived.py index 102929d..90e94d6 100644 --- a/python/popgetter/assets/scotland/census_derived.py +++ b/python/popgetter/assets/scotland/census_derived.py @@ -1,21 +1,29 @@ from __future__ import annotations +import geopandas as gpd +import numpy as np import pandas as pd from dagster import ( AssetIn, AssetOut, + MaterializeResult, + MetadataValue, SpecificPartitionsPartitionMapping, StaticPartitionsDefinition, asset, multi_asset, ) +from icecream import ic +from matplotlib import pyplot as plt + +from popgetter.utils import markdown_from_plot from ...metadata import MetricMetadata from .scotland import add_metadata, dataset_node_partition def get_lc1117sc_metric( - lc1117sc: pd.DataFrame, col: str, subset: list[str] + lc1117sc: pd.DataFrame, col: str, output_col: str, subset: list[str] ) -> pd.DataFrame: lc1117sc_transformed = lc1117sc.rename( columns={"Unnamed: 0": "OA11CD", "Unnamed: 1": "Age Category"} @@ -30,7 +38,7 @@ def get_lc1117sc_metric( ] .groupby("OA11CD") .agg("sum") - .rename(columns={col: "Count"}) + .rename(columns={col: output_col}) ) @@ -80,48 +88,56 @@ def get_lc1117sc_metric( { "partition_key": "2011/OA11/LC1117SC", "hxltag": "population_children_age5_17", - "filter_func": lambda df: get_lc1117sc_metric( - df, "All people", CHILDREN_AGE_5_TO_17 + "filter_func": lambda df, output_col: get_lc1117sc_metric( + df, "All people", output_col, CHILDREN_AGE_5_TO_17 ), }, { "partition_key": "2011/OA11/LC1117SC", "hxltag": "population_infants_age0_4", - "filter_func": lambda df: get_lc1117sc_metric( - df, "All people", INFANTS_AGE_0_TO_4 + "filter_func": lambda df, output_col: get_lc1117sc_metric( + df, "All people", output_col, INFANTS_AGE_0_TO_4 ), }, { "partition_key": "2011/OA11/LC1117SC", "hxltag": "population_children_age0_17", - "filter_func": lambda df: get_lc1117sc_metric( - df, "All people", CHILDREN_AGE_0_TO_17 + "filter_func": lambda df, output_col: get_lc1117sc_metric( + df, "All people", output_col, CHILDREN_AGE_0_TO_17 ), }, { "partition_key": "2011/OA11/LC1117SC", "hxltag": "population_adults_f", - "filter_func": lambda df: get_lc1117sc_metric(df, "Females", ADULTS), + "filter_func": lambda df, output_col: get_lc1117sc_metric( + df, "Females", output_col, ADULTS + ), }, { "partition_key": "2011/OA11/LC1117SC", "hxltag": "population_adults_m", - "filter_func": lambda df: get_lc1117sc_metric(df, "Males", ADULTS), + "filter_func": lambda df, output_col: get_lc1117sc_metric( + df, "Males", output_col, ADULTS + ), }, { "partition_key": "2011/OA11/LC1117SC", "hxltag": "population_adults", - "filter_func": lambda df: get_lc1117sc_metric(df, "All people", ADULTS), + "filter_func": lambda df, output_col: get_lc1117sc_metric( + df, "All people", output_col, ADULTS + ), }, { "partition_key": "2011/OA11/LC1117SC", "hxltag": "population_ind", - "filter_func": lambda df: get_lc1117sc_metric(df, "All people", ALL_PEOPLE), + "filter_func": lambda df, output_col: get_lc1117sc_metric( + df, "All people", output_col, ALL_PEOPLE + ), }, ] derived_columns = pd.DataFrame( - _derived_columns, columns=["node", "hxltag", "filter_func"] + _derived_columns, columns=["partition_key", "hxltag", "filter_func"] ) @@ -214,20 +230,19 @@ def get_enriched_tables_scotland( context, individual_census_table, filter_needed_catalog ) -> tuple[pd.DataFrame, MetricMetadata]: partition_keys = context.asset_partition_keys_for_input( - input_name="individual_census_table" + input_name="individual_census_table", ) output_partition = context.asset_partition_key_for_output("source_table") + ic(partition_keys) + ic(len(partition_keys)) + ic(output_partition) + ic(type(output_partition)) + ic(individual_census_table) if output_partition not in partition_keys: err_msg = f"Requested partition {output_partition} not found in the subset of 'needed' partitions {partition_keys}" raise ValueError(err_msg) - if output_partition not in individual_census_table: - err_msg = ( - f"Partition key {output_partition} not found in individual_census_table\n" - f"Available keys are {individual_census_table.keys()}" - ) - raise ValueError(err_msg) - result_df = individual_census_table[output_partition] + result_df = individual_census_table catalog_row = filter_needed_catalog[ filter_needed_catalog["partition_key"].eq(output_partition) ] @@ -237,9 +252,6 @@ def get_enriched_tables_scotland( return result_df, result_mmd -# TODO: from here - - @multi_asset( partitions_def=dataset_node_partition, ins={ @@ -250,23 +262,58 @@ def get_enriched_tables_scotland( ) def transform_data( context, - source_table: dict[str, pd.DataFrame], - source_mmd: dict[str, MetricMetadata], + source_table: pd.DataFrame, + source_mmd: MetricMetadata, ) -> tuple[pd.DataFrame, list[MetricMetadata]]: partition_key = context.asset_partition_key_for_output("derived_table") - census_table = source_table[partition_key] - parent_mmd = source_mmd[partition_key] + census_table = source_table.copy() + parent_mmd = source_mmd.copy() # source_column = parent_mmd.parquet_column_name metrics = derived_columns[derived_columns["partition_key"].eq(partition_key)] new_series: list[pd.Series] = [] new_mmds: list[MetricMetadata] = [] - for row_tuple in metrics.itertuples(): - _, _, col_name, group_by_column, filter = row_tuple - new_series.append(filter(census_table)) + for _, _, col_name, filter in metrics.itertuples(): + # Create column + column: pd.Series = filter(census_table, col_name) + ic(f"col_name: {col_name}") + new_series.append(column) + + # Construct metadata new_mmd = parent_mmd.copy() new_mmd.parent_metric_id = parent_mmd.source_metric_id new_mmd.hxl_tag = col_name new_mmds.append(new_mmd) + + # Merge series new_table: pd.DataFrame = pd.concat(new_series, axis=1) - add_metadata(context, new_table, "derived_table") + add_metadata( + context, + df=new_table, + title=f"Derived table ({partition_key})", + output_name="derived_table", + ) return new_table, new_mmds + + +@multi_asset( + ins={ + "derived_table": AssetIn(partition_mapping=needed_dataset_mapping), + "geometry": AssetIn(partition_mapping=needed_dataset_mapping), + }, + outs={ + "plot": AssetOut(), + }, + partitions_def=dataset_node_partition, +) +def plot(derived_table: pd.DataFrame, geometry: gpd.GeoDataFrame): + """Plots map with log density of people.""" + merged = geometry.merge( + derived_table[["population_ind"]], + left_on="geo_code", + right_index=True, + how="left", + ) + merged["log10 people"] = np.log10(merged["population_ind"]) + merged.plot(column="log10 people", legend=True) + md_content = markdown_from_plot(plt) + return MaterializeResult(metadata={"plot": MetadataValue.md(md_content)}) diff --git a/python/popgetter/assets/scotland/census_geometry.py b/python/popgetter/assets/scotland/census_geometry.py new file mode 100644 index 0000000..544ca1b --- /dev/null +++ b/python/popgetter/assets/scotland/census_geometry.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +import geopandas as gpd +from dagster import asset + +from popgetter.assets.scotland import download_file + +from .scotland import URL_SHAPEFILE, add_metadata, cache_dir + + +@asset +def geometry(context, oa_dz_iz_2011_lookup) -> gpd.GeoDataFrame: + """Gets the shape file for OA11 resolution.""" + file_name = download_file(cache_dir, URL_SHAPEFILE) + geo = gpd.read_file(f"zip://{file_name}") + add_metadata(context, geo, "Geometry file") + return geo[geo["geo_code"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])] diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py index 1029394..e81a95c 100644 --- a/python/popgetter/assets/scotland/scotland.py +++ b/python/popgetter/assets/scotland/scotland.py @@ -4,16 +4,11 @@ from pathlib import Path import geopandas as gpd -import matplotlib.pyplot as plt -import numpy as np import pandas as pd -import requests import zipfile_deflate64 as zipfile from dagster import ( - AssetIn, AssetOut, DynamicPartitionsDefinition, - MaterializeResult, MetadataValue, SpecificPartitionsPartitionMapping, StaticPartitionsDefinition, @@ -21,7 +16,7 @@ multi_asset, ) -from popgetter.utils import markdown_from_plot +from popgetter.assets.scotland import download_file """ Notes: @@ -87,25 +82,6 @@ }, ] -HEADERS = { - "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0" -} - - -def download_file( - cache_dir: str, - url: str, - file_name: Path | None = None, - headers: dict[str, str] = HEADERS, -) -> Path: - """Downloads file checking first if exists in cache, returning file name.""" - file_name = Path(cache_dir) / url.split("/")[-1] if file_name is None else file_name - if not Path(file_name).exists(): - r = requests.get(url, allow_redirects=True, headers=headers) - with Path(file_name).open("wb") as fp: - fp.write(r.content) - return file_name - # NB. Make sure no spaces in asset keys @multi_asset( @@ -132,7 +108,12 @@ def source_to_zip(source_name: str, url: str) -> Path: return download_file(cache_dir, url, file_name) -def add_metadata(context, df: pd.DataFrame | gpd.GeoDataFrame, title: str | list[str]): +def add_metadata( + context, + df: pd.DataFrame | gpd.GeoDataFrame, + title: str | list[str], + output_name: str | None = None, +): context.add_output_metadata( metadata={ "title": title, @@ -141,7 +122,8 @@ def add_metadata(context, df: pd.DataFrame | gpd.GeoDataFrame, title: str | list "\n".join([f"- '`{col}`'" for col in df.columns.to_list()]) ), "preview": MetadataValue.md(df.head().to_markdown()), - } + }, + output_name=output_name, ) @@ -307,59 +289,3 @@ def individual_census_table(context, catalog: pd.DataFrame) -> pd.DataFrame: subset_partition_keys: list[str] = ["2011/OA11/LC1117SC"] subset_mapping = SpecificPartitionsPartitionMapping(subset_partition_keys) subset_partition = StaticPartitionsDefinition(subset_partition_keys) - - -# TODO: revise to include all partitions and extract column name for metadata from catalog -@multi_asset( - ins={ - "individual_census_table": AssetIn(partition_mapping=subset_mapping), - }, - outs={ - "oa11_lc1117sc": AssetOut(), - }, - partitions_def=dataset_node_partition, -) -def oa11_lc1117sc( - context, individual_census_table, oa_dz_iz_2011_lookup -) -> pd.DataFrame: - """Gets LC1117SC age by sex table at OA11 resolution.""" - derived_census_table = individual_census_table.rename( - columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"} - ) - derived_census_table = derived_census_table.loc[ - derived_census_table["OA11"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"]) - ] - add_metadata(context, derived_census_table, subset_partition_keys) - return derived_census_table - - -@asset -def geometry(context, oa_dz_iz_2011_lookup) -> gpd.GeoDataFrame: - """Gets the shape file for OA11 resolution.""" - file_name = download_file(cache_dir, URL_SHAPEFILE) - geo = gpd.read_file(f"zip://{file_name}") - add_metadata(context, geo, "Geometry file") - return geo[geo["geo_code"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])] - - -@multi_asset( - ins={ - "oa11_lc1117sc": AssetIn(partition_mapping=subset_mapping), - "geometry": AssetIn(partition_mapping=subset_mapping), - }, - outs={ - "plot": AssetOut(), - }, - partitions_def=dataset_node_partition, -) -def plot(geometry: gpd.GeoDataFrame, oa11_lc1117sc: pd.DataFrame): - """Plots map with log density of people.""" - merged = geometry.merge( - oa11_lc1117sc, left_on="geo_code", right_on="OA11", how="left" - ) - merged["log10 people"] = np.log10(merged["All people"]) - merged[merged["Age bracket"] == "All people"].plot( - column="log10 people", legend=True - ) - md_content = markdown_from_plot(plt) - return MaterializeResult(metadata={"plot": MetadataValue.md(md_content)}) From 6844c6f456d918134cfea403c9736da5f1cfc27e Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 23 Apr 2024 16:43:53 +0100 Subject: [PATCH 19/60] Rename modules to match Belgium --- python/popgetter/assets/scotland/__init__.py | 2 -- python/popgetter/assets/scotland/census_geometry.py | 2 +- .../popgetter/assets/scotland/{scotland.py => census_tables.py} | 0 3 files changed, 1 insertion(+), 3 deletions(-) rename python/popgetter/assets/scotland/{scotland.py => census_tables.py} (100%) diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py index a84b2bc..7953386 100755 --- a/python/popgetter/assets/scotland/__init__.py +++ b/python/popgetter/assets/scotland/__init__.py @@ -20,8 +20,6 @@ iso3116_2="GB-SCT", ) -WORKING_DIR = Path("scotland") - @asset() def country_metadata() -> CountryMetadata: diff --git a/python/popgetter/assets/scotland/census_geometry.py b/python/popgetter/assets/scotland/census_geometry.py index 544ca1b..5187181 100644 --- a/python/popgetter/assets/scotland/census_geometry.py +++ b/python/popgetter/assets/scotland/census_geometry.py @@ -5,7 +5,7 @@ from popgetter.assets.scotland import download_file -from .scotland import URL_SHAPEFILE, add_metadata, cache_dir +from .census_tables import URL_SHAPEFILE, add_metadata, cache_dir @asset diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/census_tables.py similarity index 100% rename from python/popgetter/assets/scotland/scotland.py rename to python/popgetter/assets/scotland/census_tables.py From c4bd1ef19a65a44a64cfe212a669198cd696bf0a Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 23 Apr 2024 17:27:13 +0100 Subject: [PATCH 20/60] Fix imports, refactor Scotland catalog asset names --- python/popgetter/__init__.py | 21 +++++++---- python/popgetter/assets/scotland/__init__.py | 2 +- .../assets/scotland/census_derived.py | 10 +++--- .../assets/scotland/census_tables.py | 36 ++++++++++--------- 4 files changed, 41 insertions(+), 28 deletions(-) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 31d2355..3e1a49c 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -3,7 +3,8 @@ from collections.abc import Sequence from pathlib import Path -from python.popgetter.utils import StagingDirResource +# from python.popgetter.utils import StagingDirResource +from popgetter.utils import StagingDirResource __version__ = "0.1.0" @@ -27,12 +28,17 @@ ) from popgetter import assets +from popgetter.assets.scotland.census_tables import ( + dataset_node_partition as dataset_partition_scotland, +) all_assets: Sequence[AssetsDefinition | SourceAsset | CacheableAssetsDefinition] = [ *load_assets_from_package_module(assets.us, group_name="us"), *load_assets_from_package_module(assets.be, group_name="be"), *load_assets_from_package_module(assets.uk, group_name="uk"), - *load_assets_from_package_module(assets.scotland, group_name="scotland", key_prefix="uk-scotland"), + *load_assets_from_package_module( + assets.scotland, group_name="scotland", key_prefix="uk-scotland" + ), ] job_be: UnresolvedAssetJobDefinition = define_asset_job( @@ -54,20 +60,21 @@ description="Downloads UK data.", ) -job_uk: UnresolvedAssetJobDefinition = define_asset_job( +job_scotland: UnresolvedAssetJobDefinition = define_asset_job( name="job_scotland", selection=AssetSelection.groups("scotland"), description="Downloads Scotland data.", - # https://docs.dagster.io/guides/limiting-concurrency-in-data-pipelines#asset-based-jobs + partitions_def=dataset_partition_scotland, + # https://docs.dagster.io/guides/limiting-concurrency-in-data-pipelines#asset-based-jobs config={ "execution": { "config": { "multiprocess": { - "max_concurrent": 20, # limits concurrent assets + "max_concurrent": 20, # limits concurrent assets }, } } - } + }, ) @@ -82,5 +89,5 @@ staging_dir=str(Path(__file__).parent.joinpath("staging_dir").resolve()) ), }, - jobs=[job_be, job_us, job_uk], + jobs=[job_be, job_us, job_uk, job_scotland], ) diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py index 7953386..77371e3 100755 --- a/python/popgetter/assets/scotland/__init__.py +++ b/python/popgetter/assets/scotland/__init__.py @@ -14,7 +14,7 @@ country: CountryMetadata = CountryMetadata( name_short_en="Scotland", - name_official="Kingdom of Belgium", + name_official="Scotland", iso3="GBR", iso2="GB", iso3116_2="GB-SCT", diff --git a/python/popgetter/assets/scotland/census_derived.py b/python/popgetter/assets/scotland/census_derived.py index 90e94d6..724f292 100644 --- a/python/popgetter/assets/scotland/census_derived.py +++ b/python/popgetter/assets/scotland/census_derived.py @@ -19,7 +19,7 @@ from popgetter.utils import markdown_from_plot from ...metadata import MetricMetadata -from .scotland import add_metadata, dataset_node_partition +from .census_tables import add_metadata, dataset_node_partition def get_lc1117sc_metric( @@ -193,13 +193,15 @@ def census_table_metadata(catalog_row: dict) -> MetricMetadata: @asset( ins={ - "catalog": AssetIn(partition_mapping=needed_dataset_mapping), + "catalog_as_dataframe": AssetIn(partition_mapping=needed_dataset_mapping), }, ) def filter_needed_catalog( - context, needed_datasets, catalog: pd.DataFrame + context, needed_datasets, catalog_as_dataframe: pd.DataFrame ) -> pd.DataFrame: - needed_df = needed_datasets.merge(catalog, how="inner", on="partition_key") + needed_df = needed_datasets.merge( + catalog_as_dataframe, how="inner", on="partition_key" + ) add_metadata(context, needed_df, "needed_df") return needed_df diff --git a/python/popgetter/assets/scotland/census_tables.py b/python/popgetter/assets/scotland/census_tables.py index e81a95c..73ae065 100644 --- a/python/popgetter/assets/scotland/census_tables.py +++ b/python/popgetter/assets/scotland/census_tables.py @@ -40,7 +40,7 @@ "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx" ) URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip" -URL_CATALOG_METADATA = ( +URL_CATALOG = ( "https://www.scotlandscensus.gov.uk/media/kqcmo4ge/census-table-index-2011.xlsm" ) @@ -128,9 +128,9 @@ def add_metadata( @asset -def catalog_metadata(context) -> pd.DataFrame: - catalog_metadata_df = pd.read_excel( - URL_CATALOG_METADATA, +def catalog_reference(context) -> pd.DataFrame: + catalog_reference = pd.read_excel( + URL_CATALOG, sheet_name=None, header=None, storage_options={"User-Agent": "Mozilla/5.0"}, @@ -147,15 +147,15 @@ def catalog_metadata(context) -> pd.DataFrame: 8: "population_coverage_and_variable", } ) - add_metadata(context, catalog_metadata_df, "Metadata for census tables") - return catalog_metadata_df + add_metadata(context, catalog_reference, "Metadata for census tables") + return catalog_reference def get_table_metadata( - catalog_metadata: pd.DataFrame, table_name: str + catalog_reference: pd.DataFrame, table_name: str ) -> dict[str, str]: """Returns a dict of table metadata for a given table name.""" - rows = catalog_metadata.loc[catalog_metadata.loc[:, "table_name"].eq(table_name)] + rows = catalog_reference.loc[catalog_reference.loc[:, "table_name"].eq(table_name)] census_release = rows.loc[:, "description"].unique()[0] description = rows.loc[:, "description"].unique()[0] population_coverage = rows.loc[:, "description"].unique()[0] @@ -178,7 +178,7 @@ def get_table_name(file_name: str) -> str: @asset -def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame: +def catalog_as_dataframe(context, catalog_reference: pd.DataFrame) -> pd.DataFrame: """Creates a catalog of the individual census tables from all data sources.""" records = [] for data_source in DATA_SOURCES: @@ -191,15 +191,15 @@ def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame: # Get table name table_name = get_table_name(file_name) - # Skip bulk output files and missing tables from catalog_metadata + # Skip bulk output files and missing tables from catalog_reference if ( "bulk_output" in file_name.lower() - or catalog_metadata.loc[:, "table_name"].ne(table_name).all() + or catalog_reference.loc[:, "table_name"].ne(table_name).all() ): continue # Get table metadata - table_metadata = get_table_metadata(catalog_metadata, table_name) + table_metadata = get_table_metadata(catalog_reference, table_name) # Create a record for each census table use same keys as MetricMetadata # where possible since this makes it simpler to populate derived @@ -215,7 +215,7 @@ def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame: # Use constructed name of description and coverage "human_readable_name": table_metadata["human_readable_name"], "source_metric_id": None, - # Use catalog_metadata description + # Use catalog_reference description "description": table_metadata["description"], "hxl_tag": None, "metric_parquet_file_url": None, @@ -229,7 +229,7 @@ def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame: "source_download_url": url, # TODO: what should this be? "source_archive_file_path": None, - "source_documentation_url": URL_CATALOG_METADATA, + "source_documentation_url": URL_CATALOG, } context.log.debug(record) records.append(record) @@ -277,11 +277,15 @@ def get_table(context, table_details) -> pd.DataFrame: @asset(partitions_def=dataset_node_partition) -def individual_census_table(context, catalog: pd.DataFrame) -> pd.DataFrame: +def individual_census_table( + context, catalog_as_dataframe: pd.DataFrame +) -> pd.DataFrame: """Creates individual census tables as dataframe.""" partition_key = context.asset_partition_key_for_output() context.log.info(partition_key) - table_details = catalog.loc[catalog["partition_key"].isin([partition_key])] + table_details = catalog_as_dataframe.loc[ + catalog_as_dataframe["partition_key"].isin([partition_key]) + ] context.log.info(table_details) return get_table(context, table_details) From 596729cd9f682bf970d0af4f6e98f4916b7d2888 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 23 Apr 2024 17:43:55 +0100 Subject: [PATCH 21/60] Add data publisher for Scotland --- python/popgetter/assets/scotland/__init__.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py index 77371e3..d85fcdd 100755 --- a/python/popgetter/assets/scotland/__init__.py +++ b/python/popgetter/assets/scotland/__init__.py @@ -8,9 +8,7 @@ asset, ) -from popgetter.metadata import ( - CountryMetadata, -) +from popgetter.metadata import CountryMetadata, DataPublisher country: CountryMetadata = CountryMetadata( name_short_en="Scotland", @@ -20,6 +18,16 @@ iso3116_2="GB-SCT", ) +publisher: DataPublisher = DataPublisher( + name="National Records of Scotland", + url="https://www.nrscotland.gov.uk/", + description="National Records of Scotland (NRS) is a Non-Ministerial Department of " + "the Scottish Government. Our purpose is to collect, preserve and " + "produce information about Scotland's people and history and make it " + "available to inform current and future generations.", + countries_of_interest=[country], +) + @asset() def country_metadata() -> CountryMetadata: @@ -27,6 +35,12 @@ def country_metadata() -> CountryMetadata: return country +@asset() +def publisher_metadata(): + """Returns a DataPublisher of metadata about the publisher.""" + return publisher + + HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0" } From 6a2d1b696426d8aac9acab44bd176c5cbdc97ac6 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 25 Apr 2024 10:52:26 +0100 Subject: [PATCH 22/60] Fix column names --- python/popgetter/assets/scotland/census_tables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/popgetter/assets/scotland/census_tables.py b/python/popgetter/assets/scotland/census_tables.py index 73ae065..af6610f 100644 --- a/python/popgetter/assets/scotland/census_tables.py +++ b/python/popgetter/assets/scotland/census_tables.py @@ -156,9 +156,9 @@ def get_table_metadata( ) -> dict[str, str]: """Returns a dict of table metadata for a given table name.""" rows = catalog_reference.loc[catalog_reference.loc[:, "table_name"].eq(table_name)] - census_release = rows.loc[:, "description"].unique()[0] + census_release = rows.loc[:, "census_release"].unique()[0] description = rows.loc[:, "description"].unique()[0] - population_coverage = rows.loc[:, "description"].unique()[0] + population_coverage = rows.loc[:, "population_coverage"].unique()[0] variables = ", ".join(rows.loc[:, "variable"].astype(str).to_list()) catalog_resolution = rows.loc[:, "catalog_resolution"].unique()[0] year = int(rows.loc[:, "year"].unique()[0]) From af3d00d5a2c4947def279a75e9f16c9f3080a2b4 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 25 Apr 2024 13:26:56 +0100 Subject: [PATCH 23/60] Add source data releases metadata, fix parquet_column_name field --- python/popgetter/assets/scotland/__init__.py | 85 ++++++++++++++++++- .../assets/scotland/census_derived.py | 11 ++- .../assets/scotland/census_tables.py | 24 +++--- 3 files changed, 103 insertions(+), 17 deletions(-) diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py index d85fcdd..91d131e 100755 --- a/python/popgetter/assets/scotland/__init__.py +++ b/python/popgetter/assets/scotland/__init__.py @@ -1,6 +1,7 @@ #!/usr/bin/python3 from __future__ import annotations +from datetime import date from pathlib import Path import requests @@ -8,7 +9,7 @@ asset, ) -from popgetter.metadata import CountryMetadata, DataPublisher +from popgetter.metadata import CountryMetadata, DataPublisher, SourceDataRelease country: CountryMetadata = CountryMetadata( name_short_en="Scotland", @@ -41,6 +42,88 @@ def publisher_metadata(): return publisher +# From: https://github.com/alan-turing-institute/microsimulation/blob/37ce2843f10b83a8e7a225c801cec83b85e6e0d0/microsimulation/common.py#L32 +REQUIRED_TABLES = [ + "QS103SC", + "QS104SC", + "KS201SC", + "DC1117SC", + "DC2101SC", + "DC6206SC", + "LC1117SC", +] +REQUIRED_TABLES_REGEX = "|".join(REQUIRED_TABLES) +# Currently including only releases matching tables included +REQUIRED_RELEASES = ["3A", "3I", "2A", "3C"] +GENERAL_METHODS_URL = "https://www.scotlandscensus.gov.uk/media/jx2lz54n/scotland-s_census_2011_general_report.pdf" +CENSUS_REFERENCE_PERIOD = (date(2011, 3, 27), None) +CENSUS_COLLECTION_PERIOD = (date(2011, 3, 27), None) +CENSUS_EXPECT_NEXT_UPDATE = date(2022, 1, 1) + +sources: dict[str, SourceDataRelease] = { + "3A": SourceDataRelease( + name="Census 2011: Release 3A", + date_published=date(2014, 2, 27), + reference_period=CENSUS_REFERENCE_PERIOD, + collection_period=CENSUS_COLLECTION_PERIOD, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3a", + publishing_organisation=publisher, + description="TBC", + geography_file="TBC", + geography_level="TBC", + # available_metrics=None, + countries_of_interest=[country], + ), + "3I": SourceDataRelease( + name="Census 2011: Release 3I", + date_published=date(2014, 9, 24), + reference_period=(date(2015, 10, 22), None), + collection_period=(date(2011, 10, 22), None), + expect_next_update=date(2022, 1, 1), + url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3i", + publishing_organisation=publisher, + description="TBC", + geography_file="TBC", + geography_level="TBC", + # available_metrics=None, + countries_of_interest=[country], + ), + "2A": SourceDataRelease( + name="Census 2011: Release 2A", + date_published=date(2013, 9, 26), + reference_period=(date(2015, 10, 22), None), + collection_period=(date(2011, 10, 22), None), + expect_next_update=date(2022, 1, 1), + url="https://www.nrscotland.gov.uk/news/2013/census-2011-release-2a", + publishing_organisation=publisher, + description="TBC", + geography_file="TBC", + geography_level="TBC", + # available_metrics=None, + countries_of_interest=[country], + ), + "3C": SourceDataRelease( + name="Census 2011: Release 3C", + date_published=date(2014, 4, 9), + reference_period=(date(2015, 10, 22), None), + collection_period=(date(2011, 10, 22), None), + expect_next_update=date(2022, 1, 1), + url="https://www.nrscotland.gov.uk/news/2014/census-2011-releases-2d-and-3c", + publishing_organisation=publisher, + description="TBC", + geography_file="TBC", + geography_level="TBC", + # available_metrics=None, + countries_of_interest=[country], + ), +} +# Init +for source in sources: + sources[source].update_forward_refs() + + +# Move to tests HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0" } diff --git a/python/popgetter/assets/scotland/census_derived.py b/python/popgetter/assets/scotland/census_derived.py index 724f292..9158e91 100644 --- a/python/popgetter/assets/scotland/census_derived.py +++ b/python/popgetter/assets/scotland/census_derived.py @@ -84,6 +84,8 @@ def get_lc1117sc_metric( needed_dataset_partition = StaticPartitionsDefinition(needed_dataset_partions_keys) # Using HXL tags for variable names (https://hxlstandard.org/standard/1-1final/dictionary/#tag_population) +# TODO: add human readable names for each column as the MetricMetadata currently receives the +# catalog row (table) human readable name. _derived_columns: list[dict] = [ { "partition_key": "2011/OA11/LC1117SC", @@ -176,13 +178,16 @@ def census_table_metadata(catalog_row: dict) -> MetricMetadata: source_download_url=catalog_row["source_download_url"], source_archive_file_path=catalog_row["source_archive_file_path"], source_documentation_url=catalog_row["source_documentation_url"], - source_data_release_id="TODO", + source_data_release_id=catalog_row["source_data_release_id"], # TODO - this is a placeholder parent_metric_id="unknown_at_this_stage", potential_denominator_ids=None, parquet_margin_of_error_file=None, parquet_margin_of_error_column=None, - parquet_column_name=catalog_row["source_column"], + # TODO: currently setting to rename the derived column name equal to 'hxltag' + # and not related to the source_column + # parquet_column_name=catalog_row["source_column"], + parquet_column_name=catalog_row["hxltag"], # TODO - this is a placeholder metric_parquet_file_url="unknown_at_this_stage", hxl_tag=catalog_row["hxltag"], @@ -250,7 +255,9 @@ def get_enriched_tables_scotland( ] catalog_row = catalog_row.to_dict(orient="index") catalog_row = catalog_row.popitem()[1] + ic(catalog_row) result_mmd = census_table_metadata(catalog_row) + ic(result_mmd) return result_df, result_mmd diff --git a/python/popgetter/assets/scotland/census_tables.py b/python/popgetter/assets/scotland/census_tables.py index af6610f..7f61fa5 100644 --- a/python/popgetter/assets/scotland/census_tables.py +++ b/python/popgetter/assets/scotland/census_tables.py @@ -16,7 +16,7 @@ multi_asset, ) -from popgetter.assets.scotland import download_file +from popgetter.assets.scotland import REQUIRED_TABLES_REGEX, download_file, sources """ Notes: @@ -52,18 +52,6 @@ "OA11": 2, # "Output Area blk" } -# From: https://github.com/alan-turing-institute/microsimulation/blob/37ce2843f10b83a8e7a225c801cec83b85e6e0d0/microsimulation/common.py#L32 -REQUIRED_TABLES = [ - "QS103SC", - "QS104SC", - "KS201SC", - "DC1117SC", - "DC2101SC", - "DC6206SC", - "LC1117SC", -] -REQUIRED_TABLES_REGEX = "|".join(REQUIRED_TABLES) - DATA_SOURCES = [ { "source": "Council Area blk", @@ -201,6 +189,14 @@ def catalog_as_dataframe(context, catalog_reference: pd.DataFrame) -> pd.DataFra # Get table metadata table_metadata = get_table_metadata(catalog_reference, table_name) + # Get source release metadata if available + source_data_release = sources.get( + table_metadata["census_release"], None + ) + source_data_release_id = ( + None if source_data_release is None else source_data_release.id + ) + # Create a record for each census table use same keys as MetricMetadata # where possible since this makes it simpler to populate derived # metrics downstream @@ -225,7 +221,7 @@ def catalog_as_dataframe(context, catalog_reference: pd.DataFrame) -> pd.DataFra "potential_denominator_ids": None, "parent_metric_id": None, # TODO: check this is not an ID but a name - "source_data_release_id": table_metadata["census_release"], + "source_data_release_id": source_data_release_id, "source_download_url": url, # TODO: what should this be? "source_archive_file_path": None, From 06f0e10e963d203e0097997fe308dc5f7f809f4d Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 25 Apr 2024 13:56:05 +0100 Subject: [PATCH 24/60] Fix for CI --- python/popgetter/assets/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/popgetter/assets/__init__.py b/python/popgetter/assets/__init__.py index 7ecbf5d..e357bc2 100644 --- a/python/popgetter/assets/__init__.py +++ b/python/popgetter/assets/__init__.py @@ -1,3 +1,3 @@ from __future__ import annotations -from . import be, uk, us, scotland # noqa: F401 +from . import be, scotland, uk, us # noqa: F401 From d7991e2dc877d88c549df0dfbd4b4bc6f783ec88 Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Thu, 2 May 2024 18:34:37 +0100 Subject: [PATCH 25/60] Update Scotland metadata to match new changes in #82 --- python/popgetter/assets/scotland/__init__.py | 55 ++++++++++---------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py index 91d131e..0a96e80 100755 --- a/python/popgetter/assets/scotland/__init__.py +++ b/python/popgetter/assets/scotland/__init__.py @@ -16,7 +16,7 @@ name_official="Scotland", iso3="GBR", iso2="GB", - iso3116_2="GB-SCT", + iso3166_2="GB-SCT", ) publisher: DataPublisher = DataPublisher( @@ -26,7 +26,7 @@ "the Scottish Government. Our purpose is to collect, preserve and " "produce information about Scotland's people and history and make it " "available to inform current and future generations.", - countries_of_interest=[country], + countries_of_interest=[country.id], ) @@ -56,71 +56,72 @@ def publisher_metadata(): # Currently including only releases matching tables included REQUIRED_RELEASES = ["3A", "3I", "2A", "3C"] GENERAL_METHODS_URL = "https://www.scotlandscensus.gov.uk/media/jx2lz54n/scotland-s_census_2011_general_report.pdf" -CENSUS_REFERENCE_PERIOD = (date(2011, 3, 27), None) -CENSUS_COLLECTION_PERIOD = (date(2011, 3, 27), None) +CENSUS_REFERENCE_DATE = date(2011, 3, 27) +CENSUS_COLLECTION_DATE = date(2011, 3, 27) CENSUS_EXPECT_NEXT_UPDATE = date(2022, 1, 1) sources: dict[str, SourceDataRelease] = { "3A": SourceDataRelease( name="Census 2011: Release 3A", date_published=date(2014, 2, 27), - reference_period=CENSUS_REFERENCE_PERIOD, - collection_period=CENSUS_COLLECTION_PERIOD, + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3a", - publishing_organisation=publisher, + data_publisher_id=publisher.id, description="TBC", geography_file="TBC", geography_level="TBC", - # available_metrics=None, - countries_of_interest=[country], + countries_of_interest=[country.id], ), "3I": SourceDataRelease( name="Census 2011: Release 3I", date_published=date(2014, 9, 24), - reference_period=(date(2015, 10, 22), None), - collection_period=(date(2011, 10, 22), None), + reference_period_start=date(2015, 10, 22), + reference_period_end=date(2015, 10, 22), + collection_period_start=date(2011, 10, 22), + collection_period_end=date(2011, 10, 22), expect_next_update=date(2022, 1, 1), url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3i", - publishing_organisation=publisher, + data_publisher_id=publisher.id, description="TBC", geography_file="TBC", geography_level="TBC", - # available_metrics=None, - countries_of_interest=[country], + countries_of_interest=[country.id], ), "2A": SourceDataRelease( name="Census 2011: Release 2A", date_published=date(2013, 9, 26), - reference_period=(date(2015, 10, 22), None), - collection_period=(date(2011, 10, 22), None), + reference_period_start=date(2015, 10, 22), + reference_period_end=date(2015, 10, 22), + collection_period_start=date(2011, 10, 22), + collection_period_end=date(2011, 10, 22), expect_next_update=date(2022, 1, 1), url="https://www.nrscotland.gov.uk/news/2013/census-2011-release-2a", - publishing_organisation=publisher, + data_publisher_id=publisher.id, description="TBC", geography_file="TBC", geography_level="TBC", - # available_metrics=None, - countries_of_interest=[country], + countries_of_interest=[country.id], ), "3C": SourceDataRelease( name="Census 2011: Release 3C", date_published=date(2014, 4, 9), - reference_period=(date(2015, 10, 22), None), - collection_period=(date(2011, 10, 22), None), + reference_period_start=date(2015, 10, 22), + reference_period_end=date(2015, 10, 22), + collection_period_start=date(2011, 10, 22), + collection_period_end=date(2011, 10, 22), expect_next_update=date(2022, 1, 1), url="https://www.nrscotland.gov.uk/news/2014/census-2011-releases-2d-and-3c", - publishing_organisation=publisher, + data_publisher_id=publisher.id, description="TBC", geography_file="TBC", geography_level="TBC", - # available_metrics=None, - countries_of_interest=[country], + countries_of_interest=[country.id], ), } -# Init -for source in sources: - sources[source].update_forward_refs() # Move to tests From 629548f6f8590411dd287fb0e62450c77f94c621 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 20 Jun 2024 21:29:30 +0100 Subject: [PATCH 26/60] Add todo --- python/popgetter/assets/ni/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index f5830e1..19d6d3b 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -371,6 +371,7 @@ def _geometry( for level_details in NI_GEO_LEVELS.values(): # TODO: get correct values geometry_metadata = GeometryMetadata( + # TODO: check values for dates for the geometries validity_period_start=CENSUS_COLLECTION_DATE, validity_period_end=CENSUS_COLLECTION_DATE, level=level_details.level, From 8f3c7da19eff17b5f536e8ee04af10fd88f17cce Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 20 Jun 2024 21:30:12 +0100 Subject: [PATCH 27/60] Comment out old versions --- .../assets/scotland/census_derived.py | 602 +++++++++--------- .../assets/scotland/census_geometry.py | 24 +- .../assets/scotland/census_tables.py | 582 ++++++++--------- 3 files changed, 604 insertions(+), 604 deletions(-) diff --git a/python/popgetter/assets/scotland/census_derived.py b/python/popgetter/assets/scotland/census_derived.py index 9158e91..43277a0 100644 --- a/python/popgetter/assets/scotland/census_derived.py +++ b/python/popgetter/assets/scotland/census_derived.py @@ -1,328 +1,328 @@ -from __future__ import annotations +# from __future__ import annotations -import geopandas as gpd -import numpy as np -import pandas as pd -from dagster import ( - AssetIn, - AssetOut, - MaterializeResult, - MetadataValue, - SpecificPartitionsPartitionMapping, - StaticPartitionsDefinition, - asset, - multi_asset, -) -from icecream import ic -from matplotlib import pyplot as plt +# import geopandas as gpd +# import numpy as np +# import pandas as pd +# from dagster import ( +# AssetIn, +# AssetOut, +# MaterializeResult, +# MetadataValue, +# SpecificPartitionsPartitionMapping, +# StaticPartitionsDefinition, +# asset, +# multi_asset, +# ) +# from icecream import ic +# from matplotlib import pyplot as plt -from popgetter.utils import markdown_from_plot +# from popgetter.utils import markdown_from_plot -from ...metadata import MetricMetadata -from .census_tables import add_metadata, dataset_node_partition +# from ...metadata import MetricMetadata +# from .census_tables import add_metadata, dataset_node_partition -def get_lc1117sc_metric( - lc1117sc: pd.DataFrame, col: str, output_col: str, subset: list[str] -) -> pd.DataFrame: - lc1117sc_transformed = lc1117sc.rename( - columns={"Unnamed: 0": "OA11CD", "Unnamed: 1": "Age Category"} - ) - lc1117sc_transformed = lc1117sc_transformed.loc[ - ~lc1117sc_transformed["OA11CD"].str.startswith("S92"), : - ] - return ( - lc1117sc_transformed.loc[ - lc1117sc_transformed["Age Category"].isin(subset), - ["OA11CD", col], - ] - .groupby("OA11CD") - .agg("sum") - .rename(columns={col: output_col}) - ) +# def get_lc1117sc_metric( +# lc1117sc: pd.DataFrame, col: str, output_col: str, subset: list[str] +# ) -> pd.DataFrame: +# lc1117sc_transformed = lc1117sc.rename( +# columns={"Unnamed: 0": "OA11CD", "Unnamed: 1": "Age Category"} +# ) +# lc1117sc_transformed = lc1117sc_transformed.loc[ +# ~lc1117sc_transformed["OA11CD"].str.startswith("S92"), : +# ] +# return ( +# lc1117sc_transformed.loc[ +# lc1117sc_transformed["Age Category"].isin(subset), +# ["OA11CD", col], +# ] +# .groupby("OA11CD") +# .agg("sum") +# .rename(columns={col: output_col}) +# ) -ALL_PEOPLE = ["All people"] -INFANTS_AGE_0_TO_4 = ["0 to 4"] -CHILDREN_AGE_0_TO_17 = ["0 to 4", "5 to 9", "10 to 11", "12 to 14", "15", "16 to 17"] -CHILDREN_AGE_5_TO_17 = ["5 to 9", "10 to 11", "12 to 14", "15", "16 to 17"] -ADULTS = [ - "18 to 19", - "20 to 24", - "25 to 29", - "30 to 34", - "35 to 39", - "40 to 44", - "45 to 49", - "50 to 54", - "55 to 59", - "60 to 64", - "65 to 69", - "70 to 74", - "75 to 79", - "80 to 84", - "85 to 89", - "90 to 94", - "95 and over", -] +# ALL_PEOPLE = ["All people"] +# INFANTS_AGE_0_TO_4 = ["0 to 4"] +# CHILDREN_AGE_0_TO_17 = ["0 to 4", "5 to 9", "10 to 11", "12 to 14", "15", "16 to 17"] +# CHILDREN_AGE_5_TO_17 = ["5 to 9", "10 to 11", "12 to 14", "15", "16 to 17"] +# ADULTS = [ +# "18 to 19", +# "20 to 24", +# "25 to 29", +# "30 to 34", +# "35 to 39", +# "40 to 44", +# "45 to 49", +# "50 to 54", +# "55 to 59", +# "60 to 64", +# "65 to 69", +# "70 to 74", +# "75 to 79", +# "80 to 84", +# "85 to 89", +# "90 to 94", +# "95 and over", +# ] -needed_dataset_list = [ - { - # Population by OA11, Period: 2011 - "partition_key": "2011/OA11/LC1117SC", - "hxltag": "#population+oa11+2011", - # TODO: this partition key does not have a single column for source - "source_column": "", - } -] -needed_dataset_partions_keys: list[str] = [ - r["partition_key"] for r in needed_dataset_list -] -needed_dataset_mapping = SpecificPartitionsPartitionMapping( - needed_dataset_partions_keys -) -needed_dataset_partition = StaticPartitionsDefinition(needed_dataset_partions_keys) +# needed_dataset_list = [ +# { +# # Population by OA11, Period: 2011 +# "partition_key": "2011/OA11/LC1117SC", +# "hxltag": "#population+oa11+2011", +# # TODO: this partition key does not have a single column for source +# "source_column": "", +# } +# ] +# needed_dataset_partions_keys: list[str] = [ +# r["partition_key"] for r in needed_dataset_list +# ] +# needed_dataset_mapping = SpecificPartitionsPartitionMapping( +# needed_dataset_partions_keys +# ) +# needed_dataset_partition = StaticPartitionsDefinition(needed_dataset_partions_keys) -# Using HXL tags for variable names (https://hxlstandard.org/standard/1-1final/dictionary/#tag_population) -# TODO: add human readable names for each column as the MetricMetadata currently receives the -# catalog row (table) human readable name. -_derived_columns: list[dict] = [ - { - "partition_key": "2011/OA11/LC1117SC", - "hxltag": "population_children_age5_17", - "filter_func": lambda df, output_col: get_lc1117sc_metric( - df, "All people", output_col, CHILDREN_AGE_5_TO_17 - ), - }, - { - "partition_key": "2011/OA11/LC1117SC", - "hxltag": "population_infants_age0_4", - "filter_func": lambda df, output_col: get_lc1117sc_metric( - df, "All people", output_col, INFANTS_AGE_0_TO_4 - ), - }, - { - "partition_key": "2011/OA11/LC1117SC", - "hxltag": "population_children_age0_17", - "filter_func": lambda df, output_col: get_lc1117sc_metric( - df, "All people", output_col, CHILDREN_AGE_0_TO_17 - ), - }, - { - "partition_key": "2011/OA11/LC1117SC", - "hxltag": "population_adults_f", - "filter_func": lambda df, output_col: get_lc1117sc_metric( - df, "Females", output_col, ADULTS - ), - }, - { - "partition_key": "2011/OA11/LC1117SC", - "hxltag": "population_adults_m", - "filter_func": lambda df, output_col: get_lc1117sc_metric( - df, "Males", output_col, ADULTS - ), - }, - { - "partition_key": "2011/OA11/LC1117SC", - "hxltag": "population_adults", - "filter_func": lambda df, output_col: get_lc1117sc_metric( - df, "All people", output_col, ADULTS - ), - }, - { - "partition_key": "2011/OA11/LC1117SC", - "hxltag": "population_ind", - "filter_func": lambda df, output_col: get_lc1117sc_metric( - df, "All people", output_col, ALL_PEOPLE - ), - }, -] +# # Using HXL tags for variable names (https://hxlstandard.org/standard/1-1final/dictionary/#tag_population) +# # TODO: add human readable names for each column as the MetricMetadata currently receives the +# # catalog row (table) human readable name. +# _derived_columns: list[dict] = [ +# { +# "partition_key": "2011/OA11/LC1117SC", +# "hxltag": "population_children_age5_17", +# "filter_func": lambda df, output_col: get_lc1117sc_metric( +# df, "All people", output_col, CHILDREN_AGE_5_TO_17 +# ), +# }, +# { +# "partition_key": "2011/OA11/LC1117SC", +# "hxltag": "population_infants_age0_4", +# "filter_func": lambda df, output_col: get_lc1117sc_metric( +# df, "All people", output_col, INFANTS_AGE_0_TO_4 +# ), +# }, +# { +# "partition_key": "2011/OA11/LC1117SC", +# "hxltag": "population_children_age0_17", +# "filter_func": lambda df, output_col: get_lc1117sc_metric( +# df, "All people", output_col, CHILDREN_AGE_0_TO_17 +# ), +# }, +# { +# "partition_key": "2011/OA11/LC1117SC", +# "hxltag": "population_adults_f", +# "filter_func": lambda df, output_col: get_lc1117sc_metric( +# df, "Females", output_col, ADULTS +# ), +# }, +# { +# "partition_key": "2011/OA11/LC1117SC", +# "hxltag": "population_adults_m", +# "filter_func": lambda df, output_col: get_lc1117sc_metric( +# df, "Males", output_col, ADULTS +# ), +# }, +# { +# "partition_key": "2011/OA11/LC1117SC", +# "hxltag": "population_adults", +# "filter_func": lambda df, output_col: get_lc1117sc_metric( +# df, "All people", output_col, ADULTS +# ), +# }, +# { +# "partition_key": "2011/OA11/LC1117SC", +# "hxltag": "population_ind", +# "filter_func": lambda df, output_col: get_lc1117sc_metric( +# df, "All people", output_col, ALL_PEOPLE +# ), +# }, +# ] -derived_columns = pd.DataFrame( - _derived_columns, columns=["partition_key", "hxltag", "filter_func"] -) +# derived_columns = pd.DataFrame( +# _derived_columns, columns=["partition_key", "hxltag", "filter_func"] +# ) -# record = { -# "resolution": resolution, -# "catalog_resolution": table_metadata["catalog_resolution"], -# "source": source, -# "url": url, -# "file_name": Path(source) / file_name, -# "table_name": table_name, -# "year": table_metadata["year"], -# # Use constructed name of description and coverage -# "human_readable_name": table_metadata["human_readable_name"], -# "source_metric_id": None, -# # Use catalog_metadata description -# "description": table_metadata["description"], -# "hxl_tag": None, -# "metric_parquet_file_url": None, -# "parquet_column_name": None, -# "parquet_margin_of_error_column": None, -# "parquet_margin_of_error_file": None, -# "potential_denominator_ids": None, -# "parent_metric_id": None, -# # TODO: check this is not an ID but a name -# "source_data_release_id": table_metadata["census_release"], -# "source_download_url": url, -# # TODO: what should this be? -# "source_archive_file_path": None, -# "source_documentation_url": URL_CATALOG_METADATA, -# } +# # record = { +# # "resolution": resolution, +# # "catalog_resolution": table_metadata["catalog_resolution"], +# # "source": source, +# # "url": url, +# # "file_name": Path(source) / file_name, +# # "table_name": table_name, +# # "year": table_metadata["year"], +# # # Use constructed name of description and coverage +# # "human_readable_name": table_metadata["human_readable_name"], +# # "source_metric_id": None, +# # # Use catalog_metadata description +# # "description": table_metadata["description"], +# # "hxl_tag": None, +# # "metric_parquet_file_url": None, +# # "parquet_column_name": None, +# # "parquet_margin_of_error_column": None, +# # "parquet_margin_of_error_file": None, +# # "potential_denominator_ids": None, +# # "parent_metric_id": None, +# # # TODO: check this is not an ID but a name +# # "source_data_release_id": table_metadata["census_release"], +# # "source_download_url": url, +# # # TODO: what should this be? +# # "source_archive_file_path": None, +# # "source_documentation_url": URL_CATALOG_METADATA, +# # } -def census_table_metadata(catalog_row: dict) -> MetricMetadata: - return MetricMetadata( - human_readable_name=catalog_row["human_readable_name"], - source_download_url=catalog_row["source_download_url"], - source_archive_file_path=catalog_row["source_archive_file_path"], - source_documentation_url=catalog_row["source_documentation_url"], - source_data_release_id=catalog_row["source_data_release_id"], - # TODO - this is a placeholder - parent_metric_id="unknown_at_this_stage", - potential_denominator_ids=None, - parquet_margin_of_error_file=None, - parquet_margin_of_error_column=None, - # TODO: currently setting to rename the derived column name equal to 'hxltag' - # and not related to the source_column - # parquet_column_name=catalog_row["source_column"], - parquet_column_name=catalog_row["hxltag"], - # TODO - this is a placeholder - metric_parquet_file_url="unknown_at_this_stage", - hxl_tag=catalog_row["hxltag"], - description=catalog_row["description"], - source_metric_id=catalog_row["hxltag"], - ) +# def census_table_metadata(catalog_row: dict) -> MetricMetadata: +# return MetricMetadata( +# human_readable_name=catalog_row["human_readable_name"], +# source_download_url=catalog_row["source_download_url"], +# source_archive_file_path=catalog_row["source_archive_file_path"], +# source_documentation_url=catalog_row["source_documentation_url"], +# source_data_release_id=catalog_row["source_data_release_id"], +# # TODO - this is a placeholder +# parent_metric_id="unknown_at_this_stage", +# potential_denominator_ids=None, +# parquet_margin_of_error_file=None, +# parquet_margin_of_error_column=None, +# # TODO: currently setting to rename the derived column name equal to 'hxltag' +# # and not related to the source_column +# # parquet_column_name=catalog_row["source_column"], +# parquet_column_name=catalog_row["hxltag"], +# # TODO - this is a placeholder +# metric_parquet_file_url="unknown_at_this_stage", +# hxl_tag=catalog_row["hxltag"], +# description=catalog_row["description"], +# source_metric_id=catalog_row["hxltag"], +# ) -@asset( - ins={ - "catalog_as_dataframe": AssetIn(partition_mapping=needed_dataset_mapping), - }, -) -def filter_needed_catalog( - context, needed_datasets, catalog_as_dataframe: pd.DataFrame -) -> pd.DataFrame: - needed_df = needed_datasets.merge( - catalog_as_dataframe, how="inner", on="partition_key" - ) - add_metadata(context, needed_df, "needed_df") - return needed_df +# @asset( +# ins={ +# "catalog_as_dataframe": AssetIn(partition_mapping=needed_dataset_mapping), +# }, +# ) +# def filter_needed_catalog( +# context, needed_datasets, catalog_as_dataframe: pd.DataFrame +# ) -> pd.DataFrame: +# needed_df = needed_datasets.merge( +# catalog_as_dataframe, how="inner", on="partition_key" +# ) +# add_metadata(context, needed_df, "needed_df") +# return needed_df -@asset -def needed_datasets(context) -> pd.DataFrame: - needed_df = pd.DataFrame( - needed_dataset_list, - columns=["partition_key", "hxltag", "source_column", "derived_columns"], - dtype="string", - ) - add_metadata(context, needed_df, "needed_datasets") - return needed_df +# @asset +# def needed_datasets(context) -> pd.DataFrame: +# needed_df = pd.DataFrame( +# needed_dataset_list, +# columns=["partition_key", "hxltag", "source_column", "derived_columns"], +# dtype="string", +# ) +# add_metadata(context, needed_df, "needed_datasets") +# return needed_df -@multi_asset( - ins={ - "individual_census_table": AssetIn(partition_mapping=needed_dataset_mapping), - "filter_needed_catalog": AssetIn(), - }, - outs={ - "source_table": AssetOut(), - "source_mmd": AssetOut(), - }, - partitions_def=dataset_node_partition, -) -def get_enriched_tables_scotland( - context, individual_census_table, filter_needed_catalog -) -> tuple[pd.DataFrame, MetricMetadata]: - partition_keys = context.asset_partition_keys_for_input( - input_name="individual_census_table", - ) - output_partition = context.asset_partition_key_for_output("source_table") - ic(partition_keys) - ic(len(partition_keys)) - ic(output_partition) - ic(type(output_partition)) - ic(individual_census_table) - if output_partition not in partition_keys: - err_msg = f"Requested partition {output_partition} not found in the subset of 'needed' partitions {partition_keys}" - raise ValueError(err_msg) +# @multi_asset( +# ins={ +# "individual_census_table": AssetIn(partition_mapping=needed_dataset_mapping), +# "filter_needed_catalog": AssetIn(), +# }, +# outs={ +# "source_table": AssetOut(), +# "source_mmd": AssetOut(), +# }, +# partitions_def=dataset_node_partition, +# ) +# def get_enriched_tables_scotland( +# context, individual_census_table, filter_needed_catalog +# ) -> tuple[pd.DataFrame, MetricMetadata]: +# partition_keys = context.asset_partition_keys_for_input( +# input_name="individual_census_table", +# ) +# output_partition = context.asset_partition_key_for_output("source_table") +# ic(partition_keys) +# ic(len(partition_keys)) +# ic(output_partition) +# ic(type(output_partition)) +# ic(individual_census_table) +# if output_partition not in partition_keys: +# err_msg = f"Requested partition {output_partition} not found in the subset of 'needed' partitions {partition_keys}" +# raise ValueError(err_msg) - result_df = individual_census_table - catalog_row = filter_needed_catalog[ - filter_needed_catalog["partition_key"].eq(output_partition) - ] - catalog_row = catalog_row.to_dict(orient="index") - catalog_row = catalog_row.popitem()[1] - ic(catalog_row) - result_mmd = census_table_metadata(catalog_row) - ic(result_mmd) - return result_df, result_mmd +# result_df = individual_census_table +# catalog_row = filter_needed_catalog[ +# filter_needed_catalog["partition_key"].eq(output_partition) +# ] +# catalog_row = catalog_row.to_dict(orient="index") +# catalog_row = catalog_row.popitem()[1] +# ic(catalog_row) +# result_mmd = census_table_metadata(catalog_row) +# ic(result_mmd) +# return result_df, result_mmd -@multi_asset( - partitions_def=dataset_node_partition, - ins={ - "source_table": AssetIn(partition_mapping=needed_dataset_mapping), - "source_mmd": AssetIn(partition_mapping=needed_dataset_mapping), - }, - outs={"derived_table": AssetOut(), "derived_mmds": AssetOut()}, -) -def transform_data( - context, - source_table: pd.DataFrame, - source_mmd: MetricMetadata, -) -> tuple[pd.DataFrame, list[MetricMetadata]]: - partition_key = context.asset_partition_key_for_output("derived_table") - census_table = source_table.copy() - parent_mmd = source_mmd.copy() - # source_column = parent_mmd.parquet_column_name - metrics = derived_columns[derived_columns["partition_key"].eq(partition_key)] - new_series: list[pd.Series] = [] - new_mmds: list[MetricMetadata] = [] - for _, _, col_name, filter in metrics.itertuples(): - # Create column - column: pd.Series = filter(census_table, col_name) - ic(f"col_name: {col_name}") - new_series.append(column) +# @multi_asset( +# partitions_def=dataset_node_partition, +# ins={ +# "source_table": AssetIn(partition_mapping=needed_dataset_mapping), +# "source_mmd": AssetIn(partition_mapping=needed_dataset_mapping), +# }, +# outs={"derived_table": AssetOut(), "derived_mmds": AssetOut()}, +# ) +# def transform_data( +# context, +# source_table: pd.DataFrame, +# source_mmd: MetricMetadata, +# ) -> tuple[pd.DataFrame, list[MetricMetadata]]: +# partition_key = context.asset_partition_key_for_output("derived_table") +# census_table = source_table.copy() +# parent_mmd = source_mmd.copy() +# # source_column = parent_mmd.parquet_column_name +# metrics = derived_columns[derived_columns["partition_key"].eq(partition_key)] +# new_series: list[pd.Series] = [] +# new_mmds: list[MetricMetadata] = [] +# for _, _, col_name, filter in metrics.itertuples(): +# # Create column +# column: pd.Series = filter(census_table, col_name) +# ic(f"col_name: {col_name}") +# new_series.append(column) - # Construct metadata - new_mmd = parent_mmd.copy() - new_mmd.parent_metric_id = parent_mmd.source_metric_id - new_mmd.hxl_tag = col_name - new_mmds.append(new_mmd) +# # Construct metadata +# new_mmd = parent_mmd.copy() +# new_mmd.parent_metric_id = parent_mmd.source_metric_id +# new_mmd.hxl_tag = col_name +# new_mmds.append(new_mmd) - # Merge series - new_table: pd.DataFrame = pd.concat(new_series, axis=1) - add_metadata( - context, - df=new_table, - title=f"Derived table ({partition_key})", - output_name="derived_table", - ) - return new_table, new_mmds +# # Merge series +# new_table: pd.DataFrame = pd.concat(new_series, axis=1) +# add_metadata( +# context, +# df=new_table, +# title=f"Derived table ({partition_key})", +# output_name="derived_table", +# ) +# return new_table, new_mmds -@multi_asset( - ins={ - "derived_table": AssetIn(partition_mapping=needed_dataset_mapping), - "geometry": AssetIn(partition_mapping=needed_dataset_mapping), - }, - outs={ - "plot": AssetOut(), - }, - partitions_def=dataset_node_partition, -) -def plot(derived_table: pd.DataFrame, geometry: gpd.GeoDataFrame): - """Plots map with log density of people.""" - merged = geometry.merge( - derived_table[["population_ind"]], - left_on="geo_code", - right_index=True, - how="left", - ) - merged["log10 people"] = np.log10(merged["population_ind"]) - merged.plot(column="log10 people", legend=True) - md_content = markdown_from_plot(plt) - return MaterializeResult(metadata={"plot": MetadataValue.md(md_content)}) +# @multi_asset( +# ins={ +# "derived_table": AssetIn(partition_mapping=needed_dataset_mapping), +# "geometry": AssetIn(partition_mapping=needed_dataset_mapping), +# }, +# outs={ +# "plot": AssetOut(), +# }, +# partitions_def=dataset_node_partition, +# ) +# def plot(derived_table: pd.DataFrame, geometry: gpd.GeoDataFrame): +# """Plots map with log density of people.""" +# merged = geometry.merge( +# derived_table[["population_ind"]], +# left_on="geo_code", +# right_index=True, +# how="left", +# ) +# merged["log10 people"] = np.log10(merged["population_ind"]) +# merged.plot(column="log10 people", legend=True) +# md_content = markdown_from_plot(plt) +# return MaterializeResult(metadata={"plot": MetadataValue.md(md_content)}) diff --git a/python/popgetter/assets/scotland/census_geometry.py b/python/popgetter/assets/scotland/census_geometry.py index 5187181..28e5afa 100644 --- a/python/popgetter/assets/scotland/census_geometry.py +++ b/python/popgetter/assets/scotland/census_geometry.py @@ -1,17 +1,17 @@ -from __future__ import annotations +# from __future__ import annotations -import geopandas as gpd -from dagster import asset +# import geopandas as gpd +# from dagster import asset -from popgetter.assets.scotland import download_file +# from popgetter.assets.scotland import download_file -from .census_tables import URL_SHAPEFILE, add_metadata, cache_dir +# from .census_tables import URL_SHAPEFILE, add_metadata, cache_dir -@asset -def geometry(context, oa_dz_iz_2011_lookup) -> gpd.GeoDataFrame: - """Gets the shape file for OA11 resolution.""" - file_name = download_file(cache_dir, URL_SHAPEFILE) - geo = gpd.read_file(f"zip://{file_name}") - add_metadata(context, geo, "Geometry file") - return geo[geo["geo_code"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])] +# # @asset +# # def geometry(context, oa_dz_iz_2011_lookup) -> gpd.GeoDataFrame: +# # """Gets the shape file for OA11 resolution.""" +# # file_name = download_file(cache_dir, URL_SHAPEFILE) +# # geo = gpd.read_file(f"zip://{file_name}") +# # add_metadata(context, geo, "Geometry file") +# # return geo[geo["geo_code"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])] diff --git a/python/popgetter/assets/scotland/census_tables.py b/python/popgetter/assets/scotland/census_tables.py index 7f61fa5..5efda94 100644 --- a/python/popgetter/assets/scotland/census_tables.py +++ b/python/popgetter/assets/scotland/census_tables.py @@ -1,291 +1,291 @@ -from __future__ import annotations - -import urllib.parse as urlparse -from pathlib import Path - -import geopandas as gpd -import pandas as pd -import zipfile_deflate64 as zipfile -from dagster import ( - AssetOut, - DynamicPartitionsDefinition, - MetadataValue, - SpecificPartitionsPartitionMapping, - StaticPartitionsDefinition, - asset, - multi_asset, -) - -from popgetter.assets.scotland import REQUIRED_TABLES_REGEX, download_file, sources - -""" -Notes: - - 2011 data using UKCensusAPI, 2022 data expected soon given recent initial - publication - - Reusing some bits of code from UKCensusAPI: - https://github.com/alan-turing-institute/UKCensusAPI/blob/master/ukcensusapi/NRScotland.py -""" - - -PARTITIONS_DEF_NAME = "dataset_tables" -dataset_node_partition = DynamicPartitionsDefinition(name=PARTITIONS_DEF_NAME) - -# cache_dir = tempfile.mkdtemp() -cache_dir = "./cache" - -URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html" -URL1 = "https://www.scotlandscensus.gov.uk/" -URL2 = "https://nrscensusprodumb.blob.core.windows.net/downloads/" -URL_LOOKUP = ( - "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx" -) -URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip" -URL_CATALOG = ( - "https://www.scotlandscensus.gov.uk/media/kqcmo4ge/census-table-index-2011.xlsm" -) - -data_sources = ["Council Area blk", "SNS Data Zone 2011 blk", "Output Area blk"] -GeoCodeLookup = { - "LAD": 0, # "Council Area blk" - # MSOA (intermediate zone)? - "LSOA11": 1, # "SNS Data Zone 2011 blk" - "OA11": 2, # "Output Area blk" -} - -DATA_SOURCES = [ - { - "source": "Council Area blk", - "resolution": "LAD", - "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip", - }, - { - "source": "SNS Data Zone 2011 blk", - "resolution": "LSOA11", - "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip", - }, - { - "source": "Output Area blk", - "resolution": "OA11", - "url": URL2 + urlparse.quote("Output Area blk") + ".zip", - }, -] - - -# NB. Make sure no spaces in asset keys -@multi_asset( - outs={ - "oa_dz_iz_2011_lookup": AssetOut(), - "data_zone_2011_lookup": AssetOut(), - "intermediate_zone_2011_lookup": AssetOut(), - }, -) -def lookups(): - """Creates lookup dataframes.""" - Path(cache_dir).mkdir(parents=True, exist_ok=True) - lookup_path = download_file(cache_dir, URL_LOOKUP) - df1 = pd.read_excel(lookup_path, sheet_name="OA_DZ_IZ_2011 Lookup") - df2 = pd.read_excel(lookup_path, sheet_name="DataZone2011Lookup") - df3 = pd.read_excel(lookup_path, sheet_name="IntermediateZone2011Lookup") - return df1, df2, df3 - - -def source_to_zip(source_name: str, url: str) -> Path: - """Downloads if necessary and returns the name of the locally cached zip file - of the source data (replacing spaces with _)""" - file_name = Path(cache_dir) / (source_name.replace(" ", "_") + ".zip") - return download_file(cache_dir, url, file_name) - - -def add_metadata( - context, - df: pd.DataFrame | gpd.GeoDataFrame, - title: str | list[str], - output_name: str | None = None, -): - context.add_output_metadata( - metadata={ - "title": title, - "num_records": len(df), - "columns": MetadataValue.md( - "\n".join([f"- '`{col}`'" for col in df.columns.to_list()]) - ), - "preview": MetadataValue.md(df.head().to_markdown()), - }, - output_name=output_name, - ) - - -@asset -def catalog_reference(context) -> pd.DataFrame: - catalog_reference = pd.read_excel( - URL_CATALOG, - sheet_name=None, - header=None, - storage_options={"User-Agent": "Mozilla/5.0"}, - )["Index"].rename( - columns={ - 0: "census_release", - 1: "table_name", - 2: "description", - 3: "population_coverage", - 4: "variable", - 5: "catalog_resolution", - 6: "year", - 7: "additional_url", - 8: "population_coverage_and_variable", - } - ) - add_metadata(context, catalog_reference, "Metadata for census tables") - return catalog_reference - - -def get_table_metadata( - catalog_reference: pd.DataFrame, table_name: str -) -> dict[str, str]: - """Returns a dict of table metadata for a given table name.""" - rows = catalog_reference.loc[catalog_reference.loc[:, "table_name"].eq(table_name)] - census_release = rows.loc[:, "census_release"].unique()[0] - description = rows.loc[:, "description"].unique()[0] - population_coverage = rows.loc[:, "population_coverage"].unique()[0] - variables = ", ".join(rows.loc[:, "variable"].astype(str).to_list()) - catalog_resolution = rows.loc[:, "catalog_resolution"].unique()[0] - year = int(rows.loc[:, "year"].unique()[0]) - return { - "census_release": census_release, - "description": description, - "population_coverage": population_coverage, - "variables": variables, - "catalog_resolution": catalog_resolution, - "year": str(year), - "human_readable_name": f"{description} ({population_coverage})", - } - - -def get_table_name(file_name: str) -> str: - return file_name.rsplit(".csv")[0] - - -@asset -def catalog_as_dataframe(context, catalog_reference: pd.DataFrame) -> pd.DataFrame: - """Creates a catalog of the individual census tables from all data sources.""" - records = [] - for data_source in DATA_SOURCES: - resolution = data_source["resolution"] - source = data_source["source"] - url = data_source["url"] - zip_file_name = source_to_zip(source, url) - with zipfile.ZipFile(zip_file_name) as zip_ref: - for file_name in zip_ref.namelist(): - # Get table name - table_name = get_table_name(file_name) - - # Skip bulk output files and missing tables from catalog_reference - if ( - "bulk_output" in file_name.lower() - or catalog_reference.loc[:, "table_name"].ne(table_name).all() - ): - continue - - # Get table metadata - table_metadata = get_table_metadata(catalog_reference, table_name) - - # Get source release metadata if available - source_data_release = sources.get( - table_metadata["census_release"], None - ) - source_data_release_id = ( - None if source_data_release is None else source_data_release.id - ) - - # Create a record for each census table use same keys as MetricMetadata - # where possible since this makes it simpler to populate derived - # metrics downstream - record = { - "resolution": resolution, - "catalog_resolution": table_metadata["catalog_resolution"], - "source": source, - "url": url, - "file_name": Path(source) / file_name, - "table_name": table_name, - "year": table_metadata["year"], - # Use constructed name of description and coverage - "human_readable_name": table_metadata["human_readable_name"], - "source_metric_id": None, - # Use catalog_reference description - "description": table_metadata["description"], - "hxl_tag": None, - "metric_parquet_file_url": None, - "parquet_column_name": None, - "parquet_margin_of_error_column": None, - "parquet_margin_of_error_file": None, - "potential_denominator_ids": None, - "parent_metric_id": None, - # TODO: check this is not an ID but a name - "source_data_release_id": source_data_release_id, - "source_download_url": url, - # TODO: what should this be? - "source_archive_file_path": None, - "source_documentation_url": URL_CATALOG, - } - context.log.debug(record) - records.append(record) - zip_ref.extract(file_name, Path(cache_dir) / source) - - # TODO: check if required - for partition in context.instance.get_dynamic_partitions(PARTITIONS_DEF_NAME): - context.instance.delete_dynamic_partition(PARTITIONS_DEF_NAME, partition) - - # Create a dynamic partition for the datasets listed in the catalog - catalog_df: pd.DataFrame = pd.DataFrame.from_records(records) - catalog_df["partition_key"] = ( - catalog_df[["year", "resolution", "table_name"]] - .astype(str) - .agg(lambda s: "/".join(s).rsplit(".")[0], axis=1) - ) - # TODO: consider filtering here based on a set of keys to keep derived from - # config (i.e. backend/frontend modes) - context.instance.add_dynamic_partitions( - partitions_def_name=PARTITIONS_DEF_NAME, - # To ensure this is unique, prepend the resolution, - partition_keys=catalog_df.loc[ - catalog_df["partition_key"].str.contains(REQUIRED_TABLES_REGEX), - "partition_key", - ].to_list(), - ) - context.add_output_metadata( - metadata={ - "num_records": len(catalog_df), - "ignored_datasets": "", - "columns": MetadataValue.md( - "\n".join([f"- '`{col}`'" for col in catalog_df.columns.to_list()]) - ), - "columns_types": MetadataValue.md(catalog_df.dtypes.to_markdown()), - "preview": MetadataValue.md(catalog_df.to_markdown()), - } - ) - return catalog_df - - -def get_table(context, table_details) -> pd.DataFrame: - table_df = pd.read_csv(Path(cache_dir) / table_details["file_name"].iloc[0]) - add_metadata(context, table_df, table_details["partition_key"].iloc[0]) - return table_df - - -@asset(partitions_def=dataset_node_partition) -def individual_census_table( - context, catalog_as_dataframe: pd.DataFrame -) -> pd.DataFrame: - """Creates individual census tables as dataframe.""" - partition_key = context.asset_partition_key_for_output() - context.log.info(partition_key) - table_details = catalog_as_dataframe.loc[ - catalog_as_dataframe["partition_key"].isin([partition_key]) - ] - context.log.info(table_details) - return get_table(context, table_details) - - -subset_partition_keys: list[str] = ["2011/OA11/LC1117SC"] -subset_mapping = SpecificPartitionsPartitionMapping(subset_partition_keys) -subset_partition = StaticPartitionsDefinition(subset_partition_keys) +# from __future__ import annotations + +# import urllib.parse as urlparse +# from pathlib import Path + +# import geopandas as gpd +# import pandas as pd +# import zipfile_deflate64 as zipfile +# from dagster import ( +# AssetOut, +# DynamicPartitionsDefinition, +# MetadataValue, +# SpecificPartitionsPartitionMapping, +# StaticPartitionsDefinition, +# asset, +# multi_asset, +# ) + +# from popgetter.assets.scotland import REQUIRED_TABLES_REGEX, download_file, sources + +# """ +# Notes: +# - 2011 data using UKCensusAPI, 2022 data expected soon given recent initial +# publication +# - Reusing some bits of code from UKCensusAPI: +# https://github.com/alan-turing-institute/UKCensusAPI/blob/master/ukcensusapi/NRScotland.py +# """ + + +# PARTITIONS_DEF_NAME = "dataset_tables" +# dataset_node_partition = DynamicPartitionsDefinition(name=PARTITIONS_DEF_NAME) + +# # cache_dir = tempfile.mkdtemp() +# cache_dir = "./cache" + +# URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html" +# URL1 = "https://www.scotlandscensus.gov.uk/" +# URL2 = "https://nrscensusprodumb.blob.core.windows.net/downloads/" +# URL_LOOKUP = ( +# "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx" +# ) +# URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip" +# URL_CATALOG = ( +# "https://www.scotlandscensus.gov.uk/media/kqcmo4ge/census-table-index-2011.xlsm" +# ) + +# data_sources = ["Council Area blk", "SNS Data Zone 2011 blk", "Output Area blk"] +# GeoCodeLookup = { +# "LAD": 0, # "Council Area blk" +# # MSOA (intermediate zone)? +# "LSOA11": 1, # "SNS Data Zone 2011 blk" +# "OA11": 2, # "Output Area blk" +# } + +# DATA_SOURCES = [ +# { +# "source": "Council Area blk", +# "resolution": "LAD", +# "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip", +# }, +# { +# "source": "SNS Data Zone 2011 blk", +# "resolution": "LSOA11", +# "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip", +# }, +# { +# "source": "Output Area blk", +# "resolution": "OA11", +# "url": URL2 + urlparse.quote("Output Area blk") + ".zip", +# }, +# ] + + +# # NB. Make sure no spaces in asset keys +# @multi_asset( +# outs={ +# "oa_dz_iz_2011_lookup": AssetOut(), +# "data_zone_2011_lookup": AssetOut(), +# "intermediate_zone_2011_lookup": AssetOut(), +# }, +# ) +# def lookups(): +# """Creates lookup dataframes.""" +# Path(cache_dir).mkdir(parents=True, exist_ok=True) +# lookup_path = download_file(cache_dir, URL_LOOKUP) +# df1 = pd.read_excel(lookup_path, sheet_name="OA_DZ_IZ_2011 Lookup") +# df2 = pd.read_excel(lookup_path, sheet_name="DataZone2011Lookup") +# df3 = pd.read_excel(lookup_path, sheet_name="IntermediateZone2011Lookup") +# return df1, df2, df3 + + +# def source_to_zip(source_name: str, url: str) -> Path: +# """Downloads if necessary and returns the name of the locally cached zip file +# of the source data (replacing spaces with _)""" +# file_name = Path(cache_dir) / (source_name.replace(" ", "_") + ".zip") +# return download_file(cache_dir, url, file_name) + + +# def add_metadata( +# context, +# df: pd.DataFrame | gpd.GeoDataFrame, +# title: str | list[str], +# output_name: str | None = None, +# ): +# context.add_output_metadata( +# metadata={ +# "title": title, +# "num_records": len(df), +# "columns": MetadataValue.md( +# "\n".join([f"- '`{col}`'" for col in df.columns.to_list()]) +# ), +# "preview": MetadataValue.md(df.head().to_markdown()), +# }, +# output_name=output_name, +# ) + + +# @asset +# def catalog_reference(context) -> pd.DataFrame: +# catalog_reference = pd.read_excel( +# URL_CATALOG, +# sheet_name=None, +# header=None, +# storage_options={"User-Agent": "Mozilla/5.0"}, +# )["Index"].rename( +# columns={ +# 0: "census_release", +# 1: "table_name", +# 2: "description", +# 3: "population_coverage", +# 4: "variable", +# 5: "catalog_resolution", +# 6: "year", +# 7: "additional_url", +# 8: "population_coverage_and_variable", +# } +# ) +# add_metadata(context, catalog_reference, "Metadata for census tables") +# return catalog_reference + + +# def get_table_metadata( +# catalog_reference: pd.DataFrame, table_name: str +# ) -> dict[str, str]: +# """Returns a dict of table metadata for a given table name.""" +# rows = catalog_reference.loc[catalog_reference.loc[:, "table_name"].eq(table_name)] +# census_release = rows.loc[:, "census_release"].unique()[0] +# description = rows.loc[:, "description"].unique()[0] +# population_coverage = rows.loc[:, "population_coverage"].unique()[0] +# variables = ", ".join(rows.loc[:, "variable"].astype(str).to_list()) +# catalog_resolution = rows.loc[:, "catalog_resolution"].unique()[0] +# year = int(rows.loc[:, "year"].unique()[0]) +# return { +# "census_release": census_release, +# "description": description, +# "population_coverage": population_coverage, +# "variables": variables, +# "catalog_resolution": catalog_resolution, +# "year": str(year), +# "human_readable_name": f"{description} ({population_coverage})", +# } + + +# def get_table_name(file_name: str) -> str: +# return file_name.rsplit(".csv")[0] + + +# @asset +# def catalog_as_dataframe(context, catalog_reference: pd.DataFrame) -> pd.DataFrame: +# """Creates a catalog of the individual census tables from all data sources.""" +# records = [] +# for data_source in DATA_SOURCES: +# resolution = data_source["resolution"] +# source = data_source["source"] +# url = data_source["url"] +# zip_file_name = source_to_zip(source, url) +# with zipfile.ZipFile(zip_file_name) as zip_ref: +# for file_name in zip_ref.namelist(): +# # Get table name +# table_name = get_table_name(file_name) + +# # Skip bulk output files and missing tables from catalog_reference +# if ( +# "bulk_output" in file_name.lower() +# or catalog_reference.loc[:, "table_name"].ne(table_name).all() +# ): +# continue + +# # Get table metadata +# table_metadata = get_table_metadata(catalog_reference, table_name) + +# # Get source release metadata if available +# source_data_release = sources.get( +# table_metadata["census_release"], None +# ) +# source_data_release_id = ( +# None if source_data_release is None else source_data_release.id +# ) + +# # Create a record for each census table use same keys as MetricMetadata +# # where possible since this makes it simpler to populate derived +# # metrics downstream +# record = { +# "resolution": resolution, +# "catalog_resolution": table_metadata["catalog_resolution"], +# "source": source, +# "url": url, +# "file_name": Path(source) / file_name, +# "table_name": table_name, +# "year": table_metadata["year"], +# # Use constructed name of description and coverage +# "human_readable_name": table_metadata["human_readable_name"], +# "source_metric_id": None, +# # Use catalog_reference description +# "description": table_metadata["description"], +# "hxl_tag": None, +# "metric_parquet_file_url": None, +# "parquet_column_name": None, +# "parquet_margin_of_error_column": None, +# "parquet_margin_of_error_file": None, +# "potential_denominator_ids": None, +# "parent_metric_id": None, +# # TODO: check this is not an ID but a name +# "source_data_release_id": source_data_release_id, +# "source_download_url": url, +# # TODO: what should this be? +# "source_archive_file_path": None, +# "source_documentation_url": URL_CATALOG, +# } +# context.log.debug(record) +# records.append(record) +# zip_ref.extract(file_name, Path(cache_dir) / source) + +# # TODO: check if required +# for partition in context.instance.get_dynamic_partitions(PARTITIONS_DEF_NAME): +# context.instance.delete_dynamic_partition(PARTITIONS_DEF_NAME, partition) + +# # Create a dynamic partition for the datasets listed in the catalog +# catalog_df: pd.DataFrame = pd.DataFrame.from_records(records) +# catalog_df["partition_key"] = ( +# catalog_df[["year", "resolution", "table_name"]] +# .astype(str) +# .agg(lambda s: "/".join(s).rsplit(".")[0], axis=1) +# ) +# # TODO: consider filtering here based on a set of keys to keep derived from +# # config (i.e. backend/frontend modes) +# context.instance.add_dynamic_partitions( +# partitions_def_name=PARTITIONS_DEF_NAME, +# # To ensure this is unique, prepend the resolution, +# partition_keys=catalog_df.loc[ +# catalog_df["partition_key"].str.contains(REQUIRED_TABLES_REGEX), +# "partition_key", +# ].to_list(), +# ) +# context.add_output_metadata( +# metadata={ +# "num_records": len(catalog_df), +# "ignored_datasets": "", +# "columns": MetadataValue.md( +# "\n".join([f"- '`{col}`'" for col in catalog_df.columns.to_list()]) +# ), +# "columns_types": MetadataValue.md(catalog_df.dtypes.to_markdown()), +# "preview": MetadataValue.md(catalog_df.to_markdown()), +# } +# ) +# return catalog_df + + +# def get_table(context, table_details) -> pd.DataFrame: +# table_df = pd.read_csv(Path(cache_dir) / table_details["file_name"].iloc[0]) +# add_metadata(context, table_df, table_details["partition_key"].iloc[0]) +# return table_df + + +# @asset(partitions_def=dataset_node_partition) +# def individual_census_table( +# context, catalog_as_dataframe: pd.DataFrame +# ) -> pd.DataFrame: +# """Creates individual census tables as dataframe.""" +# partition_key = context.asset_partition_key_for_output() +# context.log.info(partition_key) +# table_details = catalog_as_dataframe.loc[ +# catalog_as_dataframe["partition_key"].isin([partition_key]) +# ] +# context.log.info(table_details) +# return get_table(context, table_details) + + +# subset_partition_keys: list[str] = ["2011/OA11/LC1117SC"] +# subset_mapping = SpecificPartitionsPartitionMapping(subset_partition_keys) +# subset_partition = StaticPartitionsDefinition(subset_partition_keys) From 30af440d18655d34134ddbc3d531af8f8af1f86a Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 20 Jun 2024 21:34:19 +0100 Subject: [PATCH 28/60] Add update to use country class --- python/popgetter/assets/__init__.py | 4 +- python/popgetter/assets/scotland/__init__.py | 617 +++++++++++++++++-- 2 files changed, 570 insertions(+), 51 deletions(-) diff --git a/python/popgetter/assets/__init__.py b/python/popgetter/assets/__init__.py index 55e91dc..0cbb5cf 100644 --- a/python/popgetter/assets/__init__.py +++ b/python/popgetter/assets/__init__.py @@ -1,7 +1,7 @@ from __future__ import annotations -from . import be, ni, uk, us +from . import be, ni, scotland, uk, us -countries = [(mod, mod.__name__.split(".")[-1]) for mod in [be, ni, uk, us]] +countries = [(mod, mod.__name__.split(".")[-1]) for mod in [be, ni, uk, us, scotland]] __all__ = ["countries"] diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py index 0a96e80..a52207d 100755 --- a/python/popgetter/assets/scotland/__init__.py +++ b/python/popgetter/assets/scotland/__init__.py @@ -1,46 +1,32 @@ #!/usr/bin/python3 from __future__ import annotations +import urllib.parse as urlparse +from collections.abc import Callable +from dataclasses import dataclass from datetime import date from pathlib import Path +from typing import ClassVar +import geopandas as gpd +import matplotlib.pyplot as plt +import pandas as pd import requests +import zipfile_deflate64 as zipfile from dagster import ( - asset, + MetadataValue, ) +from icecream import ic -from popgetter.metadata import CountryMetadata, DataPublisher, SourceDataRelease - -country: CountryMetadata = CountryMetadata( - name_short_en="Scotland", - name_official="Scotland", - iso3="GBR", - iso2="GB", - iso3166_2="GB-SCT", -) - -publisher: DataPublisher = DataPublisher( - name="National Records of Scotland", - url="https://www.nrscotland.gov.uk/", - description="National Records of Scotland (NRS) is a Non-Ministerial Department of " - "the Scottish Government. Our purpose is to collect, preserve and " - "produce information about Scotland's people and history and make it " - "available to inform current and future generations.", - countries_of_interest=[country.id], +from popgetter.assets.country import Country +from popgetter.metadata import ( + CountryMetadata, + DataPublisher, + GeometryMetadata, + MetricMetadata, + SourceDataRelease, ) - - -@asset() -def country_metadata() -> CountryMetadata: - """Returns a CountryMetadata of metadata about the country.""" - return country - - -@asset() -def publisher_metadata(): - """Returns a DataPublisher of metadata about the publisher.""" - return publisher - +from popgetter.utils import add_metadata, markdown_from_plot # From: https://github.com/alan-turing-institute/microsimulation/blob/37ce2843f10b83a8e7a225c801cec83b85e6e0d0/microsimulation/common.py#L32 REQUIRED_TABLES = [ @@ -60,7 +46,7 @@ def publisher_metadata(): CENSUS_COLLECTION_DATE = date(2011, 3, 27) CENSUS_EXPECT_NEXT_UPDATE = date(2022, 1, 1) -sources: dict[str, SourceDataRelease] = { +SOURCE_DATA_RELEASES: dict[str, SourceDataRelease] = { "3A": SourceDataRelease( name="Census 2011: Release 3A", date_published=date(2014, 2, 27), @@ -70,11 +56,12 @@ def publisher_metadata(): collection_period_end=CENSUS_COLLECTION_DATE, expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3a", - data_publisher_id=publisher.id, + data_publisher_id="TBD", description="TBC", - geography_file="TBC", - geography_level="TBC", - countries_of_interest=[country.id], + # geography_file="TBC", + # geography_level="TBC", + geometry_metadata_id="TBC", + # countries_of_interest=[country.id], ), "3I": SourceDataRelease( name="Census 2011: Release 3I", @@ -85,11 +72,12 @@ def publisher_metadata(): collection_period_end=date(2011, 10, 22), expect_next_update=date(2022, 1, 1), url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3i", - data_publisher_id=publisher.id, + data_publisher_id="TBD", description="TBC", - geography_file="TBC", - geography_level="TBC", - countries_of_interest=[country.id], + # geography_file="TBC", + # geography_level="TBC", + geometry_metadata_id="TBC", + # countries_of_interest=[country.id], ), "2A": SourceDataRelease( name="Census 2011: Release 2A", @@ -100,11 +88,12 @@ def publisher_metadata(): collection_period_end=date(2011, 10, 22), expect_next_update=date(2022, 1, 1), url="https://www.nrscotland.gov.uk/news/2013/census-2011-release-2a", - data_publisher_id=publisher.id, + data_publisher_id="TBD", description="TBC", - geography_file="TBC", - geography_level="TBC", - countries_of_interest=[country.id], + # geography_file="TBC", + # geography_level="TBC", + geometry_metadata_id="", + # countries_of_interest=[country.id], ), "3C": SourceDataRelease( name="Census 2011: Release 3C", @@ -115,11 +104,12 @@ def publisher_metadata(): collection_period_end=date(2011, 10, 22), expect_next_update=date(2022, 1, 1), url="https://www.nrscotland.gov.uk/news/2014/census-2011-releases-2d-and-3c", - data_publisher_id=publisher.id, + data_publisher_id="TBD", description="TBC", - geography_file="TBC", - geography_level="TBC", - countries_of_interest=[country.id], + geometry_metadata_id="", + # geography_file="TBC", + # geography_level="TBC", + # countries_of_interest=[country.id], ), } @@ -143,3 +133,532 @@ def download_file( with Path(file_name).open("wb") as fp: fp.write(r.content) return file_name + + +URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html" +URL1 = "https://www.scotlandscensus.gov.uk/" +URL2 = "https://nrscensusprodumb.blob.core.windows.net/downloads/" +URL_LOOKUP = ( + "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx" +) +URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip" +URL_CATALOG = ( + "https://www.scotlandscensus.gov.uk/media/kqcmo4ge/census-table-index-2011.xlsm" +) + + +DATA_SOURCES = [ + { + "source": "Council Area blk", + "resolution": "LAD", + "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip", + }, + { + "source": "SNS Data Zone 2011 blk", + "resolution": "LSOA11", + "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip", + }, + { + "source": "Output Area blk", + "resolution": "OA11", + "url": URL2 + urlparse.quote("Output Area blk") + ".zip", + }, +] + + +@dataclass +class ScotlandGeometryLevel: + level: str + hxl_tag: str + geo_id_column: str + census_table_column: str + name_columns: dict[str, str] # keys = language codes, values = column names + url: str + lookup_url: str | None + lookup_sheet: str | None + left_on: str | None + right_on: str | None + + +SCOTLAND_GEO_LEVELS = { + "OA11": ScotlandGeometryLevel( + level="OA11", + hxl_tag="TBD", + geo_id_column="OA_CODE", + census_table_column="TODO", + # census_table_column="Census 2021 Data Zone Code", + name_columns={"en": "OA_CODE"}, # TODO + # url=URL_SHAPEFILE, + url="https://www.nrscotland.gov.uk/files/geography/output-area-2011-eor.zip", + lookup_url=None, + lookup_sheet=None, + left_on=None, + right_on=None, + ) +} + + +# cache_dir = tempfile.mkdtemp() +cache_dir = "./cache" + + +@dataclass +class DerivedColumn: + hxltag: str + filter_func: Callable[[pd.DataFrame], pd.DataFrame] + output_column_name: str + human_readable_name: str + + +@dataclass +class SourceTable: + hxltag: str + geo_level: str + geo_column: str + source_column: str + + +# Config for each partition to be derived +age_code = "`Age Code`" +sex_label = "`Sex Label`" +DERIVED_COLUMNS = [ + DerivedColumn( + hxltag="#population+children+age5_17", + filter_func=lambda df: df.query(f"{age_code} >= 5 and {age_code} < 18"), + output_column_name="children_5_17", + human_readable_name="Children aged 5 to 17", + ), + DerivedColumn( + hxltag="#population+infants+age0_4", + filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 5"), + output_column_name="infants_0_4", + human_readable_name="Infants aged 0 to 4", + ), + DerivedColumn( + hxltag="#population+children+age0_17", + filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 18"), + output_column_name="children_0_17", + human_readable_name="Children aged 0 to 17", + ), + DerivedColumn( + hxltag="#population+adults+f", + filter_func=lambda df: df.query( + f"{age_code} >= 18 and {sex_label} == 'Female'" + ), + output_column_name="adults_f", + human_readable_name="Female adults", + ), + DerivedColumn( + hxltag="#population+adults+m", + filter_func=lambda df: df.query(f"{age_code} >= 18 and {sex_label} == 'Male'"), + output_column_name="adults_m", + human_readable_name="Male adults", + ), + DerivedColumn( + hxltag="#population+adults", + filter_func=lambda df: df.query(f"{age_code} >= 18"), + output_column_name="adults", + human_readable_name="Adults", + ), + DerivedColumn( + hxltag="#population+ind", + filter_func=lambda df: df, + output_column_name="individuals", + human_readable_name="Total individuals", + ), +] + +TABLES_TO_PROCESS: list[str] = [ + "QS103SC", + "QS104SC", + "KS201SC", + "DC1117SC", + "DC2101SC", + "DC6206SC", + "LC1117SC", +] + +PARTITIONS_TO_PUBLISH: list[str] = ["2011/OA11/LC1117SC"] + + +DERIVED_COLUMN_SPECIFICATIONS: dict[str, list[DerivedColumn]] = { + PARTITIONS_TO_PUBLISH[0]: DERIVED_COLUMNS, +} + + +class Scotland(Country): + key_prefix: str = "uk-scotland" + geo_levels: ClassVar[list[str]] = list(SCOTLAND_GEO_LEVELS.keys()) + tables_to_process: list[str] | None = TABLES_TO_PROCESS + + def _catalog(self, context) -> pd.DataFrame: + """Creates a catalog of the individual census tables from all data sources.""" + + def source_to_zip(source_name: str, url: str) -> Path: + """Downloads if necessary and returns the name of the locally cached zip file + of the source data (replacing spaces with _)""" + file_name = Path(cache_dir) / (source_name.replace(" ", "_") + ".zip") + return download_file(cache_dir, url, file_name) + + def get_table_name(file_name: str) -> str: + return file_name.rsplit(".csv")[0] + + def get_table_metadata( + catalog_reference: pd.DataFrame, table_name: str + ) -> dict[str, str]: + """Returns a dict of table metadata for a given table name.""" + rows = catalog_reference.loc[ + catalog_reference.loc[:, "table_name"].eq(table_name) + ] + census_release = rows.loc[:, "census_release"].unique()[0] + description = rows.loc[:, "description"].unique()[0] + population_coverage = rows.loc[:, "population_coverage"].unique()[0] + variables = ", ".join(rows.loc[:, "variable"].astype(str).to_list()) + catalog_resolution = rows.loc[:, "catalog_resolution"].unique()[0] + year = int(rows.loc[:, "year"].unique()[0]) + return { + "census_release": census_release, + "description": description, + "population_coverage": population_coverage, + "variables": variables, + "catalog_resolution": catalog_resolution, + "year": str(year), + "human_readable_name": f"{description} ({population_coverage})", + } + + # Download catalog reference + catalog_reference = pd.read_excel( + URL_CATALOG, + sheet_name=None, + header=None, + storage_options={"User-Agent": "Mozilla/5.0"}, + )["Index"].rename( + columns={ + 0: "census_release", + 1: "table_name", + 2: "description", + 3: "population_coverage", + 4: "variable", + 5: "catalog_resolution", + 6: "year", + 7: "additional_url", + 8: "population_coverage_and_variable", + } + ) + # Remove all keys + self.remove_all_partition_keys(context) + + records = [] + for data_source in DATA_SOURCES: + resolution = data_source["resolution"] + source = data_source["source"] + url = data_source["url"] + zip_file_name = source_to_zip(source, url) + with zipfile.ZipFile(zip_file_name) as zip_ref: + for file_name in zip_ref.namelist(): + # Get table name + table_name = get_table_name(file_name) + + # Skip bulk output files and missing tables from catalog_reference + if ( + "bulk_output" in file_name.lower() + or catalog_reference.loc[:, "table_name"].ne(table_name).all() + ): + continue + + # Get table metadata + table_metadata = get_table_metadata(catalog_reference, table_name) + + # Get source release metadata if available + source_data_release = SOURCE_DATA_RELEASES.get( + table_metadata["census_release"], None + ) + source_data_release_id = ( + None if source_data_release is None else source_data_release.id + ) + + # Create a record for each census table use same keys as MetricMetadata + # where possible since this makes it simpler to populate derived + # metrics downstream + record = { + "resolution": resolution, + "catalog_resolution": table_metadata["catalog_resolution"], + "source": source, + "url": url, + "file_name": Path(source) / file_name, + "table_name": table_name, + "year": table_metadata["year"], + # Use constructed name of description and coverage + "human_readable_name": table_metadata["human_readable_name"], + "source_metric_id": None, + # Use catalog_reference description + "description": table_metadata["description"], + "hxl_tag": None, + "metric_parquet_file_url": None, + "parquet_column_name": None, + "parquet_margin_of_error_column": None, + "parquet_margin_of_error_file": None, + "potential_denominator_ids": None, + "parent_metric_id": None, + # TODO: check this is not an ID but a name + "source_data_release_id": source_data_release_id, + "census_release": table_metadata["census_release"], + "source_download_url": url, + # TODO: what should this be? + "source_archive_file_path": None, + "source_documentation_url": URL_CATALOG, + } + context.log.debug(record) + records.append(record) + zip_ref.extract(file_name, Path(cache_dir) / source) + + # Create a dynamic partition for the datasets listed in the catalog + catalog_df: pd.DataFrame = pd.DataFrame.from_records(records) + catalog_df["partition_key"] = ( + catalog_df[["year", "resolution", "table_name"]] + .astype(str) + .agg(lambda s: "/".join(s).rsplit(".")[0], axis=1) + ) + + # TODO: add filter for prod vs. dev mode + self.add_partition_keys(context, catalog_df["partition_key"].to_list()) + context.add_output_metadata( + metadata={ + "num_records": len(catalog_df), + "ignored_datasets": "", + "columns": MetadataValue.md( + "\n".join([f"- '`{col}`'" for col in catalog_df.columns.to_list()]) + ), + "columns_types": MetadataValue.md(catalog_df.dtypes.to_markdown()), + "preview": MetadataValue.md(catalog_df.to_markdown()), + } + ) + return catalog_df + + def _country_metadata(self, _context) -> CountryMetadata: + return CountryMetadata( + name_short_en="Scotland", + name_official="Scotland", + iso3="GBR", + iso2="GB", + iso3166_2="GB-SCT", + ) + + def _data_publisher( + self, _context, country_metdata: CountryMetadata + ) -> DataPublisher: + return DataPublisher( + name="National Records of Scotland", + url="https://www.nrscotland.gov.uk/", + description="National Records of Scotland (NRS) is a Non-Ministerial Department of " + "the Scottish Government. Our purpose is to collect, preserve and " + "produce information about Scotland's people and history and make it " + "available to inform current and future generations.", + countries_of_interest=[country_metdata.id], + ) + + def _geometry( + self, context + ) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]: + """Gets the shape file for OA11 resolution.""" + geometries_to_return = [] + for level_details in SCOTLAND_GEO_LEVELS.values(): + # TODO: get correct values + geometry_metadata = GeometryMetadata( + validity_period_start=CENSUS_COLLECTION_DATE, + validity_period_end=CENSUS_COLLECTION_DATE, + level=level_details.level, + hxl_tag=level_details.hxl_tag, + ) + file_name = download_file(cache_dir, level_details.url) + region_geometries_raw: gpd.GeoDataFrame = gpd.read_file( + f"zip://{file_name}" + ) + if level_details.lookup_url is not None: + lookup = pd.read_excel( + level_details.lookup_url, sheet_name=level_details.lookup_sheet + ) + region_geometries_raw = region_geometries_raw.merge( + lookup, + left_on=level_details.left_on, + right_on=level_details.right_on, + how="outer", + ) + + region_geometries_raw = region_geometries_raw.dissolve( + by=level_details.geo_id_column + ).reset_index() + + context.log.debug(ic(region_geometries_raw.head())) + region_geometries = region_geometries_raw.rename( + columns={level_details.geo_id_column: "GEO_ID"} + ).loc[:, ["geometry", "GEO_ID"]] + region_names = ( + region_geometries_raw.rename( + columns={ + level_details.geo_id_column: "GEO_ID", + level_details.name_columns["en"]: "en", + } + ) + .loc[:, ["GEO_ID", "en"]] + .drop_duplicates() + ) + geometries_to_return.append( + (geometry_metadata, region_geometries, region_names) + ) + + # Add output metadata + first_metadata, first_gdf, first_names = geometries_to_return[0] + first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID") + ax = first_joined_gdf.plot(column="en", legend=False) + ax.set_title(f"Scotland 2011 {first_metadata.level}") + md_plot = markdown_from_plot(plt) + context.add_output_metadata( + metadata={ + "all_geom_levels": MetadataValue.md( + ",".join( + [metadata.level for metadata, _, _ in geometries_to_return] + ) + ), + "first_geometry_plot": MetadataValue.md(md_plot), + "first_names_preview": MetadataValue.md( + first_names.head().to_markdown() + ), + } + ) + + return geometries_to_return + + @staticmethod + def _get_geo_level_and_source_data_release( + geo_level: str, cenesus_release: str + ) -> str: + return geo_level + "_" + cenesus_release + + def _source_data_releases( + self, + _context, + geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]], + data_publisher: DataPublisher, + # TODO: consider version without inputs so only output type specified + # **kwargs, + ) -> dict[str, SourceDataRelease]: + source_data_releases = {} + for geo_metadata, _, _ in geometry: + for ( + source_data_release_id, + source_data_release, + ) in SOURCE_DATA_RELEASES.items(): + source_data_release_new: SourceDataRelease = SourceDataRelease( + name=source_data_release.name, + date_published=source_data_release.date_published, + reference_period_start=source_data_release.collection_period_start, + reference_period_end=source_data_release.reference_period_end, + collection_period_start=source_data_release.collection_period_start, + collection_period_end=source_data_release.collection_period_end, + expect_next_update=source_data_release.expect_next_update, + url=source_data_release.url, + data_publisher_id=data_publisher.id, + description=source_data_release.description, + geometry_metadata_id=geo_metadata.id, + ) + combined_level_and_release_id = ( + self._get_geo_level_and_source_data_release( + geo_metadata.level, source_data_release_id + ) + ) + source_data_releases[ + combined_level_and_release_id + ] = source_data_release_new + return source_data_releases + + @staticmethod + def get_table(context, table_details) -> pd.DataFrame: + table_df = pd.read_csv(Path(cache_dir) / table_details["file_name"].iloc[0]) + add_metadata(context, table_df, table_details["partition_key"].iloc[0]) + return table_df + + def _census_tables(self, context, catalog: pd.DataFrame) -> pd.DataFrame: + """Creates individual census tables as dataframe.""" + partition_key = context.asset_partition_key_for_output() + context.log.info(partition_key) + table_details = catalog.loc[catalog["partition_key"].isin([partition_key])] + context.log.info(table_details) + return self.get_table(context, table_details) + + # subset_partition_keys: list[str] = ["2011/OA11/LC1117SC"] + # subset_mapping = SpecificPartitionsPartitionMapping(subset_partition_keys) + # subset_partition = StaticPartitionsDefinition(subset_partition_keys) + + @staticmethod + def census_table_metadata( + catalog_row: dict[str, str], + source_table: SourceTable, + source_data_releases: dict[str, SourceDataRelease], + ) -> MetricMetadata: + return MetricMetadata( + human_readable_name=catalog_row["human_readable_name"], + source_download_url=catalog_row["source_download_url"], + source_archive_file_path=catalog_row["source_archive_file_path"], + source_documentation_url=catalog_row["source_documentation_url"], + source_data_release_id=source_data_releases[source_table.geo_level].id, + # TODO - this is a placeholder + parent_metric_id="unknown_at_this_stage", + potential_denominator_ids=None, + parquet_margin_of_error_file=None, + parquet_margin_of_error_column=None, + parquet_column_name=source_table.source_column, + # TODO - this is a placeholder + metric_parquet_path="unknown_at_this_stage", + hxl_tag=source_table.hxltag, + description=catalog_row["description"], + source_metric_id=source_table.hxltag, + ) + + def _source_metric_metadata( + self, + context, + catalog: pd.DataFrame, + source_data_releases: dict[str, SourceDataRelease], + ) -> MetricMetadata: + partition_key = context.partition_key + catalog_row = catalog[catalog["partition_key"] == partition_key].to_dict( + orient="records" + )[0] + + geo_level = partition_key.split("/")[1] + source_table = SourceTable( + # TODO: how programmatically do this + hxltag="TBD", + geo_level=geo_level, + geo_column=SCOTLAND_GEO_LEVELS[geo_level].geo_id_column, + # TODO: update this + source_column="Count", + ) + + return self.census_table_metadata( + catalog_row, + source_table, + source_data_releases, + ) + + def _derived_metrics( + self, + context, + census_tables: pd.DataFrame, + source_metric_metadata: MetricMetadata, + ) -> tuple[list[MetricMetadata], pd.DataFrame]: + ... + + +# Create assets +scotland = Scotland() +country_metadata = scotland.create_country_metadata() +data_publisher = scotland.create_data_publisher() +geometry = scotland.create_geometry() +source_data_releases = scotland.create_source_data_releases() +catalog = scotland.create_catalog() +census_tables = scotland.create_census_tables() +source_metric_metadata = scotland.create_source_metric_metadata() +derived_metrics = scotland.create_derived_metrics() +metrics = scotland.create_metrics() From 479324ce3785e4779d60ec0c2ef468e9517bc586 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 20 Jun 2024 22:12:42 +0100 Subject: [PATCH 29/60] Fix geometry --- python/popgetter/assets/scotland/__init__.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py index a52207d..4c4e954 100755 --- a/python/popgetter/assets/scotland/__init__.py +++ b/python/popgetter/assets/scotland/__init__.py @@ -287,7 +287,7 @@ class SourceTable: class Scotland(Country): - key_prefix: str = "uk-scotland" + key_prefix: str = "scotland" geo_levels: ClassVar[list[str]] = list(SCOTLAND_GEO_LEVELS.keys()) tables_to_process: list[str] | None = TABLES_TO_PROCESS @@ -377,6 +377,13 @@ def get_table_metadata( None if source_data_release is None else source_data_release.id ) + # Skip if not required + if ( + self.tables_to_process is not None + and table_name not in self.tables_to_process + ): + continue + # Create a record for each census table use same keys as MetricMetadata # where possible since this makes it simpler to populate derived # metrics downstream @@ -474,6 +481,7 @@ def _geometry( region_geometries_raw: gpd.GeoDataFrame = gpd.read_file( f"zip://{file_name}" ) + ic(region_geometries_raw.head()) if level_details.lookup_url is not None: lookup = pd.read_excel( level_details.lookup_url, sheet_name=level_details.lookup_sheet @@ -493,11 +501,16 @@ def _geometry( region_geometries = region_geometries_raw.rename( columns={level_details.geo_id_column: "GEO_ID"} ).loc[:, ["geometry", "GEO_ID"]] + + # Note: Make copy of IDs as names for now + region_geometries_raw["GEO_ID_2"] = region_geometries_raw[ + level_details.geo_id_column + ].copy() region_names = ( region_geometries_raw.rename( columns={ level_details.geo_id_column: "GEO_ID", - level_details.name_columns["en"]: "en", + "GEO_ID_2": "en", } ) .loc[:, ["GEO_ID", "en"]] From 582f57887769536c001672ad93c121825adc5d8d Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Sat, 22 Jun 2024 13:28:02 +0100 Subject: [PATCH 30/60] Revise geographies with overload providing lookups --- python/popgetter/assets/scotland/__init__.py | 391 +++++++++++++++---- 1 file changed, 306 insertions(+), 85 deletions(-) diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py index 4c4e954..74eca9c 100755 --- a/python/popgetter/assets/scotland/__init__.py +++ b/python/popgetter/assets/scotland/__init__.py @@ -5,6 +5,7 @@ from collections.abc import Callable from dataclasses import dataclass from datetime import date +from functools import reduce from pathlib import Path from typing import ClassVar @@ -15,16 +16,19 @@ import zipfile_deflate64 as zipfile from dagster import ( MetadataValue, + asset, ) from icecream import ic from popgetter.assets.country import Country +from popgetter.cloud_outputs import send_to_geometry_sensor from popgetter.metadata import ( CountryMetadata, DataPublisher, GeometryMetadata, MetricMetadata, SourceDataRelease, + metadata_to_dataframe, ) from popgetter.utils import add_metadata, markdown_from_plot @@ -150,17 +154,20 @@ def download_file( DATA_SOURCES = [ { "source": "Council Area blk", - "resolution": "LAD", + # "resolution": "LAD", + "resolution": "CouncilArea2011", "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip", }, { "source": "SNS Data Zone 2011 blk", - "resolution": "LSOA11", + # "resolution": "LSOA11", + "resolution": "DataZone2011", "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip", }, { "source": "Output Area blk", - "resolution": "OA11", + # "resolution": "OA11", + "resolution": "OutputArea2011", "url": URL2 + urlparse.quote("Output Area blk") + ".zip", }, ] @@ -181,20 +188,62 @@ class ScotlandGeometryLevel: SCOTLAND_GEO_LEVELS = { - "OA11": ScotlandGeometryLevel( - level="OA11", + "OutputArea2011": ScotlandGeometryLevel( + level="OutputArea2011", hxl_tag="TBD", geo_id_column="OA_CODE", census_table_column="TODO", # census_table_column="Census 2021 Data Zone Code", - name_columns={"en": "OA_CODE"}, # TODO + name_columns={"en": "OutputArea2011Name"}, # TODO # url=URL_SHAPEFILE, url="https://www.nrscotland.gov.uk/files/geography/output-area-2011-eor.zip", lookup_url=None, lookup_sheet=None, - left_on=None, - right_on=None, - ) + left_on="OA_CODE", + right_on="OutputArea2011Code", + ), + # LSOA11 + "DataZone2011": ScotlandGeometryLevel( + level="DataZone2011", + hxl_tag="TBD", + geo_id_column="DataZone", + census_table_column="TODO", + # census_table_column="Census 2021 Data Zone Code", + name_columns={"en": "Name"}, + url="https://maps.gov.scot/ATOM/shapefiles/SG_DataZoneBdry_2011.zip", + lookup_url=None, + lookup_sheet=None, + left_on="DataZone", + right_on="DataZone2011Code", + ), + # "MSOA11": ScotlandGeometryLevel( + # level="OA11", + # hxl_tag="TBD", + # geo_id_column="OA_CODE", + # census_table_column="TODO", + # # census_table_column="Census 2021 Data Zone Code", + # name_columns={"en": "OA_CODE"}, + # # url=URL_SHAPEFILE, + # url="https://www.nrscotland.gov.uk/files/geography/output-area-2011-eor.zip", + # lookup_url=None, + # lookup_sheet=None, + # left_on=None, + # right_on=None, + # ), + # LAD + "CouncilArea2011": ScotlandGeometryLevel( + level="CouncilArea2011", + hxl_tag="TBD", + geo_id_column="CouncilArea2011Code", + census_table_column="TODO", + # census_table_column="Census 2021 Data Zone Code", + name_columns={"en": "CouncilArea2011Name"}, + url="https://maps.gov.scot/ATOM/shapefiles/SG_DataZoneBdry_2011.zip", + lookup_url=None, + lookup_sheet=None, + left_on="DataZone", + right_on="DataZone2011Code", + ), } @@ -278,7 +327,7 @@ class SourceTable: "LC1117SC", ] -PARTITIONS_TO_PUBLISH: list[str] = ["2011/OA11/LC1117SC"] +PARTITIONS_TO_PUBLISH: list[str] = ["2011/OutputArea2011/LC1117SC"] DERIVED_COLUMN_SPECIFICATIONS: dict[str, list[DerivedColumn]] = { @@ -286,6 +335,10 @@ class SourceTable: } +def get_source_data_release(geo_level: str, cenesus_release: str) -> str: + return geo_level + "_" + cenesus_release + + class Scotland(Country): key_prefix: str = "scotland" geo_levels: ClassVar[list[str]] = list(SCOTLAND_GEO_LEVELS.keys()) @@ -464,89 +517,138 @@ def _data_publisher( countries_of_interest=[country_metdata.id], ) - def _geometry( - self, context - ) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]: - """Gets the shape file for OA11 resolution.""" - geometries_to_return = [] - for level_details in SCOTLAND_GEO_LEVELS.values(): - # TODO: get correct values - geometry_metadata = GeometryMetadata( - validity_period_start=CENSUS_COLLECTION_DATE, - validity_period_end=CENSUS_COLLECTION_DATE, - level=level_details.level, - hxl_tag=level_details.hxl_tag, + def create_lookup(self): + @asset(key_prefix=self.key_prefix) + def lookup(context): + url = "https://www.nrscotland.gov.uk/files/geography/2011-census/geog-2011-cen-supp-info-oldoa-newoa-lookup.xls" + df_oa_to_council = ( + pd.read_excel(url, sheet_name="2011OA_Lookup", storage_options=HEADERS) + .iloc[:-2] + .loc[:, ["OutputArea2011Code", "CouncilArea2011Code"]] + ) + url = "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx" + df_oa_to_dz_iz = pd.read_excel( + url, sheet_name="OA_DZ_IZ_2011 Lookup", storage_options=HEADERS + ) + df_dz_nm = pd.read_excel( + url, sheet_name="DataZone2011Lookup", storage_options=HEADERS + ) + df_iz_nm = pd.read_excel( + url, sheet_name="IntermediateZone2011Lookup", storage_options=HEADERS + ) + combined = ( + df_oa_to_council.merge(df_oa_to_dz_iz, on=["OutputArea2011Code"]) + .merge(df_dz_nm, on=["DataZone2011Code"]) + .merge(df_iz_nm, on=["IntermediateZone2011Code"]) ) - file_name = download_file(cache_dir, level_details.url) - region_geometries_raw: gpd.GeoDataFrame = gpd.read_file( - f"zip://{file_name}" + combined["OutputArea2011Name"] = combined["OutputArea2011Code"].copy() + df_council_name = pd.read_excel( + "https://www.nrscotland.gov.uk/files//geography/2011-census/oa2011-to-hba2014.xls", + sheet_name="HealthBoard2014_Council2011", + storage_options=HEADERS, ) - ic(region_geometries_raw.head()) - if level_details.lookup_url is not None: - lookup = pd.read_excel( - level_details.lookup_url, sheet_name=level_details.lookup_sheet + combined = combined.merge( + df_council_name[["CouncilArea2011Code", "NRSCouncilAreaName"]], + on="CouncilArea2011Code", + ).rename(columns={"NRSCouncilAreaName": "CouncilArea2011Name"}) + context.add_output_metadata( + metadata={ + "lookup_shape": f"{combined.shape[0]} rows x {combined.shape[1]} columns", + "lookup_preview": MetadataValue.md(combined.head().to_markdown()), + }, + ) + return combined + + return lookup + + def create_geometry(self): + """ + Creates an asset providing a list of geometries, metadata and names + at different resolutions. + """ + + @send_to_geometry_sensor + @asset(key_prefix=self.key_prefix) + def geometry( + context, lookup: pd.DataFrame + ) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]: + """List of geometries, metadata and names at different resolutions.""" + geometries_to_return = [] + for level_details in SCOTLAND_GEO_LEVELS.values(): + # TODO: get correct values + geometry_metadata = GeometryMetadata( + validity_period_start=CENSUS_COLLECTION_DATE, + validity_period_end=CENSUS_COLLECTION_DATE, + level=level_details.level, + hxl_tag=level_details.hxl_tag, + ) + file_name = download_file(cache_dir, level_details.url) + region_geometries_raw: gpd.GeoDataFrame = gpd.read_file( + f"zip://{file_name}" ) - region_geometries_raw = region_geometries_raw.merge( + context.log.debug(ic(region_geometries_raw.head())) + context.log.debug(ic(region_geometries_raw.columns)) + context.log.debug(ic(lookup.columns)) + region_geometries_merge = region_geometries_raw.merge( lookup, left_on=level_details.left_on, right_on=level_details.right_on, - how="outer", ) - region_geometries_raw = region_geometries_raw.dissolve( - by=level_details.geo_id_column - ).reset_index() - - context.log.debug(ic(region_geometries_raw.head())) - region_geometries = region_geometries_raw.rename( - columns={level_details.geo_id_column: "GEO_ID"} - ).loc[:, ["geometry", "GEO_ID"]] - - # Note: Make copy of IDs as names for now - region_geometries_raw["GEO_ID_2"] = region_geometries_raw[ - level_details.geo_id_column - ].copy() - region_names = ( - region_geometries_raw.rename( - columns={ - level_details.geo_id_column: "GEO_ID", - "GEO_ID_2": "en", - } + region_geometries_merge = region_geometries_merge.dissolve( + by=level_details.geo_id_column + ).reset_index() + + context.log.debug(ic(region_geometries_merge.head())) + context.log.debug(ic(region_geometries_merge.columns)) + region_geometries = region_geometries_merge.rename( + columns={level_details.geo_id_column: "GEO_ID"} + ).loc[:, ["geometry", "GEO_ID"]] + + region_names = ( + region_geometries_merge.rename( + columns={ + level_details.geo_id_column: "GEO_ID", + } + | { + value: key + for key, value in level_details.name_columns.items() + } + ) + .loc[:, ["GEO_ID", *list(level_details.name_columns.keys())]] + .drop_duplicates() ) - .loc[:, ["GEO_ID", "en"]] - .drop_duplicates() - ) - geometries_to_return.append( - (geometry_metadata, region_geometries, region_names) + geometries_to_return.append( + (geometry_metadata, region_geometries, region_names) + ) + + # Add output metadata + first_metadata, first_gdf, first_names = geometries_to_return[0] + first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID") + ax = first_joined_gdf.plot(column="en", legend=False) + ax.set_title(f"Scotland 2011 {first_metadata.level}") + md_plot = markdown_from_plot(plt) + context.add_output_metadata( + metadata={ + "all_geom_levels": MetadataValue.md( + ",".join( + [metadata.level for metadata, _, _ in geometries_to_return] + ) + ), + "first_geometry_plot": MetadataValue.md(md_plot), + "first_names_preview": MetadataValue.md( + first_names.head().to_markdown() + ), + } ) - # Add output metadata - first_metadata, first_gdf, first_names = geometries_to_return[0] - first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID") - ax = first_joined_gdf.plot(column="en", legend=False) - ax.set_title(f"Scotland 2011 {first_metadata.level}") - md_plot = markdown_from_plot(plt) - context.add_output_metadata( - metadata={ - "all_geom_levels": MetadataValue.md( - ",".join( - [metadata.level for metadata, _, _ in geometries_to_return] - ) - ), - "first_geometry_plot": MetadataValue.md(md_plot), - "first_names_preview": MetadataValue.md( - first_names.head().to_markdown() - ), - } - ) + return geometries_to_return - return geometries_to_return + return geometry - @staticmethod - def _get_geo_level_and_source_data_release( - geo_level: str, cenesus_release: str - ) -> str: - return geo_level + "_" + cenesus_release + def _geometry(self, context): + # Not required as geometry overridden + pass def _source_data_releases( self, @@ -575,10 +677,8 @@ def _source_data_releases( description=source_data_release.description, geometry_metadata_id=geo_metadata.id, ) - combined_level_and_release_id = ( - self._get_geo_level_and_source_data_release( - geo_metadata.level, source_data_release_id - ) + combined_level_and_release_id = get_source_data_release( + geo_metadata.level, source_data_release_id ) source_data_releases[ combined_level_and_release_id @@ -614,7 +714,11 @@ def census_table_metadata( source_download_url=catalog_row["source_download_url"], source_archive_file_path=catalog_row["source_archive_file_path"], source_documentation_url=catalog_row["source_documentation_url"], - source_data_release_id=source_data_releases[source_table.geo_level].id, + source_data_release_id=source_data_releases[ + get_source_data_release( + source_table.geo_level, catalog_row["census_release"] + ) + ].id, # TODO - this is a placeholder parent_metric_id="unknown_at_this_stage", potential_denominator_ids=None, @@ -662,12 +766,129 @@ def _derived_metrics( source_metric_metadata: MetricMetadata, ) -> tuple[list[MetricMetadata], pd.DataFrame]: ... + SEP = "__" + partition_key = context.partition_key + source_mmd = source_metric_metadata + parquet_file_name = ( + "".join(c for c in partition_key if c.isalnum()) + ".parquet" + ) + derived_metrics, derived_mmd = [], [] + + # If derived metrics + # try: + # metric_specs = DERIVED_COLUMN_SPECIFICATIONS[partition_key] + # source_column = source_mmd.parquet_column_name + # for metric_spec in metric_specs: + # new_table = ( + # census_tables.pipe(metric_spec.filter_func) + # .groupby(by="GEO_ID", as_index=True) + # .sum() + # .rename(columns={source_column: metric_spec.output_column_name}) + # .filter(items=["GEO_ID", metric_spec.output_column_name]) + # ) + # derived_metrics.append(new_table) + # new_mmd = source_mmd.copy() + # new_mmd.parent_metric_id = source_mmd.source_metric_id + # new_mmd.metric_parquet_path = parquet_file_name + # new_mmd.hxl_tag = metric_spec.hxltag + # new_mmd.parquet_column_name = metric_spec.output_column_name + # new_mmd.human_readable_name = metric_spec.human_readable_name + # derived_mmd.append(new_mmd) + # except KeyError: + # # No extra derived metrics specified for this partition -- only use + # # those from pivoted data + # pass + + # Batch + def make_pivot(df: pd.DataFrame) -> pd.DataFrame: + # TODO: reshape based on Unnamed: 1 to Unnamed N + pivot_cols = [ + col + for col in census_tables.columns + if col != "Unnamed: 0" and col.startswith("Unnamed: ") + ] + pivot = df.pivot_table( + index="Unnamed: 0", columns=pivot_cols, aggfunc="sum" + ) + + # FLattent multi-index + if isinstance(pivot.columns, pd.MultiIndex): + pivot.columns = [ + SEP.join(list(map(str, col))).strip() + for col in pivot.columns.to_numpy() + ] + # Ensure columns are string + else: + pivot.columns = [str(col).strip() for col in pivot.columns.to_numpy()] + + pivot.index = pivot.index.rename("GEO_ID") + + return pivot + + new_table = make_pivot(census_tables) + out_cols = [ + "".join(x for x in col.title() if not x.isspace()) + for col in source_mmd.description.split(" by ")[::-1] + ] + + for metric_col in new_table.columns: + metric_df = new_table.loc[:, metric_col].to_frame() + ic(metric_df) + derived_metrics.append(metric_df) + new_mmd = source_mmd.copy() + new_mmd.parent_metric_id = source_mmd.source_metric_id + new_mmd.metric_parquet_path = parquet_file_name + + # TODO: fix automating the hxltag + key_val = dict(zip(out_cols, metric_col.split(SEP), strict=True)) + + def gen_hxltag(kv: dict[str, str]) -> str: + out = ["#population"] + for key, value in kv.items(): + out += [ + "".join(c for c in key if c.isalnum()) + + "_" + + "".join(c for c in value if c.isalnum()) + ] + return "+".join(out) + + new_mmd.hxl_tag = gen_hxltag(key_val) + new_mmd.parquet_column_name = metric_col + # TODO: Update after fixing hxltag + new_mmd.human_readable_name = "; ".join( + [ + f"Variable: '{key}'; Value: '{value}'" + for key, value in key_val.items() + ] + ) + derived_mmd.append(new_mmd) + + joined_metrics = reduce( + lambda left, right: left.merge( + right, on="GEO_ID", how="inner", validate="one_to_one" + ), + derived_metrics, + ) + + context.add_output_metadata( + metadata={ + "metadata_preview": MetadataValue.md( + metadata_to_dataframe(derived_mmd).head().to_markdown() + ), + "metrics_shape": f"{joined_metrics.shape[0]} rows x {joined_metrics.shape[1]} columns", + "metrics_preview": MetadataValue.md( + joined_metrics.head().to_markdown() + ), + }, + ) + return derived_mmd, joined_metrics # Create assets scotland = Scotland() country_metadata = scotland.create_country_metadata() data_publisher = scotland.create_data_publisher() +lookup = scotland.create_lookup() geometry = scotland.create_geometry() source_data_releases = scotland.create_source_data_releases() catalog = scotland.create_catalog() From 3a3ba280c31ab819d8eef59dd08b19bc42669870 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Sat, 22 Jun 2024 14:41:22 +0100 Subject: [PATCH 31/60] Add dep --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index e575447..9608a27 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ dependencies = [ "jcs >=0.2.1", # For generating IDs from class attributes "beautifulsoup4 >=4.12.3", # For extracting catalogs from web pages "openpyxl >=3.1.3", # For reading Excel files + "xlrd >=2.0.1", # For reading Excel files ] From 2281b6f9df9336f52abfd4b201cfd84483f79cfb Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Sat, 22 Jun 2024 14:41:46 +0100 Subject: [PATCH 32/60] Fix arg, add todo --- python/popgetter/assets/ni/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 19d6d3b..d13de82 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -561,7 +561,7 @@ def pivot_df(df: pd.DataFrame, end: str) -> tuple[list[str], pd.DataFrame]: # Ensure columns are string else: pivot.columns = [str(col).strip() for col in pivot.columns.to_numpy()] - out_cols = [col.replace(var_type, "").strip() for col in pivot_cols] + out_cols = [col.replace(end, "").strip() for col in pivot_cols] return out_cols, pivot # Pivot for codes and labels @@ -575,6 +575,7 @@ def pivot_df(df: pd.DataFrame, end: str) -> tuple[list[str], pd.DataFrame]: new_mmd = source_mmd.copy() new_mmd.parent_metric_id = source_mmd.source_metric_id new_mmd.metric_parquet_path = parquet_file_name + # TODO: check this key_val = dict(zip(out_cols, metric_col.split(SEP), strict=True)) def gen_hxltag(kv: dict[str, str]) -> str: From e518f207200c31b0a82c1876605a3edd80876e9d Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Sat, 22 Jun 2024 15:59:04 +0100 Subject: [PATCH 33/60] Update derived metrics --- python/popgetter/assets/scotland/__init__.py | 105 ++++++++++++------- 1 file changed, 69 insertions(+), 36 deletions(-) diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py index 74eca9c..4731032 100755 --- a/python/popgetter/assets/scotland/__init__.py +++ b/python/popgetter/assets/scotland/__init__.py @@ -268,50 +268,57 @@ class SourceTable: # Config for each partition to be derived -age_code = "`Age Code`" +age_code = "`Age Category`" sex_label = "`Sex Label`" +infants = ["0 to 4"] +children_5_to_17 = ["5 to 9", "10 to 11", "12 to 14" "15", "16 to 17"] +children = ["0 to 4", "5 to 9", "10 to 11", "12 to 14" "15", "16 to 17"] +adults = ["18 to 19"] + [f"{i} to {i+4}" for i in range(20, 91, 5)] + ["95 and over"] +people = ["All people"] DERIVED_COLUMNS = [ DerivedColumn( hxltag="#population+children+age5_17", - filter_func=lambda df: df.query(f"{age_code} >= 5 and {age_code} < 18"), + filter_func=lambda df: df.query(f"{age_code} in @children_5_to_17"), output_column_name="children_5_17", human_readable_name="Children aged 5 to 17", ), DerivedColumn( hxltag="#population+infants+age0_4", - filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 5"), + filter_func=lambda df: df.query(f"{age_code} in @infants"), output_column_name="infants_0_4", human_readable_name="Infants aged 0 to 4", ), DerivedColumn( hxltag="#population+children+age0_17", - filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 18"), + filter_func=lambda df: df.query(f"{age_code} in @children"), output_column_name="children_0_17", human_readable_name="Children aged 0 to 17", ), DerivedColumn( hxltag="#population+adults+f", filter_func=lambda df: df.query( - f"{age_code} >= 18 and {sex_label} == 'Female'" + f"{age_code} in @adults and {sex_label} == 'Female'" ), output_column_name="adults_f", human_readable_name="Female adults", ), DerivedColumn( hxltag="#population+adults+m", - filter_func=lambda df: df.query(f"{age_code} >= 18 and {sex_label} == 'Male'"), + filter_func=lambda df: df.query( + f"{age_code} in @adults and {sex_label} == 'Male'" + ), output_column_name="adults_m", human_readable_name="Male adults", ), DerivedColumn( hxltag="#population+adults", - filter_func=lambda df: df.query(f"{age_code} >= 18"), + filter_func=lambda df: df.query(f"{age_code} in @adults"), output_column_name="adults", human_readable_name="Adults", ), DerivedColumn( hxltag="#population+ind", - filter_func=lambda df: df, + filter_func=lambda df: df.query(f"{age_code} in @people"), output_column_name="individuals", human_readable_name="Total individuals", ), @@ -775,36 +782,50 @@ def _derived_metrics( derived_metrics, derived_mmd = [], [] # If derived metrics - # try: - # metric_specs = DERIVED_COLUMN_SPECIFICATIONS[partition_key] - # source_column = source_mmd.parquet_column_name - # for metric_spec in metric_specs: - # new_table = ( - # census_tables.pipe(metric_spec.filter_func) - # .groupby(by="GEO_ID", as_index=True) - # .sum() - # .rename(columns={source_column: metric_spec.output_column_name}) - # .filter(items=["GEO_ID", metric_spec.output_column_name]) - # ) - # derived_metrics.append(new_table) - # new_mmd = source_mmd.copy() - # new_mmd.parent_metric_id = source_mmd.source_metric_id - # new_mmd.metric_parquet_path = parquet_file_name - # new_mmd.hxl_tag = metric_spec.hxltag - # new_mmd.parquet_column_name = metric_spec.output_column_name - # new_mmd.human_readable_name = metric_spec.human_readable_name - # derived_mmd.append(new_mmd) - # except KeyError: - # # No extra derived metrics specified for this partition -- only use - # # those from pivoted data - # pass + try: + metric_specs = DERIVED_COLUMN_SPECIFICATIONS[partition_key] + + def reshape(df_to_reshape: pd.DataFrame) -> pd.DataFrame: + df_to_reshape = df_to_reshape.rename( + columns={"Unnamed: 0": "GEO_ID", "Unnamed: 1": "Age Category"} + ).drop(columns=["All people"]) + df_to_reshape = df_to_reshape.melt( + ["GEO_ID", "Age Category"], var_name="Sex Label", value_name="Count" + ) + df_to_reshape["Sex Label"] = df_to_reshape["Sex Label"].map( + {"Males": "Male", "Females": "Female"} + ) + return df_to_reshape + + census_tables_for_derived_metrics = reshape(census_tables) + source_column = source_mmd.parquet_column_name + for metric_spec in metric_specs: + new_table = ( + census_tables_for_derived_metrics.pipe(metric_spec.filter_func) + .groupby(by="GEO_ID", as_index=True) + .sum() + .rename(columns={source_column: metric_spec.output_column_name}) + .filter(items=["GEO_ID", metric_spec.output_column_name]) + ) + derived_metrics.append(new_table) + new_mmd = source_mmd.copy() + new_mmd.parent_metric_id = source_mmd.source_metric_id + new_mmd.metric_parquet_path = parquet_file_name + new_mmd.hxl_tag = metric_spec.hxltag + new_mmd.parquet_column_name = metric_spec.output_column_name + new_mmd.human_readable_name = metric_spec.human_readable_name + derived_mmd.append(new_mmd) + except KeyError: + # No extra derived metrics specified for this partition -- only use + # those from pivoted data + pass # Batch def make_pivot(df: pd.DataFrame) -> pd.DataFrame: # TODO: reshape based on Unnamed: 1 to Unnamed N pivot_cols = [ col - for col in census_tables.columns + for col in df.columns if col != "Unnamed: 0" and col.startswith("Unnamed: ") ] pivot = df.pivot_table( @@ -826,10 +847,22 @@ def make_pivot(df: pd.DataFrame) -> pd.DataFrame: return pivot new_table = make_pivot(census_tables) - out_cols = [ - "".join(x for x in col.title() if not x.isspace()) - for col in source_mmd.description.split(" by ")[::-1] - ] + + # Split for description of metrics + exceptions = { + "Age by single year": ["Age by single year"], + "National Statistics Socio-economic Classification (NS-SeC) by ethnic group by sex by age": [ + "Ethnic group", + "Sex and Age", + "National Statistics Socio-economic Classification (NS-SeC)", + ], + } + if source_mmd.description not in exceptions: + split = source_mmd.description.split(" by ")[::-1] + else: + split = exceptions[source_mmd.description] + out_cols = ["".join(x for x in col.title() if not x.isspace()) for col in split] + context.log.debug(ic(out_cols)) for metric_col in new_table.columns: metric_df = new_table.loc[:, metric_col].to_frame() From 24a4dd5ec2dd48b1979ae3a08a030c5e155ff07e Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Sat, 22 Jun 2024 17:38:15 +0100 Subject: [PATCH 34/60] Fix non-integer cases --- python/popgetter/assets/scotland/__init__.py | 24 ++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py index 4731032..506bca8 100755 --- a/python/popgetter/assets/scotland/__init__.py +++ b/python/popgetter/assets/scotland/__init__.py @@ -863,7 +863,8 @@ def make_pivot(df: pd.DataFrame) -> pd.DataFrame: split = exceptions[source_mmd.description] out_cols = ["".join(x for x in col.title() if not x.isspace()) for col in split] context.log.debug(ic(out_cols)) - + ic("----") + ic(new_table.columns) for metric_col in new_table.columns: metric_df = new_table.loc[:, metric_col].to_frame() ic(metric_df) @@ -903,6 +904,24 @@ def gen_hxltag(kv: dict[str, str]) -> str: derived_metrics, ) + def make_int(maybe_non_int_df: pd.DataFrame) -> pd.DataFrame: + for col in maybe_non_int_df: + if maybe_non_int_df[col].dtype == "object": + maybe_non_int_df[col] = ( + maybe_non_int_df[col] + .str.replace(",", "") + .str.replace("-", "0") + .fillna("0") + .astype(int) + ) + return maybe_non_int_df + + # Fix format + joined_metrics = make_int(joined_metrics) + + # Filter out whole country Scotland + joined_metrics = joined_metrics.loc[~joined_metrics.index.isin(["S92000003"])] + context.add_output_metadata( metadata={ "metadata_preview": MetadataValue.md( @@ -928,4 +947,5 @@ def gen_hxltag(kv: dict[str, str]) -> str: census_tables = scotland.create_census_tables() source_metric_metadata = scotland.create_source_metric_metadata() derived_metrics = scotland.create_derived_metrics() -metrics = scotland.create_metrics() +# Publish all partitions +metrics = scotland.create_metrics(None) From 86ba91b6dfe591f1c7e7db710fc89b278f11ee62 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Sat, 22 Jun 2024 17:45:58 +0100 Subject: [PATCH 35/60] Remove obsolete modules --- .../assets/scotland/census_derived.py | 328 ------------------ .../assets/scotland/census_geometry.py | 17 - .../assets/scotland/census_tables.py | 291 ---------------- 3 files changed, 636 deletions(-) delete mode 100644 python/popgetter/assets/scotland/census_derived.py delete mode 100644 python/popgetter/assets/scotland/census_geometry.py delete mode 100644 python/popgetter/assets/scotland/census_tables.py diff --git a/python/popgetter/assets/scotland/census_derived.py b/python/popgetter/assets/scotland/census_derived.py deleted file mode 100644 index 43277a0..0000000 --- a/python/popgetter/assets/scotland/census_derived.py +++ /dev/null @@ -1,328 +0,0 @@ -# from __future__ import annotations - -# import geopandas as gpd -# import numpy as np -# import pandas as pd -# from dagster import ( -# AssetIn, -# AssetOut, -# MaterializeResult, -# MetadataValue, -# SpecificPartitionsPartitionMapping, -# StaticPartitionsDefinition, -# asset, -# multi_asset, -# ) -# from icecream import ic -# from matplotlib import pyplot as plt - -# from popgetter.utils import markdown_from_plot - -# from ...metadata import MetricMetadata -# from .census_tables import add_metadata, dataset_node_partition - - -# def get_lc1117sc_metric( -# lc1117sc: pd.DataFrame, col: str, output_col: str, subset: list[str] -# ) -> pd.DataFrame: -# lc1117sc_transformed = lc1117sc.rename( -# columns={"Unnamed: 0": "OA11CD", "Unnamed: 1": "Age Category"} -# ) -# lc1117sc_transformed = lc1117sc_transformed.loc[ -# ~lc1117sc_transformed["OA11CD"].str.startswith("S92"), : -# ] -# return ( -# lc1117sc_transformed.loc[ -# lc1117sc_transformed["Age Category"].isin(subset), -# ["OA11CD", col], -# ] -# .groupby("OA11CD") -# .agg("sum") -# .rename(columns={col: output_col}) -# ) - - -# ALL_PEOPLE = ["All people"] -# INFANTS_AGE_0_TO_4 = ["0 to 4"] -# CHILDREN_AGE_0_TO_17 = ["0 to 4", "5 to 9", "10 to 11", "12 to 14", "15", "16 to 17"] -# CHILDREN_AGE_5_TO_17 = ["5 to 9", "10 to 11", "12 to 14", "15", "16 to 17"] -# ADULTS = [ -# "18 to 19", -# "20 to 24", -# "25 to 29", -# "30 to 34", -# "35 to 39", -# "40 to 44", -# "45 to 49", -# "50 to 54", -# "55 to 59", -# "60 to 64", -# "65 to 69", -# "70 to 74", -# "75 to 79", -# "80 to 84", -# "85 to 89", -# "90 to 94", -# "95 and over", -# ] - -# needed_dataset_list = [ -# { -# # Population by OA11, Period: 2011 -# "partition_key": "2011/OA11/LC1117SC", -# "hxltag": "#population+oa11+2011", -# # TODO: this partition key does not have a single column for source -# "source_column": "", -# } -# ] -# needed_dataset_partions_keys: list[str] = [ -# r["partition_key"] for r in needed_dataset_list -# ] -# needed_dataset_mapping = SpecificPartitionsPartitionMapping( -# needed_dataset_partions_keys -# ) -# needed_dataset_partition = StaticPartitionsDefinition(needed_dataset_partions_keys) - -# # Using HXL tags for variable names (https://hxlstandard.org/standard/1-1final/dictionary/#tag_population) -# # TODO: add human readable names for each column as the MetricMetadata currently receives the -# # catalog row (table) human readable name. -# _derived_columns: list[dict] = [ -# { -# "partition_key": "2011/OA11/LC1117SC", -# "hxltag": "population_children_age5_17", -# "filter_func": lambda df, output_col: get_lc1117sc_metric( -# df, "All people", output_col, CHILDREN_AGE_5_TO_17 -# ), -# }, -# { -# "partition_key": "2011/OA11/LC1117SC", -# "hxltag": "population_infants_age0_4", -# "filter_func": lambda df, output_col: get_lc1117sc_metric( -# df, "All people", output_col, INFANTS_AGE_0_TO_4 -# ), -# }, -# { -# "partition_key": "2011/OA11/LC1117SC", -# "hxltag": "population_children_age0_17", -# "filter_func": lambda df, output_col: get_lc1117sc_metric( -# df, "All people", output_col, CHILDREN_AGE_0_TO_17 -# ), -# }, -# { -# "partition_key": "2011/OA11/LC1117SC", -# "hxltag": "population_adults_f", -# "filter_func": lambda df, output_col: get_lc1117sc_metric( -# df, "Females", output_col, ADULTS -# ), -# }, -# { -# "partition_key": "2011/OA11/LC1117SC", -# "hxltag": "population_adults_m", -# "filter_func": lambda df, output_col: get_lc1117sc_metric( -# df, "Males", output_col, ADULTS -# ), -# }, -# { -# "partition_key": "2011/OA11/LC1117SC", -# "hxltag": "population_adults", -# "filter_func": lambda df, output_col: get_lc1117sc_metric( -# df, "All people", output_col, ADULTS -# ), -# }, -# { -# "partition_key": "2011/OA11/LC1117SC", -# "hxltag": "population_ind", -# "filter_func": lambda df, output_col: get_lc1117sc_metric( -# df, "All people", output_col, ALL_PEOPLE -# ), -# }, -# ] - -# derived_columns = pd.DataFrame( -# _derived_columns, columns=["partition_key", "hxltag", "filter_func"] -# ) - - -# # record = { -# # "resolution": resolution, -# # "catalog_resolution": table_metadata["catalog_resolution"], -# # "source": source, -# # "url": url, -# # "file_name": Path(source) / file_name, -# # "table_name": table_name, -# # "year": table_metadata["year"], -# # # Use constructed name of description and coverage -# # "human_readable_name": table_metadata["human_readable_name"], -# # "source_metric_id": None, -# # # Use catalog_metadata description -# # "description": table_metadata["description"], -# # "hxl_tag": None, -# # "metric_parquet_file_url": None, -# # "parquet_column_name": None, -# # "parquet_margin_of_error_column": None, -# # "parquet_margin_of_error_file": None, -# # "potential_denominator_ids": None, -# # "parent_metric_id": None, -# # # TODO: check this is not an ID but a name -# # "source_data_release_id": table_metadata["census_release"], -# # "source_download_url": url, -# # # TODO: what should this be? -# # "source_archive_file_path": None, -# # "source_documentation_url": URL_CATALOG_METADATA, -# # } - - -# def census_table_metadata(catalog_row: dict) -> MetricMetadata: -# return MetricMetadata( -# human_readable_name=catalog_row["human_readable_name"], -# source_download_url=catalog_row["source_download_url"], -# source_archive_file_path=catalog_row["source_archive_file_path"], -# source_documentation_url=catalog_row["source_documentation_url"], -# source_data_release_id=catalog_row["source_data_release_id"], -# # TODO - this is a placeholder -# parent_metric_id="unknown_at_this_stage", -# potential_denominator_ids=None, -# parquet_margin_of_error_file=None, -# parquet_margin_of_error_column=None, -# # TODO: currently setting to rename the derived column name equal to 'hxltag' -# # and not related to the source_column -# # parquet_column_name=catalog_row["source_column"], -# parquet_column_name=catalog_row["hxltag"], -# # TODO - this is a placeholder -# metric_parquet_file_url="unknown_at_this_stage", -# hxl_tag=catalog_row["hxltag"], -# description=catalog_row["description"], -# source_metric_id=catalog_row["hxltag"], -# ) - - -# @asset( -# ins={ -# "catalog_as_dataframe": AssetIn(partition_mapping=needed_dataset_mapping), -# }, -# ) -# def filter_needed_catalog( -# context, needed_datasets, catalog_as_dataframe: pd.DataFrame -# ) -> pd.DataFrame: -# needed_df = needed_datasets.merge( -# catalog_as_dataframe, how="inner", on="partition_key" -# ) -# add_metadata(context, needed_df, "needed_df") -# return needed_df - - -# @asset -# def needed_datasets(context) -> pd.DataFrame: -# needed_df = pd.DataFrame( -# needed_dataset_list, -# columns=["partition_key", "hxltag", "source_column", "derived_columns"], -# dtype="string", -# ) -# add_metadata(context, needed_df, "needed_datasets") -# return needed_df - - -# @multi_asset( -# ins={ -# "individual_census_table": AssetIn(partition_mapping=needed_dataset_mapping), -# "filter_needed_catalog": AssetIn(), -# }, -# outs={ -# "source_table": AssetOut(), -# "source_mmd": AssetOut(), -# }, -# partitions_def=dataset_node_partition, -# ) -# def get_enriched_tables_scotland( -# context, individual_census_table, filter_needed_catalog -# ) -> tuple[pd.DataFrame, MetricMetadata]: -# partition_keys = context.asset_partition_keys_for_input( -# input_name="individual_census_table", -# ) -# output_partition = context.asset_partition_key_for_output("source_table") -# ic(partition_keys) -# ic(len(partition_keys)) -# ic(output_partition) -# ic(type(output_partition)) -# ic(individual_census_table) -# if output_partition not in partition_keys: -# err_msg = f"Requested partition {output_partition} not found in the subset of 'needed' partitions {partition_keys}" -# raise ValueError(err_msg) - -# result_df = individual_census_table -# catalog_row = filter_needed_catalog[ -# filter_needed_catalog["partition_key"].eq(output_partition) -# ] -# catalog_row = catalog_row.to_dict(orient="index") -# catalog_row = catalog_row.popitem()[1] -# ic(catalog_row) -# result_mmd = census_table_metadata(catalog_row) -# ic(result_mmd) -# return result_df, result_mmd - - -# @multi_asset( -# partitions_def=dataset_node_partition, -# ins={ -# "source_table": AssetIn(partition_mapping=needed_dataset_mapping), -# "source_mmd": AssetIn(partition_mapping=needed_dataset_mapping), -# }, -# outs={"derived_table": AssetOut(), "derived_mmds": AssetOut()}, -# ) -# def transform_data( -# context, -# source_table: pd.DataFrame, -# source_mmd: MetricMetadata, -# ) -> tuple[pd.DataFrame, list[MetricMetadata]]: -# partition_key = context.asset_partition_key_for_output("derived_table") -# census_table = source_table.copy() -# parent_mmd = source_mmd.copy() -# # source_column = parent_mmd.parquet_column_name -# metrics = derived_columns[derived_columns["partition_key"].eq(partition_key)] -# new_series: list[pd.Series] = [] -# new_mmds: list[MetricMetadata] = [] -# for _, _, col_name, filter in metrics.itertuples(): -# # Create column -# column: pd.Series = filter(census_table, col_name) -# ic(f"col_name: {col_name}") -# new_series.append(column) - -# # Construct metadata -# new_mmd = parent_mmd.copy() -# new_mmd.parent_metric_id = parent_mmd.source_metric_id -# new_mmd.hxl_tag = col_name -# new_mmds.append(new_mmd) - -# # Merge series -# new_table: pd.DataFrame = pd.concat(new_series, axis=1) -# add_metadata( -# context, -# df=new_table, -# title=f"Derived table ({partition_key})", -# output_name="derived_table", -# ) -# return new_table, new_mmds - - -# @multi_asset( -# ins={ -# "derived_table": AssetIn(partition_mapping=needed_dataset_mapping), -# "geometry": AssetIn(partition_mapping=needed_dataset_mapping), -# }, -# outs={ -# "plot": AssetOut(), -# }, -# partitions_def=dataset_node_partition, -# ) -# def plot(derived_table: pd.DataFrame, geometry: gpd.GeoDataFrame): -# """Plots map with log density of people.""" -# merged = geometry.merge( -# derived_table[["population_ind"]], -# left_on="geo_code", -# right_index=True, -# how="left", -# ) -# merged["log10 people"] = np.log10(merged["population_ind"]) -# merged.plot(column="log10 people", legend=True) -# md_content = markdown_from_plot(plt) -# return MaterializeResult(metadata={"plot": MetadataValue.md(md_content)}) diff --git a/python/popgetter/assets/scotland/census_geometry.py b/python/popgetter/assets/scotland/census_geometry.py deleted file mode 100644 index 28e5afa..0000000 --- a/python/popgetter/assets/scotland/census_geometry.py +++ /dev/null @@ -1,17 +0,0 @@ -# from __future__ import annotations - -# import geopandas as gpd -# from dagster import asset - -# from popgetter.assets.scotland import download_file - -# from .census_tables import URL_SHAPEFILE, add_metadata, cache_dir - - -# # @asset -# # def geometry(context, oa_dz_iz_2011_lookup) -> gpd.GeoDataFrame: -# # """Gets the shape file for OA11 resolution.""" -# # file_name = download_file(cache_dir, URL_SHAPEFILE) -# # geo = gpd.read_file(f"zip://{file_name}") -# # add_metadata(context, geo, "Geometry file") -# # return geo[geo["geo_code"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])] diff --git a/python/popgetter/assets/scotland/census_tables.py b/python/popgetter/assets/scotland/census_tables.py deleted file mode 100644 index 5efda94..0000000 --- a/python/popgetter/assets/scotland/census_tables.py +++ /dev/null @@ -1,291 +0,0 @@ -# from __future__ import annotations - -# import urllib.parse as urlparse -# from pathlib import Path - -# import geopandas as gpd -# import pandas as pd -# import zipfile_deflate64 as zipfile -# from dagster import ( -# AssetOut, -# DynamicPartitionsDefinition, -# MetadataValue, -# SpecificPartitionsPartitionMapping, -# StaticPartitionsDefinition, -# asset, -# multi_asset, -# ) - -# from popgetter.assets.scotland import REQUIRED_TABLES_REGEX, download_file, sources - -# """ -# Notes: -# - 2011 data using UKCensusAPI, 2022 data expected soon given recent initial -# publication -# - Reusing some bits of code from UKCensusAPI: -# https://github.com/alan-turing-institute/UKCensusAPI/blob/master/ukcensusapi/NRScotland.py -# """ - - -# PARTITIONS_DEF_NAME = "dataset_tables" -# dataset_node_partition = DynamicPartitionsDefinition(name=PARTITIONS_DEF_NAME) - -# # cache_dir = tempfile.mkdtemp() -# cache_dir = "./cache" - -# URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html" -# URL1 = "https://www.scotlandscensus.gov.uk/" -# URL2 = "https://nrscensusprodumb.blob.core.windows.net/downloads/" -# URL_LOOKUP = ( -# "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx" -# ) -# URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip" -# URL_CATALOG = ( -# "https://www.scotlandscensus.gov.uk/media/kqcmo4ge/census-table-index-2011.xlsm" -# ) - -# data_sources = ["Council Area blk", "SNS Data Zone 2011 blk", "Output Area blk"] -# GeoCodeLookup = { -# "LAD": 0, # "Council Area blk" -# # MSOA (intermediate zone)? -# "LSOA11": 1, # "SNS Data Zone 2011 blk" -# "OA11": 2, # "Output Area blk" -# } - -# DATA_SOURCES = [ -# { -# "source": "Council Area blk", -# "resolution": "LAD", -# "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip", -# }, -# { -# "source": "SNS Data Zone 2011 blk", -# "resolution": "LSOA11", -# "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip", -# }, -# { -# "source": "Output Area blk", -# "resolution": "OA11", -# "url": URL2 + urlparse.quote("Output Area blk") + ".zip", -# }, -# ] - - -# # NB. Make sure no spaces in asset keys -# @multi_asset( -# outs={ -# "oa_dz_iz_2011_lookup": AssetOut(), -# "data_zone_2011_lookup": AssetOut(), -# "intermediate_zone_2011_lookup": AssetOut(), -# }, -# ) -# def lookups(): -# """Creates lookup dataframes.""" -# Path(cache_dir).mkdir(parents=True, exist_ok=True) -# lookup_path = download_file(cache_dir, URL_LOOKUP) -# df1 = pd.read_excel(lookup_path, sheet_name="OA_DZ_IZ_2011 Lookup") -# df2 = pd.read_excel(lookup_path, sheet_name="DataZone2011Lookup") -# df3 = pd.read_excel(lookup_path, sheet_name="IntermediateZone2011Lookup") -# return df1, df2, df3 - - -# def source_to_zip(source_name: str, url: str) -> Path: -# """Downloads if necessary and returns the name of the locally cached zip file -# of the source data (replacing spaces with _)""" -# file_name = Path(cache_dir) / (source_name.replace(" ", "_") + ".zip") -# return download_file(cache_dir, url, file_name) - - -# def add_metadata( -# context, -# df: pd.DataFrame | gpd.GeoDataFrame, -# title: str | list[str], -# output_name: str | None = None, -# ): -# context.add_output_metadata( -# metadata={ -# "title": title, -# "num_records": len(df), -# "columns": MetadataValue.md( -# "\n".join([f"- '`{col}`'" for col in df.columns.to_list()]) -# ), -# "preview": MetadataValue.md(df.head().to_markdown()), -# }, -# output_name=output_name, -# ) - - -# @asset -# def catalog_reference(context) -> pd.DataFrame: -# catalog_reference = pd.read_excel( -# URL_CATALOG, -# sheet_name=None, -# header=None, -# storage_options={"User-Agent": "Mozilla/5.0"}, -# )["Index"].rename( -# columns={ -# 0: "census_release", -# 1: "table_name", -# 2: "description", -# 3: "population_coverage", -# 4: "variable", -# 5: "catalog_resolution", -# 6: "year", -# 7: "additional_url", -# 8: "population_coverage_and_variable", -# } -# ) -# add_metadata(context, catalog_reference, "Metadata for census tables") -# return catalog_reference - - -# def get_table_metadata( -# catalog_reference: pd.DataFrame, table_name: str -# ) -> dict[str, str]: -# """Returns a dict of table metadata for a given table name.""" -# rows = catalog_reference.loc[catalog_reference.loc[:, "table_name"].eq(table_name)] -# census_release = rows.loc[:, "census_release"].unique()[0] -# description = rows.loc[:, "description"].unique()[0] -# population_coverage = rows.loc[:, "population_coverage"].unique()[0] -# variables = ", ".join(rows.loc[:, "variable"].astype(str).to_list()) -# catalog_resolution = rows.loc[:, "catalog_resolution"].unique()[0] -# year = int(rows.loc[:, "year"].unique()[0]) -# return { -# "census_release": census_release, -# "description": description, -# "population_coverage": population_coverage, -# "variables": variables, -# "catalog_resolution": catalog_resolution, -# "year": str(year), -# "human_readable_name": f"{description} ({population_coverage})", -# } - - -# def get_table_name(file_name: str) -> str: -# return file_name.rsplit(".csv")[0] - - -# @asset -# def catalog_as_dataframe(context, catalog_reference: pd.DataFrame) -> pd.DataFrame: -# """Creates a catalog of the individual census tables from all data sources.""" -# records = [] -# for data_source in DATA_SOURCES: -# resolution = data_source["resolution"] -# source = data_source["source"] -# url = data_source["url"] -# zip_file_name = source_to_zip(source, url) -# with zipfile.ZipFile(zip_file_name) as zip_ref: -# for file_name in zip_ref.namelist(): -# # Get table name -# table_name = get_table_name(file_name) - -# # Skip bulk output files and missing tables from catalog_reference -# if ( -# "bulk_output" in file_name.lower() -# or catalog_reference.loc[:, "table_name"].ne(table_name).all() -# ): -# continue - -# # Get table metadata -# table_metadata = get_table_metadata(catalog_reference, table_name) - -# # Get source release metadata if available -# source_data_release = sources.get( -# table_metadata["census_release"], None -# ) -# source_data_release_id = ( -# None if source_data_release is None else source_data_release.id -# ) - -# # Create a record for each census table use same keys as MetricMetadata -# # where possible since this makes it simpler to populate derived -# # metrics downstream -# record = { -# "resolution": resolution, -# "catalog_resolution": table_metadata["catalog_resolution"], -# "source": source, -# "url": url, -# "file_name": Path(source) / file_name, -# "table_name": table_name, -# "year": table_metadata["year"], -# # Use constructed name of description and coverage -# "human_readable_name": table_metadata["human_readable_name"], -# "source_metric_id": None, -# # Use catalog_reference description -# "description": table_metadata["description"], -# "hxl_tag": None, -# "metric_parquet_file_url": None, -# "parquet_column_name": None, -# "parquet_margin_of_error_column": None, -# "parquet_margin_of_error_file": None, -# "potential_denominator_ids": None, -# "parent_metric_id": None, -# # TODO: check this is not an ID but a name -# "source_data_release_id": source_data_release_id, -# "source_download_url": url, -# # TODO: what should this be? -# "source_archive_file_path": None, -# "source_documentation_url": URL_CATALOG, -# } -# context.log.debug(record) -# records.append(record) -# zip_ref.extract(file_name, Path(cache_dir) / source) - -# # TODO: check if required -# for partition in context.instance.get_dynamic_partitions(PARTITIONS_DEF_NAME): -# context.instance.delete_dynamic_partition(PARTITIONS_DEF_NAME, partition) - -# # Create a dynamic partition for the datasets listed in the catalog -# catalog_df: pd.DataFrame = pd.DataFrame.from_records(records) -# catalog_df["partition_key"] = ( -# catalog_df[["year", "resolution", "table_name"]] -# .astype(str) -# .agg(lambda s: "/".join(s).rsplit(".")[0], axis=1) -# ) -# # TODO: consider filtering here based on a set of keys to keep derived from -# # config (i.e. backend/frontend modes) -# context.instance.add_dynamic_partitions( -# partitions_def_name=PARTITIONS_DEF_NAME, -# # To ensure this is unique, prepend the resolution, -# partition_keys=catalog_df.loc[ -# catalog_df["partition_key"].str.contains(REQUIRED_TABLES_REGEX), -# "partition_key", -# ].to_list(), -# ) -# context.add_output_metadata( -# metadata={ -# "num_records": len(catalog_df), -# "ignored_datasets": "", -# "columns": MetadataValue.md( -# "\n".join([f"- '`{col}`'" for col in catalog_df.columns.to_list()]) -# ), -# "columns_types": MetadataValue.md(catalog_df.dtypes.to_markdown()), -# "preview": MetadataValue.md(catalog_df.to_markdown()), -# } -# ) -# return catalog_df - - -# def get_table(context, table_details) -> pd.DataFrame: -# table_df = pd.read_csv(Path(cache_dir) / table_details["file_name"].iloc[0]) -# add_metadata(context, table_df, table_details["partition_key"].iloc[0]) -# return table_df - - -# @asset(partitions_def=dataset_node_partition) -# def individual_census_table( -# context, catalog_as_dataframe: pd.DataFrame -# ) -> pd.DataFrame: -# """Creates individual census tables as dataframe.""" -# partition_key = context.asset_partition_key_for_output() -# context.log.info(partition_key) -# table_details = catalog_as_dataframe.loc[ -# catalog_as_dataframe["partition_key"].isin([partition_key]) -# ] -# context.log.info(table_details) -# return get_table(context, table_details) - - -# subset_partition_keys: list[str] = ["2011/OA11/LC1117SC"] -# subset_mapping = SpecificPartitionsPartitionMapping(subset_partition_keys) -# subset_partition = StaticPartitionsDefinition(subset_partition_keys) From 2149324633984df9688864e3ef399190bd3fb2b1 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Sat, 22 Jun 2024 17:46:34 +0100 Subject: [PATCH 36/60] Remove obsolete code --- python/popgetter/assets/scotland/__init__.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py index 506bca8..c6831bf 100755 --- a/python/popgetter/assets/scotland/__init__.py +++ b/python/popgetter/assets/scotland/__init__.py @@ -706,10 +706,6 @@ def _census_tables(self, context, catalog: pd.DataFrame) -> pd.DataFrame: context.log.info(table_details) return self.get_table(context, table_details) - # subset_partition_keys: list[str] = ["2011/OA11/LC1117SC"] - # subset_mapping = SpecificPartitionsPartitionMapping(subset_partition_keys) - # subset_partition = StaticPartitionsDefinition(subset_partition_keys) - @staticmethod def census_table_metadata( catalog_row: dict[str, str], @@ -863,7 +859,6 @@ def make_pivot(df: pd.DataFrame) -> pd.DataFrame: split = exceptions[source_mmd.description] out_cols = ["".join(x for x in col.title() if not x.isspace()) for col in split] context.log.debug(ic(out_cols)) - ic("----") ic(new_table.columns) for metric_col in new_table.columns: metric_df = new_table.loc[:, metric_col].to_frame() From dfbef8770b56f273b48a5ddc8ab735a2ec61ba53 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 26 Jun 2024 21:54:37 +0100 Subject: [PATCH 37/60] Rename module, add country metadata to class --- python/popgetter/assets/__init__.py | 4 ++-- .../assets/{scotland => gb_sct}/__init__.py | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) rename python/popgetter/assets/{scotland => gb_sct}/__init__.py (99%) diff --git a/python/popgetter/assets/__init__.py b/python/popgetter/assets/__init__.py index a766e2a..b7cc89f 100644 --- a/python/popgetter/assets/__init__.py +++ b/python/popgetter/assets/__init__.py @@ -1,9 +1,9 @@ from __future__ import annotations -from . import bel, gb_nir, scotland, uk, us +from . import bel, gb_nir, gb_sct, uk, us countries = [ - (mod, mod.__name__.split(".")[-1]) for mod in [bel, gb_nir, uk, us, scotland] + (mod, mod.__name__.split(".")[-1]) for mod in [bel, gb_nir, uk, us, gb_sct] ] __all__ = ["countries"] diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/gb_sct/__init__.py similarity index 99% rename from python/popgetter/assets/scotland/__init__.py rename to python/popgetter/assets/gb_sct/__init__.py index c6831bf..b316984 100755 --- a/python/popgetter/assets/scotland/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -347,7 +347,13 @@ def get_source_data_release(geo_level: str, cenesus_release: str) -> str: class Scotland(Country): - key_prefix: str = "scotland" + country_metadata: ClassVar[CountryMetadata] = CountryMetadata( + name_short_en="Scotland", + name_official="Scotland", + iso3="GBR", + iso2="GB", + iso3166_2="GB-SCT", + ) geo_levels: ClassVar[list[str]] = list(SCOTLAND_GEO_LEVELS.keys()) tables_to_process: list[str] | None = TABLES_TO_PROCESS @@ -503,13 +509,7 @@ def get_table_metadata( return catalog_df def _country_metadata(self, _context) -> CountryMetadata: - return CountryMetadata( - name_short_en="Scotland", - name_official="Scotland", - iso3="GBR", - iso2="GB", - iso3166_2="GB-SCT", - ) + return self.country_metadata def _data_publisher( self, _context, country_metdata: CountryMetadata @@ -584,6 +584,7 @@ def geometry( for level_details in SCOTLAND_GEO_LEVELS.values(): # TODO: get correct values geometry_metadata = GeometryMetadata( + country_metadata=self.country_metadata, validity_period_start=CENSUS_COLLECTION_DATE, validity_period_end=CENSUS_COLLECTION_DATE, level=level_details.level, From 450d7cf22ea2454b6a7c26cb0f582e7a59abc709 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 26 Jun 2024 21:57:39 +0100 Subject: [PATCH 38/60] Update metrics file name --- python/popgetter/assets/gb_sct/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index b316984..ef42d27 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -774,7 +774,8 @@ def _derived_metrics( partition_key = context.partition_key source_mmd = source_metric_metadata parquet_file_name = ( - "".join(c for c in partition_key if c.isalnum()) + ".parquet" + f"{self.key_prefix}/metrics/" + f"{''.join(c for c in partition_key if c.isalnum()) + '.parquet'}" ) derived_metrics, derived_mmd = [], [] From ceccf58a8332be818ff23ca30bf71fbaa812270e Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 27 Jun 2024 06:58:41 +0100 Subject: [PATCH 39/60] Fix loop over geometry --- python/popgetter/assets/gb_sct/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index ef42d27..9154557 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -21,7 +21,7 @@ from icecream import ic from popgetter.assets.country import Country -from popgetter.cloud_outputs import send_to_geometry_sensor +from popgetter.cloud_outputs import GeometryOutput, send_to_geometry_sensor from popgetter.metadata import ( CountryMetadata, DataPublisher, @@ -661,13 +661,13 @@ def _geometry(self, context): def _source_data_releases( self, _context, - geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]], + geometry: list[GeometryOutput], data_publisher: DataPublisher, # TODO: consider version without inputs so only output type specified # **kwargs, ) -> dict[str, SourceDataRelease]: source_data_releases = {} - for geo_metadata, _, _ in geometry: + for geo in geometry: for ( source_data_release_id, source_data_release, @@ -683,10 +683,10 @@ def _source_data_releases( url=source_data_release.url, data_publisher_id=data_publisher.id, description=source_data_release.description, - geometry_metadata_id=geo_metadata.id, + geometry_metadata_id=geo.metadata.id, ) combined_level_and_release_id = get_source_data_release( - geo_metadata.level, source_data_release_id + geo.metadata.level, source_data_release_id ) source_data_releases[ combined_level_and_release_id From 2812184713327c61d0742a284b7ee13563534b38 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 27 Jun 2024 07:00:24 +0100 Subject: [PATCH 40/60] Replace 'en' with 'eng' --- python/popgetter/assets/gb_sct/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index 9154557..25574bf 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -194,7 +194,7 @@ class ScotlandGeometryLevel: geo_id_column="OA_CODE", census_table_column="TODO", # census_table_column="Census 2021 Data Zone Code", - name_columns={"en": "OutputArea2011Name"}, # TODO + name_columns={"eng": "OutputArea2011Name"}, # TODO # url=URL_SHAPEFILE, url="https://www.nrscotland.gov.uk/files/geography/output-area-2011-eor.zip", lookup_url=None, @@ -209,7 +209,7 @@ class ScotlandGeometryLevel: geo_id_column="DataZone", census_table_column="TODO", # census_table_column="Census 2021 Data Zone Code", - name_columns={"en": "Name"}, + name_columns={"eng": "Name"}, url="https://maps.gov.scot/ATOM/shapefiles/SG_DataZoneBdry_2011.zip", lookup_url=None, lookup_sheet=None, @@ -222,7 +222,7 @@ class ScotlandGeometryLevel: # geo_id_column="OA_CODE", # census_table_column="TODO", # # census_table_column="Census 2021 Data Zone Code", - # name_columns={"en": "OA_CODE"}, + # name_columns={"eng": "OA_CODE"}, # # url=URL_SHAPEFILE, # url="https://www.nrscotland.gov.uk/files/geography/output-area-2011-eor.zip", # lookup_url=None, @@ -237,7 +237,7 @@ class ScotlandGeometryLevel: geo_id_column="CouncilArea2011Code", census_table_column="TODO", # census_table_column="Census 2021 Data Zone Code", - name_columns={"en": "CouncilArea2011Name"}, + name_columns={"eng": "CouncilArea2011Name"}, url="https://maps.gov.scot/ATOM/shapefiles/SG_DataZoneBdry_2011.zip", lookup_url=None, lookup_sheet=None, @@ -633,7 +633,7 @@ def geometry( # Add output metadata first_metadata, first_gdf, first_names = geometries_to_return[0] first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID") - ax = first_joined_gdf.plot(column="en", legend=False) + ax = first_joined_gdf.plot(column="eng", legend=False) ax.set_title(f"Scotland 2011 {first_metadata.level}") md_plot = markdown_from_plot(plt) context.add_output_metadata( From 452ad096e1720ef14bc248b99ac640e221f8dd51 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 27 Jun 2024 08:50:36 +0100 Subject: [PATCH 41/60] Add source_data_releases, fix geo output --- python/popgetter/assets/gb_sct/__init__.py | 265 +++++++++++++++++---- 1 file changed, 222 insertions(+), 43 deletions(-) diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index 25574bf..b3e6c31 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -50,6 +50,14 @@ CENSUS_COLLECTION_DATE = date(2011, 3, 27) CENSUS_EXPECT_NEXT_UPDATE = date(2022, 1, 1) + +# Source releases for 2011: +# '3A','3I', '2A', '3C', '3D', '3E', '3L', '3K', '3N', +# '3B', '3J', '3M', '3G', '3H', '2C', '2B', '2D', +# Others: +# '2001 Census', +# 'nan', '75+', 'Daytime Tables', +# '1991 Census', '1992 Census', SOURCE_DATA_RELEASES: dict[str, SourceDataRelease] = { "3A": SourceDataRelease( name="Census 2011: Release 3A", @@ -61,59 +69,229 @@ expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3a", data_publisher_id="TBD", - description="TBC", - # geography_file="TBC", - # geography_level="TBC", - geometry_metadata_id="TBC", - # countries_of_interest=[country.id], + description="TBD", + geometry_metadata_id="TBD", ), "3I": SourceDataRelease( name="Census 2011: Release 3I", date_published=date(2014, 9, 24), - reference_period_start=date(2015, 10, 22), - reference_period_end=date(2015, 10, 22), - collection_period_start=date(2011, 10, 22), - collection_period_end=date(2011, 10, 22), - expect_next_update=date(2022, 1, 1), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3i", data_publisher_id="TBD", - description="TBC", - # geography_file="TBC", - # geography_level="TBC", - geometry_metadata_id="TBC", - # countries_of_interest=[country.id], + description="TBD", + geometry_metadata_id="TBD", ), "2A": SourceDataRelease( name="Census 2011: Release 2A", date_published=date(2013, 9, 26), - reference_period_start=date(2015, 10, 22), - reference_period_end=date(2015, 10, 22), - collection_period_start=date(2011, 10, 22), - collection_period_end=date(2011, 10, 22), - expect_next_update=date(2022, 1, 1), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, url="https://www.nrscotland.gov.uk/news/2013/census-2011-release-2a", data_publisher_id="TBD", - description="TBC", - # geography_file="TBC", - # geography_level="TBC", - geometry_metadata_id="", - # countries_of_interest=[country.id], + description="TBD", + geometry_metadata_id="TBD", ), "3C": SourceDataRelease( name="Census 2011: Release 3C", date_published=date(2014, 4, 9), - reference_period_start=date(2015, 10, 22), - reference_period_end=date(2015, 10, 22), - collection_period_start=date(2011, 10, 22), - collection_period_end=date(2011, 10, 22), - expect_next_update=date(2022, 1, 1), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, url="https://www.nrscotland.gov.uk/news/2014/census-2011-releases-2d-and-3c", data_publisher_id="TBD", - description="TBC", - geometry_metadata_id="", - # geography_file="TBC", - # geography_level="TBC", - # countries_of_interest=[country.id], + description="TBD", + geometry_metadata_id="TBD", + ), + "3D": SourceDataRelease( + name="Census 2011: Release 3D", + date_published=date(2014, 5, 15), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3d", + data_publisher_id="TBD", + description="TBD", + geometry_metadata_id="TBD", + ), + "3E": SourceDataRelease( + name="Census 2011: Release 3E", + date_published=date(2014, 6, 4), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3e", + data_publisher_id="TBD", + description="TBD", + geometry_metadata_id="TBD", + ), + "3F": SourceDataRelease( + name="Census 2011: Release 3F", + date_published=date(2014, 6, 25), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + url="https://www.nrscotland.gov.uk/news/2014/census-release-3f", + data_publisher_id="TBD", + description="TBD", + geometry_metadata_id="TBD", + ), + "3L": SourceDataRelease( + name="Census 2011: Release 3L", + date_published=date(2014, 11, 27), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3l", + data_publisher_id="TBD", + description="TBD", + geometry_metadata_id="TBD", + ), + "3K": SourceDataRelease( + name="Census 2011: Release 3K", + date_published=date(2014, 11, 6), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3k", + data_publisher_id="TBD", + description="TBD", + geometry_metadata_id="TBD", + ), + "3N": SourceDataRelease( + name="Census 2011: Release 3N", + date_published=date(2015, 1, 29), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + url="https://www.nrscotland.gov.uk/news/2015/census-2011-release-3n", + data_publisher_id="TBD", + description="TBD", + geometry_metadata_id="TBD", + ), + "3B": SourceDataRelease( + name="Census 2011: Release 3B", + date_published=date(2014, 3, 19), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + url="https://www.nrscotland.gov.uk/news/2014/census-2011-detailed-characteristics-on-ethnicity-identity-language-and-religion-in-scotland-%E2%80%93-release-3b", + data_publisher_id="TBD", + description="TBD", + geometry_metadata_id="TBD", + ), + "3J": SourceDataRelease( + name="Census 2011: Release 3J", + date_published=date(2014, 10, 16), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3j", + data_publisher_id="TBD", + description="TBD", + geometry_metadata_id="TBD", + ), + "3M": SourceDataRelease( + name="Census 2011: Release 3M", + date_published=date(2014, 12, 18), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3m", + data_publisher_id="TBD", + description="TBD", + geometry_metadata_id="TBD", + ), + "3G": SourceDataRelease( + name="Census 2011: Release 3G", + date_published=date(2014, 7, 23), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3g", + data_publisher_id="TBD", + description="TBD", + geometry_metadata_id="TBD", + ), + "3H": SourceDataRelease( + name="Census 2011: Release 3H", + date_published=date(2014, 8, 13), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3h", + data_publisher_id="TBD", + description="TBD", + geometry_metadata_id="TBD", + ), + "2C": SourceDataRelease( + name="Census 2011: Release 2C", + date_published=date(2013, 12, 18), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + url="https://www.nrscotland.gov.uk/news/2013/census-2011-release-2c", + data_publisher_id="TBD", + description="TBD", + geometry_metadata_id="TBD", + ), + "2B": SourceDataRelease( + name="Census 2011: Release 2B", + date_published=date(2013, 11, 14), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + url="https://www.nrscotland.gov.uk/news/2013/census-2011-release-2b", + data_publisher_id="TBD", + description="TBD", + geometry_metadata_id="TBD", + ), + "2D": SourceDataRelease( + name="Census 2011: Release 2D", + date_published=date(2014, 4, 9), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + url="https://www.nrscotland.gov.uk/news/2014/census-2011-releases-2d-and-3c", + data_publisher_id="TBD", + description="TBD", + geometry_metadata_id="TBD", ), } @@ -576,9 +754,7 @@ def create_geometry(self): @send_to_geometry_sensor @asset(key_prefix=self.key_prefix) - def geometry( - context, lookup: pd.DataFrame - ) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]: + def geometry(context, lookup: pd.DataFrame) -> list[GeometryOutput]: """List of geometries, metadata and names at different resolutions.""" geometries_to_return = [] for level_details in SCOTLAND_GEO_LEVELS.values(): @@ -627,11 +803,16 @@ def geometry( .drop_duplicates() ) geometries_to_return.append( - (geometry_metadata, region_geometries, region_names) + GeometryOutput( + metadata=geometry_metadata, + gdf=region_geometries, + names_df=region_names, + ) ) # Add output metadata - first_metadata, first_gdf, first_names = geometries_to_return[0] + geo: GeometryOutput = geometries_to_return[0] + first_metadata, first_gdf, first_names = geo.metadata, geo.gdf, geo.names_df first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID") ax = first_joined_gdf.plot(column="eng", legend=False) ax.set_title(f"Scotland 2011 {first_metadata.level}") @@ -639,9 +820,7 @@ def geometry( context.add_output_metadata( metadata={ "all_geom_levels": MetadataValue.md( - ",".join( - [metadata.level for metadata, _, _ in geometries_to_return] - ) + ",".join([geo.metadata.level for geo in geometries_to_return]) ), "first_geometry_plot": MetadataValue.md(md_plot), "first_names_preview": MetadataValue.md( From 1d28c55d956cadd2495b6022228bf6a727d6736b Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 27 Jun 2024 08:59:53 +0100 Subject: [PATCH 42/60] Fix derived metric output --- python/popgetter/assets/gb_sct/__init__.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index b3e6c31..5b8fb3e 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -21,7 +21,11 @@ from icecream import ic from popgetter.assets.country import Country -from popgetter.cloud_outputs import GeometryOutput, send_to_geometry_sensor +from popgetter.cloud_outputs import ( + GeometryOutput, + MetricsOutput, + send_to_geometry_sensor, +) from popgetter.metadata import ( CountryMetadata, DataPublisher, @@ -947,7 +951,7 @@ def _derived_metrics( context, census_tables: pd.DataFrame, source_metric_metadata: MetricMetadata, - ) -> tuple[list[MetricMetadata], pd.DataFrame]: + ) -> MetricsOutput: ... SEP = "__" partition_key = context.partition_key @@ -1109,7 +1113,7 @@ def make_int(maybe_non_int_df: pd.DataFrame) -> pd.DataFrame: ), }, ) - return derived_mmd, joined_metrics + return MetricsOutput(metadata=derived_mmd, metrics=joined_metrics) # Create assets From 786bcd3c311a4034aa5f566bf9667d7d0f1b9731 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 27 Jun 2024 09:03:13 +0100 Subject: [PATCH 43/60] Fix module name --- tests/test_be.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/test_be.py b/tests/test_be.py index fae975b..b08a6da 100644 --- a/tests/test_be.py +++ b/tests/test_be.py @@ -12,7 +12,7 @@ from rdflib import Graph from rdflib.namespace import DCAT -from popgetter.assets import be +from popgetter.assets import bel @pytest.fixture(scope="module") @@ -36,7 +36,7 @@ def demo_catalog() -> Graph: @pytest.fixture(scope="module") def demo_catalog_df(demo_catalog) -> pd.DataFrame: context = build_asset_context() - return be.census_tables.catalog_as_dataframe(context, demo_catalog) + return bel.census_tables.catalog_as_dataframe(context, demo_catalog) @pytest.mark.skip( @@ -46,7 +46,7 @@ def test_aggregate_sectors_to_municipalities(demo_sectors): # Test the that the row count is correctly added to the metadata context = build_asset_context() - actual_municipalities = be.census_geometry.aggregate_sectors_to_municipalities( + actual_municipalities = bel.census_geometry.aggregate_sectors_to_municipalities( context, demo_sectors ) @@ -62,7 +62,7 @@ def test_aggregate_sectors_to_municipalities(demo_sectors): @pytest.mark.skip(reason="Fix test_get_population_details_per_municipality first") def test_get_population_details_per_municipality(): with build_asset_context() as muni_context: - stat_muni = be.census_tables.get_population_details_per_municipality( + stat_muni = bel.census_tables.get_population_details_per_municipality( muni_context ) @@ -87,7 +87,7 @@ def test_pivot_population(): ) # Get the geometries - stat_muni = be.census_tables.get_population_details_per_municipality( + stat_muni = bel.census_tables.get_population_details_per_municipality( muni_context ) @@ -99,7 +99,7 @@ def test_pivot_population(): with build_asset_context() as pivot_context: # Pivot the population - pivoted = be.pivot_population(pivot_context, stat_muni) + pivoted = bel.pivot_population(pivot_context, stat_muni) expected_number_of_municipalities = 581 @@ -115,7 +115,7 @@ def test_demo_catalog(demo_catalog): actual_length = len( list( demo_catalog.objects( - subject=be.census_tables.opendata_catalog_root, + subject=bel.census_tables.opendata_catalog_root, predicate=DCAT.dataset, unique=False, ) @@ -128,7 +128,7 @@ def test_demo_catalog(demo_catalog): def test_catalog_metadata_details(demo_catalog_df): # Get the metadata for a specific dataset in the demo catalogue: # https://statbel.fgov.be/node/4151 "Population by Statistical sector" - # mmd = be.census_tables.get_mmd_from_dataset_node( + # mmd = bel.census_tables.get_mmd_from_dataset_node( # demo_catalog, dataset_node=URIRef("https://statbel.fgov.be/node/4151") # ) @@ -179,7 +179,7 @@ def test_catalog_as_dataframe(demo_catalog_df): # # Convert the demo catalog to a DataFrame # with build_asset_context() as context: - # catalog_df = be.census_tables.catalog_as_dataframe(context, demo_catalog_df) + # catalog_df = bel.census_tables.catalog_as_dataframe(context, demo_catalog_df) # # Check that the catalog has been converted to a DataFrame # assert isinstance(catalog_df, pd.DataFrame) @@ -228,7 +228,7 @@ def test_filter_known_failing_datasets(): "2676", ] - actual_list = be.census_tables.filter_known_failing_datasets(mock_catalog) + actual_list = bel.census_tables.filter_known_failing_datasets(mock_catalog) assert mock_catalog != expected_list assert actual_list != mock_catalog From fec9af84f3837562603514b5a9e78d03bbf40e3a Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 27 Jun 2024 09:06:51 +0100 Subject: [PATCH 44/60] Fix test --- tests/test_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index bc272b2..502f186 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -57,7 +57,7 @@ def test_source_data_release_hash(): ) assert ( source_data_release.id - == "9ec7e234d73664339e4c1f04bfa485dbb17e204dd72dc3ffbb9cab6870475597" + == "4d61bfe401ba17becd02d6b3912152c135daa9ecaebc9bd45a589dc831a85217" ) source_data_release2 = SourceDataRelease( From 31ac586fce5ceb73e27db5409fd6586b08081006 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 27 Jun 2024 10:49:28 +0100 Subject: [PATCH 45/60] Add first modifications to ensure that runs for all tables --- python/popgetter/assets/gb_sct/__init__.py | 59 +++++++++++++--------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index 5b8fb3e..b98a6ac 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -36,19 +36,7 @@ ) from popgetter.utils import add_metadata, markdown_from_plot -# From: https://github.com/alan-turing-institute/microsimulation/blob/37ce2843f10b83a8e7a225c801cec83b85e6e0d0/microsimulation/common.py#L32 -REQUIRED_TABLES = [ - "QS103SC", - "QS104SC", - "KS201SC", - "DC1117SC", - "DC2101SC", - "DC6206SC", - "LC1117SC", -] -REQUIRED_TABLES_REGEX = "|".join(REQUIRED_TABLES) # Currently including only releases matching tables included -REQUIRED_RELEASES = ["3A", "3I", "2A", "3C"] GENERAL_METHODS_URL = "https://www.scotlandscensus.gov.uk/media/jx2lz54n/scotland-s_census_2011_general_report.pdf" CENSUS_REFERENCE_DATE = date(2011, 3, 27) CENSUS_COLLECTION_DATE = date(2011, 3, 27) @@ -297,6 +285,20 @@ description="TBD", geometry_metadata_id="TBD", ), + "75+": SourceDataRelease( + name="Census 2011: 75+", + # TODO: unable to find published date for 75+ release + date_published=date(2014, 1, 1), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + url="TBD", + data_publisher_id="TBD", + description="TBD", + geometry_metadata_id="TBD", + ), } @@ -321,6 +323,7 @@ def download_file( return file_name +# TODO: remove ones no longer used URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html" URL1 = "https://www.scotlandscensus.gov.uk/" URL2 = "https://nrscensusprodumb.blob.core.windows.net/downloads/" @@ -506,21 +509,23 @@ class SourceTable: ), ] -TABLES_TO_PROCESS: list[str] = [ - "QS103SC", - "QS104SC", - "KS201SC", - "DC1117SC", - "DC2101SC", - "DC6206SC", - "LC1117SC", -] - -PARTITIONS_TO_PUBLISH: list[str] = ["2011/OutputArea2011/LC1117SC"] - +# For all available: +TABLES_TO_PROCESS = None +# For a subset: +# TABLES_TO_PROCESS: list[str] = [ +# "QS103SC", +# "QS104SC", +# "KS201SC", +# "DC1117SC", +# "DC2101SC", +# "DC6206SC", +# "LC1117SC", +# ] DERIVED_COLUMN_SPECIFICATIONS: dict[str, list[DerivedColumn]] = { - PARTITIONS_TO_PUBLISH[0]: DERIVED_COLUMNS, + "2011/OutputArea2011/LC1117SC": DERIVED_COLUMNS, + "2011/DataZone2011/LC1117SC": DERIVED_COLUMNS, + "2011/CouncilArea2011/LC1117SC": DERIVED_COLUMNS, } @@ -632,6 +637,10 @@ def get_table_metadata( ): continue + # Fix case with missing data for release + if resolution == "CouncilArea2011" and table_name == "DC6102SC": + table_metadata["census_release"] = "3I" + # Create a record for each census table use same keys as MetricMetadata # where possible since this makes it simpler to populate derived # metrics downstream From 5522d170e640ca63bc64f078dc18c7a74fb69d96 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 27 Jun 2024 12:38:15 +0100 Subject: [PATCH 46/60] Filter from catalog partition that is missing --- python/popgetter/assets/gb_sct/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index b98a6ac..e2aded9 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -637,6 +637,11 @@ def get_table_metadata( ): continue + # Remove failing case (no data in census table): + # "2011/DataZone2011/QS421SC" + if table_name == "QS421SC" and resolution == "DataZone2011": + continue + # Fix case with missing data for release if resolution == "CouncilArea2011" and table_name == "DC6102SC": table_metadata["census_release"] = "3I" From 33832b31c7fb8b2ce8043edef2dcfde27b6387ef Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 27 Jun 2024 18:02:51 +0100 Subject: [PATCH 47/60] Add option to allow ok return from derived_metrics is partition fails --- python/popgetter/assets/gb_sct/__init__.py | 95 ++++++++++++++-------- 1 file changed, 60 insertions(+), 35 deletions(-) diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index e2aded9..da4f3dd 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -543,6 +543,7 @@ class Scotland(Country): ) geo_levels: ClassVar[list[str]] = list(SCOTLAND_GEO_LEVELS.keys()) tables_to_process: list[str] | None = TABLES_TO_PROCESS + allow_missing_derived_metrics: ClassVar[bool] = True def _catalog(self, context) -> pd.DataFrame: """Creates a catalog of the individual census tables from all data sources.""" @@ -1051,6 +1052,19 @@ def make_pivot(df: pd.DataFrame) -> pd.DataFrame: "Sex and Age", "National Statistics Socio-economic Classification (NS-SeC)", ], + # 2011/CouncilArea2011/DC1104SC + "Residence type by sex by age": ["Residence type and Sex", "Age"], + # 2011/CouncilArea2011/DC1106SC + "Schoolchildren and full-time students living away from home during term time by sex by age": [ + "Schoolchildren and full-time students living away from home during term time and Sex", + "Age", + ], + # 2011/CouncilArea2011/DC1112SC + "Dependent children by household type by sex by age": [ + "Dependent children by household type", + "Sex", + "Age", + ], } if source_mmd.description not in exceptions: split = source_mmd.description.split(" by ")[::-1] @@ -1059,44 +1073,55 @@ def make_pivot(df: pd.DataFrame) -> pd.DataFrame: out_cols = ["".join(x for x in col.title() if not x.isspace()) for col in split] context.log.debug(ic(out_cols)) ic(new_table.columns) - for metric_col in new_table.columns: - metric_df = new_table.loc[:, metric_col].to_frame() - ic(metric_df) - derived_metrics.append(metric_df) - new_mmd = source_mmd.copy() - new_mmd.parent_metric_id = source_mmd.source_metric_id - new_mmd.metric_parquet_path = parquet_file_name - - # TODO: fix automating the hxltag - key_val = dict(zip(out_cols, metric_col.split(SEP), strict=True)) - - def gen_hxltag(kv: dict[str, str]) -> str: - out = ["#population"] - for key, value in kv.items(): - out += [ - "".join(c for c in key if c.isalnum()) - + "_" - + "".join(c for c in value if c.isalnum()) + try: + for metric_col in new_table.columns: + metric_df = new_table.loc[:, metric_col].to_frame() + ic(metric_df) + derived_metrics.append(metric_df) + new_mmd = source_mmd.copy() + new_mmd.parent_metric_id = source_mmd.source_metric_id + new_mmd.metric_parquet_path = parquet_file_name + + # TODO: fix automating the hxltag + key_val = dict(zip(out_cols, metric_col.split(SEP), strict=True)) + + def gen_hxltag(kv: dict[str, str]) -> str: + out = ["#population"] + for key, value in kv.items(): + out += [ + "".join(c for c in key if c.isalnum()) + + "_" + + "".join(c for c in value if c.isalnum()) + ] + return "+".join(out) + + new_mmd.hxl_tag = gen_hxltag(key_val) + new_mmd.parquet_column_name = metric_col + context.log.debug(ic(key_val)) + # TODO: Update after fixing hxltag + new_mmd.human_readable_name = "; ".join( + [ + f"Variable: '{key}'; Value: '{value}'" + for key, value in key_val.items() ] - return "+".join(out) - - new_mmd.hxl_tag = gen_hxltag(key_val) - new_mmd.parquet_column_name = metric_col - # TODO: Update after fixing hxltag - new_mmd.human_readable_name = "; ".join( - [ - f"Variable: '{key}'; Value: '{value}'" - for key, value in key_val.items() - ] + ) + derived_mmd.append(new_mmd) + + joined_metrics = reduce( + lambda left, right: left.merge( + right, on="GEO_ID", how="inner", validate="one_to_one" + ), + derived_metrics, ) - derived_mmd.append(new_mmd) - joined_metrics = reduce( - lambda left, right: left.merge( - right, on="GEO_ID", how="inner", validate="one_to_one" - ), - derived_metrics, - ) + except Exception as err: + err_msg = ( + f"Failed to automatically derive levels and description for " + f"'{partition_key}', error:\n{err}" + ) + context.log.error(err_msg) + if self.allow_missing_derived_metrics: + return MetricsOutput(metadata=[], metrics=pd.DataFrame()) def make_int(maybe_non_int_df: pd.DataFrame) -> pd.DataFrame: for col in maybe_non_int_df: From 9b2485686127f5e50f3d408b1e00fc17e7576c56 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 27 Jun 2024 21:00:28 +0100 Subject: [PATCH 48/60] Create try/except to optionally allow derived metrics with failures --- python/popgetter/assets/gb_sct/__init__.py | 277 +++++++++++---------- 1 file changed, 143 insertions(+), 134 deletions(-) diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index da4f3dd..dfea3d5 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -967,113 +967,119 @@ def _derived_metrics( census_tables: pd.DataFrame, source_metric_metadata: MetricMetadata, ) -> MetricsOutput: - ... - SEP = "__" - partition_key = context.partition_key - source_mmd = source_metric_metadata - parquet_file_name = ( - f"{self.key_prefix}/metrics/" - f"{''.join(c for c in partition_key if c.isalnum()) + '.parquet'}" - ) - derived_metrics, derived_mmd = [], [] - - # If derived metrics try: - metric_specs = DERIVED_COLUMN_SPECIFICATIONS[partition_key] - - def reshape(df_to_reshape: pd.DataFrame) -> pd.DataFrame: - df_to_reshape = df_to_reshape.rename( - columns={"Unnamed: 0": "GEO_ID", "Unnamed: 1": "Age Category"} - ).drop(columns=["All people"]) - df_to_reshape = df_to_reshape.melt( - ["GEO_ID", "Age Category"], var_name="Sex Label", value_name="Count" - ) - df_to_reshape["Sex Label"] = df_to_reshape["Sex Label"].map( - {"Males": "Male", "Females": "Female"} - ) - return df_to_reshape - - census_tables_for_derived_metrics = reshape(census_tables) - source_column = source_mmd.parquet_column_name - for metric_spec in metric_specs: - new_table = ( - census_tables_for_derived_metrics.pipe(metric_spec.filter_func) - .groupby(by="GEO_ID", as_index=True) - .sum() - .rename(columns={source_column: metric_spec.output_column_name}) - .filter(items=["GEO_ID", metric_spec.output_column_name]) - ) - derived_metrics.append(new_table) - new_mmd = source_mmd.copy() - new_mmd.parent_metric_id = source_mmd.source_metric_id - new_mmd.metric_parquet_path = parquet_file_name - new_mmd.hxl_tag = metric_spec.hxltag - new_mmd.parquet_column_name = metric_spec.output_column_name - new_mmd.human_readable_name = metric_spec.human_readable_name - derived_mmd.append(new_mmd) - except KeyError: - # No extra derived metrics specified for this partition -- only use - # those from pivoted data - pass - - # Batch - def make_pivot(df: pd.DataFrame) -> pd.DataFrame: - # TODO: reshape based on Unnamed: 1 to Unnamed N - pivot_cols = [ - col - for col in df.columns - if col != "Unnamed: 0" and col.startswith("Unnamed: ") - ] - pivot = df.pivot_table( - index="Unnamed: 0", columns=pivot_cols, aggfunc="sum" + SEP = "__" + partition_key = context.partition_key + source_mmd = source_metric_metadata + parquet_file_name = ( + f"{self.key_prefix}/metrics/" + f"{''.join(c for c in partition_key if c.isalnum()) + '.parquet'}" ) - - # FLattent multi-index - if isinstance(pivot.columns, pd.MultiIndex): - pivot.columns = [ - SEP.join(list(map(str, col))).strip() - for col in pivot.columns.to_numpy() + derived_metrics, derived_mmd = [], [] + + # If derived metrics + try: + metric_specs = DERIVED_COLUMN_SPECIFICATIONS[partition_key] + + def reshape(df_to_reshape: pd.DataFrame) -> pd.DataFrame: + df_to_reshape = df_to_reshape.rename( + columns={"Unnamed: 0": "GEO_ID", "Unnamed: 1": "Age Category"} + ).drop(columns=["All people"]) + df_to_reshape = df_to_reshape.melt( + ["GEO_ID", "Age Category"], + var_name="Sex Label", + value_name="Count", + ) + df_to_reshape["Sex Label"] = df_to_reshape["Sex Label"].map( + {"Males": "Male", "Females": "Female"} + ) + return df_to_reshape + + census_tables_for_derived_metrics = reshape(census_tables) + source_column = source_mmd.parquet_column_name + for metric_spec in metric_specs: + new_table = ( + census_tables_for_derived_metrics.pipe(metric_spec.filter_func) + .groupby(by="GEO_ID", as_index=True) + .sum() + .rename(columns={source_column: metric_spec.output_column_name}) + .filter(items=["GEO_ID", metric_spec.output_column_name]) + ) + derived_metrics.append(new_table) + new_mmd = source_mmd.copy() + new_mmd.parent_metric_id = source_mmd.source_metric_id + new_mmd.metric_parquet_path = parquet_file_name + new_mmd.hxl_tag = metric_spec.hxltag + new_mmd.parquet_column_name = metric_spec.output_column_name + new_mmd.human_readable_name = metric_spec.human_readable_name + derived_mmd.append(new_mmd) + except KeyError: + # No extra derived metrics specified for this partition -- only use + # those from pivoted data + pass + + # Batch + def make_pivot(df: pd.DataFrame) -> pd.DataFrame: + # TODO: reshape based on Unnamed: 1 to Unnamed N + pivot_cols = [ + col + for col in df.columns + if col != "Unnamed: 0" and col.startswith("Unnamed: ") ] - # Ensure columns are string + pivot = df.pivot_table( + index="Unnamed: 0", columns=pivot_cols, aggfunc="sum" + ) + + # FLattent multi-index + if isinstance(pivot.columns, pd.MultiIndex): + pivot.columns = [ + SEP.join(list(map(str, col))).strip() + for col in pivot.columns.to_numpy() + ] + # Ensure columns are string + else: + pivot.columns = [ + str(col).strip() for col in pivot.columns.to_numpy() + ] + + pivot.index = pivot.index.rename("GEO_ID") + + return pivot + + new_table = make_pivot(census_tables) + + # Split for description of metrics + exceptions = { + "Age by single year": ["Age by single year"], + "National Statistics Socio-economic Classification (NS-SeC) by ethnic group by sex by age": [ + "Ethnic group", + "Sex and Age", + "National Statistics Socio-economic Classification (NS-SeC)", + ], + # 2011/CouncilArea2011/DC1104SC + "Residence type by sex by age": ["Residence type and Sex", "Age"], + # 2011/CouncilArea2011/DC1106SC + "Schoolchildren and full-time students living away from home during term time by sex by age": [ + "Schoolchildren and full-time students living away from home during term time and Sex", + "Age", + ], + # 2011/CouncilArea2011/DC1112SC + "Dependent children by household type by sex by age": [ + "Dependent children by household type", + "Sex", + "Age", + ], + } + if source_mmd.description not in exceptions: + split = source_mmd.description.split(" by ")[::-1] else: - pivot.columns = [str(col).strip() for col in pivot.columns.to_numpy()] - - pivot.index = pivot.index.rename("GEO_ID") - - return pivot - - new_table = make_pivot(census_tables) - - # Split for description of metrics - exceptions = { - "Age by single year": ["Age by single year"], - "National Statistics Socio-economic Classification (NS-SeC) by ethnic group by sex by age": [ - "Ethnic group", - "Sex and Age", - "National Statistics Socio-economic Classification (NS-SeC)", - ], - # 2011/CouncilArea2011/DC1104SC - "Residence type by sex by age": ["Residence type and Sex", "Age"], - # 2011/CouncilArea2011/DC1106SC - "Schoolchildren and full-time students living away from home during term time by sex by age": [ - "Schoolchildren and full-time students living away from home during term time and Sex", - "Age", - ], - # 2011/CouncilArea2011/DC1112SC - "Dependent children by household type by sex by age": [ - "Dependent children by household type", - "Sex", - "Age", - ], - } - if source_mmd.description not in exceptions: - split = source_mmd.description.split(" by ")[::-1] - else: - split = exceptions[source_mmd.description] - out_cols = ["".join(x for x in col.title() if not x.isspace()) for col in split] - context.log.debug(ic(out_cols)) - ic(new_table.columns) - try: + split = exceptions[source_mmd.description] + out_cols = [ + "".join(x for x in col.title() if not x.isspace()) for col in split + ] + context.log.debug(ic(out_cols)) + ic(new_table.columns) + for metric_col in new_table.columns: metric_df = new_table.loc[:, metric_col].to_frame() ic(metric_df) @@ -1114,44 +1120,47 @@ def gen_hxltag(kv: dict[str, str]) -> str: derived_metrics, ) + def make_int(maybe_non_int_df: pd.DataFrame) -> pd.DataFrame: + for col in maybe_non_int_df: + if maybe_non_int_df[col].dtype == "object": + maybe_non_int_df[col] = ( + maybe_non_int_df[col] + .str.replace(",", "") + .str.replace("-", "0") + .fillna("0") + .astype(int) + ) + return maybe_non_int_df + + # Fix format + joined_metrics = make_int(joined_metrics) + + # Filter out whole country Scotland + joined_metrics = joined_metrics.loc[ + ~joined_metrics.index.isin(["S92000003"]) + ] + + context.add_output_metadata( + metadata={ + "metadata_preview": MetadataValue.md( + metadata_to_dataframe(derived_mmd).head().to_markdown() + ), + "metrics_shape": f"{joined_metrics.shape[0]} rows x {joined_metrics.shape[1]} columns", + "metrics_preview": MetadataValue.md( + joined_metrics.head().to_markdown() + ), + }, + ) + except Exception as err: err_msg = ( - f"Failed to automatically derive levels and description for " - f"'{partition_key}', error:\n{err}" + f"Failed to automatically derive metrics for '{partition_key}' with " + f"error: {err}" ) context.log.error(err_msg) if self.allow_missing_derived_metrics: return MetricsOutput(metadata=[], metrics=pd.DataFrame()) - def make_int(maybe_non_int_df: pd.DataFrame) -> pd.DataFrame: - for col in maybe_non_int_df: - if maybe_non_int_df[col].dtype == "object": - maybe_non_int_df[col] = ( - maybe_non_int_df[col] - .str.replace(",", "") - .str.replace("-", "0") - .fillna("0") - .astype(int) - ) - return maybe_non_int_df - - # Fix format - joined_metrics = make_int(joined_metrics) - - # Filter out whole country Scotland - joined_metrics = joined_metrics.loc[~joined_metrics.index.isin(["S92000003"])] - - context.add_output_metadata( - metadata={ - "metadata_preview": MetadataValue.md( - metadata_to_dataframe(derived_mmd).head().to_markdown() - ), - "metrics_shape": f"{joined_metrics.shape[0]} rows x {joined_metrics.shape[1]} columns", - "metrics_preview": MetadataValue.md( - joined_metrics.head().to_markdown() - ), - }, - ) return MetricsOutput(metadata=derived_mmd, metrics=joined_metrics) From 6ebdf7287369b4419378036be7bbe6bbadf4615a Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 2 Jul 2024 18:32:56 +0100 Subject: [PATCH 49/60] Replace GEO_ID with COL enum --- python/popgetter/assets/gb_sct/__init__.py | 33 +++++++++++++--------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index dfea3d5..31c9cd9 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -10,7 +10,6 @@ from typing import ClassVar import geopandas as gpd -import matplotlib.pyplot as plt import pandas as pd import requests import zipfile_deflate64 as zipfile @@ -27,6 +26,7 @@ send_to_geometry_sensor, ) from popgetter.metadata import ( + COL, CountryMetadata, DataPublisher, GeometryMetadata, @@ -805,20 +805,22 @@ def geometry(context, lookup: pd.DataFrame) -> list[GeometryOutput]: context.log.debug(ic(region_geometries_merge.head())) context.log.debug(ic(region_geometries_merge.columns)) region_geometries = region_geometries_merge.rename( - columns={level_details.geo_id_column: "GEO_ID"} - ).loc[:, ["geometry", "GEO_ID"]] + columns={level_details.geo_id_column: COL.GEO_ID.value} + ).loc[:, ["geometry", COL.GEO_ID.value]] region_names = ( region_geometries_merge.rename( columns={ - level_details.geo_id_column: "GEO_ID", + level_details.geo_id_column: COL.GEO_ID.value, } | { value: key for key, value in level_details.name_columns.items() } ) - .loc[:, ["GEO_ID", *list(level_details.name_columns.keys())]] + .loc[ + :, [COL.GEO_ID.value, *list(level_details.name_columns.keys())] + ] .drop_duplicates() ) geometries_to_return.append( @@ -832,10 +834,10 @@ def geometry(context, lookup: pd.DataFrame) -> list[GeometryOutput]: # Add output metadata geo: GeometryOutput = geometries_to_return[0] first_metadata, first_gdf, first_names = geo.metadata, geo.gdf, geo.names_df - first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID") + first_joined_gdf = first_gdf.merge(first_names, on=COL.GEO_ID.value) ax = first_joined_gdf.plot(column="eng", legend=False) ax.set_title(f"Scotland 2011 {first_metadata.level}") - md_plot = markdown_from_plot(plt) + md_plot = markdown_from_plot() context.add_output_metadata( metadata={ "all_geom_levels": MetadataValue.md( @@ -983,10 +985,13 @@ def _derived_metrics( def reshape(df_to_reshape: pd.DataFrame) -> pd.DataFrame: df_to_reshape = df_to_reshape.rename( - columns={"Unnamed: 0": "GEO_ID", "Unnamed: 1": "Age Category"} + columns={ + "Unnamed: 0": COL.GEO_ID.value, + "Unnamed: 1": "Age Category", + } ).drop(columns=["All people"]) df_to_reshape = df_to_reshape.melt( - ["GEO_ID", "Age Category"], + [COL.GEO_ID.value, "Age Category"], var_name="Sex Label", value_name="Count", ) @@ -1000,10 +1005,12 @@ def reshape(df_to_reshape: pd.DataFrame) -> pd.DataFrame: for metric_spec in metric_specs: new_table = ( census_tables_for_derived_metrics.pipe(metric_spec.filter_func) - .groupby(by="GEO_ID", as_index=True) + .groupby(by=COL.GEO_ID.value, as_index=True) .sum() .rename(columns={source_column: metric_spec.output_column_name}) - .filter(items=["GEO_ID", metric_spec.output_column_name]) + .filter( + items=[COL.GEO_ID.value, metric_spec.output_column_name] + ) ) derived_metrics.append(new_table) new_mmd = source_mmd.copy() @@ -1042,7 +1049,7 @@ def make_pivot(df: pd.DataFrame) -> pd.DataFrame: str(col).strip() for col in pivot.columns.to_numpy() ] - pivot.index = pivot.index.rename("GEO_ID") + pivot.index = pivot.index.rename(COL.GEO_ID.value) return pivot @@ -1115,7 +1122,7 @@ def gen_hxltag(kv: dict[str, str]) -> str: joined_metrics = reduce( lambda left, right: left.merge( - right, on="GEO_ID", how="inner", validate="one_to_one" + right, on=COL.GEO_ID.value, how="inner", validate="one_to_one" ), derived_metrics, ) From d62bfc80992e6e664ae90b42bd4aacf198cd6b5d Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 2 Jul 2024 18:38:23 +0100 Subject: [PATCH 50/60] Use tempfile.mkdtemp() for cache_dir --- python/popgetter/assets/gb_sct/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index 31c9cd9..a65c47e 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -1,6 +1,7 @@ #!/usr/bin/python3 from __future__ import annotations +import tempfile import urllib.parse as urlparse from collections.abc import Callable from dataclasses import dataclass @@ -432,8 +433,8 @@ class ScotlandGeometryLevel: } -# cache_dir = tempfile.mkdtemp() -cache_dir = "./cache" +# Use temporary directory for `cache_dir`` +cache_dir = tempfile.mkdtemp() @dataclass From 070c8175bd8b96ce3283757013164fffc3d847ca Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 2 Jul 2024 20:42:39 +0100 Subject: [PATCH 51/60] Add utils, rename static variables upper case --- python/popgetter/assets/gb_sct/__init__.py | 70 ++++++++-------------- python/popgetter/assets/gb_sct/utils.py | 24 ++++++++ 2 files changed, 49 insertions(+), 45 deletions(-) create mode 100644 python/popgetter/assets/gb_sct/utils.py diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index a65c47e..845bbf7 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -12,7 +12,6 @@ import geopandas as gpd import pandas as pd -import requests import zipfile_deflate64 as zipfile from dagster import ( MetadataValue, @@ -21,6 +20,7 @@ from icecream import ic from popgetter.assets.country import Country +from popgetter.assets.gb_sct.utils import HEADERS, download_file from popgetter.cloud_outputs import ( GeometryOutput, MetricsOutput, @@ -303,27 +303,6 @@ } -# Move to tests -HEADERS = { - "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0" -} - - -def download_file( - cache_dir: str, - url: str, - file_name: Path | None = None, - headers: dict[str, str] = HEADERS, -) -> Path: - """Downloads file checking first if exists in cache, returning file name.""" - file_name = Path(cache_dir) / url.split("/")[-1] if file_name is None else file_name - if not Path(file_name).exists(): - r = requests.get(url, allow_redirects=True, headers=headers) - with Path(file_name).open("wb") as fp: - fp.write(r.content) - return file_name - - # TODO: remove ones no longer used URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html" URL1 = "https://www.scotlandscensus.gov.uk/" @@ -433,8 +412,8 @@ class ScotlandGeometryLevel: } -# Use temporary directory for `cache_dir`` -cache_dir = tempfile.mkdtemp() +# Use temporary directory +CACHE_DIR = tempfile.mkdtemp() @dataclass @@ -454,36 +433,36 @@ class SourceTable: # Config for each partition to be derived -age_code = "`Age Category`" -sex_label = "`Sex Label`" -infants = ["0 to 4"] -children_5_to_17 = ["5 to 9", "10 to 11", "12 to 14" "15", "16 to 17"] -children = ["0 to 4", "5 to 9", "10 to 11", "12 to 14" "15", "16 to 17"] -adults = ["18 to 19"] + [f"{i} to {i+4}" for i in range(20, 91, 5)] + ["95 and over"] -people = ["All people"] +AGE_CODE = "`Age Category`" +SEX_LABEL = "`Sex Label`" +INFANTS = ["0 to 4"] +CHILDREN_5_TO_17 = ["5 to 9", "10 to 11", "12 to 14" "15", "16 to 17"] +CHILDREN = ["0 to 4", "5 to 9", "10 to 11", "12 to 14" "15", "16 to 17"] +ADULTS = ["18 to 19"] + [f"{i} to {i+4}" for i in range(20, 91, 5)] + ["95 and over"] +PEOPLE = ["All people"] DERIVED_COLUMNS = [ DerivedColumn( hxltag="#population+children+age5_17", - filter_func=lambda df: df.query(f"{age_code} in @children_5_to_17"), + filter_func=lambda df: df.query(f"{AGE_CODE} in @children_5_to_17"), output_column_name="children_5_17", human_readable_name="Children aged 5 to 17", ), DerivedColumn( hxltag="#population+infants+age0_4", - filter_func=lambda df: df.query(f"{age_code} in @infants"), + filter_func=lambda df: df.query(f"{AGE_CODE} in @infants"), output_column_name="infants_0_4", human_readable_name="Infants aged 0 to 4", ), DerivedColumn( hxltag="#population+children+age0_17", - filter_func=lambda df: df.query(f"{age_code} in @children"), + filter_func=lambda df: df.query(f"{AGE_CODE} in @children"), output_column_name="children_0_17", human_readable_name="Children aged 0 to 17", ), DerivedColumn( hxltag="#population+adults+f", filter_func=lambda df: df.query( - f"{age_code} in @adults and {sex_label} == 'Female'" + f"{AGE_CODE} in @adults and {SEX_LABEL} == 'Female'" ), output_column_name="adults_f", human_readable_name="Female adults", @@ -491,27 +470,25 @@ class SourceTable: DerivedColumn( hxltag="#population+adults+m", filter_func=lambda df: df.query( - f"{age_code} in @adults and {sex_label} == 'Male'" + f"{AGE_CODE} in @adults and {SEX_LABEL} == 'Male'" ), output_column_name="adults_m", human_readable_name="Male adults", ), DerivedColumn( hxltag="#population+adults", - filter_func=lambda df: df.query(f"{age_code} in @adults"), + filter_func=lambda df: df.query(f"{AGE_CODE} in @adults"), output_column_name="adults", human_readable_name="Adults", ), DerivedColumn( hxltag="#population+ind", - filter_func=lambda df: df.query(f"{age_code} in @people"), + filter_func=lambda df: df.query(f"{AGE_CODE} in @people"), output_column_name="individuals", human_readable_name="Total individuals", ), ] -# For all available: -TABLES_TO_PROCESS = None # For a subset: # TABLES_TO_PROCESS: list[str] = [ # "QS103SC", @@ -523,6 +500,9 @@ class SourceTable: # "LC1117SC", # ] +# For all available: +TABLES_TO_PROCESS = None + DERIVED_COLUMN_SPECIFICATIONS: dict[str, list[DerivedColumn]] = { "2011/OutputArea2011/LC1117SC": DERIVED_COLUMNS, "2011/DataZone2011/LC1117SC": DERIVED_COLUMNS, @@ -552,8 +532,8 @@ def _catalog(self, context) -> pd.DataFrame: def source_to_zip(source_name: str, url: str) -> Path: """Downloads if necessary and returns the name of the locally cached zip file of the source data (replacing spaces with _)""" - file_name = Path(cache_dir) / (source_name.replace(" ", "_") + ".zip") - return download_file(cache_dir, url, file_name) + file_name = Path(CACHE_DIR) / (source_name.replace(" ", "_") + ".zip") + return download_file(CACHE_DIR, url, file_name) def get_table_name(file_name: str) -> str: return file_name.rsplit(".csv")[0] @@ -681,7 +661,7 @@ def get_table_metadata( } context.log.debug(record) records.append(record) - zip_ref.extract(file_name, Path(cache_dir) / source) + zip_ref.extract(file_name, Path(CACHE_DIR) / source) # Create a dynamic partition for the datasets listed in the catalog catalog_df: pd.DataFrame = pd.DataFrame.from_records(records) @@ -786,7 +766,7 @@ def geometry(context, lookup: pd.DataFrame) -> list[GeometryOutput]: level=level_details.level, hxl_tag=level_details.hxl_tag, ) - file_name = download_file(cache_dir, level_details.url) + file_name = download_file(CACHE_DIR, level_details.url) region_geometries_raw: gpd.GeoDataFrame = gpd.read_file( f"zip://{file_name}" ) @@ -896,7 +876,7 @@ def _source_data_releases( @staticmethod def get_table(context, table_details) -> pd.DataFrame: - table_df = pd.read_csv(Path(cache_dir) / table_details["file_name"].iloc[0]) + table_df = pd.read_csv(Path(CACHE_DIR) / table_details["file_name"].iloc[0]) add_metadata(context, table_df, table_details["partition_key"].iloc[0]) return table_df diff --git a/python/popgetter/assets/gb_sct/utils.py b/python/popgetter/assets/gb_sct/utils.py new file mode 100644 index 0000000..671d17c --- /dev/null +++ b/python/popgetter/assets/gb_sct/utils.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from pathlib import Path + +import requests + +HEADERS = { + "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0" +} + + +def download_file( + cache_dir: str, + url: str, + file_name: Path | None = None, + headers: dict[str, str] = HEADERS, +) -> Path: + """Downloads file checking first if exists in cache, returning file name.""" + file_name = Path(cache_dir) / url.split("/")[-1] if file_name is None else file_name + if not Path(file_name).exists(): + r = requests.get(url, allow_redirects=True, headers=headers) + with Path(file_name).open("wb") as fp: + fp.write(r.content) + return file_name From e15ddff65d6412f7660ba8f1137a6ee9ad237761 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 2 Jul 2024 21:28:21 +0100 Subject: [PATCH 52/60] Revert tempfile --- python/popgetter/assets/gb_sct/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index 845bbf7..c0bc9dd 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -1,7 +1,6 @@ #!/usr/bin/python3 from __future__ import annotations -import tempfile import urllib.parse as urlparse from collections.abc import Callable from dataclasses import dataclass @@ -412,8 +411,10 @@ class ScotlandGeometryLevel: } +# TODO: identify better tempfile option # Use temporary directory -CACHE_DIR = tempfile.mkdtemp() +# CACHE_DIR = tempfile.mkdtemp() +CACHE_DIR = "./cache" @dataclass From d561c406d1c61fa6c09afd5eddad431d1974ce39 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 2 Jul 2024 21:46:57 +0100 Subject: [PATCH 53/60] Ensure CACHE_DIR made --- python/popgetter/assets/gb_sct/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index c0bc9dd..88c0c70 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -527,6 +527,12 @@ class Scotland(Country): tables_to_process: list[str] | None = TABLES_TO_PROCESS allow_missing_derived_metrics: ClassVar[bool] = True + def __init__(self): + super().__init__() + + # Make temp directory + Path(CACHE_DIR).mkdir(parents=True) + def _catalog(self, context) -> pd.DataFrame: """Creates a catalog of the individual census tables from all data sources.""" From 27514ecc88591d8fa90e4516912a1c075c35220e Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 10 Jul 2024 14:59:41 +0100 Subject: [PATCH 54/60] Update deps --- pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index db220c7..550ef4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,8 +47,7 @@ dependencies = [ "rdflib >=7.0.0", # Required to parse BEL TTL Metadata catalogue. "icecream >=2.1.3", # General debugging tool "python-slugify >=8.0.4", # Required for generating asset names from GBR Ordnance Survey OpenData Product names - "openpyxl", - "zipfile-deflate64", + "zipfile-deflate64 >= 0.2.0", # Required for handling zipped files in Scotland DAG "jcs >=0.2.1", # For generating IDs from class attributes "beautifulsoup4 >=4.12.3", # For extracting catalogs from web pages "openpyxl >=3.1.3", # For reading Excel files From de1422eab4bc25ce57e4768fae360c2ec9b1dc11 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 10 Jul 2024 15:03:21 +0100 Subject: [PATCH 55/60] Remove URL constants --- python/popgetter/assets/gb_sct/__init__.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index 88c0c70..5784514 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -304,8 +304,6 @@ # TODO: remove ones no longer used URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html" -URL1 = "https://www.scotlandscensus.gov.uk/" -URL2 = "https://nrscensusprodumb.blob.core.windows.net/downloads/" URL_LOOKUP = ( "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx" ) @@ -320,19 +318,23 @@ "source": "Council Area blk", # "resolution": "LAD", "resolution": "CouncilArea2011", - "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip", + "url": "https://www.scotlandscensus.gov.uk/media/hjmd0oqr/council-area-blk.zip", }, { "source": "SNS Data Zone 2011 blk", # "resolution": "LSOA11", "resolution": "DataZone2011", - "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip", + "url": "https://nrscensusprodumb.blob.core.windows.net/downloads/" + + urlparse.quote("SNS Data Zone 2011 blk") + + ".zip", }, { "source": "Output Area blk", # "resolution": "OA11", "resolution": "OutputArea2011", - "url": URL2 + urlparse.quote("Output Area blk") + ".zip", + "url": "https://nrscensusprodumb.blob.core.windows.net/downloads/" + + urlparse.quote("Output Area blk") + + ".zip", }, ] From e0c86d37eae728474b743b27d092c8efcc3eb8e6 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 10 Jul 2024 15:04:27 +0100 Subject: [PATCH 56/60] Rename variable --- python/popgetter/assets/gb_sct/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index 5784514..8ad4b35 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -313,7 +313,7 @@ ) -DATA_SOURCES = [ +CENSUS_TABLE_DATA_SOURCES = [ { "source": "Council Area blk", # "resolution": "LAD", @@ -593,7 +593,7 @@ def get_table_metadata( self.remove_all_partition_keys(context) records = [] - for data_source in DATA_SOURCES: + for data_source in CENSUS_TABLE_DATA_SOURCES: resolution = data_source["resolution"] source = data_source["source"] url = data_source["url"] From b7baea2d0fbd17b49038a5f03fdeafa63113eb29 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 10 Jul 2024 15:11:17 +0100 Subject: [PATCH 57/60] Change minimum version requirement --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 550ef4b..693bbc0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ dependencies = [ "rdflib >=7.0.0", # Required to parse BEL TTL Metadata catalogue. "icecream >=2.1.3", # General debugging tool "python-slugify >=8.0.4", # Required for generating asset names from GBR Ordnance Survey OpenData Product names - "zipfile-deflate64 >= 0.2.0", # Required for handling zipped files in Scotland DAG + "zipfile-deflate64 >= 0.1.0", # Required for handling zipped files in Scotland DAG "jcs >=0.2.1", # For generating IDs from class attributes "beautifulsoup4 >=4.12.3", # For extracting catalogs from web pages "openpyxl >=3.1.3", # For reading Excel files From 068920a8ffa122d7a99c04c53729fe44a70c9b51 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 10 Jul 2024 15:54:50 +0100 Subject: [PATCH 58/60] Fix deprecated warning --- python/popgetter/assets/gb_sct/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index 8ad4b35..d02c56e 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -1003,7 +1003,7 @@ def reshape(df_to_reshape: pd.DataFrame) -> pd.DataFrame: ) ) derived_metrics.append(new_table) - new_mmd = source_mmd.copy() + new_mmd = source_mmd.model_copy(deep=True) new_mmd.parent_metric_id = source_mmd.source_metric_id new_mmd.metric_parquet_path = parquet_file_name new_mmd.hxl_tag = metric_spec.hxltag From 8dacb88f7b5fefd7af0bbf23dcd8d6adc047e552 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 10 Jul 2024 15:59:27 +0100 Subject: [PATCH 59/60] Remove obsolete class field --- python/popgetter/assets/gb_sct/__init__.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index d02c56e..092cc78 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -344,7 +344,6 @@ class ScotlandGeometryLevel: level: str hxl_tag: str geo_id_column: str - census_table_column: str name_columns: dict[str, str] # keys = language codes, values = column names url: str lookup_url: str | None @@ -358,10 +357,7 @@ class ScotlandGeometryLevel: level="OutputArea2011", hxl_tag="TBD", geo_id_column="OA_CODE", - census_table_column="TODO", - # census_table_column="Census 2021 Data Zone Code", - name_columns={"eng": "OutputArea2011Name"}, # TODO - # url=URL_SHAPEFILE, + name_columns={"eng": "OutputArea2011Name"}, url="https://www.nrscotland.gov.uk/files/geography/output-area-2011-eor.zip", lookup_url=None, lookup_sheet=None, @@ -373,8 +369,6 @@ class ScotlandGeometryLevel: level="DataZone2011", hxl_tag="TBD", geo_id_column="DataZone", - census_table_column="TODO", - # census_table_column="Census 2021 Data Zone Code", name_columns={"eng": "Name"}, url="https://maps.gov.scot/ATOM/shapefiles/SG_DataZoneBdry_2011.zip", lookup_url=None, @@ -386,8 +380,6 @@ class ScotlandGeometryLevel: # level="OA11", # hxl_tag="TBD", # geo_id_column="OA_CODE", - # census_table_column="TODO", - # # census_table_column="Census 2021 Data Zone Code", # name_columns={"eng": "OA_CODE"}, # # url=URL_SHAPEFILE, # url="https://www.nrscotland.gov.uk/files/geography/output-area-2011-eor.zip", @@ -401,8 +393,6 @@ class ScotlandGeometryLevel: level="CouncilArea2011", hxl_tag="TBD", geo_id_column="CouncilArea2011Code", - census_table_column="TODO", - # census_table_column="Census 2021 Data Zone Code", name_columns={"eng": "CouncilArea2011Name"}, url="https://maps.gov.scot/ATOM/shapefiles/SG_DataZoneBdry_2011.zip", lookup_url=None, From ad3e787d4adac9baa87a44ba49e04e2d56c78233 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 10 Jul 2024 16:00:44 +0100 Subject: [PATCH 60/60] Allow cache to previously exist --- python/popgetter/assets/gb_sct/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py index 092cc78..a72fc24 100755 --- a/python/popgetter/assets/gb_sct/__init__.py +++ b/python/popgetter/assets/gb_sct/__init__.py @@ -523,7 +523,7 @@ def __init__(self): super().__init__() # Make temp directory - Path(CACHE_DIR).mkdir(parents=True) + Path(CACHE_DIR).mkdir(parents=True, exist_ok=True) def _catalog(self, context) -> pd.DataFrame: """Creates a catalog of the individual census tables from all data sources."""