From d2c0901289c427a1e0510f9d2220897714709c5f Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 28 Feb 2024 10:36:10 +0000
Subject: [PATCH 01/60] Initial application of asset to scotland methods

---
 pyproject.toml                               |   1 +
 python/popgetter/__init__.py                 |   7 +
 python/popgetter/assets/__init__.py          |   2 +-
 python/popgetter/assets/scotland/__init__.py |  22 +++
 python/popgetter/assets/scotland/scotland.py | 175 +++++++++++++++++++
 5 files changed, 206 insertions(+), 1 deletion(-)
 create mode 100755 python/popgetter/assets/scotland/__init__.py
 create mode 100644 python/popgetter/assets/scotland/scotland.py

diff --git a/pyproject.toml b/pyproject.toml
index d73748b..9e4b01b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,6 +43,7 @@ dependencies = [
   "pydantic<2.0.0",
   "rdflib >=7.0.0", # Required to parse BEL TTL Metadata catalogue.
   "icecream >=2.1.3", # General debugging tool
+  "openpyxl",
 ]
 
 [project.optional-dependencies]
diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py
index 951e4df..65f247a 100644
--- a/python/popgetter/__init__.py
+++ b/python/popgetter/__init__.py
@@ -29,6 +29,7 @@
     *load_assets_from_package_module(assets.us, group_name="us"),
     *load_assets_from_package_module(assets.be, group_name="be"),
     *load_assets_from_package_module(assets.uk, group_name="uk"),
+    *load_assets_from_package_module(assets.scotland, group_name="scotland"),
 ]
 
 job_be: UnresolvedAssetJobDefinition = define_asset_job(
@@ -50,6 +51,12 @@
     description="Downloads UK data.",
 )
 
+job_uk: UnresolvedAssetJobDefinition = define_asset_job(
+    name="job_scotland",
+    selection=AssetSelection.groups("scotland"),
+    description="Downloads Scotland data.",
+)
+
 defs: Definitions = Definitions(
     assets=all_assets,
     schedules=[],
diff --git a/python/popgetter/assets/__init__.py b/python/popgetter/assets/__init__.py
index e050bf8..7ecbf5d 100644
--- a/python/popgetter/assets/__init__.py
+++ b/python/popgetter/assets/__init__.py
@@ -1,3 +1,3 @@
 from __future__ import annotations
 
-from . import be, uk, us  # noqa: F401
+from . import be, uk, us, scotland  # noqa: F401
diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py
new file mode 100755
index 0000000..fb102a3
--- /dev/null
+++ b/python/popgetter/assets/scotland/__init__.py
@@ -0,0 +1,22 @@
+#!/usr/bin/python3
+from __future__ import annotations
+
+from dagster import (
+    asset,
+)
+
+from popgetter.metadata import (
+    CountryMetadata,
+)
+
+from . import (
+    scotland, 
+)
+
+
+# @asset(key_prefix=asset_prefix)
+# def get_country_metadata() -> CountryMetadata:
+#     """
+#     Returns a CountryMetadata of metadata about the country.
+#     """
+#     return country
diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py
new file mode 100644
index 0000000..8e7a0ab
--- /dev/null
+++ b/python/popgetter/assets/scotland/scotland.py
@@ -0,0 +1,175 @@
+import subprocess
+import requests
+import zipfile
+import os
+import urllib
+import pandas as pd
+import geopandas
+import numpy as np
+import matplotlib.pyplot as plt
+
+from dagster import asset
+
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
+}
+
+
+def download_file(
+    cache_dir: str,
+    url: str,
+    file_name: str | None = None,
+    headers: dict[str, str] = HEADERS,
+) -> str:
+    """Downloads file checking first if exists in cache, returning file name."""
+    file_name = (
+        os.path.join(cache_dir, url.split("/")[-1]) if file_name is None else file_name
+    )
+    if not os.path.exists(file_name):
+        r = requests.get(url, allow_redirects=True, headers=headers)
+        open(file_name, "wb").write(r.content)
+    return file_name
+
+
+"""
+Notes:
+  - 2011 data using UKCensusAPI, 2022 data expected soon given recent initial
+    publication
+  - Reusing some bits of code from UKCensusAPI:
+    https://github.com/alan-turing-institute/UKCensusAPI/blob/master/ukcensusapi/NRScotland.py
+"""
+
+
+class Scotland:
+    cache_dir: str
+    lookup: pd.DataFrame
+
+    URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html"
+    URL1 = "https://www.scotlandscensus.gov.uk/"
+    URL2 = "https://nrscensusprodumb.blob.core.windows.net/downloads/"
+    URL_LOOKUP = (
+        "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx"
+    )
+    URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip"
+
+    data_sources = ["Council Area blk", "SNS Data Zone 2011 blk", "Output Area blk"]
+    GeoCodeLookup = {
+        "LAD": 0,  # "Council Area blk"
+        # MSOA (intermediate zone)?
+        "LSOA11": 1,  # "SNS Data Zone 2011 blk"
+        "OA11": 2,  # "Output Area blk"
+    }
+    SCGeoCodes = ["CA", "DZ", "OA"]
+
+    def __init__(self, cache_dir: str = "./cache/"):
+        """Init and get lookup."""
+        self.cache_dir = cache_dir
+        os.makedirs(self.cache_dir, exist_ok=True)
+        lookup_path = download_file(self.cache_dir, self.URL_LOOKUP)
+        self.lookup = pd.read_excel(lookup_path, sheet_name="OA_DZ_IZ_2011 Lookup")
+
+    def __source_to_zip(self, source_name: str) -> str:
+        """Downloads if necessary and returns the name of the locally cached zip file
+        of the source data (replacing spaces with _)"""
+        file_name = os.path.join(self.cache_dir, source_name.replace(" ", "_") + ".zip")
+        if not os.path.isfile(file_name):
+            if source_name.split()[0] == "Council":
+                scotland_src = (
+                    self.URL1
+                    + "media/hjmd0oqr/"
+                    + source_name.lower().replace(" ", "-")
+                    + ".zip"
+                )
+            else:
+                scotland_src = self.URL2 + urllib.parse.quote(source_name) + ".zip"
+        return download_file(self.cache_dir, scotland_src, file_name)
+
+    def get_rawdata(self, table: str, resolution: str) -> pd.DataFrame:
+        """Gets the raw csv data and metadata."""
+        if not os.path.exists(os.path.join(self.cache_dir, table + ".csv")):
+            try:
+                zf = self.__source_to_zip(
+                    self.data_sources[self.GeoCodeLookup[resolution]]
+                )
+                with zipfile.ZipFile(zf) as zip_ref:
+                    zip_ref.extractall(self.cache_dir)
+            except NotImplementedError as _:
+                subprocess.run(["unzip", "-o", zf, "-d", self.cache_dir])
+
+        return pd.read_csv(os.path.join(self.cache_dir, table + ".csv"))
+
+    def get_lc1117sc(self) -> pd.DataFrame:
+        """Gets LC1117SC age by sex table at OA11 resolution."""
+        df = self.get_rawdata("LC1117SC", "OA11").rename(
+            columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"}
+        )
+        return df.loc[df["OA11"].isin(self.lookup["OutputArea2011Code"])]
+
+    def get_shapefile(self) -> geopandas.GeoDataFrame:
+        """Gets the shape file for OA11 resolution."""
+        file_name = download_file(self.cache_dir, self.URL_SHAPEFILE)
+        geo = geopandas.read_file(f"zip://{file_name}")
+        return geo[geo["geo_code"].isin(self.lookup["OutputArea2011Code"])]
+
+@asset
+def download_data():
+    cache_dir = "./cache/"
+    scotland = Scotland(cache_dir)
+
+@asset
+def download_census() -> pd.DataFrame:
+    cache_dir = "./cache/"
+    scotland = Scotland(cache_dir)
+    return scotland.get_lc1117sc()
+
+
+@asset
+def download_shapefile() -> geopandas.GeoDataFrame:
+    cache_dir = "./cache/"
+    scotland = Scotland(cache_dir)
+    return scotland.get_shapefile()
+
+# @multi_asset(
+#     ins={
+#         "individual_census_table": AssetIn(
+#             key_prefix=asset_prefix, partition_mapping=needed_dataset_mapping
+#         ),
+#         # "individual_census_table": AssetIn(key_prefix=asset_prefix),
+#         "filter_needed_catalog": AssetIn(key_prefix=asset_prefix),
+#     },
+# def generate_plots():
+#     geo.merge(pop, left_on="geo_code", right_on="OA11", how="left")
+#     # Plot
+#     merged["log10 people"] = np.log10(merged["All people"])
+#     merged[merged["Age bracket"] == "All people"].plot(
+#         column="log10 people", legend=True
+#     )
+#     plt.show()
+
+
+def main():
+    cache_dir = "./cache/"
+
+    # Make instance of Scotland
+    scotland = Scotland(cache_dir)
+
+    # Get OA11 Age/Sex data
+    pop = scotland.get_lc1117sc()
+
+    # Get shape file
+    geo = scotland.get_shapefile()
+
+    # Merge
+    merged = geo.merge(pop, left_on="geo_code", right_on="OA11", how="left")
+    print(merged)
+
+    # Plot
+    merged["log10 people"] = np.log10(merged["All people"])
+    merged[merged["Age bracket"] == "All people"].plot(
+        column="log10 people", legend=True
+    )
+    plt.show()
+
+
+if __name__ == "__main__":
+    main()

From 714ceab12d8d371cd60c9bd61b1b511c4659c1f6 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 6 Mar 2024 17:32:23 +0000
Subject: [PATCH 02/60] Add zipfile-deflate64 dep

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 9e4b01b..ae68ef1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,6 +44,7 @@ dependencies = [
   "rdflib >=7.0.0", # Required to parse BEL TTL Metadata catalogue.
   "icecream >=2.1.3", # General debugging tool
   "openpyxl",
+  "zipfile-deflate64",
 ]
 
 [project.optional-dependencies]

From cb8e86b664d48451299979a9906c8fb337c0afda Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 6 Mar 2024 17:33:22 +0000
Subject: [PATCH 03/60] Add key_prefix

---
 python/popgetter/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py
index 65f247a..a98dd3b 100644
--- a/python/popgetter/__init__.py
+++ b/python/popgetter/__init__.py
@@ -29,7 +29,7 @@
     *load_assets_from_package_module(assets.us, group_name="us"),
     *load_assets_from_package_module(assets.be, group_name="be"),
     *load_assets_from_package_module(assets.uk, group_name="uk"),
-    *load_assets_from_package_module(assets.scotland, group_name="scotland"),
+    *load_assets_from_package_module(assets.scotland, group_name="scotland", key_prefix="uk-scotland"),
 ]
 
 job_be: UnresolvedAssetJobDefinition = define_asset_job(

From 2e7429173b25a892d40b6ba32a295dc660f21e3d Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 6 Mar 2024 17:37:29 +0000
Subject: [PATCH 04/60] Begin refactor with dagster

---
 python/popgetter/assets/scotland/scotland.py | 274 ++++++++++---------
 1 file changed, 152 insertions(+), 122 deletions(-)

diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py
index 8e7a0ab..a7de2a6 100644
--- a/python/popgetter/assets/scotland/scotland.py
+++ b/python/popgetter/assets/scotland/scotland.py
@@ -1,14 +1,17 @@
 import subprocess
+import tempfile
+from typing import Tuple
 import requests
-import zipfile
+# import zipfile
+import zipfile_deflate64 as zipfile
 import os
-import urllib
+import urllib.parse as urlparse
 import pandas as pd
 import geopandas
 import numpy as np
 import matplotlib.pyplot as plt
-
-from dagster import asset
+from icecream import ic
+from dagster import AssetIn, AssetOut, DynamicPartitionsDefinition, MetadataValue, Output, SpecificPartitionsPartitionMapping, StaticPartitionsDefinition, asset, multi_asset
 
 HEADERS = {
     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
@@ -40,103 +43,158 @@ def download_file(
 """
 
 
-class Scotland:
-    cache_dir: str
-    lookup: pd.DataFrame
+PARTITIONS_DEF_NAME = "dataset_tables"
+dataset_node_partition = DynamicPartitionsDefinition(name=PARTITIONS_DEF_NAME)
 
-    URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html"
-    URL1 = "https://www.scotlandscensus.gov.uk/"
-    URL2 = "https://nrscensusprodumb.blob.core.windows.net/downloads/"
-    URL_LOOKUP = (
-        "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx"
-    )
-    URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip"
-
-    data_sources = ["Council Area blk", "SNS Data Zone 2011 blk", "Output Area blk"]
-    GeoCodeLookup = {
-        "LAD": 0,  # "Council Area blk"
-        # MSOA (intermediate zone)?
-        "LSOA11": 1,  # "SNS Data Zone 2011 blk"
-        "OA11": 2,  # "Output Area blk"
+# cache_dir = tempfile.mkdtemp()
+cache_dir = "./cache"
+
+URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html"
+URL1 = "https://www.scotlandscensus.gov.uk/"
+URL2 = "https://nrscensusprodumb.blob.core.windows.net/downloads/"
+URL_LOOKUP = (
+    "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx"
+)
+URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip"
+
+data_sources = ["Council Area blk", "SNS Data Zone 2011 blk", "Output Area blk"]
+GeoCodeLookup = {
+    "LAD": 0,  # "Council Area blk"
+    # MSOA (intermediate zone)?
+    "LSOA11": 1,  # "SNS Data Zone 2011 blk"
+    "OA11": 2,  # "Output Area blk"
+}
+# SCGeoCodes = ["CA", "DZ", "OA"]
+
+
+DATA_SOURCES = {
+    0: {
+        "source": "Council Area blk",
+        "resolution": "LAD",
+        "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip"
+    },
+    1: {
+        "source": "SNS Data Zone 2011 blk",
+        "resolution": "LSOA11",
+        "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip"
+    },
+    2: {
+        "source": "Output Area blk",
+        "resolution": "OA11",
+        "url": URL2 + urlparse.quote("Output Area blk") + ".zip"
     }
-    SCGeoCodes = ["CA", "DZ", "OA"]
-
-    def __init__(self, cache_dir: str = "./cache/"):
-        """Init and get lookup."""
-        self.cache_dir = cache_dir
-        os.makedirs(self.cache_dir, exist_ok=True)
-        lookup_path = download_file(self.cache_dir, self.URL_LOOKUP)
-        self.lookup = pd.read_excel(lookup_path, sheet_name="OA_DZ_IZ_2011 Lookup")
-
-    def __source_to_zip(self, source_name: str) -> str:
-        """Downloads if necessary and returns the name of the locally cached zip file
-        of the source data (replacing spaces with _)"""
-        file_name = os.path.join(self.cache_dir, source_name.replace(" ", "_") + ".zip")
-        if not os.path.isfile(file_name):
-            if source_name.split()[0] == "Council":
-                scotland_src = (
-                    self.URL1
-                    + "media/hjmd0oqr/"
-                    + source_name.lower().replace(" ", "-")
-                    + ".zip"
-                )
-            else:
-                scotland_src = self.URL2 + urllib.parse.quote(source_name) + ".zip"
-        return download_file(self.cache_dir, scotland_src, file_name)
-
-    def get_rawdata(self, table: str, resolution: str) -> pd.DataFrame:
-        """Gets the raw csv data and metadata."""
-        if not os.path.exists(os.path.join(self.cache_dir, table + ".csv")):
-            try:
-                zf = self.__source_to_zip(
-                    self.data_sources[self.GeoCodeLookup[resolution]]
-                )
-                with zipfile.ZipFile(zf) as zip_ref:
-                    zip_ref.extractall(self.cache_dir)
-            except NotImplementedError as _:
-                subprocess.run(["unzip", "-o", zf, "-d", self.cache_dir])
-
-        return pd.read_csv(os.path.join(self.cache_dir, table + ".csv"))
-
-    def get_lc1117sc(self) -> pd.DataFrame:
-        """Gets LC1117SC age by sex table at OA11 resolution."""
-        df = self.get_rawdata("LC1117SC", "OA11").rename(
-            columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"}
-        )
-        return df.loc[df["OA11"].isin(self.lookup["OutputArea2011Code"])]
-
-    def get_shapefile(self) -> geopandas.GeoDataFrame:
-        """Gets the shape file for OA11 resolution."""
-        file_name = download_file(self.cache_dir, self.URL_SHAPEFILE)
-        geo = geopandas.read_file(f"zip://{file_name}")
-        return geo[geo["geo_code"].isin(self.lookup["OutputArea2011Code"])]
+}
 
-@asset
-def download_data():
-    cache_dir = "./cache/"
-    scotland = Scotland(cache_dir)
+
+# NB. Make sure no spaces in asset keys
+@multi_asset(
+    outs={
+        "oa_dz_iz_2011_lookup": AssetOut(),
+        "data_zone_2011_lookup": AssetOut(),
+        "intermediate_zone_2011_lookup": AssetOut(),
+    },
+)
+def download_lookup():
+    os.makedirs(cache_dir, exist_ok=True)
+    lookup_path = download_file(cache_dir, URL_LOOKUP)
+    df1 = pd.read_excel(lookup_path, sheet_name="OA_DZ_IZ_2011 Lookup")
+    df2 = pd.read_excel(lookup_path, sheet_name="DataZone2011Lookup")
+    df3 = pd.read_excel(lookup_path, sheet_name="IntermediateZone2011Lookup")
+    return df1, df2, df3
+
+
+def source_to_zip(source_name: str, url: str) -> str:
+    """Downloads if necessary and returns the name of the locally cached zip file
+    of the source data (replacing spaces with _)"""
+    file_name = os.path.join(cache_dir, source_name.replace(" ", "_") + ".zip")
+    return download_file(cache_dir, url, file_name)
 
 @asset
-def download_census() -> pd.DataFrame:
-    cache_dir = "./cache/"
-    scotland = Scotland(cache_dir)
-    return scotland.get_lc1117sc()
+def make_catalog(context) -> pd.DataFrame:
+    records = []
+    for data_source in DATA_SOURCES.values():
+        resolution = data_source["resolution"]
+        source = data_source["source"]
+        url = data_source["url"]
+        with zipfile.ZipFile(source_to_zip(source, url)) as zip_ref: 
+            for name in zip_ref.namelist():
+                print(name)
+                record = {
+                        "resolution": resolution,
+                        "source": source,
+                        "url": url,
+                        "file_name": name,
+                    }
+                records.append(record)
+                ic(record)
+                zip_ref.extract(name, cache_dir)
+    
+    for partition in context.instance.get_dynamic_partitions(PARTITIONS_DEF_NAME):
+        context.instance.delete_dynamic_partition(PARTITIONS_DEF_NAME, partition)
+
+    # Create a dynamic partition for the datasets listed in the catalog
+    catalog_df: pd.DataFrame = pd.DataFrame.from_records(records)
+    partition_keys = catalog_df["file_name"].to_list()
+    context.instance.add_dynamic_partitions(
+        partitions_def_name=PARTITIONS_DEF_NAME, partition_keys=partition_keys
+    )
+    context.add_output_metadata(
+        metadata={
+            "num_records": len(catalog_df),
+            "ignored_datasets": "",
+            "columns": MetadataValue.md(
+                "\n".join([f"- '`{col}`'" for col in catalog_df.columns.to_list()])
+            ),
+            "columns_types": MetadataValue.md(catalog_df.dtypes.to_markdown()),
+            "preview": MetadataValue.md(catalog_df.to_markdown()),
+        }
+    )
+    return catalog_df
+
+
+def get_table(context, table_details) -> pd.DataFrame:
+    df = pd.read_csv(os.path.join(cache_dir, table_details["file_name"].iloc[0]))
+    context.add_output_metadata(
+        metadata={
+            "title": table_details["file_nae"].iloc[0],
+            # "title": "Test",
+            "num_records": len(df),  # Metadata can be any key-value pair
+            "columns": MetadataValue.md(
+                "\n".join([f"- '`{col}`'" for col in df.columns.to_list()])
+            ),
+            "preview": MetadataValue.md(df.head().to_markdown()),
+        }
+    )
+    return df
+
+@asset(partitions_def=dataset_node_partition)
+def individual_census_table(context, make_catalog: pd.DataFrame) -> pd.DataFrame:
+    partition_key = context.asset_partition_key_for_output()
+    ic(partition_key)
+    row = make_catalog.loc[make_catalog["file_name"].isin([partition_key])]
+    ic(row)
+    return get_table(context, table_details=row)
+
+
+# # TODO: add to derived
+# def get_lc1117sc(context, lookup, ) -> pd.DataFrame:
+#     """Gets LC1117SC age by sex table at OA11 resolution."""
+#     df = get_rawdata("LC1117SC", "OA11").rename(
+#         columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"}
+#     )
+#     return df.loc[df["OA11"].isin(lookup["OutputArea2011Code"])]
 
 
-@asset
-def download_shapefile() -> geopandas.GeoDataFrame:
-    cache_dir = "./cache/"
-    scotland = Scotland(cache_dir)
-    return scotland.get_shapefile()
-
-# @multi_asset(
-#     ins={
-#         "individual_census_table": AssetIn(
-#             key_prefix=asset_prefix, partition_mapping=needed_dataset_mapping
-#         ),
-#         # "individual_census_table": AssetIn(key_prefix=asset_prefix),
-#         "filter_needed_catalog": AssetIn(key_prefix=asset_prefix),
-#     },
+# # TODO: add shapefile
+# def shapefile(context) -> geopandas.GeoDataFrame:
+#     """Gets the shape file for OA11 resolution."""
+#     file_name = download_file(cache_dir, URL_SHAPEFILE)
+#     geo = geopandas.read_file(f"zip://{file_name}")
+#     return geo[geo["geo_code"].isin(lookup["OutputArea2011Code"])]
+
+    
+# # TODO: add plots
+# @asset
 # def generate_plots():
 #     geo.merge(pop, left_on="geo_code", right_on="OA11", how="left")
 #     # Plot
@@ -145,31 +203,3 @@ def download_shapefile() -> geopandas.GeoDataFrame:
 #         column="log10 people", legend=True
 #     )
 #     plt.show()
-
-
-def main():
-    cache_dir = "./cache/"
-
-    # Make instance of Scotland
-    scotland = Scotland(cache_dir)
-
-    # Get OA11 Age/Sex data
-    pop = scotland.get_lc1117sc()
-
-    # Get shape file
-    geo = scotland.get_shapefile()
-
-    # Merge
-    merged = geo.merge(pop, left_on="geo_code", right_on="OA11", how="left")
-    print(merged)
-
-    # Plot
-    merged["log10 people"] = np.log10(merged["All people"])
-    merged[merged["Age bracket"] == "All people"].plot(
-        column="log10 people", legend=True
-    )
-    plt.show()
-
-
-if __name__ == "__main__":
-    main()

From 9dadda2cca1fb21e9ea88a5e190c139ab967cb98 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 7 Mar 2024 22:19:13 +0000
Subject: [PATCH 05/60] Format, make partition keys unique, add geographies
 asset

---
 python/popgetter/assets/scotland/scotland.py | 157 ++++++++++++-------
 1 file changed, 99 insertions(+), 58 deletions(-)

diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py
index a7de2a6..3885167 100644
--- a/python/popgetter/assets/scotland/scotland.py
+++ b/python/popgetter/assets/scotland/scotland.py
@@ -2,6 +2,7 @@
 import tempfile
 from typing import Tuple
 import requests
+
 # import zipfile
 import zipfile_deflate64 as zipfile
 import os
@@ -11,28 +12,20 @@
 import numpy as np
 import matplotlib.pyplot as plt
 from icecream import ic
-from dagster import AssetIn, AssetOut, DynamicPartitionsDefinition, MetadataValue, Output, SpecificPartitionsPartitionMapping, StaticPartitionsDefinition, asset, multi_asset
-
-HEADERS = {
-    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
-}
-
-
-def download_file(
-    cache_dir: str,
-    url: str,
-    file_name: str | None = None,
-    headers: dict[str, str] = HEADERS,
-) -> str:
-    """Downloads file checking first if exists in cache, returning file name."""
-    file_name = (
-        os.path.join(cache_dir, url.split("/")[-1]) if file_name is None else file_name
-    )
-    if not os.path.exists(file_name):
-        r = requests.get(url, allow_redirects=True, headers=headers)
-        open(file_name, "wb").write(r.content)
-    return file_name
-
+import popgetter
+from dagster import (
+    AssetIn,
+    AssetKey,
+    AssetOut,
+    DynamicPartitionsDefinition,
+    MetadataValue,
+    Output,
+    SpecificPartitionsPartitionMapping,
+    StaticPartitionsDefinition,
+    asset,
+    multi_asset,
+    op,
+)
 
 """
 Notes:
@@ -64,28 +57,46 @@ def download_file(
     "LSOA11": 1,  # "SNS Data Zone 2011 blk"
     "OA11": 2,  # "Output Area blk"
 }
-# SCGeoCodes = ["CA", "DZ", "OA"]
-
 
 DATA_SOURCES = {
     0: {
         "source": "Council Area blk",
         "resolution": "LAD",
-        "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip"
+        "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip",
     },
     1: {
         "source": "SNS Data Zone 2011 blk",
         "resolution": "LSOA11",
-        "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip"
+        "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip",
     },
     2: {
         "source": "Output Area blk",
         "resolution": "OA11",
-        "url": URL2 + urlparse.quote("Output Area blk") + ".zip"
-    }
+        "url": URL2 + urlparse.quote("Output Area blk") + ".zip",
+    },
+}
+
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
 }
 
 
+def download_file(
+    cache_dir: str,
+    url: str,
+    file_name: str | None = None,
+    headers: dict[str, str] = HEADERS,
+) -> str:
+    """Downloads file checking first if exists in cache, returning file name."""
+    file_name = (
+        os.path.join(cache_dir, url.split("/")[-1]) if file_name is None else file_name
+    )
+    if not os.path.exists(file_name):
+        r = requests.get(url, allow_redirects=True, headers=headers)
+        open(file_name, "wb").write(r.content)
+    return file_name
+
+
 # NB. Make sure no spaces in asset keys
 @multi_asset(
     outs={
@@ -109,34 +120,39 @@ def source_to_zip(source_name: str, url: str) -> str:
     file_name = os.path.join(cache_dir, source_name.replace(" ", "_") + ".zip")
     return download_file(cache_dir, url, file_name)
 
+
 @asset
-def make_catalog(context) -> pd.DataFrame:
+def catalog(context) -> pd.DataFrame:
     records = []
     for data_source in DATA_SOURCES.values():
         resolution = data_source["resolution"]
         source = data_source["source"]
         url = data_source["url"]
-        with zipfile.ZipFile(source_to_zip(source, url)) as zip_ref: 
+        with zipfile.ZipFile(source_to_zip(source, url)) as zip_ref:
             for name in zip_ref.namelist():
                 print(name)
                 record = {
-                        "resolution": resolution,
-                        "source": source,
-                        "url": url,
-                        "file_name": name,
-                    }
+                    "resolution": resolution,
+                    "source": source,
+                    "url": url,
+                    "file_name": name,
+                }
                 records.append(record)
                 ic(record)
                 zip_ref.extract(name, cache_dir)
-    
+
+    # TODO: check if required
     for partition in context.instance.get_dynamic_partitions(PARTITIONS_DEF_NAME):
         context.instance.delete_dynamic_partition(PARTITIONS_DEF_NAME, partition)
 
     # Create a dynamic partition for the datasets listed in the catalog
     catalog_df: pd.DataFrame = pd.DataFrame.from_records(records)
-    partition_keys = catalog_df["file_name"].to_list()
+    catalog_df["partition_keys"] = (
+        catalog_df[["resolution", "file_name"]].agg("/".join, axis=1).to_list()
+    )
     context.instance.add_dynamic_partitions(
-        partitions_def_name=PARTITIONS_DEF_NAME, partition_keys=partition_keys
+        partitions_def_name=PARTITIONS_DEF_NAME,
+        partition_keys=catalog_df["partition_keys"].to_list(),
     )
     context.add_output_metadata(
         metadata={
@@ -156,9 +172,8 @@ def get_table(context, table_details) -> pd.DataFrame:
     df = pd.read_csv(os.path.join(cache_dir, table_details["file_name"].iloc[0]))
     context.add_output_metadata(
         metadata={
-            "title": table_details["file_nae"].iloc[0],
-            # "title": "Test",
-            "num_records": len(df),  # Metadata can be any key-value pair
+            "title": table_details["partition_keys"].iloc[0],
+            "num_records": len(df),
             "columns": MetadataValue.md(
                 "\n".join([f"- '`{col}`'" for col in df.columns.to_list()])
             ),
@@ -167,32 +182,58 @@ def get_table(context, table_details) -> pd.DataFrame:
     )
     return df
 
+
 @asset(partitions_def=dataset_node_partition)
-def individual_census_table(context, make_catalog: pd.DataFrame) -> pd.DataFrame:
+def individual_census_table(context, catalog: pd.DataFrame) -> pd.DataFrame:
     partition_key = context.asset_partition_key_for_output()
-    ic(partition_key)
-    row = make_catalog.loc[make_catalog["file_name"].isin([partition_key])]
-    ic(row)
+    context.log.info(partition_key)
+    row = catalog.loc[catalog["partition_keys"].isin([partition_key])]
+    context.log.info(row)
     return get_table(context, table_details=row)
 
 
-# # TODO: add to derived
-# def get_lc1117sc(context, lookup, ) -> pd.DataFrame:
+# @op
+# def lc1117sc(context, individual_census_table, oa_dz_iz_2011_lookup) -> pd.DataFrame:
 #     """Gets LC1117SC age by sex table at OA11 resolution."""
-#     df = get_rawdata("LC1117SC", "OA11").rename(
-#         columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"}
-#     )
-#     return df.loc[df["OA11"].isin(lookup["OutputArea2011Code"])]
+#     from popgetter import defs
+#     with defs.get_asset_value_loader(instance=context.instance) as loader:
+#         df = loader.load_asset_value(AssetKey(["uk-scotland",  "individual_census_table"]), partition_key="LC1117SC.csv")
+#         df = df.rename(
+#             columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"}
+#         )
+#         df = df.loc[df["OA11"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])]
+#         context.add_output_metadata(
+#             metadata = {
+#                 "title": df["file_name"].iloc[0],
+#                 "num_records": len(df),
+#                 "columns": MetadataValue.md(
+#                     "\n".join([f"- '`{col}`'" for col in df.columns.to_list()])
+#                 ),
+#                 "preview": MetadataValue.md(df.head().to_markdown()),
+#             }
+#         )
+#         return df
+
 
+@asset
+def geometry(context, oa_dz_iz_2011_lookup) -> geopandas.GeoDataFrame:
+    """Gets the shape file for OA11 resolution."""
+    file_name = download_file(cache_dir, URL_SHAPEFILE)
+    geo = geopandas.read_file(f"zip://{file_name}")
+    # TODO: add metadat for geopandas
+    # context.add_output_metadata(
+    #     metadata={
+    #         "title": table_details["partition_keys"].iloc[0],
+    #         "num_records": len(df),
+    #         "columns": MetadataValue.md(
+    #             "\n".join([f"- '`{col}`'" for col in df.columns.to_list()])
+    #         ),
+    #         "preview": MetadataValue.md(df.head().to_markdown()),
+    #     }
+    # )
+    return geo[geo["geo_code"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])]
 
-# # TODO: add shapefile
-# def shapefile(context) -> geopandas.GeoDataFrame:
-#     """Gets the shape file for OA11 resolution."""
-#     file_name = download_file(cache_dir, URL_SHAPEFILE)
-#     geo = geopandas.read_file(f"zip://{file_name}")
-#     return geo[geo["geo_code"].isin(lookup["OutputArea2011Code"])]
 
-    
 # # TODO: add plots
 # @asset
 # def generate_plots():

From b6e8e1d7809dc20faa185426eab22345fcb01a43 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Fri, 8 Mar 2024 10:47:17 +0000
Subject: [PATCH 06/60] Add config to asset_job

---
 python/popgetter/__init__.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py
index a98dd3b..8f0e279 100644
--- a/python/popgetter/__init__.py
+++ b/python/popgetter/__init__.py
@@ -55,6 +55,16 @@
     name="job_scotland",
     selection=AssetSelection.groups("scotland"),
     description="Downloads Scotland data.",
+    # https://docs.dagster.io/guides/limiting-concurrency-in-data-pipelines#asset-based-jobs
+    config={
+        "execution": {
+            "config": {
+                "multiprocess": {
+                    "max_concurrent": 20, # limits concurrent assets
+                },
+            }
+        }
+    }
 )
 
 defs: Definitions = Definitions(

From 65d27049ac71ae62e36874219afae29ee42a88c1 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Sat, 9 Mar 2024 08:39:17 +0000
Subject: [PATCH 07/60] Initial dagster rewrite for Scotland

---
 python/popgetter/assets/scotland/scotland.py | 161 +++++++++++--------
 1 file changed, 98 insertions(+), 63 deletions(-)

diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py
index 3885167..b54b9ab 100644
--- a/python/popgetter/assets/scotland/scotland.py
+++ b/python/popgetter/assets/scotland/scotland.py
@@ -1,14 +1,14 @@
+import base64
+from io import BytesIO
 import subprocess
 import tempfile
 from typing import Tuple
 import requests
-
-# import zipfile
 import zipfile_deflate64 as zipfile
 import os
 import urllib.parse as urlparse
 import pandas as pd
-import geopandas
+import geopandas as gpd
 import numpy as np
 import matplotlib.pyplot as plt
 from icecream import ic
@@ -18,8 +18,10 @@
     AssetKey,
     AssetOut,
     DynamicPartitionsDefinition,
+    MaterializeResult,
     MetadataValue,
     Output,
+    Partition,
     SpecificPartitionsPartitionMapping,
     StaticPartitionsDefinition,
     asset,
@@ -58,23 +60,23 @@
     "OA11": 2,  # "Output Area blk"
 }
 
-DATA_SOURCES = {
-    0: {
+DATA_SOURCES = [
+    {
         "source": "Council Area blk",
         "resolution": "LAD",
         "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip",
     },
-    1: {
+    {
         "source": "SNS Data Zone 2011 blk",
         "resolution": "LSOA11",
         "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip",
     },
-    2: {
+    {
         "source": "Output Area blk",
         "resolution": "OA11",
         "url": URL2 + urlparse.quote("Output Area blk") + ".zip",
     },
-}
+]
 
 HEADERS = {
     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
@@ -105,7 +107,8 @@ def download_file(
         "intermediate_zone_2011_lookup": AssetOut(),
     },
 )
-def download_lookup():
+def lookups():
+    """Creates lookup dataframes."""
     os.makedirs(cache_dir, exist_ok=True)
     lookup_path = download_file(cache_dir, URL_LOOKUP)
     df1 = pd.read_excel(lookup_path, sheet_name="OA_DZ_IZ_2011 Lookup")
@@ -123,22 +126,22 @@ def source_to_zip(source_name: str, url: str) -> str:
 
 @asset
 def catalog(context) -> pd.DataFrame:
+    """Creates a catalog of the individual census tables from all data sources."""
     records = []
-    for data_source in DATA_SOURCES.values():
+    for data_source in DATA_SOURCES:
         resolution = data_source["resolution"]
         source = data_source["source"]
         url = data_source["url"]
         with zipfile.ZipFile(source_to_zip(source, url)) as zip_ref:
             for name in zip_ref.namelist():
-                print(name)
                 record = {
                     "resolution": resolution,
                     "source": source,
                     "url": url,
                     "file_name": name,
                 }
+                context.log.debug(record)
                 records.append(record)
-                ic(record)
                 zip_ref.extract(name, cache_dir)
 
     # TODO: check if required
@@ -147,11 +150,12 @@ def catalog(context) -> pd.DataFrame:
 
     # Create a dynamic partition for the datasets listed in the catalog
     catalog_df: pd.DataFrame = pd.DataFrame.from_records(records)
-    catalog_df["partition_keys"] = (
-        catalog_df[["resolution", "file_name"]].agg("/".join, axis=1).to_list()
+    catalog_df["partition_keys"] = catalog_df[["resolution", "file_name"]].agg(
+        lambda s: "/".join(s).rsplit(".")[0], axis=1
     )
     context.instance.add_dynamic_partitions(
         partitions_def_name=PARTITIONS_DEF_NAME,
+        # To ensure this is unique, prepend the resolution
         partition_keys=catalog_df["partition_keys"].to_list(),
     )
     context.add_output_metadata(
@@ -185,62 +189,93 @@ def get_table(context, table_details) -> pd.DataFrame:
 
 @asset(partitions_def=dataset_node_partition)
 def individual_census_table(context, catalog: pd.DataFrame) -> pd.DataFrame:
+    """Creates individual census tables as dataframe."""
     partition_key = context.asset_partition_key_for_output()
     context.log.info(partition_key)
-    row = catalog.loc[catalog["partition_keys"].isin([partition_key])]
-    context.log.info(row)
-    return get_table(context, table_details=row)
-
-
-# @op
-# def lc1117sc(context, individual_census_table, oa_dz_iz_2011_lookup) -> pd.DataFrame:
-#     """Gets LC1117SC age by sex table at OA11 resolution."""
-#     from popgetter import defs
-#     with defs.get_asset_value_loader(instance=context.instance) as loader:
-#         df = loader.load_asset_value(AssetKey(["uk-scotland",  "individual_census_table"]), partition_key="LC1117SC.csv")
-#         df = df.rename(
-#             columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"}
-#         )
-#         df = df.loc[df["OA11"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])]
-#         context.add_output_metadata(
-#             metadata = {
-#                 "title": df["file_name"].iloc[0],
-#                 "num_records": len(df),
-#                 "columns": MetadataValue.md(
-#                     "\n".join([f"- '`{col}`'" for col in df.columns.to_list()])
-#                 ),
-#                 "preview": MetadataValue.md(df.head().to_markdown()),
-#             }
-#         )
-#         return df
+    table_details = catalog.loc[catalog["partition_keys"].isin([partition_key])]
+    context.log.info(table_details)
+    return get_table(context, table_details)
+
+
+_subset = [
+    {
+        "partition_keys": "OA11/LC1117SC",
+    },
+]
+_subset_partition_keys: list[str] = [r["partition_keys"] for r in _subset]
+subset_mapping = SpecificPartitionsPartitionMapping(_subset_partition_keys)
+subset_partition = StaticPartitionsDefinition(_subset_partition_keys)
+
+
+@multi_asset(
+    ins={
+        "individual_census_table": AssetIn(partition_mapping=subset_mapping),
+    },
+    outs={
+        "oa11_lc1117sc": AssetOut(),
+    },
+    partitions_def=dataset_node_partition,
+)
+def oa11_lc1117sc(
+    context, individual_census_table, oa_dz_iz_2011_lookup
+) -> pd.DataFrame:
+    """Gets LC1117SC age by sex table at OA11 resolution."""
+    df = individual_census_table.rename(
+        columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"}
+    )
+    df = df.loc[df["OA11"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])]
+    context.add_output_metadata(
+        metadata={
+            "title": _subset_partition_keys,
+            "num_records": len(df),
+            "columns": MetadataValue.md(
+                "\n".join([f"- '`{col}`'" for col in df.columns.to_list()])
+            ),
+            "preview": MetadataValue.md(df.head().to_markdown()),
+        }
+    )
+    return df
 
 
 @asset
-def geometry(context, oa_dz_iz_2011_lookup) -> geopandas.GeoDataFrame:
+def geometry(context, oa_dz_iz_2011_lookup) -> gpd.GeoDataFrame:
     """Gets the shape file for OA11 resolution."""
     file_name = download_file(cache_dir, URL_SHAPEFILE)
-    geo = geopandas.read_file(f"zip://{file_name}")
-    # TODO: add metadat for geopandas
-    # context.add_output_metadata(
-    #     metadata={
-    #         "title": table_details["partition_keys"].iloc[0],
-    #         "num_records": len(df),
-    #         "columns": MetadataValue.md(
-    #             "\n".join([f"- '`{col}`'" for col in df.columns.to_list()])
-    #         ),
-    #         "preview": MetadataValue.md(df.head().to_markdown()),
-    #     }
-    # )
+    geo = gpd.read_file(f"zip://{file_name}")
+    context.add_output_metadata(
+        metadata={
+            "title": "Geometry file",
+            "num_records": len(geo),
+            "columns": MetadataValue.md(
+                "\n".join([f"- '`{col}`'" for col in geo.columns.to_list()])
+            ),
+            "preview": MetadataValue.md(geo.head().to_markdown()),
+        }
+    )
     return geo[geo["geo_code"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])]
 
 
-# # TODO: add plots
-# @asset
-# def generate_plots():
-#     geo.merge(pop, left_on="geo_code", right_on="OA11", how="left")
-#     # Plot
-#     merged["log10 people"] = np.log10(merged["All people"])
-#     merged[merged["Age bracket"] == "All people"].plot(
-#         column="log10 people", legend=True
-#     )
-#     plt.show()
+@multi_asset(
+    ins={
+        "oa11_lc1117sc": AssetIn(partition_mapping=subset_mapping),
+        "geometry": AssetIn(partition_mapping=subset_mapping),
+    },
+    outs={
+        "plot": AssetOut(),
+    },
+    partitions_def=dataset_node_partition,
+)
+def plot(geometry: gpd.GeoDataFrame, oa11_lc1117sc: pd.DataFrame):
+    """Plots map with log density of people."""
+    merged = geometry.merge(
+        oa11_lc1117sc, left_on="geo_code", right_on="OA11", how="left"
+    )
+    merged["log10 people"] = np.log10(merged["All people"])
+    merged[merged["Age bracket"] == "All people"].plot(
+        column="log10 people", legend=True
+    )
+    buffer = BytesIO()
+    plt.savefig(buffer, format="png")
+    image_data = base64.b64encode(buffer.getvalue())
+    md_content = f"![img](data:image/png;base64,{image_data.decode()})"
+    return MaterializeResult(metadata={"plot": MetadataValue.md(md_content)})

From 6b20ab1de89b5604f193771d047d13f3db6891f0 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Fri, 15 Mar 2024 18:49:03 +0000
Subject: [PATCH 08/60] Add function to add metadata and metadata index asset

---
 python/popgetter/__init__.py                 |  2 +
 python/popgetter/assets/scotland/scotland.py | 60 ++++++++++----------
 2 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py
index 8f0e279..37e5c7c 100644
--- a/python/popgetter/__init__.py
+++ b/python/popgetter/__init__.py
@@ -70,6 +70,8 @@
 defs: Definitions = Definitions(
     assets=all_assets,
     schedules=[],
+    # Example with multiple configs including for production:
+    # https://docs.dagster.io/guides/dagster/transitioning-data-pipelines-from-development-to-production#production
     resources={"pipes_subprocess_client": PipesSubprocessClient()},
     jobs=[job_be, job_us, job_uk],
 )
diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py
index b54b9ab..33e4c3f 100644
--- a/python/popgetter/assets/scotland/scotland.py
+++ b/python/popgetter/assets/scotland/scotland.py
@@ -51,6 +51,7 @@
     "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx"
 )
 URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip"
+URL_METADATA_INDEX = "https://www.scotlandscensus.gov.uk/media/kqcmo4ge/census-table-index-2011.xlsm"
 
 data_sources = ["Council Area blk", "SNS Data Zone 2011 blk", "Output Area blk"]
 GeoCodeLookup = {
@@ -124,6 +125,30 @@ def source_to_zip(source_name: str, url: str) -> str:
     return download_file(cache_dir, url, file_name)
 
 
+def add_metadata(context, df: pd.DataFrame | gpd.DataFrame, title: str | list[str]):
+    context.add_output_metadata(
+        metadata={
+            "title": title,
+            "num_records": len(df),
+            "columns": MetadataValue.md(
+                "\n".join([f"- '`{col}`'" for col in df.columns.to_list()])
+            ),
+            "preview": MetadataValue.md(df.head().to_markdown()),
+        }
+    )
+
+@asset
+def metadata_index(context) -> pd.DataFrame:
+    dfs = pd.read_excel(
+        URL_METADATA_INDEX,
+        sheet_name=None,
+        storage_options={"User-Agent": "Mozilla/5.0"},
+    )
+    df = dfs["Index"]
+    add_metadata(context, df, "Metadata for census tables")
+    return df
+
+
 @asset
 def catalog(context) -> pd.DataFrame:
     """Creates a catalog of the individual census tables from all data sources."""
@@ -153,6 +178,8 @@ def catalog(context) -> pd.DataFrame:
     catalog_df["partition_keys"] = catalog_df[["resolution", "file_name"]].agg(
         lambda s: "/".join(s).rsplit(".")[0], axis=1
     )
+    # TODO: consider filtering here based on a set of keys to keep derived from
+    # config (i.e. backend/frontend modes)
     context.instance.add_dynamic_partitions(
         partitions_def_name=PARTITIONS_DEF_NAME,
         # To ensure this is unique, prepend the resolution
@@ -174,16 +201,7 @@ def catalog(context) -> pd.DataFrame:
 
 def get_table(context, table_details) -> pd.DataFrame:
     df = pd.read_csv(os.path.join(cache_dir, table_details["file_name"].iloc[0]))
-    context.add_output_metadata(
-        metadata={
-            "title": table_details["partition_keys"].iloc[0],
-            "num_records": len(df),
-            "columns": MetadataValue.md(
-                "\n".join([f"- '`{col}`'" for col in df.columns.to_list()])
-            ),
-            "preview": MetadataValue.md(df.head().to_markdown()),
-        }
-    )
+    add_metadata(context, df, table_details["partition_keys"].iloc[0])
     return df
 
 
@@ -224,16 +242,7 @@ def oa11_lc1117sc(
         columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"}
     )
     df = df.loc[df["OA11"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])]
-    context.add_output_metadata(
-        metadata={
-            "title": _subset_partition_keys,
-            "num_records": len(df),
-            "columns": MetadataValue.md(
-                "\n".join([f"- '`{col}`'" for col in df.columns.to_list()])
-            ),
-            "preview": MetadataValue.md(df.head().to_markdown()),
-        }
-    )
+    add_metadata(context, df, _subset_partition_keys)
     return df
 
 
@@ -242,16 +251,7 @@ def geometry(context, oa_dz_iz_2011_lookup) -> gpd.GeoDataFrame:
     """Gets the shape file for OA11 resolution."""
     file_name = download_file(cache_dir, URL_SHAPEFILE)
     geo = gpd.read_file(f"zip://{file_name}")
-    context.add_output_metadata(
-        metadata={
-            "title": "Geometry file",
-            "num_records": len(geo),
-            "columns": MetadataValue.md(
-                "\n".join([f"- '`{col}`'" for col in geo.columns.to_list()])
-            ),
-            "preview": MetadataValue.md(geo.head().to_markdown()),
-        }
-    )
+    add_metadata(context, geo, "Geometry file")
     return geo[geo["geo_code"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])]
 
 

From 9cef0c38779fb659841616e0bc6f656e3277cceb Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Mon, 18 Mar 2024 09:37:10 +0000
Subject: [PATCH 09/60] Use markdown_from_plot util

---
 python/popgetter/assets/scotland/scotland.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py
index 33e4c3f..1ac7de2 100644
--- a/python/popgetter/assets/scotland/scotland.py
+++ b/python/popgetter/assets/scotland/scotland.py
@@ -3,6 +3,7 @@
 import subprocess
 import tempfile
 from typing import Tuple
+from popgetter.utils import markdown_from_plot
 import requests
 import zipfile_deflate64 as zipfile
 import os
@@ -125,7 +126,7 @@ def source_to_zip(source_name: str, url: str) -> str:
     return download_file(cache_dir, url, file_name)
 
 
-def add_metadata(context, df: pd.DataFrame | gpd.DataFrame, title: str | list[str]):
+def add_metadata(context, df: pd.DataFrame | gpd.GeoDataFrame, title: str | list[str]):
     context.add_output_metadata(
         metadata={
             "title": title,
@@ -274,8 +275,5 @@ def plot(geometry: gpd.GeoDataFrame, oa11_lc1117sc: pd.DataFrame):
     merged[merged["Age bracket"] == "All people"].plot(
         column="log10 people", legend=True
     )
-    buffer = BytesIO()
-    plt.savefig(buffer, format="png")
-    image_data = base64.b64encode(buffer.getvalue())
-    md_content = f"![img](data:image/png;base64,{image_data.decode()})"
+    md_content = markdown_from_plot(plt)
     return MaterializeResult(metadata={"plot": MetadataValue.md(md_content)})

From eafa2388618e3b9f408b234ad60af033adfaeb80 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 17 Apr 2024 19:24:57 +0100
Subject: [PATCH 10/60] Add required tables

---
 python/popgetter/assets/scotland/scotland.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py
index 1ac7de2..4f19773 100644
--- a/python/popgetter/assets/scotland/scotland.py
+++ b/python/popgetter/assets/scotland/scotland.py
@@ -52,7 +52,9 @@
     "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx"
 )
 URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip"
-URL_METADATA_INDEX = "https://www.scotlandscensus.gov.uk/media/kqcmo4ge/census-table-index-2011.xlsm"
+URL_METADATA_INDEX = (
+    "https://www.scotlandscensus.gov.uk/media/kqcmo4ge/census-table-index-2011.xlsm"
+)
 
 data_sources = ["Council Area blk", "SNS Data Zone 2011 blk", "Output Area blk"]
 GeoCodeLookup = {
@@ -62,6 +64,10 @@
     "OA11": 2,  # "Output Area blk"
 }
 
+# From: https://github.com/alan-turing-institute/microsimulation/blob/37ce2843f10b83a8e7a225c801cec83b85e6e0d0/microsimulation/common.py#L32
+REQUIRED_TABLES = ["QS103SC", "QS104SC", "KS201SC", "DC1117SC", "DC2101SC", "DC6206SC"]
+REQUIRED_TABLES_REGEX = "|".join(REQUIRED_TABLES)
+
 DATA_SOURCES = [
     {
         "source": "Council Area blk",
@@ -180,11 +186,15 @@ def catalog(context) -> pd.DataFrame:
         lambda s: "/".join(s).rsplit(".")[0], axis=1
     )
     # TODO: consider filtering here based on a set of keys to keep derived from
-    # config (i.e. backend/frontend modes)
+    # config (i.e. backend/frontend modes)
     context.instance.add_dynamic_partitions(
         partitions_def_name=PARTITIONS_DEF_NAME,
         # To ensure this is unique, prepend the resolution
-        partition_keys=catalog_df["partition_keys"].to_list(),
+        # partition_keys=catalog_df["partition_keys"].to_list(),
+        partition_keys=catalog_df.loc[
+            catalog_df["partition_keys"].str.contains(REQUIRED_TABLES_REGEX),
+            "partition_keys",
+        ].to_list(),
     )
     context.add_output_metadata(
         metadata={

From 5f65c914383d0be3fc38572b56e74ef3d7def9ba Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 18 Apr 2024 13:20:28 +0100
Subject: [PATCH 11/60] Add catalog_metadata, revise catalog towards metric
 metadata

---
 python/popgetter/assets/scotland/scotland.py | 172 +++++++++++++------
 1 file changed, 122 insertions(+), 50 deletions(-)

diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py
index 4f19773..70a1296 100644
--- a/python/popgetter/assets/scotland/scotland.py
+++ b/python/popgetter/assets/scotland/scotland.py
@@ -1,35 +1,28 @@
-import base64
-from io import BytesIO
-import subprocess
-import tempfile
-from typing import Tuple
-from popgetter.utils import markdown_from_plot
-import requests
-import zipfile_deflate64 as zipfile
-import os
+from __future__ import annotations
+
 import urllib.parse as urlparse
-import pandas as pd
+from pathlib import Path
+
 import geopandas as gpd
-import numpy as np
 import matplotlib.pyplot as plt
-from icecream import ic
-import popgetter
+import numpy as np
+import pandas as pd
+import requests
+import zipfile_deflate64 as zipfile
 from dagster import (
     AssetIn,
-    AssetKey,
     AssetOut,
     DynamicPartitionsDefinition,
     MaterializeResult,
     MetadataValue,
-    Output,
-    Partition,
     SpecificPartitionsPartitionMapping,
     StaticPartitionsDefinition,
     asset,
     multi_asset,
-    op,
 )
 
+from popgetter.utils import markdown_from_plot
+
 """
 Notes:
   - 2011 data using UKCensusAPI, 2022 data expected soon given recent initial
@@ -52,7 +45,7 @@
     "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx"
 )
 URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip"
-URL_METADATA_INDEX = (
+URL_CATALOG_METADATA = (
     "https://www.scotlandscensus.gov.uk/media/kqcmo4ge/census-table-index-2011.xlsm"
 )
 
@@ -94,16 +87,15 @@
 def download_file(
     cache_dir: str,
     url: str,
-    file_name: str | None = None,
+    file_name: Path | None = None,
     headers: dict[str, str] = HEADERS,
-) -> str:
+) -> Path:
     """Downloads file checking first if exists in cache, returning file name."""
-    file_name = (
-        os.path.join(cache_dir, url.split("/")[-1]) if file_name is None else file_name
-    )
-    if not os.path.exists(file_name):
+    file_name = Path(cache_dir) / url.split("/")[-1] if file_name is None else file_name
+    if not Path(file_name).exists():
         r = requests.get(url, allow_redirects=True, headers=headers)
-        open(file_name, "wb").write(r.content)
+        with Path(file_name).open("wb") as fp:
+            fp.write(r.content)
     return file_name
 
 
@@ -117,7 +109,7 @@ def download_file(
 )
 def lookups():
     """Creates lookup dataframes."""
-    os.makedirs(cache_dir, exist_ok=True)
+    Path(cache_dir).mkdir(parents=True, exist_ok=True)
     lookup_path = download_file(cache_dir, URL_LOOKUP)
     df1 = pd.read_excel(lookup_path, sheet_name="OA_DZ_IZ_2011 Lookup")
     df2 = pd.read_excel(lookup_path, sheet_name="DataZone2011Lookup")
@@ -125,10 +117,10 @@ def lookups():
     return df1, df2, df3
 
 
-def source_to_zip(source_name: str, url: str) -> str:
+def source_to_zip(source_name: str, url: str) -> Path:
     """Downloads if necessary and returns the name of the locally cached zip file
     of the source data (replacing spaces with _)"""
-    file_name = os.path.join(cache_dir, source_name.replace(" ", "_") + ".zip")
+    file_name = Path(cache_dir) / (source_name.replace(" ", "_") + ".zip")
     return download_file(cache_dir, url, file_name)
 
 
@@ -144,20 +136,59 @@ def add_metadata(context, df: pd.DataFrame | gpd.GeoDataFrame, title: str | list
         }
     )
 
+
 @asset
-def metadata_index(context) -> pd.DataFrame:
-    dfs = pd.read_excel(
-        URL_METADATA_INDEX,
+def catalog_metadata(context) -> pd.DataFrame:
+    catalog_metadata_df = pd.read_excel(
+        URL_CATALOG_METADATA,
         sheet_name=None,
+        header=None,
         storage_options={"User-Agent": "Mozilla/5.0"},
+    )["Index"].rename(
+        columns={
+            0: "census_release",
+            1: "table_name",
+            2: "description",
+            3: "population_coverage",
+            4: "variable",
+            5: "catalog_resolution",
+            6: "year",
+            7: "additional_url",
+            8: "population_coverage_and_variable",
+        }
     )
-    df = dfs["Index"]
-    add_metadata(context, df, "Metadata for census tables")
-    return df
+    add_metadata(context, catalog_metadata_df, "Metadata for census tables")
+    return catalog_metadata_df
+
+
+def get_table_metadata(
+    catalog_metadata: pd.DataFrame, table_name: str
+) -> dict[str, str]:
+    """Returns a dict of table metadata for a given table name."""
+    rows = catalog_metadata.loc[catalog_metadata.loc[:, "table_name"].eq(table_name)]
+    census_release = rows.loc[:, "description"].unique()[0]
+    description = rows.loc[:, "description"].unique()[0]
+    population_coverage = rows.loc[:, "description"].unique()[0]
+    variables = ", ".join(rows.loc[:, "variable"].astype(str).to_list())
+    catalog_resolution = rows.loc[:, "catalog_resolution"].unique()[0]
+    year = int(rows.loc[:, "year"].unique()[0])
+    return {
+        "census_release": census_release,
+        "description": description,
+        "population_coverage": population_coverage,
+        "variables": variables,
+        "catalog_resolution": catalog_resolution,
+        "year": str(year),
+        "human_readable_name": f"{description} ({population_coverage})",
+    }
+
+
+def get_table_name(file_name: str) -> str:
+    return file_name.rsplit(".csv")[0]
 
 
 @asset
-def catalog(context) -> pd.DataFrame:
+def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame:
     """Creates a catalog of the individual census tables from all data sources."""
     records = []
     for data_source in DATA_SOURCES:
@@ -165,16 +196,53 @@ def catalog(context) -> pd.DataFrame:
         source = data_source["source"]
         url = data_source["url"]
         with zipfile.ZipFile(source_to_zip(source, url)) as zip_ref:
-            for name in zip_ref.namelist():
+            for file_name in zip_ref.namelist():
+                # Get table name
+                table_name = get_table_name(file_name)
+
+                # Skip bulk output files and missing tables from catalog_metadata
+                if (
+                    "bulk_output" in file_name.lower()
+                    or catalog_metadata.loc[:, "table_name"].ne(table_name).all()
+                ):
+                    continue
+
+                # Get table metadata
+                table_metadata = get_table_metadata(catalog_metadata, table_name)
+
+                # Create a record for each census table use same keys as MetricMetadata
+                # where possible since this makes it simpler to populate derived
+                # metrics downstream
                 record = {
                     "resolution": resolution,
+                    "catalog_resolution": table_metadata["catalog_resolution"],
                     "source": source,
                     "url": url,
-                    "file_name": name,
+                    "file_name": file_name,
+                    "table_name": table_name,
+                    "year": table_metadata["year"],
+                    # Use constructed name of description and coverage
+                    "human_readable_name": table_metadata["human_readable_name"],
+                    "source_metric_id": None,
+                    # Use catalog_metadata description
+                    "description": table_metadata["description"],
+                    "hxl_tag": None,
+                    "metric_parquet_file_url": None,
+                    "parquet_column_name": None,
+                    "parquet_margin_of_error_column": None,
+                    "parquet_margin_of_error_file": None,
+                    "potential_denominator_ids": None,
+                    "parent_metric_id": None,
+                    # TODO: check this is not an ID but a name
+                    "source_data_release_id": table_metadata["census_release"],
+                    "source_download_url": url,
+                    # TODO: what should this be?
+                    "source_archive_file_path": None,
+                    "source_documentation_url": URL_CATALOG_METADATA,
                 }
                 context.log.debug(record)
                 records.append(record)
-                zip_ref.extract(name, cache_dir)
+                zip_ref.extract(file_name, cache_dir)
 
     # TODO: check if required
     for partition in context.instance.get_dynamic_partitions(PARTITIONS_DEF_NAME):
@@ -182,15 +250,16 @@ def catalog(context) -> pd.DataFrame:
 
     # Create a dynamic partition for the datasets listed in the catalog
     catalog_df: pd.DataFrame = pd.DataFrame.from_records(records)
-    catalog_df["partition_keys"] = catalog_df[["resolution", "file_name"]].agg(
-        lambda s: "/".join(s).rsplit(".")[0], axis=1
+    catalog_df["partition_keys"] = (
+        catalog_df[["year", "resolution", "table_name"]]
+        .astype(str)
+        .agg(lambda s: "/".join(s).rsplit(".")[0], axis=1)
     )
     # TODO: consider filtering here based on a set of keys to keep derived from
     # config (i.e. backend/frontend modes)
     context.instance.add_dynamic_partitions(
         partitions_def_name=PARTITIONS_DEF_NAME,
-        # To ensure this is unique, prepend the resolution
-        # partition_keys=catalog_df["partition_keys"].to_list(),
+        # To ensure this is unique, prepend the resolution,
         partition_keys=catalog_df.loc[
             catalog_df["partition_keys"].str.contains(REQUIRED_TABLES_REGEX),
             "partition_keys",
@@ -211,9 +280,9 @@ def catalog(context) -> pd.DataFrame:
 
 
 def get_table(context, table_details) -> pd.DataFrame:
-    df = pd.read_csv(os.path.join(cache_dir, table_details["file_name"].iloc[0]))
-    add_metadata(context, df, table_details["partition_keys"].iloc[0])
-    return df
+    table_df = pd.read_csv(Path(cache_dir) / table_details["file_name"].iloc[0])
+    add_metadata(context, table_df, table_details["partition_keys"].iloc[0])
+    return table_df
 
 
 @asset(partitions_def=dataset_node_partition)
@@ -228,7 +297,7 @@ def individual_census_table(context, catalog: pd.DataFrame) -> pd.DataFrame:
 
 _subset = [
     {
-        "partition_keys": "OA11/LC1117SC",
+        "partition_keys": "2011/DCLC1117SC",
     },
 ]
 _subset_partition_keys: list[str] = [r["partition_keys"] for r in _subset]
@@ -236,6 +305,7 @@ def individual_census_table(context, catalog: pd.DataFrame) -> pd.DataFrame:
 subset_partition = StaticPartitionsDefinition(_subset_partition_keys)
 
 
+# TODO: revise to include all partitions and extract column name for metadata from catalog
 @multi_asset(
     ins={
         "individual_census_table": AssetIn(partition_mapping=subset_mapping),
@@ -249,12 +319,14 @@ def oa11_lc1117sc(
     context, individual_census_table, oa_dz_iz_2011_lookup
 ) -> pd.DataFrame:
     """Gets LC1117SC age by sex table at OA11 resolution."""
-    df = individual_census_table.rename(
+    derived_census_table = individual_census_table.rename(
         columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"}
     )
-    df = df.loc[df["OA11"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])]
-    add_metadata(context, df, _subset_partition_keys)
-    return df
+    derived_census_table = derived_census_table.loc[
+        derived_census_table["OA11"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])
+    ]
+    add_metadata(context, derived_census_table, _subset_partition_keys)
+    return derived_census_table
 
 
 @asset

From 687c33fca5c71f072df1be58b3577b8ab3756db6 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 18 Apr 2024 20:58:41 +0100
Subject: [PATCH 12/60] Fix extracted zip file names

---
 python/popgetter/assets/scotland/scotland.py | 30 +++++++++++---------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py
index 70a1296..c042df6 100644
--- a/python/popgetter/assets/scotland/scotland.py
+++ b/python/popgetter/assets/scotland/scotland.py
@@ -58,7 +58,15 @@
 }
 
 # From: https://github.com/alan-turing-institute/microsimulation/blob/37ce2843f10b83a8e7a225c801cec83b85e6e0d0/microsimulation/common.py#L32
-REQUIRED_TABLES = ["QS103SC", "QS104SC", "KS201SC", "DC1117SC", "DC2101SC", "DC6206SC"]
+REQUIRED_TABLES = [
+    "QS103SC",
+    "QS104SC",
+    "KS201SC",
+    "DC1117SC",
+    "DC2101SC",
+    "DC6206SC",
+    "LC1117SC",
+]
 REQUIRED_TABLES_REGEX = "|".join(REQUIRED_TABLES)
 
 DATA_SOURCES = [
@@ -195,7 +203,8 @@ def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame:
         resolution = data_source["resolution"]
         source = data_source["source"]
         url = data_source["url"]
-        with zipfile.ZipFile(source_to_zip(source, url)) as zip_ref:
+        zip_file_name = source_to_zip(source, url)
+        with zipfile.ZipFile(zip_file_name) as zip_ref:
             for file_name in zip_ref.namelist():
                 # Get table name
                 table_name = get_table_name(file_name)
@@ -218,7 +227,7 @@ def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame:
                     "catalog_resolution": table_metadata["catalog_resolution"],
                     "source": source,
                     "url": url,
-                    "file_name": file_name,
+                    "file_name": Path(source) / file_name,
                     "table_name": table_name,
                     "year": table_metadata["year"],
                     # Use constructed name of description and coverage
@@ -242,7 +251,7 @@ def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame:
                 }
                 context.log.debug(record)
                 records.append(record)
-                zip_ref.extract(file_name, cache_dir)
+                zip_ref.extract(file_name, Path(cache_dir) / source)
 
     # TODO: check if required
     for partition in context.instance.get_dynamic_partitions(PARTITIONS_DEF_NAME):
@@ -295,14 +304,9 @@ def individual_census_table(context, catalog: pd.DataFrame) -> pd.DataFrame:
     return get_table(context, table_details)
 
 
-_subset = [
-    {
-        "partition_keys": "2011/DCLC1117SC",
-    },
-]
-_subset_partition_keys: list[str] = [r["partition_keys"] for r in _subset]
-subset_mapping = SpecificPartitionsPartitionMapping(_subset_partition_keys)
-subset_partition = StaticPartitionsDefinition(_subset_partition_keys)
+subset_partition_keys: list[str] = ["2011/OA11/LC1117SC"]
+subset_mapping = SpecificPartitionsPartitionMapping(subset_partition_keys)
+subset_partition = StaticPartitionsDefinition(subset_partition_keys)
 
 
 # TODO: revise to include all partitions and extract column name for metadata from catalog
@@ -325,7 +329,7 @@ def oa11_lc1117sc(
     derived_census_table = derived_census_table.loc[
         derived_census_table["OA11"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])
     ]
-    add_metadata(context, derived_census_table, _subset_partition_keys)
+    add_metadata(context, derived_census_table, subset_partition_keys)
     return derived_census_table
 
 

From 739c9d1a784fec5e5dce6a637752fb5e6141e1a8 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Mon, 22 Apr 2024 21:26:18 +0100
Subject: [PATCH 13/60] Rename as df column to partition_key

---
 python/popgetter/assets/scotland/scotland.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py
index c042df6..1029394 100644
--- a/python/popgetter/assets/scotland/scotland.py
+++ b/python/popgetter/assets/scotland/scotland.py
@@ -259,7 +259,7 @@ def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame:
 
     # Create a dynamic partition for the datasets listed in the catalog
     catalog_df: pd.DataFrame = pd.DataFrame.from_records(records)
-    catalog_df["partition_keys"] = (
+    catalog_df["partition_key"] = (
         catalog_df[["year", "resolution", "table_name"]]
         .astype(str)
         .agg(lambda s: "/".join(s).rsplit(".")[0], axis=1)
@@ -270,8 +270,8 @@ def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame:
         partitions_def_name=PARTITIONS_DEF_NAME,
         # To ensure this is unique, prepend the resolution,
         partition_keys=catalog_df.loc[
-            catalog_df["partition_keys"].str.contains(REQUIRED_TABLES_REGEX),
-            "partition_keys",
+            catalog_df["partition_key"].str.contains(REQUIRED_TABLES_REGEX),
+            "partition_key",
         ].to_list(),
     )
     context.add_output_metadata(
@@ -290,7 +290,7 @@ def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame:
 
 def get_table(context, table_details) -> pd.DataFrame:
     table_df = pd.read_csv(Path(cache_dir) / table_details["file_name"].iloc[0])
-    add_metadata(context, table_df, table_details["partition_keys"].iloc[0])
+    add_metadata(context, table_df, table_details["partition_key"].iloc[0])
     return table_df
 
 
@@ -299,7 +299,7 @@ def individual_census_table(context, catalog: pd.DataFrame) -> pd.DataFrame:
     """Creates individual census tables as dataframe."""
     partition_key = context.asset_partition_key_for_output()
     context.log.info(partition_key)
-    table_details = catalog.loc[catalog["partition_keys"].isin([partition_key])]
+    table_details = catalog.loc[catalog["partition_key"].isin([partition_key])]
     context.log.info(table_details)
     return get_table(context, table_details)
 

From b9e76b0bdc51b4cbfb61239a88250d885c838cc5 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Mon, 22 Apr 2024 21:29:37 +0100
Subject: [PATCH 14/60] Add initial census_derived for Scotland

---
 .../assets/scotland/census_derived.py         | 272 ++++++++++++++++++
 1 file changed, 272 insertions(+)
 create mode 100644 python/popgetter/assets/scotland/census_derived.py

diff --git a/python/popgetter/assets/scotland/census_derived.py b/python/popgetter/assets/scotland/census_derived.py
new file mode 100644
index 0000000..102929d
--- /dev/null
+++ b/python/popgetter/assets/scotland/census_derived.py
@@ -0,0 +1,272 @@
+from __future__ import annotations
+
+import pandas as pd
+from dagster import (
+    AssetIn,
+    AssetOut,
+    SpecificPartitionsPartitionMapping,
+    StaticPartitionsDefinition,
+    asset,
+    multi_asset,
+)
+
+from ...metadata import MetricMetadata
+from .scotland import add_metadata, dataset_node_partition
+
+
+def get_lc1117sc_metric(
+    lc1117sc: pd.DataFrame, col: str, subset: list[str]
+) -> pd.DataFrame:
+    lc1117sc_transformed = lc1117sc.rename(
+        columns={"Unnamed: 0": "OA11CD", "Unnamed: 1": "Age Category"}
+    )
+    lc1117sc_transformed = lc1117sc_transformed.loc[
+        ~lc1117sc_transformed["OA11CD"].str.startswith("S92"), :
+    ]
+    return (
+        lc1117sc_transformed.loc[
+            lc1117sc_transformed["Age Category"].isin(subset),
+            ["OA11CD", col],
+        ]
+        .groupby("OA11CD")
+        .agg("sum")
+        .rename(columns={col: "Count"})
+    )
+
+
+ALL_PEOPLE = ["All people"]
+INFANTS_AGE_0_TO_4 = ["0 to 4"]
+CHILDREN_AGE_0_TO_17 = ["0 to 4", "5 to 9", "10 to 11", "12 to 14", "15", "16 to 17"]
+CHILDREN_AGE_5_TO_17 = ["5 to 9", "10 to 11", "12 to 14", "15", "16 to 17"]
+ADULTS = [
+    "18 to 19",
+    "20 to 24",
+    "25 to 29",
+    "30 to 34",
+    "35 to 39",
+    "40 to 44",
+    "45 to 49",
+    "50 to 54",
+    "55 to 59",
+    "60 to 64",
+    "65 to 69",
+    "70 to 74",
+    "75 to 79",
+    "80 to 84",
+    "85 to 89",
+    "90 to 94",
+    "95 and over",
+]
+
+needed_dataset_list = [
+    {
+        # Population by OA11, Period: 2011
+        "partition_key": "2011/OA11/LC1117SC",
+        "hxltag": "#population+oa11+2011",
+        # TODO: this partition key does not have a single column for source
+        "source_column": "",
+    }
+]
+needed_dataset_partions_keys: list[str] = [
+    r["partition_key"] for r in needed_dataset_list
+]
+needed_dataset_mapping = SpecificPartitionsPartitionMapping(
+    needed_dataset_partions_keys
+)
+needed_dataset_partition = StaticPartitionsDefinition(needed_dataset_partions_keys)
+
+# Using HXL tags for variable names (https://hxlstandard.org/standard/1-1final/dictionary/#tag_population)
+_derived_columns: list[dict] = [
+    {
+        "partition_key": "2011/OA11/LC1117SC",
+        "hxltag": "population_children_age5_17",
+        "filter_func": lambda df: get_lc1117sc_metric(
+            df, "All people", CHILDREN_AGE_5_TO_17
+        ),
+    },
+    {
+        "partition_key": "2011/OA11/LC1117SC",
+        "hxltag": "population_infants_age0_4",
+        "filter_func": lambda df: get_lc1117sc_metric(
+            df, "All people", INFANTS_AGE_0_TO_4
+        ),
+    },
+    {
+        "partition_key": "2011/OA11/LC1117SC",
+        "hxltag": "population_children_age0_17",
+        "filter_func": lambda df: get_lc1117sc_metric(
+            df, "All people", CHILDREN_AGE_0_TO_17
+        ),
+    },
+    {
+        "partition_key": "2011/OA11/LC1117SC",
+        "hxltag": "population_adults_f",
+        "filter_func": lambda df: get_lc1117sc_metric(df, "Females", ADULTS),
+    },
+    {
+        "partition_key": "2011/OA11/LC1117SC",
+        "hxltag": "population_adults_m",
+        "filter_func": lambda df: get_lc1117sc_metric(df, "Males", ADULTS),
+    },
+    {
+        "partition_key": "2011/OA11/LC1117SC",
+        "hxltag": "population_adults",
+        "filter_func": lambda df: get_lc1117sc_metric(df, "All people", ADULTS),
+    },
+    {
+        "partition_key": "2011/OA11/LC1117SC",
+        "hxltag": "population_ind",
+        "filter_func": lambda df: get_lc1117sc_metric(df, "All people", ALL_PEOPLE),
+    },
+]
+
+derived_columns = pd.DataFrame(
+    _derived_columns, columns=["node", "hxltag", "filter_func"]
+)
+
+
+# record = {
+#     "resolution": resolution,
+#     "catalog_resolution": table_metadata["catalog_resolution"],
+#     "source": source,
+#     "url": url,
+#     "file_name": Path(source) / file_name,
+#     "table_name": table_name,
+#     "year": table_metadata["year"],
+#     # Use constructed name of description and coverage
+#     "human_readable_name": table_metadata["human_readable_name"],
+#     "source_metric_id": None,
+#     # Use catalog_metadata description
+#     "description": table_metadata["description"],
+#     "hxl_tag": None,
+#     "metric_parquet_file_url": None,
+#     "parquet_column_name": None,
+#     "parquet_margin_of_error_column": None,
+#     "parquet_margin_of_error_file": None,
+#     "potential_denominator_ids": None,
+#     "parent_metric_id": None,
+#     # TODO: check this is not an ID but a name
+#     "source_data_release_id": table_metadata["census_release"],
+#     "source_download_url": url,
+#     # TODO: what should this be?
+#     "source_archive_file_path": None,
+#     "source_documentation_url": URL_CATALOG_METADATA,
+# }
+
+
+def census_table_metadata(catalog_row: dict) -> MetricMetadata:
+    return MetricMetadata(
+        human_readable_name=catalog_row["human_readable_name"],
+        source_download_url=catalog_row["source_download_url"],
+        source_archive_file_path=catalog_row["source_archive_file_path"],
+        source_documentation_url=catalog_row["source_documentation_url"],
+        source_data_release_id="TODO",
+        # TODO - this is a placeholder
+        parent_metric_id="unknown_at_this_stage",
+        potential_denominator_ids=None,
+        parquet_margin_of_error_file=None,
+        parquet_margin_of_error_column=None,
+        parquet_column_name=catalog_row["source_column"],
+        # TODO - this is a placeholder
+        metric_parquet_file_url="unknown_at_this_stage",
+        hxl_tag=catalog_row["hxltag"],
+        description=catalog_row["description"],
+        source_metric_id=catalog_row["hxltag"],
+    )
+
+
+@asset(
+    ins={
+        "catalog": AssetIn(partition_mapping=needed_dataset_mapping),
+    },
+)
+def filter_needed_catalog(
+    context, needed_datasets, catalog: pd.DataFrame
+) -> pd.DataFrame:
+    needed_df = needed_datasets.merge(catalog, how="inner", on="partition_key")
+    add_metadata(context, needed_df, "needed_df")
+    return needed_df
+
+
+@asset
+def needed_datasets(context) -> pd.DataFrame:
+    needed_df = pd.DataFrame(
+        needed_dataset_list,
+        columns=["partition_key", "hxltag", "source_column", "derived_columns"],
+        dtype="string",
+    )
+    add_metadata(context, needed_df, "needed_datasets")
+    return needed_df
+
+
+@multi_asset(
+    ins={
+        "individual_census_table": AssetIn(partition_mapping=needed_dataset_mapping),
+        "filter_needed_catalog": AssetIn(),
+    },
+    outs={
+        "source_table": AssetOut(),
+        "source_mmd": AssetOut(),
+    },
+    partitions_def=dataset_node_partition,
+)
+def get_enriched_tables_scotland(
+    context, individual_census_table, filter_needed_catalog
+) -> tuple[pd.DataFrame, MetricMetadata]:
+    partition_keys = context.asset_partition_keys_for_input(
+        input_name="individual_census_table"
+    )
+    output_partition = context.asset_partition_key_for_output("source_table")
+    if output_partition not in partition_keys:
+        err_msg = f"Requested partition {output_partition} not found in the subset of 'needed' partitions {partition_keys}"
+        raise ValueError(err_msg)
+
+    if output_partition not in individual_census_table:
+        err_msg = (
+            f"Partition key {output_partition} not found in individual_census_table\n"
+            f"Available keys are {individual_census_table.keys()}"
+        )
+        raise ValueError(err_msg)
+    result_df = individual_census_table[output_partition]
+    catalog_row = filter_needed_catalog[
+        filter_needed_catalog["partition_key"].eq(output_partition)
+    ]
+    catalog_row = catalog_row.to_dict(orient="index")
+    catalog_row = catalog_row.popitem()[1]
+    result_mmd = census_table_metadata(catalog_row)
+    return result_df, result_mmd
+
+
+# TODO: from here
+
+
+@multi_asset(
+    partitions_def=dataset_node_partition,
+    ins={
+        "source_table": AssetIn(partition_mapping=needed_dataset_mapping),
+        "source_mmd": AssetIn(partition_mapping=needed_dataset_mapping),
+    },
+    outs={"derived_table": AssetOut(), "derived_mmds": AssetOut()},
+)
+def transform_data(
+    context,
+    source_table: dict[str, pd.DataFrame],
+    source_mmd: dict[str, MetricMetadata],
+) -> tuple[pd.DataFrame, list[MetricMetadata]]:
+    partition_key = context.asset_partition_key_for_output("derived_table")
+    census_table = source_table[partition_key]
+    parent_mmd = source_mmd[partition_key]
+    # source_column = parent_mmd.parquet_column_name
+    metrics = derived_columns[derived_columns["partition_key"].eq(partition_key)]
+    new_series: list[pd.Series] = []
+    new_mmds: list[MetricMetadata] = []
+    for row_tuple in metrics.itertuples():
+        _, _, col_name, group_by_column, filter = row_tuple
+        new_series.append(filter(census_table))
+        new_mmd = parent_mmd.copy()
+        new_mmd.parent_metric_id = parent_mmd.source_metric_id
+        new_mmd.hxl_tag = col_name
+        new_mmds.append(new_mmd)
+    new_table: pd.DataFrame = pd.concat(new_series, axis=1)
+    add_metadata(context, new_table, "derived_table")
+    return new_table, new_mmds

From b365a1496dc65ddb561b4dbae1baff5425a99ee0 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 23 Apr 2024 14:12:28 +0100
Subject: [PATCH 15/60] Add ISO3116-2 field, move download_file to module
 import

---
 python/popgetter/assets/scotland/__init__.py | 47 ++++++++++++++++----
 python/popgetter/metadata.py                 |  3 ++
 2 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py
index fb102a3..044a2cb 100755
--- a/python/popgetter/assets/scotland/__init__.py
+++ b/python/popgetter/assets/scotland/__init__.py
@@ -1,22 +1,53 @@
 #!/usr/bin/python3
 from __future__ import annotations
 
+from pathlib import Path
+
+import requests
 from dagster import (
     asset,
 )
 
+# from popgetter.assets.scotland import country
+# from . import (
+#     scotland,
+# )
 from popgetter.metadata import (
     CountryMetadata,
 )
 
-from . import (
-    scotland, 
+country: CountryMetadata = CountryMetadata(
+    name_short_en="Scotland",
+    name_official="Kingdom of Belgium",
+    iso3="GBR",
+    iso2="GB",
+    iso3116_2="GB-SCT",
 )
 
+WORKING_DIR = Path("scotland")
+
+
+@asset()
+def get_country_metadata() -> CountryMetadata:
+    """Returns a CountryMetadata of metadata about the country."""
+    return country
+
+
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
+}
+
 
-# @asset(key_prefix=asset_prefix)
-# def get_country_metadata() -> CountryMetadata:
-#     """
-#     Returns a CountryMetadata of metadata about the country.
-#     """
-#     return country
+def download_file(
+    cache_dir: str,
+    url: str,
+    file_name: Path | None = None,
+    headers: dict[str, str] = HEADERS,
+) -> Path:
+    """Downloads file checking first if exists in cache, returning file name."""
+    file_name = Path(cache_dir) / url.split("/")[-1] if file_name is None else file_name
+    if not Path(file_name).exists():
+        r = requests.get(url, allow_redirects=True, headers=headers)
+        with Path(file_name).open("wb") as fp:
+            fp.write(r.content)
+    return file_name
diff --git a/python/popgetter/metadata.py b/python/popgetter/metadata.py
index aa6178a..237ca09 100644
--- a/python/popgetter/metadata.py
+++ b/python/popgetter/metadata.py
@@ -15,6 +15,9 @@ class CountryMetadata(BaseModel):
     )
     iso3: str = Field(description="The ISO3 code of the country (for example 'BEL').")
     iso2: str = Field(description="The ISO2 code of the country (for example 'BE').")
+    iso3116_2: str | None = Field(
+        description="The ISO3116-2 code for the names of the principal subdivisions (for example 'GB-SCT')."
+    )
 
 
 class DataPublisher(BaseModel):

From a44e744cc2df6b7b88741b5137d40042363ed4b3 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 23 Apr 2024 14:13:49 +0100
Subject: [PATCH 16/60] Add ISO3116-2 field to Belgium and UK

---
 python/popgetter/assets/be/belgium.py        | 1 +
 python/popgetter/assets/uk/united_kingdom.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/python/popgetter/assets/be/belgium.py b/python/popgetter/assets/be/belgium.py
index 211e0b2..3253e9d 100644
--- a/python/popgetter/assets/be/belgium.py
+++ b/python/popgetter/assets/be/belgium.py
@@ -9,6 +9,7 @@
     name_official="Kingdom of Belgium",
     iso3="BEL",
     iso2="BE",
+    iso3116_2=None,
 )
 
 WORKING_DIR = Path("belgium")
diff --git a/python/popgetter/assets/uk/united_kingdom.py b/python/popgetter/assets/uk/united_kingdom.py
index e6ab99f..3f12b39 100644
--- a/python/popgetter/assets/uk/united_kingdom.py
+++ b/python/popgetter/assets/uk/united_kingdom.py
@@ -7,6 +7,7 @@
     name_official="The United Kingdom of Great Britain and Northern Ireland",
     iso3="GBR",
     iso2="GB",
+    iso3116_2=None,
 )
 
 asset_prefix = "uk"

From f65cbaec025f4022b87ac07ee3d0ae7ca7a5b269 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 23 Apr 2024 14:39:58 +0100
Subject: [PATCH 17/60] Rename country metadata asset

---
 python/popgetter/assets/scotland/__init__.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py
index 044a2cb..a84b2bc 100755
--- a/python/popgetter/assets/scotland/__init__.py
+++ b/python/popgetter/assets/scotland/__init__.py
@@ -8,10 +8,6 @@
     asset,
 )
 
-# from popgetter.assets.scotland import country
-# from . import (
-#     scotland,
-# )
 from popgetter.metadata import (
     CountryMetadata,
 )
@@ -28,7 +24,7 @@
 
 
 @asset()
-def get_country_metadata() -> CountryMetadata:
+def country_metadata() -> CountryMetadata:
     """Returns a CountryMetadata of metadata about the country."""
     return country
 

From dd5dcfd865f6227beba04455912569ba9337d0ce Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 23 Apr 2024 14:56:23 +0100
Subject: [PATCH 18/60] Refactor and fix derived module, add geometry module

---
 .../assets/scotland/census_derived.py         | 111 +++++++++++++-----
 .../assets/scotland/census_geometry.py        |  17 +++
 python/popgetter/assets/scotland/scotland.py  |  92 ++-------------
 3 files changed, 105 insertions(+), 115 deletions(-)
 create mode 100644 python/popgetter/assets/scotland/census_geometry.py

diff --git a/python/popgetter/assets/scotland/census_derived.py b/python/popgetter/assets/scotland/census_derived.py
index 102929d..90e94d6 100644
--- a/python/popgetter/assets/scotland/census_derived.py
+++ b/python/popgetter/assets/scotland/census_derived.py
@@ -1,21 +1,29 @@
 from __future__ import annotations
 
+import geopandas as gpd
+import numpy as np
 import pandas as pd
 from dagster import (
     AssetIn,
     AssetOut,
+    MaterializeResult,
+    MetadataValue,
     SpecificPartitionsPartitionMapping,
     StaticPartitionsDefinition,
     asset,
     multi_asset,
 )
+from icecream import ic
+from matplotlib import pyplot as plt
+
+from popgetter.utils import markdown_from_plot
 
 from ...metadata import MetricMetadata
 from .scotland import add_metadata, dataset_node_partition
 
 
 def get_lc1117sc_metric(
-    lc1117sc: pd.DataFrame, col: str, subset: list[str]
+    lc1117sc: pd.DataFrame, col: str, output_col: str, subset: list[str]
 ) -> pd.DataFrame:
     lc1117sc_transformed = lc1117sc.rename(
         columns={"Unnamed: 0": "OA11CD", "Unnamed: 1": "Age Category"}
@@ -30,7 +38,7 @@ def get_lc1117sc_metric(
         ]
         .groupby("OA11CD")
         .agg("sum")
-        .rename(columns={col: "Count"})
+        .rename(columns={col: output_col})
     )
 
 
@@ -80,48 +88,56 @@ def get_lc1117sc_metric(
     {
         "partition_key": "2011/OA11/LC1117SC",
         "hxltag": "population_children_age5_17",
-        "filter_func": lambda df: get_lc1117sc_metric(
-            df, "All people", CHILDREN_AGE_5_TO_17
+        "filter_func": lambda df, output_col: get_lc1117sc_metric(
+            df, "All people", output_col, CHILDREN_AGE_5_TO_17
         ),
     },
     {
         "partition_key": "2011/OA11/LC1117SC",
         "hxltag": "population_infants_age0_4",
-        "filter_func": lambda df: get_lc1117sc_metric(
-            df, "All people", INFANTS_AGE_0_TO_4
+        "filter_func": lambda df, output_col: get_lc1117sc_metric(
+            df, "All people", output_col, INFANTS_AGE_0_TO_4
         ),
     },
     {
         "partition_key": "2011/OA11/LC1117SC",
         "hxltag": "population_children_age0_17",
-        "filter_func": lambda df: get_lc1117sc_metric(
-            df, "All people", CHILDREN_AGE_0_TO_17
+        "filter_func": lambda df, output_col: get_lc1117sc_metric(
+            df, "All people", output_col, CHILDREN_AGE_0_TO_17
         ),
     },
     {
         "partition_key": "2011/OA11/LC1117SC",
         "hxltag": "population_adults_f",
-        "filter_func": lambda df: get_lc1117sc_metric(df, "Females", ADULTS),
+        "filter_func": lambda df, output_col: get_lc1117sc_metric(
+            df, "Females", output_col, ADULTS
+        ),
     },
     {
         "partition_key": "2011/OA11/LC1117SC",
         "hxltag": "population_adults_m",
-        "filter_func": lambda df: get_lc1117sc_metric(df, "Males", ADULTS),
+        "filter_func": lambda df, output_col: get_lc1117sc_metric(
+            df, "Males", output_col, ADULTS
+        ),
     },
     {
         "partition_key": "2011/OA11/LC1117SC",
         "hxltag": "population_adults",
-        "filter_func": lambda df: get_lc1117sc_metric(df, "All people", ADULTS),
+        "filter_func": lambda df, output_col: get_lc1117sc_metric(
+            df, "All people", output_col, ADULTS
+        ),
     },
     {
         "partition_key": "2011/OA11/LC1117SC",
         "hxltag": "population_ind",
-        "filter_func": lambda df: get_lc1117sc_metric(df, "All people", ALL_PEOPLE),
+        "filter_func": lambda df, output_col: get_lc1117sc_metric(
+            df, "All people", output_col, ALL_PEOPLE
+        ),
     },
 ]
 
 derived_columns = pd.DataFrame(
-    _derived_columns, columns=["node", "hxltag", "filter_func"]
+    _derived_columns, columns=["partition_key", "hxltag", "filter_func"]
 )
 
 
@@ -214,20 +230,19 @@ def get_enriched_tables_scotland(
     context, individual_census_table, filter_needed_catalog
 ) -> tuple[pd.DataFrame, MetricMetadata]:
     partition_keys = context.asset_partition_keys_for_input(
-        input_name="individual_census_table"
+        input_name="individual_census_table",
     )
     output_partition = context.asset_partition_key_for_output("source_table")
+    ic(partition_keys)
+    ic(len(partition_keys))
+    ic(output_partition)
+    ic(type(output_partition))
+    ic(individual_census_table)
     if output_partition not in partition_keys:
         err_msg = f"Requested partition {output_partition} not found in the subset of 'needed' partitions {partition_keys}"
         raise ValueError(err_msg)
 
-    if output_partition not in individual_census_table:
-        err_msg = (
-            f"Partition key {output_partition} not found in individual_census_table\n"
-            f"Available keys are {individual_census_table.keys()}"
-        )
-        raise ValueError(err_msg)
-    result_df = individual_census_table[output_partition]
+    result_df = individual_census_table
     catalog_row = filter_needed_catalog[
         filter_needed_catalog["partition_key"].eq(output_partition)
     ]
@@ -237,9 +252,6 @@ def get_enriched_tables_scotland(
     return result_df, result_mmd
 
 
-# TODO: from here
-
-
 @multi_asset(
     partitions_def=dataset_node_partition,
     ins={
@@ -250,23 +262,58 @@ def get_enriched_tables_scotland(
 )
 def transform_data(
     context,
-    source_table: dict[str, pd.DataFrame],
-    source_mmd: dict[str, MetricMetadata],
+    source_table: pd.DataFrame,
+    source_mmd: MetricMetadata,
 ) -> tuple[pd.DataFrame, list[MetricMetadata]]:
     partition_key = context.asset_partition_key_for_output("derived_table")
-    census_table = source_table[partition_key]
-    parent_mmd = source_mmd[partition_key]
+    census_table = source_table.copy()
+    parent_mmd = source_mmd.copy()
     # source_column = parent_mmd.parquet_column_name
     metrics = derived_columns[derived_columns["partition_key"].eq(partition_key)]
     new_series: list[pd.Series] = []
     new_mmds: list[MetricMetadata] = []
-    for row_tuple in metrics.itertuples():
-        _, _, col_name, group_by_column, filter = row_tuple
-        new_series.append(filter(census_table))
+    for _, _, col_name, filter in metrics.itertuples():
+        # Create column
+        column: pd.Series = filter(census_table, col_name)
+        ic(f"col_name: {col_name}")
+        new_series.append(column)
+
+        # Construct metadata
         new_mmd = parent_mmd.copy()
         new_mmd.parent_metric_id = parent_mmd.source_metric_id
         new_mmd.hxl_tag = col_name
         new_mmds.append(new_mmd)
+
+    # Merge series
     new_table: pd.DataFrame = pd.concat(new_series, axis=1)
-    add_metadata(context, new_table, "derived_table")
+    add_metadata(
+        context,
+        df=new_table,
+        title=f"Derived table ({partition_key})",
+        output_name="derived_table",
+    )
     return new_table, new_mmds
+
+
+@multi_asset(
+    ins={
+        "derived_table": AssetIn(partition_mapping=needed_dataset_mapping),
+        "geometry": AssetIn(partition_mapping=needed_dataset_mapping),
+    },
+    outs={
+        "plot": AssetOut(),
+    },
+    partitions_def=dataset_node_partition,
+)
+def plot(derived_table: pd.DataFrame, geometry: gpd.GeoDataFrame):
+    """Plots map with log density of people."""
+    merged = geometry.merge(
+        derived_table[["population_ind"]],
+        left_on="geo_code",
+        right_index=True,
+        how="left",
+    )
+    merged["log10 people"] = np.log10(merged["population_ind"])
+    merged.plot(column="log10 people", legend=True)
+    md_content = markdown_from_plot(plt)
+    return MaterializeResult(metadata={"plot": MetadataValue.md(md_content)})
diff --git a/python/popgetter/assets/scotland/census_geometry.py b/python/popgetter/assets/scotland/census_geometry.py
new file mode 100644
index 0000000..544ca1b
--- /dev/null
+++ b/python/popgetter/assets/scotland/census_geometry.py
@@ -0,0 +1,17 @@
+from __future__ import annotations
+
+import geopandas as gpd
+from dagster import asset
+
+from popgetter.assets.scotland import download_file
+
+from .scotland import URL_SHAPEFILE, add_metadata, cache_dir
+
+
+@asset
+def geometry(context, oa_dz_iz_2011_lookup) -> gpd.GeoDataFrame:
+    """Gets the shape file for OA11 resolution."""
+    file_name = download_file(cache_dir, URL_SHAPEFILE)
+    geo = gpd.read_file(f"zip://{file_name}")
+    add_metadata(context, geo, "Geometry file")
+    return geo[geo["geo_code"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])]
diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/scotland.py
index 1029394..e81a95c 100644
--- a/python/popgetter/assets/scotland/scotland.py
+++ b/python/popgetter/assets/scotland/scotland.py
@@ -4,16 +4,11 @@
 from pathlib import Path
 
 import geopandas as gpd
-import matplotlib.pyplot as plt
-import numpy as np
 import pandas as pd
-import requests
 import zipfile_deflate64 as zipfile
 from dagster import (
-    AssetIn,
     AssetOut,
     DynamicPartitionsDefinition,
-    MaterializeResult,
     MetadataValue,
     SpecificPartitionsPartitionMapping,
     StaticPartitionsDefinition,
@@ -21,7 +16,7 @@
     multi_asset,
 )
 
-from popgetter.utils import markdown_from_plot
+from popgetter.assets.scotland import download_file
 
 """
 Notes:
@@ -87,25 +82,6 @@
     },
 ]
 
-HEADERS = {
-    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
-}
-
-
-def download_file(
-    cache_dir: str,
-    url: str,
-    file_name: Path | None = None,
-    headers: dict[str, str] = HEADERS,
-) -> Path:
-    """Downloads file checking first if exists in cache, returning file name."""
-    file_name = Path(cache_dir) / url.split("/")[-1] if file_name is None else file_name
-    if not Path(file_name).exists():
-        r = requests.get(url, allow_redirects=True, headers=headers)
-        with Path(file_name).open("wb") as fp:
-            fp.write(r.content)
-    return file_name
-
 
 # NB. Make sure no spaces in asset keys
 @multi_asset(
@@ -132,7 +108,12 @@ def source_to_zip(source_name: str, url: str) -> Path:
     return download_file(cache_dir, url, file_name)
 
 
-def add_metadata(context, df: pd.DataFrame | gpd.GeoDataFrame, title: str | list[str]):
+def add_metadata(
+    context,
+    df: pd.DataFrame | gpd.GeoDataFrame,
+    title: str | list[str],
+    output_name: str | None = None,
+):
     context.add_output_metadata(
         metadata={
             "title": title,
@@ -141,7 +122,8 @@ def add_metadata(context, df: pd.DataFrame | gpd.GeoDataFrame, title: str | list
                 "\n".join([f"- '`{col}`'" for col in df.columns.to_list()])
             ),
             "preview": MetadataValue.md(df.head().to_markdown()),
-        }
+        },
+        output_name=output_name,
     )
 
 
@@ -307,59 +289,3 @@ def individual_census_table(context, catalog: pd.DataFrame) -> pd.DataFrame:
 subset_partition_keys: list[str] = ["2011/OA11/LC1117SC"]
 subset_mapping = SpecificPartitionsPartitionMapping(subset_partition_keys)
 subset_partition = StaticPartitionsDefinition(subset_partition_keys)
-
-
-# TODO: revise to include all partitions and extract column name for metadata from catalog
-@multi_asset(
-    ins={
-        "individual_census_table": AssetIn(partition_mapping=subset_mapping),
-    },
-    outs={
-        "oa11_lc1117sc": AssetOut(),
-    },
-    partitions_def=dataset_node_partition,
-)
-def oa11_lc1117sc(
-    context, individual_census_table, oa_dz_iz_2011_lookup
-) -> pd.DataFrame:
-    """Gets LC1117SC age by sex table at OA11 resolution."""
-    derived_census_table = individual_census_table.rename(
-        columns={"Unnamed: 0": "OA11", "Unnamed: 1": "Age bracket"}
-    )
-    derived_census_table = derived_census_table.loc[
-        derived_census_table["OA11"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])
-    ]
-    add_metadata(context, derived_census_table, subset_partition_keys)
-    return derived_census_table
-
-
-@asset
-def geometry(context, oa_dz_iz_2011_lookup) -> gpd.GeoDataFrame:
-    """Gets the shape file for OA11 resolution."""
-    file_name = download_file(cache_dir, URL_SHAPEFILE)
-    geo = gpd.read_file(f"zip://{file_name}")
-    add_metadata(context, geo, "Geometry file")
-    return geo[geo["geo_code"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])]
-
-
-@multi_asset(
-    ins={
-        "oa11_lc1117sc": AssetIn(partition_mapping=subset_mapping),
-        "geometry": AssetIn(partition_mapping=subset_mapping),
-    },
-    outs={
-        "plot": AssetOut(),
-    },
-    partitions_def=dataset_node_partition,
-)
-def plot(geometry: gpd.GeoDataFrame, oa11_lc1117sc: pd.DataFrame):
-    """Plots map with log density of people."""
-    merged = geometry.merge(
-        oa11_lc1117sc, left_on="geo_code", right_on="OA11", how="left"
-    )
-    merged["log10 people"] = np.log10(merged["All people"])
-    merged[merged["Age bracket"] == "All people"].plot(
-        column="log10 people", legend=True
-    )
-    md_content = markdown_from_plot(plt)
-    return MaterializeResult(metadata={"plot": MetadataValue.md(md_content)})

From 6844c6f456d918134cfea403c9736da5f1cfc27e Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 23 Apr 2024 16:43:53 +0100
Subject: [PATCH 19/60] Rename modules to match Belgium

---
 python/popgetter/assets/scotland/__init__.py                    | 2 --
 python/popgetter/assets/scotland/census_geometry.py             | 2 +-
 .../popgetter/assets/scotland/{scotland.py => census_tables.py} | 0
 3 files changed, 1 insertion(+), 3 deletions(-)
 rename python/popgetter/assets/scotland/{scotland.py => census_tables.py} (100%)

diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py
index a84b2bc..7953386 100755
--- a/python/popgetter/assets/scotland/__init__.py
+++ b/python/popgetter/assets/scotland/__init__.py
@@ -20,8 +20,6 @@
     iso3116_2="GB-SCT",
 )
 
-WORKING_DIR = Path("scotland")
-
 
 @asset()
 def country_metadata() -> CountryMetadata:
diff --git a/python/popgetter/assets/scotland/census_geometry.py b/python/popgetter/assets/scotland/census_geometry.py
index 544ca1b..5187181 100644
--- a/python/popgetter/assets/scotland/census_geometry.py
+++ b/python/popgetter/assets/scotland/census_geometry.py
@@ -5,7 +5,7 @@
 
 from popgetter.assets.scotland import download_file
 
-from .scotland import URL_SHAPEFILE, add_metadata, cache_dir
+from .census_tables import URL_SHAPEFILE, add_metadata, cache_dir
 
 
 @asset
diff --git a/python/popgetter/assets/scotland/scotland.py b/python/popgetter/assets/scotland/census_tables.py
similarity index 100%
rename from python/popgetter/assets/scotland/scotland.py
rename to python/popgetter/assets/scotland/census_tables.py

From c4bd1ef19a65a44a64cfe212a669198cd696bf0a Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 23 Apr 2024 17:27:13 +0100
Subject: [PATCH 20/60] Fix imports, refactor Scotland catalog asset names

---
 python/popgetter/__init__.py                  | 21 +++++++----
 python/popgetter/assets/scotland/__init__.py  |  2 +-
 .../assets/scotland/census_derived.py         | 10 +++---
 .../assets/scotland/census_tables.py          | 36 ++++++++++---------
 4 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py
index 31d2355..3e1a49c 100644
--- a/python/popgetter/__init__.py
+++ b/python/popgetter/__init__.py
@@ -3,7 +3,8 @@
 from collections.abc import Sequence
 from pathlib import Path
 
-from python.popgetter.utils import StagingDirResource
+# from python.popgetter.utils import StagingDirResource
+from popgetter.utils import StagingDirResource
 
 __version__ = "0.1.0"
 
@@ -27,12 +28,17 @@
 )
 
 from popgetter import assets
+from popgetter.assets.scotland.census_tables import (
+    dataset_node_partition as dataset_partition_scotland,
+)
 
 all_assets: Sequence[AssetsDefinition | SourceAsset | CacheableAssetsDefinition] = [
     *load_assets_from_package_module(assets.us, group_name="us"),
     *load_assets_from_package_module(assets.be, group_name="be"),
     *load_assets_from_package_module(assets.uk, group_name="uk"),
-    *load_assets_from_package_module(assets.scotland, group_name="scotland", key_prefix="uk-scotland"),
+    *load_assets_from_package_module(
+        assets.scotland, group_name="scotland", key_prefix="uk-scotland"
+    ),
 ]
 
 job_be: UnresolvedAssetJobDefinition = define_asset_job(
@@ -54,20 +60,21 @@
     description="Downloads UK data.",
 )
 
-job_uk: UnresolvedAssetJobDefinition = define_asset_job(
+job_scotland: UnresolvedAssetJobDefinition = define_asset_job(
     name="job_scotland",
     selection=AssetSelection.groups("scotland"),
     description="Downloads Scotland data.",
-    # https://docs.dagster.io/guides/limiting-concurrency-in-data-pipelines#asset-based-jobs
+    partitions_def=dataset_partition_scotland,
+    # https://docs.dagster.io/guides/limiting-concurrency-in-data-pipelines#asset-based-jobs
     config={
         "execution": {
             "config": {
                 "multiprocess": {
-                    "max_concurrent": 20, # limits concurrent assets
+                    "max_concurrent": 20,  # limits concurrent assets
                 },
             }
         }
-    }
+    },
 )
 
 
@@ -82,5 +89,5 @@
             staging_dir=str(Path(__file__).parent.joinpath("staging_dir").resolve())
         ),
     },
-    jobs=[job_be, job_us, job_uk],
+    jobs=[job_be, job_us, job_uk, job_scotland],
 )
diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py
index 7953386..77371e3 100755
--- a/python/popgetter/assets/scotland/__init__.py
+++ b/python/popgetter/assets/scotland/__init__.py
@@ -14,7 +14,7 @@
 
 country: CountryMetadata = CountryMetadata(
     name_short_en="Scotland",
-    name_official="Kingdom of Belgium",
+    name_official="Scotland",
     iso3="GBR",
     iso2="GB",
     iso3116_2="GB-SCT",
diff --git a/python/popgetter/assets/scotland/census_derived.py b/python/popgetter/assets/scotland/census_derived.py
index 90e94d6..724f292 100644
--- a/python/popgetter/assets/scotland/census_derived.py
+++ b/python/popgetter/assets/scotland/census_derived.py
@@ -19,7 +19,7 @@
 from popgetter.utils import markdown_from_plot
 
 from ...metadata import MetricMetadata
-from .scotland import add_metadata, dataset_node_partition
+from .census_tables import add_metadata, dataset_node_partition
 
 
 def get_lc1117sc_metric(
@@ -193,13 +193,15 @@ def census_table_metadata(catalog_row: dict) -> MetricMetadata:
 
 @asset(
     ins={
-        "catalog": AssetIn(partition_mapping=needed_dataset_mapping),
+        "catalog_as_dataframe": AssetIn(partition_mapping=needed_dataset_mapping),
     },
 )
 def filter_needed_catalog(
-    context, needed_datasets, catalog: pd.DataFrame
+    context, needed_datasets, catalog_as_dataframe: pd.DataFrame
 ) -> pd.DataFrame:
-    needed_df = needed_datasets.merge(catalog, how="inner", on="partition_key")
+    needed_df = needed_datasets.merge(
+        catalog_as_dataframe, how="inner", on="partition_key"
+    )
     add_metadata(context, needed_df, "needed_df")
     return needed_df
 
diff --git a/python/popgetter/assets/scotland/census_tables.py b/python/popgetter/assets/scotland/census_tables.py
index e81a95c..73ae065 100644
--- a/python/popgetter/assets/scotland/census_tables.py
+++ b/python/popgetter/assets/scotland/census_tables.py
@@ -40,7 +40,7 @@
     "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx"
 )
 URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip"
-URL_CATALOG_METADATA = (
+URL_CATALOG = (
     "https://www.scotlandscensus.gov.uk/media/kqcmo4ge/census-table-index-2011.xlsm"
 )
 
@@ -128,9 +128,9 @@ def add_metadata(
 
 
 @asset
-def catalog_metadata(context) -> pd.DataFrame:
-    catalog_metadata_df = pd.read_excel(
-        URL_CATALOG_METADATA,
+def catalog_reference(context) -> pd.DataFrame:
+    catalog_reference = pd.read_excel(
+        URL_CATALOG,
         sheet_name=None,
         header=None,
         storage_options={"User-Agent": "Mozilla/5.0"},
@@ -147,15 +147,15 @@ def catalog_metadata(context) -> pd.DataFrame:
             8: "population_coverage_and_variable",
         }
     )
-    add_metadata(context, catalog_metadata_df, "Metadata for census tables")
-    return catalog_metadata_df
+    add_metadata(context, catalog_reference, "Metadata for census tables")
+    return catalog_reference
 
 
 def get_table_metadata(
-    catalog_metadata: pd.DataFrame, table_name: str
+    catalog_reference: pd.DataFrame, table_name: str
 ) -> dict[str, str]:
     """Returns a dict of table metadata for a given table name."""
-    rows = catalog_metadata.loc[catalog_metadata.loc[:, "table_name"].eq(table_name)]
+    rows = catalog_reference.loc[catalog_reference.loc[:, "table_name"].eq(table_name)]
     census_release = rows.loc[:, "description"].unique()[0]
     description = rows.loc[:, "description"].unique()[0]
     population_coverage = rows.loc[:, "description"].unique()[0]
@@ -178,7 +178,7 @@ def get_table_name(file_name: str) -> str:
 
 
 @asset
-def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame:
+def catalog_as_dataframe(context, catalog_reference: pd.DataFrame) -> pd.DataFrame:
     """Creates a catalog of the individual census tables from all data sources."""
     records = []
     for data_source in DATA_SOURCES:
@@ -191,15 +191,15 @@ def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame:
                 # Get table name
                 table_name = get_table_name(file_name)
 
-                # Skip bulk output files and missing tables from catalog_metadata
+                # Skip bulk output files and missing tables from catalog_reference
                 if (
                     "bulk_output" in file_name.lower()
-                    or catalog_metadata.loc[:, "table_name"].ne(table_name).all()
+                    or catalog_reference.loc[:, "table_name"].ne(table_name).all()
                 ):
                     continue
 
                 # Get table metadata
-                table_metadata = get_table_metadata(catalog_metadata, table_name)
+                table_metadata = get_table_metadata(catalog_reference, table_name)
 
                 # Create a record for each census table use same keys as MetricMetadata
                 # where possible since this makes it simpler to populate derived
@@ -215,7 +215,7 @@ def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame:
                     # Use constructed name of description and coverage
                     "human_readable_name": table_metadata["human_readable_name"],
                     "source_metric_id": None,
-                    # Use catalog_metadata description
+                    # Use catalog_reference description
                     "description": table_metadata["description"],
                     "hxl_tag": None,
                     "metric_parquet_file_url": None,
@@ -229,7 +229,7 @@ def catalog(context, catalog_metadata: pd.DataFrame) -> pd.DataFrame:
                     "source_download_url": url,
                     # TODO: what should this be?
                     "source_archive_file_path": None,
-                    "source_documentation_url": URL_CATALOG_METADATA,
+                    "source_documentation_url": URL_CATALOG,
                 }
                 context.log.debug(record)
                 records.append(record)
@@ -277,11 +277,15 @@ def get_table(context, table_details) -> pd.DataFrame:
 
 
 @asset(partitions_def=dataset_node_partition)
-def individual_census_table(context, catalog: pd.DataFrame) -> pd.DataFrame:
+def individual_census_table(
+    context, catalog_as_dataframe: pd.DataFrame
+) -> pd.DataFrame:
     """Creates individual census tables as dataframe."""
     partition_key = context.asset_partition_key_for_output()
     context.log.info(partition_key)
-    table_details = catalog.loc[catalog["partition_key"].isin([partition_key])]
+    table_details = catalog_as_dataframe.loc[
+        catalog_as_dataframe["partition_key"].isin([partition_key])
+    ]
     context.log.info(table_details)
     return get_table(context, table_details)
 

From 596729cd9f682bf970d0af4f6e98f4916b7d2888 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 23 Apr 2024 17:43:55 +0100
Subject: [PATCH 21/60] Add data publisher for Scotland

---
 python/popgetter/assets/scotland/__init__.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py
index 77371e3..d85fcdd 100755
--- a/python/popgetter/assets/scotland/__init__.py
+++ b/python/popgetter/assets/scotland/__init__.py
@@ -8,9 +8,7 @@
     asset,
 )
 
-from popgetter.metadata import (
-    CountryMetadata,
-)
+from popgetter.metadata import CountryMetadata, DataPublisher
 
 country: CountryMetadata = CountryMetadata(
     name_short_en="Scotland",
@@ -20,6 +18,16 @@
     iso3116_2="GB-SCT",
 )
 
+publisher: DataPublisher = DataPublisher(
+    name="National Records of Scotland",
+    url="https://www.nrscotland.gov.uk/",
+    description="National Records of Scotland (NRS) is a Non-Ministerial Department of "
+    "the Scottish Government. Our purpose is to collect, preserve and "
+    "produce information about Scotland's people and history and make it "
+    "available to inform current and future generations.",
+    countries_of_interest=[country],
+)
+
 
 @asset()
 def country_metadata() -> CountryMetadata:
@@ -27,6 +35,12 @@ def country_metadata() -> CountryMetadata:
     return country
 
 
+@asset()
+def publisher_metadata():
+    """Returns a DataPublisher of metadata about the publisher."""
+    return publisher
+
+
 HEADERS = {
     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
 }

From 6a2d1b696426d8aac9acab44bd176c5cbdc97ac6 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 25 Apr 2024 10:52:26 +0100
Subject: [PATCH 22/60] Fix column names

---
 python/popgetter/assets/scotland/census_tables.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/popgetter/assets/scotland/census_tables.py b/python/popgetter/assets/scotland/census_tables.py
index 73ae065..af6610f 100644
--- a/python/popgetter/assets/scotland/census_tables.py
+++ b/python/popgetter/assets/scotland/census_tables.py
@@ -156,9 +156,9 @@ def get_table_metadata(
 ) -> dict[str, str]:
     """Returns a dict of table metadata for a given table name."""
     rows = catalog_reference.loc[catalog_reference.loc[:, "table_name"].eq(table_name)]
-    census_release = rows.loc[:, "description"].unique()[0]
+    census_release = rows.loc[:, "census_release"].unique()[0]
     description = rows.loc[:, "description"].unique()[0]
-    population_coverage = rows.loc[:, "description"].unique()[0]
+    population_coverage = rows.loc[:, "population_coverage"].unique()[0]
     variables = ", ".join(rows.loc[:, "variable"].astype(str).to_list())
     catalog_resolution = rows.loc[:, "catalog_resolution"].unique()[0]
     year = int(rows.loc[:, "year"].unique()[0])

From af3d00d5a2c4947def279a75e9f16c9f3080a2b4 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 25 Apr 2024 13:26:56 +0100
Subject: [PATCH 23/60] Add source data releases metadata, fix
 parquet_column_name field

---
 python/popgetter/assets/scotland/__init__.py  | 85 ++++++++++++++++++-
 .../assets/scotland/census_derived.py         | 11 ++-
 .../assets/scotland/census_tables.py          | 24 +++---
 3 files changed, 103 insertions(+), 17 deletions(-)

diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py
index d85fcdd..91d131e 100755
--- a/python/popgetter/assets/scotland/__init__.py
+++ b/python/popgetter/assets/scotland/__init__.py
@@ -1,6 +1,7 @@
 #!/usr/bin/python3
 from __future__ import annotations
 
+from datetime import date
 from pathlib import Path
 
 import requests
@@ -8,7 +9,7 @@
     asset,
 )
 
-from popgetter.metadata import CountryMetadata, DataPublisher
+from popgetter.metadata import CountryMetadata, DataPublisher, SourceDataRelease
 
 country: CountryMetadata = CountryMetadata(
     name_short_en="Scotland",
@@ -41,6 +42,88 @@ def publisher_metadata():
     return publisher
 
 
+# From: https://github.com/alan-turing-institute/microsimulation/blob/37ce2843f10b83a8e7a225c801cec83b85e6e0d0/microsimulation/common.py#L32
+REQUIRED_TABLES = [
+    "QS103SC",
+    "QS104SC",
+    "KS201SC",
+    "DC1117SC",
+    "DC2101SC",
+    "DC6206SC",
+    "LC1117SC",
+]
+REQUIRED_TABLES_REGEX = "|".join(REQUIRED_TABLES)
+# Currently including only releases matching tables included
+REQUIRED_RELEASES = ["3A", "3I", "2A", "3C"]
+GENERAL_METHODS_URL = "https://www.scotlandscensus.gov.uk/media/jx2lz54n/scotland-s_census_2011_general_report.pdf"
+CENSUS_REFERENCE_PERIOD = (date(2011, 3, 27), None)
+CENSUS_COLLECTION_PERIOD = (date(2011, 3, 27), None)
+CENSUS_EXPECT_NEXT_UPDATE = date(2022, 1, 1)
+
+sources: dict[str, SourceDataRelease] = {
+    "3A": SourceDataRelease(
+        name="Census 2011: Release 3A",
+        date_published=date(2014, 2, 27),
+        reference_period=CENSUS_REFERENCE_PERIOD,
+        collection_period=CENSUS_COLLECTION_PERIOD,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+        url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3a",
+        publishing_organisation=publisher,
+        description="TBC",
+        geography_file="TBC",
+        geography_level="TBC",
+        # available_metrics=None,
+        countries_of_interest=[country],
+    ),
+    "3I": SourceDataRelease(
+        name="Census 2011: Release 3I",
+        date_published=date(2014, 9, 24),
+        reference_period=(date(2015, 10, 22), None),
+        collection_period=(date(2011, 10, 22), None),
+        expect_next_update=date(2022, 1, 1),
+        url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3i",
+        publishing_organisation=publisher,
+        description="TBC",
+        geography_file="TBC",
+        geography_level="TBC",
+        # available_metrics=None,
+        countries_of_interest=[country],
+    ),
+    "2A": SourceDataRelease(
+        name="Census 2011: Release 2A",
+        date_published=date(2013, 9, 26),
+        reference_period=(date(2015, 10, 22), None),
+        collection_period=(date(2011, 10, 22), None),
+        expect_next_update=date(2022, 1, 1),
+        url="https://www.nrscotland.gov.uk/news/2013/census-2011-release-2a",
+        publishing_organisation=publisher,
+        description="TBC",
+        geography_file="TBC",
+        geography_level="TBC",
+        # available_metrics=None,
+        countries_of_interest=[country],
+    ),
+    "3C": SourceDataRelease(
+        name="Census 2011: Release 3C",
+        date_published=date(2014, 4, 9),
+        reference_period=(date(2015, 10, 22), None),
+        collection_period=(date(2011, 10, 22), None),
+        expect_next_update=date(2022, 1, 1),
+        url="https://www.nrscotland.gov.uk/news/2014/census-2011-releases-2d-and-3c",
+        publishing_organisation=publisher,
+        description="TBC",
+        geography_file="TBC",
+        geography_level="TBC",
+        # available_metrics=None,
+        countries_of_interest=[country],
+    ),
+}
+# Init
+for source in sources:
+    sources[source].update_forward_refs()
+
+
+# Move to tests
 HEADERS = {
     "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
 }
diff --git a/python/popgetter/assets/scotland/census_derived.py b/python/popgetter/assets/scotland/census_derived.py
index 724f292..9158e91 100644
--- a/python/popgetter/assets/scotland/census_derived.py
+++ b/python/popgetter/assets/scotland/census_derived.py
@@ -84,6 +84,8 @@ def get_lc1117sc_metric(
 needed_dataset_partition = StaticPartitionsDefinition(needed_dataset_partions_keys)
 
 # Using HXL tags for variable names (https://hxlstandard.org/standard/1-1final/dictionary/#tag_population)
+# TODO: add human readable names for each column as the MetricMetadata currently receives the
+# catalog row (table) human readable name.
 _derived_columns: list[dict] = [
     {
         "partition_key": "2011/OA11/LC1117SC",
@@ -176,13 +178,16 @@ def census_table_metadata(catalog_row: dict) -> MetricMetadata:
         source_download_url=catalog_row["source_download_url"],
         source_archive_file_path=catalog_row["source_archive_file_path"],
         source_documentation_url=catalog_row["source_documentation_url"],
-        source_data_release_id="TODO",
+        source_data_release_id=catalog_row["source_data_release_id"],
         # TODO - this is a placeholder
         parent_metric_id="unknown_at_this_stage",
         potential_denominator_ids=None,
         parquet_margin_of_error_file=None,
         parquet_margin_of_error_column=None,
-        parquet_column_name=catalog_row["source_column"],
+        # TODO: currently setting to rename the derived column name equal to 'hxltag'
+        # and not related to the source_column
+        # parquet_column_name=catalog_row["source_column"],
+        parquet_column_name=catalog_row["hxltag"],
         # TODO - this is a placeholder
         metric_parquet_file_url="unknown_at_this_stage",
         hxl_tag=catalog_row["hxltag"],
@@ -250,7 +255,9 @@ def get_enriched_tables_scotland(
     ]
     catalog_row = catalog_row.to_dict(orient="index")
     catalog_row = catalog_row.popitem()[1]
+    ic(catalog_row)
     result_mmd = census_table_metadata(catalog_row)
+    ic(result_mmd)
     return result_df, result_mmd
 
 
diff --git a/python/popgetter/assets/scotland/census_tables.py b/python/popgetter/assets/scotland/census_tables.py
index af6610f..7f61fa5 100644
--- a/python/popgetter/assets/scotland/census_tables.py
+++ b/python/popgetter/assets/scotland/census_tables.py
@@ -16,7 +16,7 @@
     multi_asset,
 )
 
-from popgetter.assets.scotland import download_file
+from popgetter.assets.scotland import REQUIRED_TABLES_REGEX, download_file, sources
 
 """
 Notes:
@@ -52,18 +52,6 @@
     "OA11": 2,  # "Output Area blk"
 }
 
-# From: https://github.com/alan-turing-institute/microsimulation/blob/37ce2843f10b83a8e7a225c801cec83b85e6e0d0/microsimulation/common.py#L32
-REQUIRED_TABLES = [
-    "QS103SC",
-    "QS104SC",
-    "KS201SC",
-    "DC1117SC",
-    "DC2101SC",
-    "DC6206SC",
-    "LC1117SC",
-]
-REQUIRED_TABLES_REGEX = "|".join(REQUIRED_TABLES)
-
 DATA_SOURCES = [
     {
         "source": "Council Area blk",
@@ -201,6 +189,14 @@ def catalog_as_dataframe(context, catalog_reference: pd.DataFrame) -> pd.DataFra
                 # Get table metadata
                 table_metadata = get_table_metadata(catalog_reference, table_name)
 
+                # Get source release metadata if available
+                source_data_release = sources.get(
+                    table_metadata["census_release"], None
+                )
+                source_data_release_id = (
+                    None if source_data_release is None else source_data_release.id
+                )
+
                 # Create a record for each census table use same keys as MetricMetadata
                 # where possible since this makes it simpler to populate derived
                 # metrics downstream
@@ -225,7 +221,7 @@ def catalog_as_dataframe(context, catalog_reference: pd.DataFrame) -> pd.DataFra
                     "potential_denominator_ids": None,
                     "parent_metric_id": None,
                     # TODO: check this is not an ID but a name
-                    "source_data_release_id": table_metadata["census_release"],
+                    "source_data_release_id": source_data_release_id,
                     "source_download_url": url,
                     # TODO: what should this be?
                     "source_archive_file_path": None,

From 06f0e10e963d203e0097997fe308dc5f7f809f4d Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 25 Apr 2024 13:56:05 +0100
Subject: [PATCH 24/60] Fix for CI

---
 python/popgetter/assets/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/popgetter/assets/__init__.py b/python/popgetter/assets/__init__.py
index 7ecbf5d..e357bc2 100644
--- a/python/popgetter/assets/__init__.py
+++ b/python/popgetter/assets/__init__.py
@@ -1,3 +1,3 @@
 from __future__ import annotations
 
-from . import be, uk, us, scotland  # noqa: F401
+from . import be, scotland, uk, us  # noqa: F401

From d7991e2dc877d88c549df0dfbd4b4bc6f783ec88 Mon Sep 17 00:00:00 2001
From: Jonathan Yong <yongrenjie@gmail.com>
Date: Thu, 2 May 2024 18:34:37 +0100
Subject: [PATCH 25/60] Update Scotland metadata to match new changes in #82

---
 python/popgetter/assets/scotland/__init__.py | 55 ++++++++++----------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py
index 91d131e..0a96e80 100755
--- a/python/popgetter/assets/scotland/__init__.py
+++ b/python/popgetter/assets/scotland/__init__.py
@@ -16,7 +16,7 @@
     name_official="Scotland",
     iso3="GBR",
     iso2="GB",
-    iso3116_2="GB-SCT",
+    iso3166_2="GB-SCT",
 )
 
 publisher: DataPublisher = DataPublisher(
@@ -26,7 +26,7 @@
     "the Scottish Government. Our purpose is to collect, preserve and "
     "produce information about Scotland's people and history and make it "
     "available to inform current and future generations.",
-    countries_of_interest=[country],
+    countries_of_interest=[country.id],
 )
 
 
@@ -56,71 +56,72 @@ def publisher_metadata():
 # Currently including only releases matching tables included
 REQUIRED_RELEASES = ["3A", "3I", "2A", "3C"]
 GENERAL_METHODS_URL = "https://www.scotlandscensus.gov.uk/media/jx2lz54n/scotland-s_census_2011_general_report.pdf"
-CENSUS_REFERENCE_PERIOD = (date(2011, 3, 27), None)
-CENSUS_COLLECTION_PERIOD = (date(2011, 3, 27), None)
+CENSUS_REFERENCE_DATE = date(2011, 3, 27)
+CENSUS_COLLECTION_DATE = date(2011, 3, 27)
 CENSUS_EXPECT_NEXT_UPDATE = date(2022, 1, 1)
 
 sources: dict[str, SourceDataRelease] = {
     "3A": SourceDataRelease(
         name="Census 2011: Release 3A",
         date_published=date(2014, 2, 27),
-        reference_period=CENSUS_REFERENCE_PERIOD,
-        collection_period=CENSUS_COLLECTION_PERIOD,
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
         expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
         url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3a",
-        publishing_organisation=publisher,
+        data_publisher_id=publisher.id,
         description="TBC",
         geography_file="TBC",
         geography_level="TBC",
-        # available_metrics=None,
-        countries_of_interest=[country],
+        countries_of_interest=[country.id],
     ),
     "3I": SourceDataRelease(
         name="Census 2011: Release 3I",
         date_published=date(2014, 9, 24),
-        reference_period=(date(2015, 10, 22), None),
-        collection_period=(date(2011, 10, 22), None),
+        reference_period_start=date(2015, 10, 22),
+        reference_period_end=date(2015, 10, 22),
+        collection_period_start=date(2011, 10, 22),
+        collection_period_end=date(2011, 10, 22),
         expect_next_update=date(2022, 1, 1),
         url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3i",
-        publishing_organisation=publisher,
+        data_publisher_id=publisher.id,
         description="TBC",
         geography_file="TBC",
         geography_level="TBC",
-        # available_metrics=None,
-        countries_of_interest=[country],
+        countries_of_interest=[country.id],
     ),
     "2A": SourceDataRelease(
         name="Census 2011: Release 2A",
         date_published=date(2013, 9, 26),
-        reference_period=(date(2015, 10, 22), None),
-        collection_period=(date(2011, 10, 22), None),
+        reference_period_start=date(2015, 10, 22),
+        reference_period_end=date(2015, 10, 22),
+        collection_period_start=date(2011, 10, 22),
+        collection_period_end=date(2011, 10, 22),
         expect_next_update=date(2022, 1, 1),
         url="https://www.nrscotland.gov.uk/news/2013/census-2011-release-2a",
-        publishing_organisation=publisher,
+        data_publisher_id=publisher.id,
         description="TBC",
         geography_file="TBC",
         geography_level="TBC",
-        # available_metrics=None,
-        countries_of_interest=[country],
+        countries_of_interest=[country.id],
     ),
     "3C": SourceDataRelease(
         name="Census 2011: Release 3C",
         date_published=date(2014, 4, 9),
-        reference_period=(date(2015, 10, 22), None),
-        collection_period=(date(2011, 10, 22), None),
+        reference_period_start=date(2015, 10, 22),
+        reference_period_end=date(2015, 10, 22),
+        collection_period_start=date(2011, 10, 22),
+        collection_period_end=date(2011, 10, 22),
         expect_next_update=date(2022, 1, 1),
         url="https://www.nrscotland.gov.uk/news/2014/census-2011-releases-2d-and-3c",
-        publishing_organisation=publisher,
+        data_publisher_id=publisher.id,
         description="TBC",
         geography_file="TBC",
         geography_level="TBC",
-        # available_metrics=None,
-        countries_of_interest=[country],
+        countries_of_interest=[country.id],
     ),
 }
-# Init
-for source in sources:
-    sources[source].update_forward_refs()
 
 
 # Move to tests

From 629548f6f8590411dd287fb0e62450c77f94c621 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 20 Jun 2024 21:29:30 +0100
Subject: [PATCH 26/60] Add todo

---
 python/popgetter/assets/ni/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index f5830e1..19d6d3b 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -371,6 +371,7 @@ def _geometry(
         for level_details in NI_GEO_LEVELS.values():
             # TODO: get correct values
             geometry_metadata = GeometryMetadata(
+                # TODO: check values for dates for the geometries
                 validity_period_start=CENSUS_COLLECTION_DATE,
                 validity_period_end=CENSUS_COLLECTION_DATE,
                 level=level_details.level,

From 8f3c7da19eff17b5f536e8ee04af10fd88f17cce Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 20 Jun 2024 21:30:12 +0100
Subject: [PATCH 27/60] Comment out old versions

---
 .../assets/scotland/census_derived.py         | 602 +++++++++---------
 .../assets/scotland/census_geometry.py        |  24 +-
 .../assets/scotland/census_tables.py          | 582 ++++++++---------
 3 files changed, 604 insertions(+), 604 deletions(-)

diff --git a/python/popgetter/assets/scotland/census_derived.py b/python/popgetter/assets/scotland/census_derived.py
index 9158e91..43277a0 100644
--- a/python/popgetter/assets/scotland/census_derived.py
+++ b/python/popgetter/assets/scotland/census_derived.py
@@ -1,328 +1,328 @@
-from __future__ import annotations
+# from __future__ import annotations
 
-import geopandas as gpd
-import numpy as np
-import pandas as pd
-from dagster import (
-    AssetIn,
-    AssetOut,
-    MaterializeResult,
-    MetadataValue,
-    SpecificPartitionsPartitionMapping,
-    StaticPartitionsDefinition,
-    asset,
-    multi_asset,
-)
-from icecream import ic
-from matplotlib import pyplot as plt
+# import geopandas as gpd
+# import numpy as np
+# import pandas as pd
+# from dagster import (
+#     AssetIn,
+#     AssetOut,
+#     MaterializeResult,
+#     MetadataValue,
+#     SpecificPartitionsPartitionMapping,
+#     StaticPartitionsDefinition,
+#     asset,
+#     multi_asset,
+# )
+# from icecream import ic
+# from matplotlib import pyplot as plt
 
-from popgetter.utils import markdown_from_plot
+# from popgetter.utils import markdown_from_plot
 
-from ...metadata import MetricMetadata
-from .census_tables import add_metadata, dataset_node_partition
+# from ...metadata import MetricMetadata
+# from .census_tables import add_metadata, dataset_node_partition
 
 
-def get_lc1117sc_metric(
-    lc1117sc: pd.DataFrame, col: str, output_col: str, subset: list[str]
-) -> pd.DataFrame:
-    lc1117sc_transformed = lc1117sc.rename(
-        columns={"Unnamed: 0": "OA11CD", "Unnamed: 1": "Age Category"}
-    )
-    lc1117sc_transformed = lc1117sc_transformed.loc[
-        ~lc1117sc_transformed["OA11CD"].str.startswith("S92"), :
-    ]
-    return (
-        lc1117sc_transformed.loc[
-            lc1117sc_transformed["Age Category"].isin(subset),
-            ["OA11CD", col],
-        ]
-        .groupby("OA11CD")
-        .agg("sum")
-        .rename(columns={col: output_col})
-    )
+# def get_lc1117sc_metric(
+#     lc1117sc: pd.DataFrame, col: str, output_col: str, subset: list[str]
+# ) -> pd.DataFrame:
+#     lc1117sc_transformed = lc1117sc.rename(
+#         columns={"Unnamed: 0": "OA11CD", "Unnamed: 1": "Age Category"}
+#     )
+#     lc1117sc_transformed = lc1117sc_transformed.loc[
+#         ~lc1117sc_transformed["OA11CD"].str.startswith("S92"), :
+#     ]
+#     return (
+#         lc1117sc_transformed.loc[
+#             lc1117sc_transformed["Age Category"].isin(subset),
+#             ["OA11CD", col],
+#         ]
+#         .groupby("OA11CD")
+#         .agg("sum")
+#         .rename(columns={col: output_col})
+#     )
 
 
-ALL_PEOPLE = ["All people"]
-INFANTS_AGE_0_TO_4 = ["0 to 4"]
-CHILDREN_AGE_0_TO_17 = ["0 to 4", "5 to 9", "10 to 11", "12 to 14", "15", "16 to 17"]
-CHILDREN_AGE_5_TO_17 = ["5 to 9", "10 to 11", "12 to 14", "15", "16 to 17"]
-ADULTS = [
-    "18 to 19",
-    "20 to 24",
-    "25 to 29",
-    "30 to 34",
-    "35 to 39",
-    "40 to 44",
-    "45 to 49",
-    "50 to 54",
-    "55 to 59",
-    "60 to 64",
-    "65 to 69",
-    "70 to 74",
-    "75 to 79",
-    "80 to 84",
-    "85 to 89",
-    "90 to 94",
-    "95 and over",
-]
+# ALL_PEOPLE = ["All people"]
+# INFANTS_AGE_0_TO_4 = ["0 to 4"]
+# CHILDREN_AGE_0_TO_17 = ["0 to 4", "5 to 9", "10 to 11", "12 to 14", "15", "16 to 17"]
+# CHILDREN_AGE_5_TO_17 = ["5 to 9", "10 to 11", "12 to 14", "15", "16 to 17"]
+# ADULTS = [
+#     "18 to 19",
+#     "20 to 24",
+#     "25 to 29",
+#     "30 to 34",
+#     "35 to 39",
+#     "40 to 44",
+#     "45 to 49",
+#     "50 to 54",
+#     "55 to 59",
+#     "60 to 64",
+#     "65 to 69",
+#     "70 to 74",
+#     "75 to 79",
+#     "80 to 84",
+#     "85 to 89",
+#     "90 to 94",
+#     "95 and over",
+# ]
 
-needed_dataset_list = [
-    {
-        # Population by OA11, Period: 2011
-        "partition_key": "2011/OA11/LC1117SC",
-        "hxltag": "#population+oa11+2011",
-        # TODO: this partition key does not have a single column for source
-        "source_column": "",
-    }
-]
-needed_dataset_partions_keys: list[str] = [
-    r["partition_key"] for r in needed_dataset_list
-]
-needed_dataset_mapping = SpecificPartitionsPartitionMapping(
-    needed_dataset_partions_keys
-)
-needed_dataset_partition = StaticPartitionsDefinition(needed_dataset_partions_keys)
+# needed_dataset_list = [
+#     {
+#         # Population by OA11, Period: 2011
+#         "partition_key": "2011/OA11/LC1117SC",
+#         "hxltag": "#population+oa11+2011",
+#         # TODO: this partition key does not have a single column for source
+#         "source_column": "",
+#     }
+# ]
+# needed_dataset_partions_keys: list[str] = [
+#     r["partition_key"] for r in needed_dataset_list
+# ]
+# needed_dataset_mapping = SpecificPartitionsPartitionMapping(
+#     needed_dataset_partions_keys
+# )
+# needed_dataset_partition = StaticPartitionsDefinition(needed_dataset_partions_keys)
 
-# Using HXL tags for variable names (https://hxlstandard.org/standard/1-1final/dictionary/#tag_population)
-# TODO: add human readable names for each column as the MetricMetadata currently receives the
-# catalog row (table) human readable name.
-_derived_columns: list[dict] = [
-    {
-        "partition_key": "2011/OA11/LC1117SC",
-        "hxltag": "population_children_age5_17",
-        "filter_func": lambda df, output_col: get_lc1117sc_metric(
-            df, "All people", output_col, CHILDREN_AGE_5_TO_17
-        ),
-    },
-    {
-        "partition_key": "2011/OA11/LC1117SC",
-        "hxltag": "population_infants_age0_4",
-        "filter_func": lambda df, output_col: get_lc1117sc_metric(
-            df, "All people", output_col, INFANTS_AGE_0_TO_4
-        ),
-    },
-    {
-        "partition_key": "2011/OA11/LC1117SC",
-        "hxltag": "population_children_age0_17",
-        "filter_func": lambda df, output_col: get_lc1117sc_metric(
-            df, "All people", output_col, CHILDREN_AGE_0_TO_17
-        ),
-    },
-    {
-        "partition_key": "2011/OA11/LC1117SC",
-        "hxltag": "population_adults_f",
-        "filter_func": lambda df, output_col: get_lc1117sc_metric(
-            df, "Females", output_col, ADULTS
-        ),
-    },
-    {
-        "partition_key": "2011/OA11/LC1117SC",
-        "hxltag": "population_adults_m",
-        "filter_func": lambda df, output_col: get_lc1117sc_metric(
-            df, "Males", output_col, ADULTS
-        ),
-    },
-    {
-        "partition_key": "2011/OA11/LC1117SC",
-        "hxltag": "population_adults",
-        "filter_func": lambda df, output_col: get_lc1117sc_metric(
-            df, "All people", output_col, ADULTS
-        ),
-    },
-    {
-        "partition_key": "2011/OA11/LC1117SC",
-        "hxltag": "population_ind",
-        "filter_func": lambda df, output_col: get_lc1117sc_metric(
-            df, "All people", output_col, ALL_PEOPLE
-        ),
-    },
-]
+# # Using HXL tags for variable names (https://hxlstandard.org/standard/1-1final/dictionary/#tag_population)
+# # TODO: add human readable names for each column as the MetricMetadata currently receives the
+# # catalog row (table) human readable name.
+# _derived_columns: list[dict] = [
+#     {
+#         "partition_key": "2011/OA11/LC1117SC",
+#         "hxltag": "population_children_age5_17",
+#         "filter_func": lambda df, output_col: get_lc1117sc_metric(
+#             df, "All people", output_col, CHILDREN_AGE_5_TO_17
+#         ),
+#     },
+#     {
+#         "partition_key": "2011/OA11/LC1117SC",
+#         "hxltag": "population_infants_age0_4",
+#         "filter_func": lambda df, output_col: get_lc1117sc_metric(
+#             df, "All people", output_col, INFANTS_AGE_0_TO_4
+#         ),
+#     },
+#     {
+#         "partition_key": "2011/OA11/LC1117SC",
+#         "hxltag": "population_children_age0_17",
+#         "filter_func": lambda df, output_col: get_lc1117sc_metric(
+#             df, "All people", output_col, CHILDREN_AGE_0_TO_17
+#         ),
+#     },
+#     {
+#         "partition_key": "2011/OA11/LC1117SC",
+#         "hxltag": "population_adults_f",
+#         "filter_func": lambda df, output_col: get_lc1117sc_metric(
+#             df, "Females", output_col, ADULTS
+#         ),
+#     },
+#     {
+#         "partition_key": "2011/OA11/LC1117SC",
+#         "hxltag": "population_adults_m",
+#         "filter_func": lambda df, output_col: get_lc1117sc_metric(
+#             df, "Males", output_col, ADULTS
+#         ),
+#     },
+#     {
+#         "partition_key": "2011/OA11/LC1117SC",
+#         "hxltag": "population_adults",
+#         "filter_func": lambda df, output_col: get_lc1117sc_metric(
+#             df, "All people", output_col, ADULTS
+#         ),
+#     },
+#     {
+#         "partition_key": "2011/OA11/LC1117SC",
+#         "hxltag": "population_ind",
+#         "filter_func": lambda df, output_col: get_lc1117sc_metric(
+#             df, "All people", output_col, ALL_PEOPLE
+#         ),
+#     },
+# ]
 
-derived_columns = pd.DataFrame(
-    _derived_columns, columns=["partition_key", "hxltag", "filter_func"]
-)
+# derived_columns = pd.DataFrame(
+#     _derived_columns, columns=["partition_key", "hxltag", "filter_func"]
+# )
 
 
-# record = {
-#     "resolution": resolution,
-#     "catalog_resolution": table_metadata["catalog_resolution"],
-#     "source": source,
-#     "url": url,
-#     "file_name": Path(source) / file_name,
-#     "table_name": table_name,
-#     "year": table_metadata["year"],
-#     # Use constructed name of description and coverage
-#     "human_readable_name": table_metadata["human_readable_name"],
-#     "source_metric_id": None,
-#     # Use catalog_metadata description
-#     "description": table_metadata["description"],
-#     "hxl_tag": None,
-#     "metric_parquet_file_url": None,
-#     "parquet_column_name": None,
-#     "parquet_margin_of_error_column": None,
-#     "parquet_margin_of_error_file": None,
-#     "potential_denominator_ids": None,
-#     "parent_metric_id": None,
-#     # TODO: check this is not an ID but a name
-#     "source_data_release_id": table_metadata["census_release"],
-#     "source_download_url": url,
-#     # TODO: what should this be?
-#     "source_archive_file_path": None,
-#     "source_documentation_url": URL_CATALOG_METADATA,
-# }
+# # record = {
+# #     "resolution": resolution,
+# #     "catalog_resolution": table_metadata["catalog_resolution"],
+# #     "source": source,
+# #     "url": url,
+# #     "file_name": Path(source) / file_name,
+# #     "table_name": table_name,
+# #     "year": table_metadata["year"],
+# #     # Use constructed name of description and coverage
+# #     "human_readable_name": table_metadata["human_readable_name"],
+# #     "source_metric_id": None,
+# #     # Use catalog_metadata description
+# #     "description": table_metadata["description"],
+# #     "hxl_tag": None,
+# #     "metric_parquet_file_url": None,
+# #     "parquet_column_name": None,
+# #     "parquet_margin_of_error_column": None,
+# #     "parquet_margin_of_error_file": None,
+# #     "potential_denominator_ids": None,
+# #     "parent_metric_id": None,
+# #     # TODO: check this is not an ID but a name
+# #     "source_data_release_id": table_metadata["census_release"],
+# #     "source_download_url": url,
+# #     # TODO: what should this be?
+# #     "source_archive_file_path": None,
+# #     "source_documentation_url": URL_CATALOG_METADATA,
+# # }
 
 
-def census_table_metadata(catalog_row: dict) -> MetricMetadata:
-    return MetricMetadata(
-        human_readable_name=catalog_row["human_readable_name"],
-        source_download_url=catalog_row["source_download_url"],
-        source_archive_file_path=catalog_row["source_archive_file_path"],
-        source_documentation_url=catalog_row["source_documentation_url"],
-        source_data_release_id=catalog_row["source_data_release_id"],
-        # TODO - this is a placeholder
-        parent_metric_id="unknown_at_this_stage",
-        potential_denominator_ids=None,
-        parquet_margin_of_error_file=None,
-        parquet_margin_of_error_column=None,
-        # TODO: currently setting to rename the derived column name equal to 'hxltag'
-        # and not related to the source_column
-        # parquet_column_name=catalog_row["source_column"],
-        parquet_column_name=catalog_row["hxltag"],
-        # TODO - this is a placeholder
-        metric_parquet_file_url="unknown_at_this_stage",
-        hxl_tag=catalog_row["hxltag"],
-        description=catalog_row["description"],
-        source_metric_id=catalog_row["hxltag"],
-    )
+# def census_table_metadata(catalog_row: dict) -> MetricMetadata:
+#     return MetricMetadata(
+#         human_readable_name=catalog_row["human_readable_name"],
+#         source_download_url=catalog_row["source_download_url"],
+#         source_archive_file_path=catalog_row["source_archive_file_path"],
+#         source_documentation_url=catalog_row["source_documentation_url"],
+#         source_data_release_id=catalog_row["source_data_release_id"],
+#         # TODO - this is a placeholder
+#         parent_metric_id="unknown_at_this_stage",
+#         potential_denominator_ids=None,
+#         parquet_margin_of_error_file=None,
+#         parquet_margin_of_error_column=None,
+#         # TODO: currently setting to rename the derived column name equal to 'hxltag'
+#         # and not related to the source_column
+#         # parquet_column_name=catalog_row["source_column"],
+#         parquet_column_name=catalog_row["hxltag"],
+#         # TODO - this is a placeholder
+#         metric_parquet_file_url="unknown_at_this_stage",
+#         hxl_tag=catalog_row["hxltag"],
+#         description=catalog_row["description"],
+#         source_metric_id=catalog_row["hxltag"],
+#     )
 
 
-@asset(
-    ins={
-        "catalog_as_dataframe": AssetIn(partition_mapping=needed_dataset_mapping),
-    },
-)
-def filter_needed_catalog(
-    context, needed_datasets, catalog_as_dataframe: pd.DataFrame
-) -> pd.DataFrame:
-    needed_df = needed_datasets.merge(
-        catalog_as_dataframe, how="inner", on="partition_key"
-    )
-    add_metadata(context, needed_df, "needed_df")
-    return needed_df
+# @asset(
+#     ins={
+#         "catalog_as_dataframe": AssetIn(partition_mapping=needed_dataset_mapping),
+#     },
+# )
+# def filter_needed_catalog(
+#     context, needed_datasets, catalog_as_dataframe: pd.DataFrame
+# ) -> pd.DataFrame:
+#     needed_df = needed_datasets.merge(
+#         catalog_as_dataframe, how="inner", on="partition_key"
+#     )
+#     add_metadata(context, needed_df, "needed_df")
+#     return needed_df
 
 
-@asset
-def needed_datasets(context) -> pd.DataFrame:
-    needed_df = pd.DataFrame(
-        needed_dataset_list,
-        columns=["partition_key", "hxltag", "source_column", "derived_columns"],
-        dtype="string",
-    )
-    add_metadata(context, needed_df, "needed_datasets")
-    return needed_df
+# @asset
+# def needed_datasets(context) -> pd.DataFrame:
+#     needed_df = pd.DataFrame(
+#         needed_dataset_list,
+#         columns=["partition_key", "hxltag", "source_column", "derived_columns"],
+#         dtype="string",
+#     )
+#     add_metadata(context, needed_df, "needed_datasets")
+#     return needed_df
 
 
-@multi_asset(
-    ins={
-        "individual_census_table": AssetIn(partition_mapping=needed_dataset_mapping),
-        "filter_needed_catalog": AssetIn(),
-    },
-    outs={
-        "source_table": AssetOut(),
-        "source_mmd": AssetOut(),
-    },
-    partitions_def=dataset_node_partition,
-)
-def get_enriched_tables_scotland(
-    context, individual_census_table, filter_needed_catalog
-) -> tuple[pd.DataFrame, MetricMetadata]:
-    partition_keys = context.asset_partition_keys_for_input(
-        input_name="individual_census_table",
-    )
-    output_partition = context.asset_partition_key_for_output("source_table")
-    ic(partition_keys)
-    ic(len(partition_keys))
-    ic(output_partition)
-    ic(type(output_partition))
-    ic(individual_census_table)
-    if output_partition not in partition_keys:
-        err_msg = f"Requested partition {output_partition} not found in the subset of 'needed' partitions {partition_keys}"
-        raise ValueError(err_msg)
+# @multi_asset(
+#     ins={
+#         "individual_census_table": AssetIn(partition_mapping=needed_dataset_mapping),
+#         "filter_needed_catalog": AssetIn(),
+#     },
+#     outs={
+#         "source_table": AssetOut(),
+#         "source_mmd": AssetOut(),
+#     },
+#     partitions_def=dataset_node_partition,
+# )
+# def get_enriched_tables_scotland(
+#     context, individual_census_table, filter_needed_catalog
+# ) -> tuple[pd.DataFrame, MetricMetadata]:
+#     partition_keys = context.asset_partition_keys_for_input(
+#         input_name="individual_census_table",
+#     )
+#     output_partition = context.asset_partition_key_for_output("source_table")
+#     ic(partition_keys)
+#     ic(len(partition_keys))
+#     ic(output_partition)
+#     ic(type(output_partition))
+#     ic(individual_census_table)
+#     if output_partition not in partition_keys:
+#         err_msg = f"Requested partition {output_partition} not found in the subset of 'needed' partitions {partition_keys}"
+#         raise ValueError(err_msg)
 
-    result_df = individual_census_table
-    catalog_row = filter_needed_catalog[
-        filter_needed_catalog["partition_key"].eq(output_partition)
-    ]
-    catalog_row = catalog_row.to_dict(orient="index")
-    catalog_row = catalog_row.popitem()[1]
-    ic(catalog_row)
-    result_mmd = census_table_metadata(catalog_row)
-    ic(result_mmd)
-    return result_df, result_mmd
+#     result_df = individual_census_table
+#     catalog_row = filter_needed_catalog[
+#         filter_needed_catalog["partition_key"].eq(output_partition)
+#     ]
+#     catalog_row = catalog_row.to_dict(orient="index")
+#     catalog_row = catalog_row.popitem()[1]
+#     ic(catalog_row)
+#     result_mmd = census_table_metadata(catalog_row)
+#     ic(result_mmd)
+#     return result_df, result_mmd
 
 
-@multi_asset(
-    partitions_def=dataset_node_partition,
-    ins={
-        "source_table": AssetIn(partition_mapping=needed_dataset_mapping),
-        "source_mmd": AssetIn(partition_mapping=needed_dataset_mapping),
-    },
-    outs={"derived_table": AssetOut(), "derived_mmds": AssetOut()},
-)
-def transform_data(
-    context,
-    source_table: pd.DataFrame,
-    source_mmd: MetricMetadata,
-) -> tuple[pd.DataFrame, list[MetricMetadata]]:
-    partition_key = context.asset_partition_key_for_output("derived_table")
-    census_table = source_table.copy()
-    parent_mmd = source_mmd.copy()
-    # source_column = parent_mmd.parquet_column_name
-    metrics = derived_columns[derived_columns["partition_key"].eq(partition_key)]
-    new_series: list[pd.Series] = []
-    new_mmds: list[MetricMetadata] = []
-    for _, _, col_name, filter in metrics.itertuples():
-        # Create column
-        column: pd.Series = filter(census_table, col_name)
-        ic(f"col_name: {col_name}")
-        new_series.append(column)
+# @multi_asset(
+#     partitions_def=dataset_node_partition,
+#     ins={
+#         "source_table": AssetIn(partition_mapping=needed_dataset_mapping),
+#         "source_mmd": AssetIn(partition_mapping=needed_dataset_mapping),
+#     },
+#     outs={"derived_table": AssetOut(), "derived_mmds": AssetOut()},
+# )
+# def transform_data(
+#     context,
+#     source_table: pd.DataFrame,
+#     source_mmd: MetricMetadata,
+# ) -> tuple[pd.DataFrame, list[MetricMetadata]]:
+#     partition_key = context.asset_partition_key_for_output("derived_table")
+#     census_table = source_table.copy()
+#     parent_mmd = source_mmd.copy()
+#     # source_column = parent_mmd.parquet_column_name
+#     metrics = derived_columns[derived_columns["partition_key"].eq(partition_key)]
+#     new_series: list[pd.Series] = []
+#     new_mmds: list[MetricMetadata] = []
+#     for _, _, col_name, filter in metrics.itertuples():
+#         # Create column
+#         column: pd.Series = filter(census_table, col_name)
+#         ic(f"col_name: {col_name}")
+#         new_series.append(column)
 
-        # Construct metadata
-        new_mmd = parent_mmd.copy()
-        new_mmd.parent_metric_id = parent_mmd.source_metric_id
-        new_mmd.hxl_tag = col_name
-        new_mmds.append(new_mmd)
+#         # Construct metadata
+#         new_mmd = parent_mmd.copy()
+#         new_mmd.parent_metric_id = parent_mmd.source_metric_id
+#         new_mmd.hxl_tag = col_name
+#         new_mmds.append(new_mmd)
 
-    # Merge series
-    new_table: pd.DataFrame = pd.concat(new_series, axis=1)
-    add_metadata(
-        context,
-        df=new_table,
-        title=f"Derived table ({partition_key})",
-        output_name="derived_table",
-    )
-    return new_table, new_mmds
+#     # Merge series
+#     new_table: pd.DataFrame = pd.concat(new_series, axis=1)
+#     add_metadata(
+#         context,
+#         df=new_table,
+#         title=f"Derived table ({partition_key})",
+#         output_name="derived_table",
+#     )
+#     return new_table, new_mmds
 
 
-@multi_asset(
-    ins={
-        "derived_table": AssetIn(partition_mapping=needed_dataset_mapping),
-        "geometry": AssetIn(partition_mapping=needed_dataset_mapping),
-    },
-    outs={
-        "plot": AssetOut(),
-    },
-    partitions_def=dataset_node_partition,
-)
-def plot(derived_table: pd.DataFrame, geometry: gpd.GeoDataFrame):
-    """Plots map with log density of people."""
-    merged = geometry.merge(
-        derived_table[["population_ind"]],
-        left_on="geo_code",
-        right_index=True,
-        how="left",
-    )
-    merged["log10 people"] = np.log10(merged["population_ind"])
-    merged.plot(column="log10 people", legend=True)
-    md_content = markdown_from_plot(plt)
-    return MaterializeResult(metadata={"plot": MetadataValue.md(md_content)})
+# @multi_asset(
+#     ins={
+#         "derived_table": AssetIn(partition_mapping=needed_dataset_mapping),
+#         "geometry": AssetIn(partition_mapping=needed_dataset_mapping),
+#     },
+#     outs={
+#         "plot": AssetOut(),
+#     },
+#     partitions_def=dataset_node_partition,
+# )
+# def plot(derived_table: pd.DataFrame, geometry: gpd.GeoDataFrame):
+#     """Plots map with log density of people."""
+#     merged = geometry.merge(
+#         derived_table[["population_ind"]],
+#         left_on="geo_code",
+#         right_index=True,
+#         how="left",
+#     )
+#     merged["log10 people"] = np.log10(merged["population_ind"])
+#     merged.plot(column="log10 people", legend=True)
+#     md_content = markdown_from_plot(plt)
+#     return MaterializeResult(metadata={"plot": MetadataValue.md(md_content)})
diff --git a/python/popgetter/assets/scotland/census_geometry.py b/python/popgetter/assets/scotland/census_geometry.py
index 5187181..28e5afa 100644
--- a/python/popgetter/assets/scotland/census_geometry.py
+++ b/python/popgetter/assets/scotland/census_geometry.py
@@ -1,17 +1,17 @@
-from __future__ import annotations
+# from __future__ import annotations
 
-import geopandas as gpd
-from dagster import asset
+# import geopandas as gpd
+# from dagster import asset
 
-from popgetter.assets.scotland import download_file
+# from popgetter.assets.scotland import download_file
 
-from .census_tables import URL_SHAPEFILE, add_metadata, cache_dir
+# from .census_tables import URL_SHAPEFILE, add_metadata, cache_dir
 
 
-@asset
-def geometry(context, oa_dz_iz_2011_lookup) -> gpd.GeoDataFrame:
-    """Gets the shape file for OA11 resolution."""
-    file_name = download_file(cache_dir, URL_SHAPEFILE)
-    geo = gpd.read_file(f"zip://{file_name}")
-    add_metadata(context, geo, "Geometry file")
-    return geo[geo["geo_code"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])]
+# # @asset
+# # def geometry(context, oa_dz_iz_2011_lookup) -> gpd.GeoDataFrame:
+# #     """Gets the shape file for OA11 resolution."""
+# #     file_name = download_file(cache_dir, URL_SHAPEFILE)
+# #     geo = gpd.read_file(f"zip://{file_name}")
+# #     add_metadata(context, geo, "Geometry file")
+# #     return geo[geo["geo_code"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])]
diff --git a/python/popgetter/assets/scotland/census_tables.py b/python/popgetter/assets/scotland/census_tables.py
index 7f61fa5..5efda94 100644
--- a/python/popgetter/assets/scotland/census_tables.py
+++ b/python/popgetter/assets/scotland/census_tables.py
@@ -1,291 +1,291 @@
-from __future__ import annotations
-
-import urllib.parse as urlparse
-from pathlib import Path
-
-import geopandas as gpd
-import pandas as pd
-import zipfile_deflate64 as zipfile
-from dagster import (
-    AssetOut,
-    DynamicPartitionsDefinition,
-    MetadataValue,
-    SpecificPartitionsPartitionMapping,
-    StaticPartitionsDefinition,
-    asset,
-    multi_asset,
-)
-
-from popgetter.assets.scotland import REQUIRED_TABLES_REGEX, download_file, sources
-
-"""
-Notes:
-  - 2011 data using UKCensusAPI, 2022 data expected soon given recent initial
-    publication
-  - Reusing some bits of code from UKCensusAPI:
-    https://github.com/alan-turing-institute/UKCensusAPI/blob/master/ukcensusapi/NRScotland.py
-"""
-
-
-PARTITIONS_DEF_NAME = "dataset_tables"
-dataset_node_partition = DynamicPartitionsDefinition(name=PARTITIONS_DEF_NAME)
-
-# cache_dir = tempfile.mkdtemp()
-cache_dir = "./cache"
-
-URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html"
-URL1 = "https://www.scotlandscensus.gov.uk/"
-URL2 = "https://nrscensusprodumb.blob.core.windows.net/downloads/"
-URL_LOOKUP = (
-    "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx"
-)
-URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip"
-URL_CATALOG = (
-    "https://www.scotlandscensus.gov.uk/media/kqcmo4ge/census-table-index-2011.xlsm"
-)
-
-data_sources = ["Council Area blk", "SNS Data Zone 2011 blk", "Output Area blk"]
-GeoCodeLookup = {
-    "LAD": 0,  # "Council Area blk"
-    # MSOA (intermediate zone)?
-    "LSOA11": 1,  # "SNS Data Zone 2011 blk"
-    "OA11": 2,  # "Output Area blk"
-}
-
-DATA_SOURCES = [
-    {
-        "source": "Council Area blk",
-        "resolution": "LAD",
-        "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip",
-    },
-    {
-        "source": "SNS Data Zone 2011 blk",
-        "resolution": "LSOA11",
-        "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip",
-    },
-    {
-        "source": "Output Area blk",
-        "resolution": "OA11",
-        "url": URL2 + urlparse.quote("Output Area blk") + ".zip",
-    },
-]
-
-
-# NB. Make sure no spaces in asset keys
-@multi_asset(
-    outs={
-        "oa_dz_iz_2011_lookup": AssetOut(),
-        "data_zone_2011_lookup": AssetOut(),
-        "intermediate_zone_2011_lookup": AssetOut(),
-    },
-)
-def lookups():
-    """Creates lookup dataframes."""
-    Path(cache_dir).mkdir(parents=True, exist_ok=True)
-    lookup_path = download_file(cache_dir, URL_LOOKUP)
-    df1 = pd.read_excel(lookup_path, sheet_name="OA_DZ_IZ_2011 Lookup")
-    df2 = pd.read_excel(lookup_path, sheet_name="DataZone2011Lookup")
-    df3 = pd.read_excel(lookup_path, sheet_name="IntermediateZone2011Lookup")
-    return df1, df2, df3
-
-
-def source_to_zip(source_name: str, url: str) -> Path:
-    """Downloads if necessary and returns the name of the locally cached zip file
-    of the source data (replacing spaces with _)"""
-    file_name = Path(cache_dir) / (source_name.replace(" ", "_") + ".zip")
-    return download_file(cache_dir, url, file_name)
-
-
-def add_metadata(
-    context,
-    df: pd.DataFrame | gpd.GeoDataFrame,
-    title: str | list[str],
-    output_name: str | None = None,
-):
-    context.add_output_metadata(
-        metadata={
-            "title": title,
-            "num_records": len(df),
-            "columns": MetadataValue.md(
-                "\n".join([f"- '`{col}`'" for col in df.columns.to_list()])
-            ),
-            "preview": MetadataValue.md(df.head().to_markdown()),
-        },
-        output_name=output_name,
-    )
-
-
-@asset
-def catalog_reference(context) -> pd.DataFrame:
-    catalog_reference = pd.read_excel(
-        URL_CATALOG,
-        sheet_name=None,
-        header=None,
-        storage_options={"User-Agent": "Mozilla/5.0"},
-    )["Index"].rename(
-        columns={
-            0: "census_release",
-            1: "table_name",
-            2: "description",
-            3: "population_coverage",
-            4: "variable",
-            5: "catalog_resolution",
-            6: "year",
-            7: "additional_url",
-            8: "population_coverage_and_variable",
-        }
-    )
-    add_metadata(context, catalog_reference, "Metadata for census tables")
-    return catalog_reference
-
-
-def get_table_metadata(
-    catalog_reference: pd.DataFrame, table_name: str
-) -> dict[str, str]:
-    """Returns a dict of table metadata for a given table name."""
-    rows = catalog_reference.loc[catalog_reference.loc[:, "table_name"].eq(table_name)]
-    census_release = rows.loc[:, "census_release"].unique()[0]
-    description = rows.loc[:, "description"].unique()[0]
-    population_coverage = rows.loc[:, "population_coverage"].unique()[0]
-    variables = ", ".join(rows.loc[:, "variable"].astype(str).to_list())
-    catalog_resolution = rows.loc[:, "catalog_resolution"].unique()[0]
-    year = int(rows.loc[:, "year"].unique()[0])
-    return {
-        "census_release": census_release,
-        "description": description,
-        "population_coverage": population_coverage,
-        "variables": variables,
-        "catalog_resolution": catalog_resolution,
-        "year": str(year),
-        "human_readable_name": f"{description} ({population_coverage})",
-    }
-
-
-def get_table_name(file_name: str) -> str:
-    return file_name.rsplit(".csv")[0]
-
-
-@asset
-def catalog_as_dataframe(context, catalog_reference: pd.DataFrame) -> pd.DataFrame:
-    """Creates a catalog of the individual census tables from all data sources."""
-    records = []
-    for data_source in DATA_SOURCES:
-        resolution = data_source["resolution"]
-        source = data_source["source"]
-        url = data_source["url"]
-        zip_file_name = source_to_zip(source, url)
-        with zipfile.ZipFile(zip_file_name) as zip_ref:
-            for file_name in zip_ref.namelist():
-                # Get table name
-                table_name = get_table_name(file_name)
-
-                # Skip bulk output files and missing tables from catalog_reference
-                if (
-                    "bulk_output" in file_name.lower()
-                    or catalog_reference.loc[:, "table_name"].ne(table_name).all()
-                ):
-                    continue
-
-                # Get table metadata
-                table_metadata = get_table_metadata(catalog_reference, table_name)
-
-                # Get source release metadata if available
-                source_data_release = sources.get(
-                    table_metadata["census_release"], None
-                )
-                source_data_release_id = (
-                    None if source_data_release is None else source_data_release.id
-                )
-
-                # Create a record for each census table use same keys as MetricMetadata
-                # where possible since this makes it simpler to populate derived
-                # metrics downstream
-                record = {
-                    "resolution": resolution,
-                    "catalog_resolution": table_metadata["catalog_resolution"],
-                    "source": source,
-                    "url": url,
-                    "file_name": Path(source) / file_name,
-                    "table_name": table_name,
-                    "year": table_metadata["year"],
-                    # Use constructed name of description and coverage
-                    "human_readable_name": table_metadata["human_readable_name"],
-                    "source_metric_id": None,
-                    # Use catalog_reference description
-                    "description": table_metadata["description"],
-                    "hxl_tag": None,
-                    "metric_parquet_file_url": None,
-                    "parquet_column_name": None,
-                    "parquet_margin_of_error_column": None,
-                    "parquet_margin_of_error_file": None,
-                    "potential_denominator_ids": None,
-                    "parent_metric_id": None,
-                    # TODO: check this is not an ID but a name
-                    "source_data_release_id": source_data_release_id,
-                    "source_download_url": url,
-                    # TODO: what should this be?
-                    "source_archive_file_path": None,
-                    "source_documentation_url": URL_CATALOG,
-                }
-                context.log.debug(record)
-                records.append(record)
-                zip_ref.extract(file_name, Path(cache_dir) / source)
-
-    # TODO: check if required
-    for partition in context.instance.get_dynamic_partitions(PARTITIONS_DEF_NAME):
-        context.instance.delete_dynamic_partition(PARTITIONS_DEF_NAME, partition)
-
-    # Create a dynamic partition for the datasets listed in the catalog
-    catalog_df: pd.DataFrame = pd.DataFrame.from_records(records)
-    catalog_df["partition_key"] = (
-        catalog_df[["year", "resolution", "table_name"]]
-        .astype(str)
-        .agg(lambda s: "/".join(s).rsplit(".")[0], axis=1)
-    )
-    # TODO: consider filtering here based on a set of keys to keep derived from
-    # config (i.e. backend/frontend modes)
-    context.instance.add_dynamic_partitions(
-        partitions_def_name=PARTITIONS_DEF_NAME,
-        # To ensure this is unique, prepend the resolution,
-        partition_keys=catalog_df.loc[
-            catalog_df["partition_key"].str.contains(REQUIRED_TABLES_REGEX),
-            "partition_key",
-        ].to_list(),
-    )
-    context.add_output_metadata(
-        metadata={
-            "num_records": len(catalog_df),
-            "ignored_datasets": "",
-            "columns": MetadataValue.md(
-                "\n".join([f"- '`{col}`'" for col in catalog_df.columns.to_list()])
-            ),
-            "columns_types": MetadataValue.md(catalog_df.dtypes.to_markdown()),
-            "preview": MetadataValue.md(catalog_df.to_markdown()),
-        }
-    )
-    return catalog_df
-
-
-def get_table(context, table_details) -> pd.DataFrame:
-    table_df = pd.read_csv(Path(cache_dir) / table_details["file_name"].iloc[0])
-    add_metadata(context, table_df, table_details["partition_key"].iloc[0])
-    return table_df
-
-
-@asset(partitions_def=dataset_node_partition)
-def individual_census_table(
-    context, catalog_as_dataframe: pd.DataFrame
-) -> pd.DataFrame:
-    """Creates individual census tables as dataframe."""
-    partition_key = context.asset_partition_key_for_output()
-    context.log.info(partition_key)
-    table_details = catalog_as_dataframe.loc[
-        catalog_as_dataframe["partition_key"].isin([partition_key])
-    ]
-    context.log.info(table_details)
-    return get_table(context, table_details)
-
-
-subset_partition_keys: list[str] = ["2011/OA11/LC1117SC"]
-subset_mapping = SpecificPartitionsPartitionMapping(subset_partition_keys)
-subset_partition = StaticPartitionsDefinition(subset_partition_keys)
+# from __future__ import annotations
+
+# import urllib.parse as urlparse
+# from pathlib import Path
+
+# import geopandas as gpd
+# import pandas as pd
+# import zipfile_deflate64 as zipfile
+# from dagster import (
+#     AssetOut,
+#     DynamicPartitionsDefinition,
+#     MetadataValue,
+#     SpecificPartitionsPartitionMapping,
+#     StaticPartitionsDefinition,
+#     asset,
+#     multi_asset,
+# )
+
+# from popgetter.assets.scotland import REQUIRED_TABLES_REGEX, download_file, sources
+
+# """
+# Notes:
+#   - 2011 data using UKCensusAPI, 2022 data expected soon given recent initial
+#     publication
+#   - Reusing some bits of code from UKCensusAPI:
+#     https://github.com/alan-turing-institute/UKCensusAPI/blob/master/ukcensusapi/NRScotland.py
+# """
+
+
+# PARTITIONS_DEF_NAME = "dataset_tables"
+# dataset_node_partition = DynamicPartitionsDefinition(name=PARTITIONS_DEF_NAME)
+
+# # cache_dir = tempfile.mkdtemp()
+# cache_dir = "./cache"
+
+# URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html"
+# URL1 = "https://www.scotlandscensus.gov.uk/"
+# URL2 = "https://nrscensusprodumb.blob.core.windows.net/downloads/"
+# URL_LOOKUP = (
+#     "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx"
+# )
+# URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip"
+# URL_CATALOG = (
+#     "https://www.scotlandscensus.gov.uk/media/kqcmo4ge/census-table-index-2011.xlsm"
+# )
+
+# data_sources = ["Council Area blk", "SNS Data Zone 2011 blk", "Output Area blk"]
+# GeoCodeLookup = {
+#     "LAD": 0,  # "Council Area blk"
+#     # MSOA (intermediate zone)?
+#     "LSOA11": 1,  # "SNS Data Zone 2011 blk"
+#     "OA11": 2,  # "Output Area blk"
+# }
+
+# DATA_SOURCES = [
+#     {
+#         "source": "Council Area blk",
+#         "resolution": "LAD",
+#         "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip",
+#     },
+#     {
+#         "source": "SNS Data Zone 2011 blk",
+#         "resolution": "LSOA11",
+#         "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip",
+#     },
+#     {
+#         "source": "Output Area blk",
+#         "resolution": "OA11",
+#         "url": URL2 + urlparse.quote("Output Area blk") + ".zip",
+#     },
+# ]
+
+
+# # NB. Make sure no spaces in asset keys
+# @multi_asset(
+#     outs={
+#         "oa_dz_iz_2011_lookup": AssetOut(),
+#         "data_zone_2011_lookup": AssetOut(),
+#         "intermediate_zone_2011_lookup": AssetOut(),
+#     },
+# )
+# def lookups():
+#     """Creates lookup dataframes."""
+#     Path(cache_dir).mkdir(parents=True, exist_ok=True)
+#     lookup_path = download_file(cache_dir, URL_LOOKUP)
+#     df1 = pd.read_excel(lookup_path, sheet_name="OA_DZ_IZ_2011 Lookup")
+#     df2 = pd.read_excel(lookup_path, sheet_name="DataZone2011Lookup")
+#     df3 = pd.read_excel(lookup_path, sheet_name="IntermediateZone2011Lookup")
+#     return df1, df2, df3
+
+
+# def source_to_zip(source_name: str, url: str) -> Path:
+#     """Downloads if necessary and returns the name of the locally cached zip file
+#     of the source data (replacing spaces with _)"""
+#     file_name = Path(cache_dir) / (source_name.replace(" ", "_") + ".zip")
+#     return download_file(cache_dir, url, file_name)
+
+
+# def add_metadata(
+#     context,
+#     df: pd.DataFrame | gpd.GeoDataFrame,
+#     title: str | list[str],
+#     output_name: str | None = None,
+# ):
+#     context.add_output_metadata(
+#         metadata={
+#             "title": title,
+#             "num_records": len(df),
+#             "columns": MetadataValue.md(
+#                 "\n".join([f"- '`{col}`'" for col in df.columns.to_list()])
+#             ),
+#             "preview": MetadataValue.md(df.head().to_markdown()),
+#         },
+#         output_name=output_name,
+#     )
+
+
+# @asset
+# def catalog_reference(context) -> pd.DataFrame:
+#     catalog_reference = pd.read_excel(
+#         URL_CATALOG,
+#         sheet_name=None,
+#         header=None,
+#         storage_options={"User-Agent": "Mozilla/5.0"},
+#     )["Index"].rename(
+#         columns={
+#             0: "census_release",
+#             1: "table_name",
+#             2: "description",
+#             3: "population_coverage",
+#             4: "variable",
+#             5: "catalog_resolution",
+#             6: "year",
+#             7: "additional_url",
+#             8: "population_coverage_and_variable",
+#         }
+#     )
+#     add_metadata(context, catalog_reference, "Metadata for census tables")
+#     return catalog_reference
+
+
+# def get_table_metadata(
+#     catalog_reference: pd.DataFrame, table_name: str
+# ) -> dict[str, str]:
+#     """Returns a dict of table metadata for a given table name."""
+#     rows = catalog_reference.loc[catalog_reference.loc[:, "table_name"].eq(table_name)]
+#     census_release = rows.loc[:, "census_release"].unique()[0]
+#     description = rows.loc[:, "description"].unique()[0]
+#     population_coverage = rows.loc[:, "population_coverage"].unique()[0]
+#     variables = ", ".join(rows.loc[:, "variable"].astype(str).to_list())
+#     catalog_resolution = rows.loc[:, "catalog_resolution"].unique()[0]
+#     year = int(rows.loc[:, "year"].unique()[0])
+#     return {
+#         "census_release": census_release,
+#         "description": description,
+#         "population_coverage": population_coverage,
+#         "variables": variables,
+#         "catalog_resolution": catalog_resolution,
+#         "year": str(year),
+#         "human_readable_name": f"{description} ({population_coverage})",
+#     }
+
+
+# def get_table_name(file_name: str) -> str:
+#     return file_name.rsplit(".csv")[0]
+
+
+# @asset
+# def catalog_as_dataframe(context, catalog_reference: pd.DataFrame) -> pd.DataFrame:
+#     """Creates a catalog of the individual census tables from all data sources."""
+#     records = []
+#     for data_source in DATA_SOURCES:
+#         resolution = data_source["resolution"]
+#         source = data_source["source"]
+#         url = data_source["url"]
+#         zip_file_name = source_to_zip(source, url)
+#         with zipfile.ZipFile(zip_file_name) as zip_ref:
+#             for file_name in zip_ref.namelist():
+#                 # Get table name
+#                 table_name = get_table_name(file_name)
+
+#                 # Skip bulk output files and missing tables from catalog_reference
+#                 if (
+#                     "bulk_output" in file_name.lower()
+#                     or catalog_reference.loc[:, "table_name"].ne(table_name).all()
+#                 ):
+#                     continue
+
+#                 # Get table metadata
+#                 table_metadata = get_table_metadata(catalog_reference, table_name)
+
+#                 # Get source release metadata if available
+#                 source_data_release = sources.get(
+#                     table_metadata["census_release"], None
+#                 )
+#                 source_data_release_id = (
+#                     None if source_data_release is None else source_data_release.id
+#                 )
+
+#                 # Create a record for each census table use same keys as MetricMetadata
+#                 # where possible since this makes it simpler to populate derived
+#                 # metrics downstream
+#                 record = {
+#                     "resolution": resolution,
+#                     "catalog_resolution": table_metadata["catalog_resolution"],
+#                     "source": source,
+#                     "url": url,
+#                     "file_name": Path(source) / file_name,
+#                     "table_name": table_name,
+#                     "year": table_metadata["year"],
+#                     # Use constructed name of description and coverage
+#                     "human_readable_name": table_metadata["human_readable_name"],
+#                     "source_metric_id": None,
+#                     # Use catalog_reference description
+#                     "description": table_metadata["description"],
+#                     "hxl_tag": None,
+#                     "metric_parquet_file_url": None,
+#                     "parquet_column_name": None,
+#                     "parquet_margin_of_error_column": None,
+#                     "parquet_margin_of_error_file": None,
+#                     "potential_denominator_ids": None,
+#                     "parent_metric_id": None,
+#                     # TODO: check this is not an ID but a name
+#                     "source_data_release_id": source_data_release_id,
+#                     "source_download_url": url,
+#                     # TODO: what should this be?
+#                     "source_archive_file_path": None,
+#                     "source_documentation_url": URL_CATALOG,
+#                 }
+#                 context.log.debug(record)
+#                 records.append(record)
+#                 zip_ref.extract(file_name, Path(cache_dir) / source)
+
+#     # TODO: check if required
+#     for partition in context.instance.get_dynamic_partitions(PARTITIONS_DEF_NAME):
+#         context.instance.delete_dynamic_partition(PARTITIONS_DEF_NAME, partition)
+
+#     # Create a dynamic partition for the datasets listed in the catalog
+#     catalog_df: pd.DataFrame = pd.DataFrame.from_records(records)
+#     catalog_df["partition_key"] = (
+#         catalog_df[["year", "resolution", "table_name"]]
+#         .astype(str)
+#         .agg(lambda s: "/".join(s).rsplit(".")[0], axis=1)
+#     )
+#     # TODO: consider filtering here based on a set of keys to keep derived from
+#     # config (i.e. backend/frontend modes)
+#     context.instance.add_dynamic_partitions(
+#         partitions_def_name=PARTITIONS_DEF_NAME,
+#         # To ensure this is unique, prepend the resolution,
+#         partition_keys=catalog_df.loc[
+#             catalog_df["partition_key"].str.contains(REQUIRED_TABLES_REGEX),
+#             "partition_key",
+#         ].to_list(),
+#     )
+#     context.add_output_metadata(
+#         metadata={
+#             "num_records": len(catalog_df),
+#             "ignored_datasets": "",
+#             "columns": MetadataValue.md(
+#                 "\n".join([f"- '`{col}`'" for col in catalog_df.columns.to_list()])
+#             ),
+#             "columns_types": MetadataValue.md(catalog_df.dtypes.to_markdown()),
+#             "preview": MetadataValue.md(catalog_df.to_markdown()),
+#         }
+#     )
+#     return catalog_df
+
+
+# def get_table(context, table_details) -> pd.DataFrame:
+#     table_df = pd.read_csv(Path(cache_dir) / table_details["file_name"].iloc[0])
+#     add_metadata(context, table_df, table_details["partition_key"].iloc[0])
+#     return table_df
+
+
+# @asset(partitions_def=dataset_node_partition)
+# def individual_census_table(
+#     context, catalog_as_dataframe: pd.DataFrame
+# ) -> pd.DataFrame:
+#     """Creates individual census tables as dataframe."""
+#     partition_key = context.asset_partition_key_for_output()
+#     context.log.info(partition_key)
+#     table_details = catalog_as_dataframe.loc[
+#         catalog_as_dataframe["partition_key"].isin([partition_key])
+#     ]
+#     context.log.info(table_details)
+#     return get_table(context, table_details)
+
+
+# subset_partition_keys: list[str] = ["2011/OA11/LC1117SC"]
+# subset_mapping = SpecificPartitionsPartitionMapping(subset_partition_keys)
+# subset_partition = StaticPartitionsDefinition(subset_partition_keys)

From 30af440d18655d34134ddbc3d531af8f8af1f86a Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 20 Jun 2024 21:34:19 +0100
Subject: [PATCH 28/60] Add update to use country class

---
 python/popgetter/assets/__init__.py          |   4 +-
 python/popgetter/assets/scotland/__init__.py | 617 +++++++++++++++++--
 2 files changed, 570 insertions(+), 51 deletions(-)

diff --git a/python/popgetter/assets/__init__.py b/python/popgetter/assets/__init__.py
index 55e91dc..0cbb5cf 100644
--- a/python/popgetter/assets/__init__.py
+++ b/python/popgetter/assets/__init__.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
-from . import be, ni, uk, us
+from . import be, ni, scotland, uk, us
 
-countries = [(mod, mod.__name__.split(".")[-1]) for mod in [be, ni, uk, us]]
+countries = [(mod, mod.__name__.split(".")[-1]) for mod in [be, ni, uk, us, scotland]]
 
 __all__ = ["countries"]
diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py
index 0a96e80..a52207d 100755
--- a/python/popgetter/assets/scotland/__init__.py
+++ b/python/popgetter/assets/scotland/__init__.py
@@ -1,46 +1,32 @@
 #!/usr/bin/python3
 from __future__ import annotations
 
+import urllib.parse as urlparse
+from collections.abc import Callable
+from dataclasses import dataclass
 from datetime import date
 from pathlib import Path
+from typing import ClassVar
 
+import geopandas as gpd
+import matplotlib.pyplot as plt
+import pandas as pd
 import requests
+import zipfile_deflate64 as zipfile
 from dagster import (
-    asset,
+    MetadataValue,
 )
+from icecream import ic
 
-from popgetter.metadata import CountryMetadata, DataPublisher, SourceDataRelease
-
-country: CountryMetadata = CountryMetadata(
-    name_short_en="Scotland",
-    name_official="Scotland",
-    iso3="GBR",
-    iso2="GB",
-    iso3166_2="GB-SCT",
-)
-
-publisher: DataPublisher = DataPublisher(
-    name="National Records of Scotland",
-    url="https://www.nrscotland.gov.uk/",
-    description="National Records of Scotland (NRS) is a Non-Ministerial Department of "
-    "the Scottish Government. Our purpose is to collect, preserve and "
-    "produce information about Scotland's people and history and make it "
-    "available to inform current and future generations.",
-    countries_of_interest=[country.id],
+from popgetter.assets.country import Country
+from popgetter.metadata import (
+    CountryMetadata,
+    DataPublisher,
+    GeometryMetadata,
+    MetricMetadata,
+    SourceDataRelease,
 )
-
-
-@asset()
-def country_metadata() -> CountryMetadata:
-    """Returns a CountryMetadata of metadata about the country."""
-    return country
-
-
-@asset()
-def publisher_metadata():
-    """Returns a DataPublisher of metadata about the publisher."""
-    return publisher
-
+from popgetter.utils import add_metadata, markdown_from_plot
 
 # From: https://github.com/alan-turing-institute/microsimulation/blob/37ce2843f10b83a8e7a225c801cec83b85e6e0d0/microsimulation/common.py#L32
 REQUIRED_TABLES = [
@@ -60,7 +46,7 @@ def publisher_metadata():
 CENSUS_COLLECTION_DATE = date(2011, 3, 27)
 CENSUS_EXPECT_NEXT_UPDATE = date(2022, 1, 1)
 
-sources: dict[str, SourceDataRelease] = {
+SOURCE_DATA_RELEASES: dict[str, SourceDataRelease] = {
     "3A": SourceDataRelease(
         name="Census 2011: Release 3A",
         date_published=date(2014, 2, 27),
@@ -70,11 +56,12 @@ def publisher_metadata():
         collection_period_end=CENSUS_COLLECTION_DATE,
         expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
         url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3a",
-        data_publisher_id=publisher.id,
+        data_publisher_id="TBD",
         description="TBC",
-        geography_file="TBC",
-        geography_level="TBC",
-        countries_of_interest=[country.id],
+        # geography_file="TBC",
+        # geography_level="TBC",
+        geometry_metadata_id="TBC",
+        # countries_of_interest=[country.id],
     ),
     "3I": SourceDataRelease(
         name="Census 2011: Release 3I",
@@ -85,11 +72,12 @@ def publisher_metadata():
         collection_period_end=date(2011, 10, 22),
         expect_next_update=date(2022, 1, 1),
         url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3i",
-        data_publisher_id=publisher.id,
+        data_publisher_id="TBD",
         description="TBC",
-        geography_file="TBC",
-        geography_level="TBC",
-        countries_of_interest=[country.id],
+        # geography_file="TBC",
+        # geography_level="TBC",
+        geometry_metadata_id="TBC",
+        # countries_of_interest=[country.id],
     ),
     "2A": SourceDataRelease(
         name="Census 2011: Release 2A",
@@ -100,11 +88,12 @@ def publisher_metadata():
         collection_period_end=date(2011, 10, 22),
         expect_next_update=date(2022, 1, 1),
         url="https://www.nrscotland.gov.uk/news/2013/census-2011-release-2a",
-        data_publisher_id=publisher.id,
+        data_publisher_id="TBD",
         description="TBC",
-        geography_file="TBC",
-        geography_level="TBC",
-        countries_of_interest=[country.id],
+        # geography_file="TBC",
+        # geography_level="TBC",
+        geometry_metadata_id="",
+        # countries_of_interest=[country.id],
     ),
     "3C": SourceDataRelease(
         name="Census 2011: Release 3C",
@@ -115,11 +104,12 @@ def publisher_metadata():
         collection_period_end=date(2011, 10, 22),
         expect_next_update=date(2022, 1, 1),
         url="https://www.nrscotland.gov.uk/news/2014/census-2011-releases-2d-and-3c",
-        data_publisher_id=publisher.id,
+        data_publisher_id="TBD",
         description="TBC",
-        geography_file="TBC",
-        geography_level="TBC",
-        countries_of_interest=[country.id],
+        geometry_metadata_id="",
+        # geography_file="TBC",
+        # geography_level="TBC",
+        # countries_of_interest=[country.id],
     ),
 }
 
@@ -143,3 +133,532 @@ def download_file(
         with Path(file_name).open("wb") as fp:
             fp.write(r.content)
     return file_name
+
+
+URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html"
+URL1 = "https://www.scotlandscensus.gov.uk/"
+URL2 = "https://nrscensusprodumb.blob.core.windows.net/downloads/"
+URL_LOOKUP = (
+    "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx"
+)
+URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip"
+URL_CATALOG = (
+    "https://www.scotlandscensus.gov.uk/media/kqcmo4ge/census-table-index-2011.xlsm"
+)
+
+
+DATA_SOURCES = [
+    {
+        "source": "Council Area blk",
+        "resolution": "LAD",
+        "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip",
+    },
+    {
+        "source": "SNS Data Zone 2011 blk",
+        "resolution": "LSOA11",
+        "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip",
+    },
+    {
+        "source": "Output Area blk",
+        "resolution": "OA11",
+        "url": URL2 + urlparse.quote("Output Area blk") + ".zip",
+    },
+]
+
+
+@dataclass
+class ScotlandGeometryLevel:
+    level: str
+    hxl_tag: str
+    geo_id_column: str
+    census_table_column: str
+    name_columns: dict[str, str]  # keys = language codes, values = column names
+    url: str
+    lookup_url: str | None
+    lookup_sheet: str | None
+    left_on: str | None
+    right_on: str | None
+
+
+SCOTLAND_GEO_LEVELS = {
+    "OA11": ScotlandGeometryLevel(
+        level="OA11",
+        hxl_tag="TBD",
+        geo_id_column="OA_CODE",
+        census_table_column="TODO",
+        # census_table_column="Census 2021 Data Zone Code",
+        name_columns={"en": "OA_CODE"},  # TODO
+        # url=URL_SHAPEFILE,
+        url="https://www.nrscotland.gov.uk/files/geography/output-area-2011-eor.zip",
+        lookup_url=None,
+        lookup_sheet=None,
+        left_on=None,
+        right_on=None,
+    )
+}
+
+
+# cache_dir = tempfile.mkdtemp()
+cache_dir = "./cache"
+
+
+@dataclass
+class DerivedColumn:
+    hxltag: str
+    filter_func: Callable[[pd.DataFrame], pd.DataFrame]
+    output_column_name: str
+    human_readable_name: str
+
+
+@dataclass
+class SourceTable:
+    hxltag: str
+    geo_level: str
+    geo_column: str
+    source_column: str
+
+
+# Config for each partition to be derived
+age_code = "`Age Code`"
+sex_label = "`Sex Label`"
+DERIVED_COLUMNS = [
+    DerivedColumn(
+        hxltag="#population+children+age5_17",
+        filter_func=lambda df: df.query(f"{age_code} >= 5 and {age_code} < 18"),
+        output_column_name="children_5_17",
+        human_readable_name="Children aged 5 to 17",
+    ),
+    DerivedColumn(
+        hxltag="#population+infants+age0_4",
+        filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 5"),
+        output_column_name="infants_0_4",
+        human_readable_name="Infants aged 0 to 4",
+    ),
+    DerivedColumn(
+        hxltag="#population+children+age0_17",
+        filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 18"),
+        output_column_name="children_0_17",
+        human_readable_name="Children aged 0 to 17",
+    ),
+    DerivedColumn(
+        hxltag="#population+adults+f",
+        filter_func=lambda df: df.query(
+            f"{age_code} >= 18 and {sex_label} == 'Female'"
+        ),
+        output_column_name="adults_f",
+        human_readable_name="Female adults",
+    ),
+    DerivedColumn(
+        hxltag="#population+adults+m",
+        filter_func=lambda df: df.query(f"{age_code} >= 18 and {sex_label} == 'Male'"),
+        output_column_name="adults_m",
+        human_readable_name="Male adults",
+    ),
+    DerivedColumn(
+        hxltag="#population+adults",
+        filter_func=lambda df: df.query(f"{age_code} >= 18"),
+        output_column_name="adults",
+        human_readable_name="Adults",
+    ),
+    DerivedColumn(
+        hxltag="#population+ind",
+        filter_func=lambda df: df,
+        output_column_name="individuals",
+        human_readable_name="Total individuals",
+    ),
+]
+
+TABLES_TO_PROCESS: list[str] = [
+    "QS103SC",
+    "QS104SC",
+    "KS201SC",
+    "DC1117SC",
+    "DC2101SC",
+    "DC6206SC",
+    "LC1117SC",
+]
+
+PARTITIONS_TO_PUBLISH: list[str] = ["2011/OA11/LC1117SC"]
+
+
+DERIVED_COLUMN_SPECIFICATIONS: dict[str, list[DerivedColumn]] = {
+    PARTITIONS_TO_PUBLISH[0]: DERIVED_COLUMNS,
+}
+
+
+class Scotland(Country):
+    key_prefix: str = "uk-scotland"
+    geo_levels: ClassVar[list[str]] = list(SCOTLAND_GEO_LEVELS.keys())
+    tables_to_process: list[str] | None = TABLES_TO_PROCESS
+
+    def _catalog(self, context) -> pd.DataFrame:
+        """Creates a catalog of the individual census tables from all data sources."""
+
+        def source_to_zip(source_name: str, url: str) -> Path:
+            """Downloads if necessary and returns the name of the locally cached zip file
+            of the source data (replacing spaces with _)"""
+            file_name = Path(cache_dir) / (source_name.replace(" ", "_") + ".zip")
+            return download_file(cache_dir, url, file_name)
+
+        def get_table_name(file_name: str) -> str:
+            return file_name.rsplit(".csv")[0]
+
+        def get_table_metadata(
+            catalog_reference: pd.DataFrame, table_name: str
+        ) -> dict[str, str]:
+            """Returns a dict of table metadata for a given table name."""
+            rows = catalog_reference.loc[
+                catalog_reference.loc[:, "table_name"].eq(table_name)
+            ]
+            census_release = rows.loc[:, "census_release"].unique()[0]
+            description = rows.loc[:, "description"].unique()[0]
+            population_coverage = rows.loc[:, "population_coverage"].unique()[0]
+            variables = ", ".join(rows.loc[:, "variable"].astype(str).to_list())
+            catalog_resolution = rows.loc[:, "catalog_resolution"].unique()[0]
+            year = int(rows.loc[:, "year"].unique()[0])
+            return {
+                "census_release": census_release,
+                "description": description,
+                "population_coverage": population_coverage,
+                "variables": variables,
+                "catalog_resolution": catalog_resolution,
+                "year": str(year),
+                "human_readable_name": f"{description} ({population_coverage})",
+            }
+
+        # Download catalog reference
+        catalog_reference = pd.read_excel(
+            URL_CATALOG,
+            sheet_name=None,
+            header=None,
+            storage_options={"User-Agent": "Mozilla/5.0"},
+        )["Index"].rename(
+            columns={
+                0: "census_release",
+                1: "table_name",
+                2: "description",
+                3: "population_coverage",
+                4: "variable",
+                5: "catalog_resolution",
+                6: "year",
+                7: "additional_url",
+                8: "population_coverage_and_variable",
+            }
+        )
+        # Remove all keys
+        self.remove_all_partition_keys(context)
+
+        records = []
+        for data_source in DATA_SOURCES:
+            resolution = data_source["resolution"]
+            source = data_source["source"]
+            url = data_source["url"]
+            zip_file_name = source_to_zip(source, url)
+            with zipfile.ZipFile(zip_file_name) as zip_ref:
+                for file_name in zip_ref.namelist():
+                    # Get table name
+                    table_name = get_table_name(file_name)
+
+                    # Skip bulk output files and missing tables from catalog_reference
+                    if (
+                        "bulk_output" in file_name.lower()
+                        or catalog_reference.loc[:, "table_name"].ne(table_name).all()
+                    ):
+                        continue
+
+                    # Get table metadata
+                    table_metadata = get_table_metadata(catalog_reference, table_name)
+
+                    # Get source release metadata if available
+                    source_data_release = SOURCE_DATA_RELEASES.get(
+                        table_metadata["census_release"], None
+                    )
+                    source_data_release_id = (
+                        None if source_data_release is None else source_data_release.id
+                    )
+
+                    # Create a record for each census table use same keys as MetricMetadata
+                    # where possible since this makes it simpler to populate derived
+                    # metrics downstream
+                    record = {
+                        "resolution": resolution,
+                        "catalog_resolution": table_metadata["catalog_resolution"],
+                        "source": source,
+                        "url": url,
+                        "file_name": Path(source) / file_name,
+                        "table_name": table_name,
+                        "year": table_metadata["year"],
+                        # Use constructed name of description and coverage
+                        "human_readable_name": table_metadata["human_readable_name"],
+                        "source_metric_id": None,
+                        # Use catalog_reference description
+                        "description": table_metadata["description"],
+                        "hxl_tag": None,
+                        "metric_parquet_file_url": None,
+                        "parquet_column_name": None,
+                        "parquet_margin_of_error_column": None,
+                        "parquet_margin_of_error_file": None,
+                        "potential_denominator_ids": None,
+                        "parent_metric_id": None,
+                        # TODO: check this is not an ID but a name
+                        "source_data_release_id": source_data_release_id,
+                        "census_release": table_metadata["census_release"],
+                        "source_download_url": url,
+                        # TODO: what should this be?
+                        "source_archive_file_path": None,
+                        "source_documentation_url": URL_CATALOG,
+                    }
+                    context.log.debug(record)
+                    records.append(record)
+                    zip_ref.extract(file_name, Path(cache_dir) / source)
+
+        # Create a dynamic partition for the datasets listed in the catalog
+        catalog_df: pd.DataFrame = pd.DataFrame.from_records(records)
+        catalog_df["partition_key"] = (
+            catalog_df[["year", "resolution", "table_name"]]
+            .astype(str)
+            .agg(lambda s: "/".join(s).rsplit(".")[0], axis=1)
+        )
+
+        # TODO: add filter for prod vs. dev mode
+        self.add_partition_keys(context, catalog_df["partition_key"].to_list())
+        context.add_output_metadata(
+            metadata={
+                "num_records": len(catalog_df),
+                "ignored_datasets": "",
+                "columns": MetadataValue.md(
+                    "\n".join([f"- '`{col}`'" for col in catalog_df.columns.to_list()])
+                ),
+                "columns_types": MetadataValue.md(catalog_df.dtypes.to_markdown()),
+                "preview": MetadataValue.md(catalog_df.to_markdown()),
+            }
+        )
+        return catalog_df
+
+    def _country_metadata(self, _context) -> CountryMetadata:
+        return CountryMetadata(
+            name_short_en="Scotland",
+            name_official="Scotland",
+            iso3="GBR",
+            iso2="GB",
+            iso3166_2="GB-SCT",
+        )
+
+    def _data_publisher(
+        self, _context, country_metdata: CountryMetadata
+    ) -> DataPublisher:
+        return DataPublisher(
+            name="National Records of Scotland",
+            url="https://www.nrscotland.gov.uk/",
+            description="National Records of Scotland (NRS) is a Non-Ministerial Department of "
+            "the Scottish Government. Our purpose is to collect, preserve and "
+            "produce information about Scotland's people and history and make it "
+            "available to inform current and future generations.",
+            countries_of_interest=[country_metdata.id],
+        )
+
+    def _geometry(
+        self, context
+    ) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]:
+        """Gets the shape file for OA11 resolution."""
+        geometries_to_return = []
+        for level_details in SCOTLAND_GEO_LEVELS.values():
+            # TODO: get correct values
+            geometry_metadata = GeometryMetadata(
+                validity_period_start=CENSUS_COLLECTION_DATE,
+                validity_period_end=CENSUS_COLLECTION_DATE,
+                level=level_details.level,
+                hxl_tag=level_details.hxl_tag,
+            )
+            file_name = download_file(cache_dir, level_details.url)
+            region_geometries_raw: gpd.GeoDataFrame = gpd.read_file(
+                f"zip://{file_name}"
+            )
+            if level_details.lookup_url is not None:
+                lookup = pd.read_excel(
+                    level_details.lookup_url, sheet_name=level_details.lookup_sheet
+                )
+                region_geometries_raw = region_geometries_raw.merge(
+                    lookup,
+                    left_on=level_details.left_on,
+                    right_on=level_details.right_on,
+                    how="outer",
+                )
+
+            region_geometries_raw = region_geometries_raw.dissolve(
+                by=level_details.geo_id_column
+            ).reset_index()
+
+            context.log.debug(ic(region_geometries_raw.head()))
+            region_geometries = region_geometries_raw.rename(
+                columns={level_details.geo_id_column: "GEO_ID"}
+            ).loc[:, ["geometry", "GEO_ID"]]
+            region_names = (
+                region_geometries_raw.rename(
+                    columns={
+                        level_details.geo_id_column: "GEO_ID",
+                        level_details.name_columns["en"]: "en",
+                    }
+                )
+                .loc[:, ["GEO_ID", "en"]]
+                .drop_duplicates()
+            )
+            geometries_to_return.append(
+                (geometry_metadata, region_geometries, region_names)
+            )
+
+        # Add output metadata
+        first_metadata, first_gdf, first_names = geometries_to_return[0]
+        first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID")
+        ax = first_joined_gdf.plot(column="en", legend=False)
+        ax.set_title(f"Scotland 2011 {first_metadata.level}")
+        md_plot = markdown_from_plot(plt)
+        context.add_output_metadata(
+            metadata={
+                "all_geom_levels": MetadataValue.md(
+                    ",".join(
+                        [metadata.level for metadata, _, _ in geometries_to_return]
+                    )
+                ),
+                "first_geometry_plot": MetadataValue.md(md_plot),
+                "first_names_preview": MetadataValue.md(
+                    first_names.head().to_markdown()
+                ),
+            }
+        )
+
+        return geometries_to_return
+
+    @staticmethod
+    def _get_geo_level_and_source_data_release(
+        geo_level: str, cenesus_release: str
+    ) -> str:
+        return geo_level + "_" + cenesus_release
+
+    def _source_data_releases(
+        self,
+        _context,
+        geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]],
+        data_publisher: DataPublisher,
+        # TODO: consider version without inputs so only output type specified
+        # **kwargs,
+    ) -> dict[str, SourceDataRelease]:
+        source_data_releases = {}
+        for geo_metadata, _, _ in geometry:
+            for (
+                source_data_release_id,
+                source_data_release,
+            ) in SOURCE_DATA_RELEASES.items():
+                source_data_release_new: SourceDataRelease = SourceDataRelease(
+                    name=source_data_release.name,
+                    date_published=source_data_release.date_published,
+                    reference_period_start=source_data_release.collection_period_start,
+                    reference_period_end=source_data_release.reference_period_end,
+                    collection_period_start=source_data_release.collection_period_start,
+                    collection_period_end=source_data_release.collection_period_end,
+                    expect_next_update=source_data_release.expect_next_update,
+                    url=source_data_release.url,
+                    data_publisher_id=data_publisher.id,
+                    description=source_data_release.description,
+                    geometry_metadata_id=geo_metadata.id,
+                )
+                combined_level_and_release_id = (
+                    self._get_geo_level_and_source_data_release(
+                        geo_metadata.level, source_data_release_id
+                    )
+                )
+                source_data_releases[
+                    combined_level_and_release_id
+                ] = source_data_release_new
+        return source_data_releases
+
+    @staticmethod
+    def get_table(context, table_details) -> pd.DataFrame:
+        table_df = pd.read_csv(Path(cache_dir) / table_details["file_name"].iloc[0])
+        add_metadata(context, table_df, table_details["partition_key"].iloc[0])
+        return table_df
+
+    def _census_tables(self, context, catalog: pd.DataFrame) -> pd.DataFrame:
+        """Creates individual census tables as dataframe."""
+        partition_key = context.asset_partition_key_for_output()
+        context.log.info(partition_key)
+        table_details = catalog.loc[catalog["partition_key"].isin([partition_key])]
+        context.log.info(table_details)
+        return self.get_table(context, table_details)
+
+    # subset_partition_keys: list[str] = ["2011/OA11/LC1117SC"]
+    # subset_mapping = SpecificPartitionsPartitionMapping(subset_partition_keys)
+    # subset_partition = StaticPartitionsDefinition(subset_partition_keys)
+
+    @staticmethod
+    def census_table_metadata(
+        catalog_row: dict[str, str],
+        source_table: SourceTable,
+        source_data_releases: dict[str, SourceDataRelease],
+    ) -> MetricMetadata:
+        return MetricMetadata(
+            human_readable_name=catalog_row["human_readable_name"],
+            source_download_url=catalog_row["source_download_url"],
+            source_archive_file_path=catalog_row["source_archive_file_path"],
+            source_documentation_url=catalog_row["source_documentation_url"],
+            source_data_release_id=source_data_releases[source_table.geo_level].id,
+            # TODO - this is a placeholder
+            parent_metric_id="unknown_at_this_stage",
+            potential_denominator_ids=None,
+            parquet_margin_of_error_file=None,
+            parquet_margin_of_error_column=None,
+            parquet_column_name=source_table.source_column,
+            # TODO - this is a placeholder
+            metric_parquet_path="unknown_at_this_stage",
+            hxl_tag=source_table.hxltag,
+            description=catalog_row["description"],
+            source_metric_id=source_table.hxltag,
+        )
+
+    def _source_metric_metadata(
+        self,
+        context,
+        catalog: pd.DataFrame,
+        source_data_releases: dict[str, SourceDataRelease],
+    ) -> MetricMetadata:
+        partition_key = context.partition_key
+        catalog_row = catalog[catalog["partition_key"] == partition_key].to_dict(
+            orient="records"
+        )[0]
+
+        geo_level = partition_key.split("/")[1]
+        source_table = SourceTable(
+            # TODO: how programmatically do this
+            hxltag="TBD",
+            geo_level=geo_level,
+            geo_column=SCOTLAND_GEO_LEVELS[geo_level].geo_id_column,
+            # TODO: update this
+            source_column="Count",
+        )
+
+        return self.census_table_metadata(
+            catalog_row,
+            source_table,
+            source_data_releases,
+        )
+
+    def _derived_metrics(
+        self,
+        context,
+        census_tables: pd.DataFrame,
+        source_metric_metadata: MetricMetadata,
+    ) -> tuple[list[MetricMetadata], pd.DataFrame]:
+        ...
+
+
+# Create assets
+scotland = Scotland()
+country_metadata = scotland.create_country_metadata()
+data_publisher = scotland.create_data_publisher()
+geometry = scotland.create_geometry()
+source_data_releases = scotland.create_source_data_releases()
+catalog = scotland.create_catalog()
+census_tables = scotland.create_census_tables()
+source_metric_metadata = scotland.create_source_metric_metadata()
+derived_metrics = scotland.create_derived_metrics()
+metrics = scotland.create_metrics()

From 479324ce3785e4779d60ec0c2ef468e9517bc586 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 20 Jun 2024 22:12:42 +0100
Subject: [PATCH 29/60] Fix geometry

---
 python/popgetter/assets/scotland/__init__.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py
index a52207d..4c4e954 100755
--- a/python/popgetter/assets/scotland/__init__.py
+++ b/python/popgetter/assets/scotland/__init__.py
@@ -287,7 +287,7 @@ class SourceTable:
 
 
 class Scotland(Country):
-    key_prefix: str = "uk-scotland"
+    key_prefix: str = "scotland"
     geo_levels: ClassVar[list[str]] = list(SCOTLAND_GEO_LEVELS.keys())
     tables_to_process: list[str] | None = TABLES_TO_PROCESS
 
@@ -377,6 +377,13 @@ def get_table_metadata(
                         None if source_data_release is None else source_data_release.id
                     )
 
+                    # Skip if not required
+                    if (
+                        self.tables_to_process is not None
+                        and table_name not in self.tables_to_process
+                    ):
+                        continue
+
                     # Create a record for each census table use same keys as MetricMetadata
                     # where possible since this makes it simpler to populate derived
                     # metrics downstream
@@ -474,6 +481,7 @@ def _geometry(
             region_geometries_raw: gpd.GeoDataFrame = gpd.read_file(
                 f"zip://{file_name}"
             )
+            ic(region_geometries_raw.head())
             if level_details.lookup_url is not None:
                 lookup = pd.read_excel(
                     level_details.lookup_url, sheet_name=level_details.lookup_sheet
@@ -493,11 +501,16 @@ def _geometry(
             region_geometries = region_geometries_raw.rename(
                 columns={level_details.geo_id_column: "GEO_ID"}
             ).loc[:, ["geometry", "GEO_ID"]]
+
+            # Note: Make copy of IDs as names for now
+            region_geometries_raw["GEO_ID_2"] = region_geometries_raw[
+                level_details.geo_id_column
+            ].copy()
             region_names = (
                 region_geometries_raw.rename(
                     columns={
                         level_details.geo_id_column: "GEO_ID",
-                        level_details.name_columns["en"]: "en",
+                        "GEO_ID_2": "en",
                     }
                 )
                 .loc[:, ["GEO_ID", "en"]]

From 582f57887769536c001672ad93c121825adc5d8d Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Sat, 22 Jun 2024 13:28:02 +0100
Subject: [PATCH 30/60] Revise geographies with overload providing lookups

---
 python/popgetter/assets/scotland/__init__.py | 391 +++++++++++++++----
 1 file changed, 306 insertions(+), 85 deletions(-)

diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py
index 4c4e954..74eca9c 100755
--- a/python/popgetter/assets/scotland/__init__.py
+++ b/python/popgetter/assets/scotland/__init__.py
@@ -5,6 +5,7 @@
 from collections.abc import Callable
 from dataclasses import dataclass
 from datetime import date
+from functools import reduce
 from pathlib import Path
 from typing import ClassVar
 
@@ -15,16 +16,19 @@
 import zipfile_deflate64 as zipfile
 from dagster import (
     MetadataValue,
+    asset,
 )
 from icecream import ic
 
 from popgetter.assets.country import Country
+from popgetter.cloud_outputs import send_to_geometry_sensor
 from popgetter.metadata import (
     CountryMetadata,
     DataPublisher,
     GeometryMetadata,
     MetricMetadata,
     SourceDataRelease,
+    metadata_to_dataframe,
 )
 from popgetter.utils import add_metadata, markdown_from_plot
 
@@ -150,17 +154,20 @@ def download_file(
 DATA_SOURCES = [
     {
         "source": "Council Area blk",
-        "resolution": "LAD",
+        # "resolution": "LAD",
+        "resolution": "CouncilArea2011",
         "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip",
     },
     {
         "source": "SNS Data Zone 2011 blk",
-        "resolution": "LSOA11",
+        # "resolution": "LSOA11",
+        "resolution": "DataZone2011",
         "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip",
     },
     {
         "source": "Output Area blk",
-        "resolution": "OA11",
+        # "resolution": "OA11",
+        "resolution": "OutputArea2011",
         "url": URL2 + urlparse.quote("Output Area blk") + ".zip",
     },
 ]
@@ -181,20 +188,62 @@ class ScotlandGeometryLevel:
 
 
 SCOTLAND_GEO_LEVELS = {
-    "OA11": ScotlandGeometryLevel(
-        level="OA11",
+    "OutputArea2011": ScotlandGeometryLevel(
+        level="OutputArea2011",
         hxl_tag="TBD",
         geo_id_column="OA_CODE",
         census_table_column="TODO",
         # census_table_column="Census 2021 Data Zone Code",
-        name_columns={"en": "OA_CODE"},  # TODO
+        name_columns={"en": "OutputArea2011Name"},  # TODO
         # url=URL_SHAPEFILE,
         url="https://www.nrscotland.gov.uk/files/geography/output-area-2011-eor.zip",
         lookup_url=None,
         lookup_sheet=None,
-        left_on=None,
-        right_on=None,
-    )
+        left_on="OA_CODE",
+        right_on="OutputArea2011Code",
+    ),
+    # LSOA11
+    "DataZone2011": ScotlandGeometryLevel(
+        level="DataZone2011",
+        hxl_tag="TBD",
+        geo_id_column="DataZone",
+        census_table_column="TODO",
+        # census_table_column="Census 2021 Data Zone Code",
+        name_columns={"en": "Name"},
+        url="https://maps.gov.scot/ATOM/shapefiles/SG_DataZoneBdry_2011.zip",
+        lookup_url=None,
+        lookup_sheet=None,
+        left_on="DataZone",
+        right_on="DataZone2011Code",
+    ),
+    # "MSOA11": ScotlandGeometryLevel(
+    #     level="OA11",
+    #     hxl_tag="TBD",
+    #     geo_id_column="OA_CODE",
+    #     census_table_column="TODO",
+    #     # census_table_column="Census 2021 Data Zone Code",
+    #     name_columns={"en": "OA_CODE"},
+    #     # url=URL_SHAPEFILE,
+    #     url="https://www.nrscotland.gov.uk/files/geography/output-area-2011-eor.zip",
+    #     lookup_url=None,
+    #     lookup_sheet=None,
+    #     left_on=None,
+    #     right_on=None,
+    # ),
+    # LAD
+    "CouncilArea2011": ScotlandGeometryLevel(
+        level="CouncilArea2011",
+        hxl_tag="TBD",
+        geo_id_column="CouncilArea2011Code",
+        census_table_column="TODO",
+        # census_table_column="Census 2021 Data Zone Code",
+        name_columns={"en": "CouncilArea2011Name"},
+        url="https://maps.gov.scot/ATOM/shapefiles/SG_DataZoneBdry_2011.zip",
+        lookup_url=None,
+        lookup_sheet=None,
+        left_on="DataZone",
+        right_on="DataZone2011Code",
+    ),
 }
 
 
@@ -278,7 +327,7 @@ class SourceTable:
     "LC1117SC",
 ]
 
-PARTITIONS_TO_PUBLISH: list[str] = ["2011/OA11/LC1117SC"]
+PARTITIONS_TO_PUBLISH: list[str] = ["2011/OutputArea2011/LC1117SC"]
 
 
 DERIVED_COLUMN_SPECIFICATIONS: dict[str, list[DerivedColumn]] = {
@@ -286,6 +335,10 @@ class SourceTable:
 }
 
 
+def get_source_data_release(geo_level: str, cenesus_release: str) -> str:
+    return geo_level + "_" + cenesus_release
+
+
 class Scotland(Country):
     key_prefix: str = "scotland"
     geo_levels: ClassVar[list[str]] = list(SCOTLAND_GEO_LEVELS.keys())
@@ -464,89 +517,138 @@ def _data_publisher(
             countries_of_interest=[country_metdata.id],
         )
 
-    def _geometry(
-        self, context
-    ) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]:
-        """Gets the shape file for OA11 resolution."""
-        geometries_to_return = []
-        for level_details in SCOTLAND_GEO_LEVELS.values():
-            # TODO: get correct values
-            geometry_metadata = GeometryMetadata(
-                validity_period_start=CENSUS_COLLECTION_DATE,
-                validity_period_end=CENSUS_COLLECTION_DATE,
-                level=level_details.level,
-                hxl_tag=level_details.hxl_tag,
+    def create_lookup(self):
+        @asset(key_prefix=self.key_prefix)
+        def lookup(context):
+            url = "https://www.nrscotland.gov.uk/files/geography/2011-census/geog-2011-cen-supp-info-oldoa-newoa-lookup.xls"
+            df_oa_to_council = (
+                pd.read_excel(url, sheet_name="2011OA_Lookup", storage_options=HEADERS)
+                .iloc[:-2]
+                .loc[:, ["OutputArea2011Code", "CouncilArea2011Code"]]
+            )
+            url = "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx"
+            df_oa_to_dz_iz = pd.read_excel(
+                url, sheet_name="OA_DZ_IZ_2011 Lookup", storage_options=HEADERS
+            )
+            df_dz_nm = pd.read_excel(
+                url, sheet_name="DataZone2011Lookup", storage_options=HEADERS
+            )
+            df_iz_nm = pd.read_excel(
+                url, sheet_name="IntermediateZone2011Lookup", storage_options=HEADERS
+            )
+            combined = (
+                df_oa_to_council.merge(df_oa_to_dz_iz, on=["OutputArea2011Code"])
+                .merge(df_dz_nm, on=["DataZone2011Code"])
+                .merge(df_iz_nm, on=["IntermediateZone2011Code"])
             )
-            file_name = download_file(cache_dir, level_details.url)
-            region_geometries_raw: gpd.GeoDataFrame = gpd.read_file(
-                f"zip://{file_name}"
+            combined["OutputArea2011Name"] = combined["OutputArea2011Code"].copy()
+            df_council_name = pd.read_excel(
+                "https://www.nrscotland.gov.uk/files//geography/2011-census/oa2011-to-hba2014.xls",
+                sheet_name="HealthBoard2014_Council2011",
+                storage_options=HEADERS,
             )
-            ic(region_geometries_raw.head())
-            if level_details.lookup_url is not None:
-                lookup = pd.read_excel(
-                    level_details.lookup_url, sheet_name=level_details.lookup_sheet
+            combined = combined.merge(
+                df_council_name[["CouncilArea2011Code", "NRSCouncilAreaName"]],
+                on="CouncilArea2011Code",
+            ).rename(columns={"NRSCouncilAreaName": "CouncilArea2011Name"})
+            context.add_output_metadata(
+                metadata={
+                    "lookup_shape": f"{combined.shape[0]} rows x {combined.shape[1]} columns",
+                    "lookup_preview": MetadataValue.md(combined.head().to_markdown()),
+                },
+            )
+            return combined
+
+        return lookup
+
+    def create_geometry(self):
+        """
+        Creates an asset providing a list of geometries, metadata and names
+        at different resolutions.
+        """
+
+        @send_to_geometry_sensor
+        @asset(key_prefix=self.key_prefix)
+        def geometry(
+            context, lookup: pd.DataFrame
+        ) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]:
+            """List of geometries, metadata and names at different resolutions."""
+            geometries_to_return = []
+            for level_details in SCOTLAND_GEO_LEVELS.values():
+                # TODO: get correct values
+                geometry_metadata = GeometryMetadata(
+                    validity_period_start=CENSUS_COLLECTION_DATE,
+                    validity_period_end=CENSUS_COLLECTION_DATE,
+                    level=level_details.level,
+                    hxl_tag=level_details.hxl_tag,
+                )
+                file_name = download_file(cache_dir, level_details.url)
+                region_geometries_raw: gpd.GeoDataFrame = gpd.read_file(
+                    f"zip://{file_name}"
                 )
-                region_geometries_raw = region_geometries_raw.merge(
+                context.log.debug(ic(region_geometries_raw.head()))
+                context.log.debug(ic(region_geometries_raw.columns))
+                context.log.debug(ic(lookup.columns))
+                region_geometries_merge = region_geometries_raw.merge(
                     lookup,
                     left_on=level_details.left_on,
                     right_on=level_details.right_on,
-                    how="outer",
                 )
 
-            region_geometries_raw = region_geometries_raw.dissolve(
-                by=level_details.geo_id_column
-            ).reset_index()
-
-            context.log.debug(ic(region_geometries_raw.head()))
-            region_geometries = region_geometries_raw.rename(
-                columns={level_details.geo_id_column: "GEO_ID"}
-            ).loc[:, ["geometry", "GEO_ID"]]
-
-            # Note: Make copy of IDs as names for now
-            region_geometries_raw["GEO_ID_2"] = region_geometries_raw[
-                level_details.geo_id_column
-            ].copy()
-            region_names = (
-                region_geometries_raw.rename(
-                    columns={
-                        level_details.geo_id_column: "GEO_ID",
-                        "GEO_ID_2": "en",
-                    }
+                region_geometries_merge = region_geometries_merge.dissolve(
+                    by=level_details.geo_id_column
+                ).reset_index()
+
+                context.log.debug(ic(region_geometries_merge.head()))
+                context.log.debug(ic(region_geometries_merge.columns))
+                region_geometries = region_geometries_merge.rename(
+                    columns={level_details.geo_id_column: "GEO_ID"}
+                ).loc[:, ["geometry", "GEO_ID"]]
+
+                region_names = (
+                    region_geometries_merge.rename(
+                        columns={
+                            level_details.geo_id_column: "GEO_ID",
+                        }
+                        | {
+                            value: key
+                            for key, value in level_details.name_columns.items()
+                        }
+                    )
+                    .loc[:, ["GEO_ID", *list(level_details.name_columns.keys())]]
+                    .drop_duplicates()
                 )
-                .loc[:, ["GEO_ID", "en"]]
-                .drop_duplicates()
-            )
-            geometries_to_return.append(
-                (geometry_metadata, region_geometries, region_names)
+                geometries_to_return.append(
+                    (geometry_metadata, region_geometries, region_names)
+                )
+
+            # Add output metadata
+            first_metadata, first_gdf, first_names = geometries_to_return[0]
+            first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID")
+            ax = first_joined_gdf.plot(column="en", legend=False)
+            ax.set_title(f"Scotland 2011 {first_metadata.level}")
+            md_plot = markdown_from_plot(plt)
+            context.add_output_metadata(
+                metadata={
+                    "all_geom_levels": MetadataValue.md(
+                        ",".join(
+                            [metadata.level for metadata, _, _ in geometries_to_return]
+                        )
+                    ),
+                    "first_geometry_plot": MetadataValue.md(md_plot),
+                    "first_names_preview": MetadataValue.md(
+                        first_names.head().to_markdown()
+                    ),
+                }
             )
 
-        # Add output metadata
-        first_metadata, first_gdf, first_names = geometries_to_return[0]
-        first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID")
-        ax = first_joined_gdf.plot(column="en", legend=False)
-        ax.set_title(f"Scotland 2011 {first_metadata.level}")
-        md_plot = markdown_from_plot(plt)
-        context.add_output_metadata(
-            metadata={
-                "all_geom_levels": MetadataValue.md(
-                    ",".join(
-                        [metadata.level for metadata, _, _ in geometries_to_return]
-                    )
-                ),
-                "first_geometry_plot": MetadataValue.md(md_plot),
-                "first_names_preview": MetadataValue.md(
-                    first_names.head().to_markdown()
-                ),
-            }
-        )
+            return geometries_to_return
 
-        return geometries_to_return
+        return geometry
 
-    @staticmethod
-    def _get_geo_level_and_source_data_release(
-        geo_level: str, cenesus_release: str
-    ) -> str:
-        return geo_level + "_" + cenesus_release
+    def _geometry(self, context):
+        # Not required as geometry overridden
+        pass
 
     def _source_data_releases(
         self,
@@ -575,10 +677,8 @@ def _source_data_releases(
                     description=source_data_release.description,
                     geometry_metadata_id=geo_metadata.id,
                 )
-                combined_level_and_release_id = (
-                    self._get_geo_level_and_source_data_release(
-                        geo_metadata.level, source_data_release_id
-                    )
+                combined_level_and_release_id = get_source_data_release(
+                    geo_metadata.level, source_data_release_id
                 )
                 source_data_releases[
                     combined_level_and_release_id
@@ -614,7 +714,11 @@ def census_table_metadata(
             source_download_url=catalog_row["source_download_url"],
             source_archive_file_path=catalog_row["source_archive_file_path"],
             source_documentation_url=catalog_row["source_documentation_url"],
-            source_data_release_id=source_data_releases[source_table.geo_level].id,
+            source_data_release_id=source_data_releases[
+                get_source_data_release(
+                    source_table.geo_level, catalog_row["census_release"]
+                )
+            ].id,
             # TODO - this is a placeholder
             parent_metric_id="unknown_at_this_stage",
             potential_denominator_ids=None,
@@ -662,12 +766,129 @@ def _derived_metrics(
         source_metric_metadata: MetricMetadata,
     ) -> tuple[list[MetricMetadata], pd.DataFrame]:
         ...
+        SEP = "__"
+        partition_key = context.partition_key
+        source_mmd = source_metric_metadata
+        parquet_file_name = (
+            "".join(c for c in partition_key if c.isalnum()) + ".parquet"
+        )
+        derived_metrics, derived_mmd = [], []
+
+        # If derived metrics
+        # try:
+        #     metric_specs = DERIVED_COLUMN_SPECIFICATIONS[partition_key]
+        #     source_column = source_mmd.parquet_column_name
+        #     for metric_spec in metric_specs:
+        #         new_table = (
+        #             census_tables.pipe(metric_spec.filter_func)
+        #             .groupby(by="GEO_ID", as_index=True)
+        #             .sum()
+        #             .rename(columns={source_column: metric_spec.output_column_name})
+        #             .filter(items=["GEO_ID", metric_spec.output_column_name])
+        #         )
+        #         derived_metrics.append(new_table)
+        #         new_mmd = source_mmd.copy()
+        #         new_mmd.parent_metric_id = source_mmd.source_metric_id
+        #         new_mmd.metric_parquet_path = parquet_file_name
+        #         new_mmd.hxl_tag = metric_spec.hxltag
+        #         new_mmd.parquet_column_name = metric_spec.output_column_name
+        #         new_mmd.human_readable_name = metric_spec.human_readable_name
+        #         derived_mmd.append(new_mmd)
+        # except KeyError:
+        #     # No extra derived metrics specified for this partition -- only use
+        #     # those from pivoted data
+        #     pass
+
+        # Batch
+        def make_pivot(df: pd.DataFrame) -> pd.DataFrame:
+            # TODO: reshape based on Unnamed: 1 to Unnamed N
+            pivot_cols = [
+                col
+                for col in census_tables.columns
+                if col != "Unnamed: 0" and col.startswith("Unnamed: ")
+            ]
+            pivot = df.pivot_table(
+                index="Unnamed: 0", columns=pivot_cols, aggfunc="sum"
+            )
+
+            # FLattent multi-index
+            if isinstance(pivot.columns, pd.MultiIndex):
+                pivot.columns = [
+                    SEP.join(list(map(str, col))).strip()
+                    for col in pivot.columns.to_numpy()
+                ]
+            # Ensure columns are string
+            else:
+                pivot.columns = [str(col).strip() for col in pivot.columns.to_numpy()]
+
+            pivot.index = pivot.index.rename("GEO_ID")
+
+            return pivot
+
+        new_table = make_pivot(census_tables)
+        out_cols = [
+            "".join(x for x in col.title() if not x.isspace())
+            for col in source_mmd.description.split(" by ")[::-1]
+        ]
+
+        for metric_col in new_table.columns:
+            metric_df = new_table.loc[:, metric_col].to_frame()
+            ic(metric_df)
+            derived_metrics.append(metric_df)
+            new_mmd = source_mmd.copy()
+            new_mmd.parent_metric_id = source_mmd.source_metric_id
+            new_mmd.metric_parquet_path = parquet_file_name
+
+            # TODO: fix automating the hxltag
+            key_val = dict(zip(out_cols, metric_col.split(SEP), strict=True))
+
+            def gen_hxltag(kv: dict[str, str]) -> str:
+                out = ["#population"]
+                for key, value in kv.items():
+                    out += [
+                        "".join(c for c in key if c.isalnum())
+                        + "_"
+                        + "".join(c for c in value if c.isalnum())
+                    ]
+                return "+".join(out)
+
+            new_mmd.hxl_tag = gen_hxltag(key_val)
+            new_mmd.parquet_column_name = metric_col
+            # TODO: Update after fixing hxltag
+            new_mmd.human_readable_name = "; ".join(
+                [
+                    f"Variable: '{key}'; Value: '{value}'"
+                    for key, value in key_val.items()
+                ]
+            )
+            derived_mmd.append(new_mmd)
+
+        joined_metrics = reduce(
+            lambda left, right: left.merge(
+                right, on="GEO_ID", how="inner", validate="one_to_one"
+            ),
+            derived_metrics,
+        )
+
+        context.add_output_metadata(
+            metadata={
+                "metadata_preview": MetadataValue.md(
+                    metadata_to_dataframe(derived_mmd).head().to_markdown()
+                ),
+                "metrics_shape": f"{joined_metrics.shape[0]} rows x {joined_metrics.shape[1]} columns",
+                "metrics_preview": MetadataValue.md(
+                    joined_metrics.head().to_markdown()
+                ),
+            },
+        )
+        return derived_mmd, joined_metrics
 
 
 # Create assets
 scotland = Scotland()
 country_metadata = scotland.create_country_metadata()
 data_publisher = scotland.create_data_publisher()
+lookup = scotland.create_lookup()
 geometry = scotland.create_geometry()
 source_data_releases = scotland.create_source_data_releases()
 catalog = scotland.create_catalog()

From 3a3ba280c31ab819d8eef59dd08b19bc42669870 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Sat, 22 Jun 2024 14:41:22 +0100
Subject: [PATCH 31/60] Add dep

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index e575447..9608a27 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,7 @@ dependencies = [
   "jcs >=0.2.1", # For generating IDs from class attributes
   "beautifulsoup4 >=4.12.3", # For extracting catalogs from web pages
   "openpyxl >=3.1.3", # For reading Excel files
+  "xlrd >=2.0.1", # For reading Excel files
 ]
 
 

From 2281b6f9df9336f52abfd4b201cfd84483f79cfb Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Sat, 22 Jun 2024 14:41:46 +0100
Subject: [PATCH 32/60] Fix arg, add todo

---
 python/popgetter/assets/ni/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 19d6d3b..d13de82 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -561,7 +561,7 @@ def pivot_df(df: pd.DataFrame, end: str) -> tuple[list[str], pd.DataFrame]:
             # Ensure columns are string
             else:
                 pivot.columns = [str(col).strip() for col in pivot.columns.to_numpy()]
-            out_cols = [col.replace(var_type, "").strip() for col in pivot_cols]
+            out_cols = [col.replace(end, "").strip() for col in pivot_cols]
             return out_cols, pivot
 
         # Pivot for codes and labels
@@ -575,6 +575,7 @@ def pivot_df(df: pd.DataFrame, end: str) -> tuple[list[str], pd.DataFrame]:
                 new_mmd = source_mmd.copy()
                 new_mmd.parent_metric_id = source_mmd.source_metric_id
                 new_mmd.metric_parquet_path = parquet_file_name
+                # TODO: check this
                 key_val = dict(zip(out_cols, metric_col.split(SEP), strict=True))
 
                 def gen_hxltag(kv: dict[str, str]) -> str:

From e518f207200c31b0a82c1876605a3edd80876e9d Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Sat, 22 Jun 2024 15:59:04 +0100
Subject: [PATCH 33/60] Update derived metrics

---
 python/popgetter/assets/scotland/__init__.py | 105 ++++++++++++-------
 1 file changed, 69 insertions(+), 36 deletions(-)

diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py
index 74eca9c..4731032 100755
--- a/python/popgetter/assets/scotland/__init__.py
+++ b/python/popgetter/assets/scotland/__init__.py
@@ -268,50 +268,57 @@ class SourceTable:
 
 
 # Config for each partition to be derived
-age_code = "`Age Code`"
+age_code = "`Age Category`"
 sex_label = "`Sex Label`"
+infants = ["0 to 4"]
+children_5_to_17 = ["5 to 9", "10 to 11", "12 to 14" "15", "16 to 17"]
+children = ["0 to 4", "5 to 9", "10 to 11", "12 to 14" "15", "16 to 17"]
+adults = ["18 to 19"] + [f"{i} to {i+4}" for i in range(20, 91, 5)] + ["95 and over"]
+people = ["All people"]
 DERIVED_COLUMNS = [
     DerivedColumn(
         hxltag="#population+children+age5_17",
-        filter_func=lambda df: df.query(f"{age_code} >= 5 and {age_code} < 18"),
+        filter_func=lambda df: df.query(f"{age_code} in @children_5_to_17"),
         output_column_name="children_5_17",
         human_readable_name="Children aged 5 to 17",
     ),
     DerivedColumn(
         hxltag="#population+infants+age0_4",
-        filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 5"),
+        filter_func=lambda df: df.query(f"{age_code} in @infants"),
         output_column_name="infants_0_4",
         human_readable_name="Infants aged 0 to 4",
     ),
     DerivedColumn(
         hxltag="#population+children+age0_17",
-        filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 18"),
+        filter_func=lambda df: df.query(f"{age_code} in @children"),
         output_column_name="children_0_17",
         human_readable_name="Children aged 0 to 17",
     ),
     DerivedColumn(
         hxltag="#population+adults+f",
         filter_func=lambda df: df.query(
-            f"{age_code} >= 18 and {sex_label} == 'Female'"
+            f"{age_code} in @adults and {sex_label} == 'Female'"
         ),
         output_column_name="adults_f",
         human_readable_name="Female adults",
     ),
     DerivedColumn(
         hxltag="#population+adults+m",
-        filter_func=lambda df: df.query(f"{age_code} >= 18 and {sex_label} == 'Male'"),
+        filter_func=lambda df: df.query(
+            f"{age_code} in @adults and {sex_label} == 'Male'"
+        ),
         output_column_name="adults_m",
         human_readable_name="Male adults",
     ),
     DerivedColumn(
         hxltag="#population+adults",
-        filter_func=lambda df: df.query(f"{age_code} >= 18"),
+        filter_func=lambda df: df.query(f"{age_code} in @adults"),
         output_column_name="adults",
         human_readable_name="Adults",
     ),
     DerivedColumn(
         hxltag="#population+ind",
-        filter_func=lambda df: df,
+        filter_func=lambda df: df.query(f"{age_code} in @people"),
         output_column_name="individuals",
         human_readable_name="Total individuals",
     ),
@@ -775,36 +782,50 @@ def _derived_metrics(
         derived_metrics, derived_mmd = [], []
 
         # If derived metrics
-        # try:
-        #     metric_specs = DERIVED_COLUMN_SPECIFICATIONS[partition_key]
-        #     source_column = source_mmd.parquet_column_name
-        #     for metric_spec in metric_specs:
-        #         new_table = (
-        #             census_tables.pipe(metric_spec.filter_func)
-        #             .groupby(by="GEO_ID", as_index=True)
-        #             .sum()
-        #             .rename(columns={source_column: metric_spec.output_column_name})
-        #             .filter(items=["GEO_ID", metric_spec.output_column_name])
-        #         )
-        #         derived_metrics.append(new_table)
-        #         new_mmd = source_mmd.copy()
-        #         new_mmd.parent_metric_id = source_mmd.source_metric_id
-        #         new_mmd.metric_parquet_path = parquet_file_name
-        #         new_mmd.hxl_tag = metric_spec.hxltag
-        #         new_mmd.parquet_column_name = metric_spec.output_column_name
-        #         new_mmd.human_readable_name = metric_spec.human_readable_name
-        #         derived_mmd.append(new_mmd)
-        # except KeyError:
-        #     # No extra derived metrics specified for this partition -- only use
-        #     # those from pivoted data
-        #     pass
+        try:
+            metric_specs = DERIVED_COLUMN_SPECIFICATIONS[partition_key]
+
+            def reshape(df_to_reshape: pd.DataFrame) -> pd.DataFrame:
+                df_to_reshape = df_to_reshape.rename(
+                    columns={"Unnamed: 0": "GEO_ID", "Unnamed: 1": "Age Category"}
+                ).drop(columns=["All people"])
+                df_to_reshape = df_to_reshape.melt(
+                    ["GEO_ID", "Age Category"], var_name="Sex Label", value_name="Count"
+                )
+                df_to_reshape["Sex Label"] = df_to_reshape["Sex Label"].map(
+                    {"Males": "Male", "Females": "Female"}
+                )
+                return df_to_reshape
+
+            census_tables_for_derived_metrics = reshape(census_tables)
+            source_column = source_mmd.parquet_column_name
+            for metric_spec in metric_specs:
+                new_table = (
+                    census_tables_for_derived_metrics.pipe(metric_spec.filter_func)
+                    .groupby(by="GEO_ID", as_index=True)
+                    .sum()
+                    .rename(columns={source_column: metric_spec.output_column_name})
+                    .filter(items=["GEO_ID", metric_spec.output_column_name])
+                )
+                derived_metrics.append(new_table)
+                new_mmd = source_mmd.copy()
+                new_mmd.parent_metric_id = source_mmd.source_metric_id
+                new_mmd.metric_parquet_path = parquet_file_name
+                new_mmd.hxl_tag = metric_spec.hxltag
+                new_mmd.parquet_column_name = metric_spec.output_column_name
+                new_mmd.human_readable_name = metric_spec.human_readable_name
+                derived_mmd.append(new_mmd)
+        except KeyError:
+            # No extra derived metrics specified for this partition -- only use
+            # those from pivoted data
+            pass
 
         # Batch
         def make_pivot(df: pd.DataFrame) -> pd.DataFrame:
             # TODO: reshape based on Unnamed: 1 to Unnamed N
             pivot_cols = [
                 col
-                for col in census_tables.columns
+                for col in df.columns
                 if col != "Unnamed: 0" and col.startswith("Unnamed: ")
             ]
             pivot = df.pivot_table(
@@ -826,10 +847,22 @@ def make_pivot(df: pd.DataFrame) -> pd.DataFrame:
             return pivot
 
         new_table = make_pivot(census_tables)
-        out_cols = [
-            "".join(x for x in col.title() if not x.isspace())
-            for col in source_mmd.description.split(" by ")[::-1]
-        ]
+
+        # Split for description of metrics
+        exceptions = {
+            "Age by single year": ["Age by single year"],
+            "National Statistics Socio-economic Classification (NS-SeC) by ethnic group by sex by age": [
+                "Ethnic group",
+                "Sex and Age",
+                "National Statistics Socio-economic Classification (NS-SeC)",
+            ],
+        }
+        if source_mmd.description not in exceptions:
+            split = source_mmd.description.split(" by ")[::-1]
+        else:
+            split = exceptions[source_mmd.description]
+        out_cols = ["".join(x for x in col.title() if not x.isspace()) for col in split]
+        context.log.debug(ic(out_cols))
 
         for metric_col in new_table.columns:
             metric_df = new_table.loc[:, metric_col].to_frame()

From 24a4dd5ec2dd48b1979ae3a08a030c5e155ff07e Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Sat, 22 Jun 2024 17:38:15 +0100
Subject: [PATCH 34/60] Fix non-integer cases

---
 python/popgetter/assets/scotland/__init__.py | 24 ++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py
index 4731032..506bca8 100755
--- a/python/popgetter/assets/scotland/__init__.py
+++ b/python/popgetter/assets/scotland/__init__.py
@@ -863,7 +863,8 @@ def make_pivot(df: pd.DataFrame) -> pd.DataFrame:
             split = exceptions[source_mmd.description]
         out_cols = ["".join(x for x in col.title() if not x.isspace()) for col in split]
         context.log.debug(ic(out_cols))
-
+        ic("----")
+        ic(new_table.columns)
         for metric_col in new_table.columns:
             metric_df = new_table.loc[:, metric_col].to_frame()
             ic(metric_df)
@@ -903,6 +904,24 @@ def gen_hxltag(kv: dict[str, str]) -> str:
             derived_metrics,
         )
 
+        def make_int(maybe_non_int_df: pd.DataFrame) -> pd.DataFrame:
+            for col in maybe_non_int_df:
+                if maybe_non_int_df[col].dtype == "object":
+                    maybe_non_int_df[col] = (
+                        maybe_non_int_df[col]
+                        .str.replace(",", "")
+                        .str.replace("-", "0")
+                        .fillna("0")
+                        .astype(int)
+                    )
+            return maybe_non_int_df
+
+        # Fix format
+        joined_metrics = make_int(joined_metrics)
+
+        # Filter out whole country Scotland
+        joined_metrics = joined_metrics.loc[~joined_metrics.index.isin(["S92000003"])]
+
         context.add_output_metadata(
             metadata={
                 "metadata_preview": MetadataValue.md(
@@ -928,4 +947,5 @@ def gen_hxltag(kv: dict[str, str]) -> str:
 census_tables = scotland.create_census_tables()
 source_metric_metadata = scotland.create_source_metric_metadata()
 derived_metrics = scotland.create_derived_metrics()
-metrics = scotland.create_metrics()
+# Publish all partitions
+metrics = scotland.create_metrics(None)

From 86ba91b6dfe591f1c7e7db710fc89b278f11ee62 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Sat, 22 Jun 2024 17:45:58 +0100
Subject: [PATCH 35/60] Remove obsolete modules

---
 .../assets/scotland/census_derived.py         | 328 ------------------
 .../assets/scotland/census_geometry.py        |  17 -
 .../assets/scotland/census_tables.py          | 291 ----------------
 3 files changed, 636 deletions(-)
 delete mode 100644 python/popgetter/assets/scotland/census_derived.py
 delete mode 100644 python/popgetter/assets/scotland/census_geometry.py
 delete mode 100644 python/popgetter/assets/scotland/census_tables.py

diff --git a/python/popgetter/assets/scotland/census_derived.py b/python/popgetter/assets/scotland/census_derived.py
deleted file mode 100644
index 43277a0..0000000
--- a/python/popgetter/assets/scotland/census_derived.py
+++ /dev/null
@@ -1,328 +0,0 @@
-# from __future__ import annotations
-
-# import geopandas as gpd
-# import numpy as np
-# import pandas as pd
-# from dagster import (
-#     AssetIn,
-#     AssetOut,
-#     MaterializeResult,
-#     MetadataValue,
-#     SpecificPartitionsPartitionMapping,
-#     StaticPartitionsDefinition,
-#     asset,
-#     multi_asset,
-# )
-# from icecream import ic
-# from matplotlib import pyplot as plt
-
-# from popgetter.utils import markdown_from_plot
-
-# from ...metadata import MetricMetadata
-# from .census_tables import add_metadata, dataset_node_partition
-
-
-# def get_lc1117sc_metric(
-#     lc1117sc: pd.DataFrame, col: str, output_col: str, subset: list[str]
-# ) -> pd.DataFrame:
-#     lc1117sc_transformed = lc1117sc.rename(
-#         columns={"Unnamed: 0": "OA11CD", "Unnamed: 1": "Age Category"}
-#     )
-#     lc1117sc_transformed = lc1117sc_transformed.loc[
-#         ~lc1117sc_transformed["OA11CD"].str.startswith("S92"), :
-#     ]
-#     return (
-#         lc1117sc_transformed.loc[
-#             lc1117sc_transformed["Age Category"].isin(subset),
-#             ["OA11CD", col],
-#         ]
-#         .groupby("OA11CD")
-#         .agg("sum")
-#         .rename(columns={col: output_col})
-#     )
-
-
-# ALL_PEOPLE = ["All people"]
-# INFANTS_AGE_0_TO_4 = ["0 to 4"]
-# CHILDREN_AGE_0_TO_17 = ["0 to 4", "5 to 9", "10 to 11", "12 to 14", "15", "16 to 17"]
-# CHILDREN_AGE_5_TO_17 = ["5 to 9", "10 to 11", "12 to 14", "15", "16 to 17"]
-# ADULTS = [
-#     "18 to 19",
-#     "20 to 24",
-#     "25 to 29",
-#     "30 to 34",
-#     "35 to 39",
-#     "40 to 44",
-#     "45 to 49",
-#     "50 to 54",
-#     "55 to 59",
-#     "60 to 64",
-#     "65 to 69",
-#     "70 to 74",
-#     "75 to 79",
-#     "80 to 84",
-#     "85 to 89",
-#     "90 to 94",
-#     "95 and over",
-# ]
-
-# needed_dataset_list = [
-#     {
-#         # Population by OA11, Period: 2011
-#         "partition_key": "2011/OA11/LC1117SC",
-#         "hxltag": "#population+oa11+2011",
-#         # TODO: this partition key does not have a single column for source
-#         "source_column": "",
-#     }
-# ]
-# needed_dataset_partions_keys: list[str] = [
-#     r["partition_key"] for r in needed_dataset_list
-# ]
-# needed_dataset_mapping = SpecificPartitionsPartitionMapping(
-#     needed_dataset_partions_keys
-# )
-# needed_dataset_partition = StaticPartitionsDefinition(needed_dataset_partions_keys)
-
-# # Using HXL tags for variable names (https://hxlstandard.org/standard/1-1final/dictionary/#tag_population)
-# # TODO: add human readable names for each column as the MetricMetadata currently receives the
-# # catalog row (table) human readable name.
-# _derived_columns: list[dict] = [
-#     {
-#         "partition_key": "2011/OA11/LC1117SC",
-#         "hxltag": "population_children_age5_17",
-#         "filter_func": lambda df, output_col: get_lc1117sc_metric(
-#             df, "All people", output_col, CHILDREN_AGE_5_TO_17
-#         ),
-#     },
-#     {
-#         "partition_key": "2011/OA11/LC1117SC",
-#         "hxltag": "population_infants_age0_4",
-#         "filter_func": lambda df, output_col: get_lc1117sc_metric(
-#             df, "All people", output_col, INFANTS_AGE_0_TO_4
-#         ),
-#     },
-#     {
-#         "partition_key": "2011/OA11/LC1117SC",
-#         "hxltag": "population_children_age0_17",
-#         "filter_func": lambda df, output_col: get_lc1117sc_metric(
-#             df, "All people", output_col, CHILDREN_AGE_0_TO_17
-#         ),
-#     },
-#     {
-#         "partition_key": "2011/OA11/LC1117SC",
-#         "hxltag": "population_adults_f",
-#         "filter_func": lambda df, output_col: get_lc1117sc_metric(
-#             df, "Females", output_col, ADULTS
-#         ),
-#     },
-#     {
-#         "partition_key": "2011/OA11/LC1117SC",
-#         "hxltag": "population_adults_m",
-#         "filter_func": lambda df, output_col: get_lc1117sc_metric(
-#             df, "Males", output_col, ADULTS
-#         ),
-#     },
-#     {
-#         "partition_key": "2011/OA11/LC1117SC",
-#         "hxltag": "population_adults",
-#         "filter_func": lambda df, output_col: get_lc1117sc_metric(
-#             df, "All people", output_col, ADULTS
-#         ),
-#     },
-#     {
-#         "partition_key": "2011/OA11/LC1117SC",
-#         "hxltag": "population_ind",
-#         "filter_func": lambda df, output_col: get_lc1117sc_metric(
-#             df, "All people", output_col, ALL_PEOPLE
-#         ),
-#     },
-# ]
-
-# derived_columns = pd.DataFrame(
-#     _derived_columns, columns=["partition_key", "hxltag", "filter_func"]
-# )
-
-
-# # record = {
-# #     "resolution": resolution,
-# #     "catalog_resolution": table_metadata["catalog_resolution"],
-# #     "source": source,
-# #     "url": url,
-# #     "file_name": Path(source) / file_name,
-# #     "table_name": table_name,
-# #     "year": table_metadata["year"],
-# #     # Use constructed name of description and coverage
-# #     "human_readable_name": table_metadata["human_readable_name"],
-# #     "source_metric_id": None,
-# #     # Use catalog_metadata description
-# #     "description": table_metadata["description"],
-# #     "hxl_tag": None,
-# #     "metric_parquet_file_url": None,
-# #     "parquet_column_name": None,
-# #     "parquet_margin_of_error_column": None,
-# #     "parquet_margin_of_error_file": None,
-# #     "potential_denominator_ids": None,
-# #     "parent_metric_id": None,
-# #     # TODO: check this is not an ID but a name
-# #     "source_data_release_id": table_metadata["census_release"],
-# #     "source_download_url": url,
-# #     # TODO: what should this be?
-# #     "source_archive_file_path": None,
-# #     "source_documentation_url": URL_CATALOG_METADATA,
-# # }
-
-
-# def census_table_metadata(catalog_row: dict) -> MetricMetadata:
-#     return MetricMetadata(
-#         human_readable_name=catalog_row["human_readable_name"],
-#         source_download_url=catalog_row["source_download_url"],
-#         source_archive_file_path=catalog_row["source_archive_file_path"],
-#         source_documentation_url=catalog_row["source_documentation_url"],
-#         source_data_release_id=catalog_row["source_data_release_id"],
-#         # TODO - this is a placeholder
-#         parent_metric_id="unknown_at_this_stage",
-#         potential_denominator_ids=None,
-#         parquet_margin_of_error_file=None,
-#         parquet_margin_of_error_column=None,
-#         # TODO: currently setting to rename the derived column name equal to 'hxltag'
-#         # and not related to the source_column
-#         # parquet_column_name=catalog_row["source_column"],
-#         parquet_column_name=catalog_row["hxltag"],
-#         # TODO - this is a placeholder
-#         metric_parquet_file_url="unknown_at_this_stage",
-#         hxl_tag=catalog_row["hxltag"],
-#         description=catalog_row["description"],
-#         source_metric_id=catalog_row["hxltag"],
-#     )
-
-
-# @asset(
-#     ins={
-#         "catalog_as_dataframe": AssetIn(partition_mapping=needed_dataset_mapping),
-#     },
-# )
-# def filter_needed_catalog(
-#     context, needed_datasets, catalog_as_dataframe: pd.DataFrame
-# ) -> pd.DataFrame:
-#     needed_df = needed_datasets.merge(
-#         catalog_as_dataframe, how="inner", on="partition_key"
-#     )
-#     add_metadata(context, needed_df, "needed_df")
-#     return needed_df
-
-
-# @asset
-# def needed_datasets(context) -> pd.DataFrame:
-#     needed_df = pd.DataFrame(
-#         needed_dataset_list,
-#         columns=["partition_key", "hxltag", "source_column", "derived_columns"],
-#         dtype="string",
-#     )
-#     add_metadata(context, needed_df, "needed_datasets")
-#     return needed_df
-
-
-# @multi_asset(
-#     ins={
-#         "individual_census_table": AssetIn(partition_mapping=needed_dataset_mapping),
-#         "filter_needed_catalog": AssetIn(),
-#     },
-#     outs={
-#         "source_table": AssetOut(),
-#         "source_mmd": AssetOut(),
-#     },
-#     partitions_def=dataset_node_partition,
-# )
-# def get_enriched_tables_scotland(
-#     context, individual_census_table, filter_needed_catalog
-# ) -> tuple[pd.DataFrame, MetricMetadata]:
-#     partition_keys = context.asset_partition_keys_for_input(
-#         input_name="individual_census_table",
-#     )
-#     output_partition = context.asset_partition_key_for_output("source_table")
-#     ic(partition_keys)
-#     ic(len(partition_keys))
-#     ic(output_partition)
-#     ic(type(output_partition))
-#     ic(individual_census_table)
-#     if output_partition not in partition_keys:
-#         err_msg = f"Requested partition {output_partition} not found in the subset of 'needed' partitions {partition_keys}"
-#         raise ValueError(err_msg)
-
-#     result_df = individual_census_table
-#     catalog_row = filter_needed_catalog[
-#         filter_needed_catalog["partition_key"].eq(output_partition)
-#     ]
-#     catalog_row = catalog_row.to_dict(orient="index")
-#     catalog_row = catalog_row.popitem()[1]
-#     ic(catalog_row)
-#     result_mmd = census_table_metadata(catalog_row)
-#     ic(result_mmd)
-#     return result_df, result_mmd
-
-
-# @multi_asset(
-#     partitions_def=dataset_node_partition,
-#     ins={
-#         "source_table": AssetIn(partition_mapping=needed_dataset_mapping),
-#         "source_mmd": AssetIn(partition_mapping=needed_dataset_mapping),
-#     },
-#     outs={"derived_table": AssetOut(), "derived_mmds": AssetOut()},
-# )
-# def transform_data(
-#     context,
-#     source_table: pd.DataFrame,
-#     source_mmd: MetricMetadata,
-# ) -> tuple[pd.DataFrame, list[MetricMetadata]]:
-#     partition_key = context.asset_partition_key_for_output("derived_table")
-#     census_table = source_table.copy()
-#     parent_mmd = source_mmd.copy()
-#     # source_column = parent_mmd.parquet_column_name
-#     metrics = derived_columns[derived_columns["partition_key"].eq(partition_key)]
-#     new_series: list[pd.Series] = []
-#     new_mmds: list[MetricMetadata] = []
-#     for _, _, col_name, filter in metrics.itertuples():
-#         # Create column
-#         column: pd.Series = filter(census_table, col_name)
-#         ic(f"col_name: {col_name}")
-#         new_series.append(column)
-
-#         # Construct metadata
-#         new_mmd = parent_mmd.copy()
-#         new_mmd.parent_metric_id = parent_mmd.source_metric_id
-#         new_mmd.hxl_tag = col_name
-#         new_mmds.append(new_mmd)
-
-#     # Merge series
-#     new_table: pd.DataFrame = pd.concat(new_series, axis=1)
-#     add_metadata(
-#         context,
-#         df=new_table,
-#         title=f"Derived table ({partition_key})",
-#         output_name="derived_table",
-#     )
-#     return new_table, new_mmds
-
-
-# @multi_asset(
-#     ins={
-#         "derived_table": AssetIn(partition_mapping=needed_dataset_mapping),
-#         "geometry": AssetIn(partition_mapping=needed_dataset_mapping),
-#     },
-#     outs={
-#         "plot": AssetOut(),
-#     },
-#     partitions_def=dataset_node_partition,
-# )
-# def plot(derived_table: pd.DataFrame, geometry: gpd.GeoDataFrame):
-#     """Plots map with log density of people."""
-#     merged = geometry.merge(
-#         derived_table[["population_ind"]],
-#         left_on="geo_code",
-#         right_index=True,
-#         how="left",
-#     )
-#     merged["log10 people"] = np.log10(merged["population_ind"])
-#     merged.plot(column="log10 people", legend=True)
-#     md_content = markdown_from_plot(plt)
-#     return MaterializeResult(metadata={"plot": MetadataValue.md(md_content)})
diff --git a/python/popgetter/assets/scotland/census_geometry.py b/python/popgetter/assets/scotland/census_geometry.py
deleted file mode 100644
index 28e5afa..0000000
--- a/python/popgetter/assets/scotland/census_geometry.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# from __future__ import annotations
-
-# import geopandas as gpd
-# from dagster import asset
-
-# from popgetter.assets.scotland import download_file
-
-# from .census_tables import URL_SHAPEFILE, add_metadata, cache_dir
-
-
-# # @asset
-# # def geometry(context, oa_dz_iz_2011_lookup) -> gpd.GeoDataFrame:
-# #     """Gets the shape file for OA11 resolution."""
-# #     file_name = download_file(cache_dir, URL_SHAPEFILE)
-# #     geo = gpd.read_file(f"zip://{file_name}")
-# #     add_metadata(context, geo, "Geometry file")
-# #     return geo[geo["geo_code"].isin(oa_dz_iz_2011_lookup["OutputArea2011Code"])]
diff --git a/python/popgetter/assets/scotland/census_tables.py b/python/popgetter/assets/scotland/census_tables.py
deleted file mode 100644
index 5efda94..0000000
--- a/python/popgetter/assets/scotland/census_tables.py
+++ /dev/null
@@ -1,291 +0,0 @@
-# from __future__ import annotations
-
-# import urllib.parse as urlparse
-# from pathlib import Path
-
-# import geopandas as gpd
-# import pandas as pd
-# import zipfile_deflate64 as zipfile
-# from dagster import (
-#     AssetOut,
-#     DynamicPartitionsDefinition,
-#     MetadataValue,
-#     SpecificPartitionsPartitionMapping,
-#     StaticPartitionsDefinition,
-#     asset,
-#     multi_asset,
-# )
-
-# from popgetter.assets.scotland import REQUIRED_TABLES_REGEX, download_file, sources
-
-# """
-# Notes:
-#   - 2011 data using UKCensusAPI, 2022 data expected soon given recent initial
-#     publication
-#   - Reusing some bits of code from UKCensusAPI:
-#     https://github.com/alan-turing-institute/UKCensusAPI/blob/master/ukcensusapi/NRScotland.py
-# """
-
-
-# PARTITIONS_DEF_NAME = "dataset_tables"
-# dataset_node_partition = DynamicPartitionsDefinition(name=PARTITIONS_DEF_NAME)
-
-# # cache_dir = tempfile.mkdtemp()
-# cache_dir = "./cache"
-
-# URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html"
-# URL1 = "https://www.scotlandscensus.gov.uk/"
-# URL2 = "https://nrscensusprodumb.blob.core.windows.net/downloads/"
-# URL_LOOKUP = (
-#     "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx"
-# )
-# URL_SHAPEFILE = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/infuse_oa_lyr_2011.zip"
-# URL_CATALOG = (
-#     "https://www.scotlandscensus.gov.uk/media/kqcmo4ge/census-table-index-2011.xlsm"
-# )
-
-# data_sources = ["Council Area blk", "SNS Data Zone 2011 blk", "Output Area blk"]
-# GeoCodeLookup = {
-#     "LAD": 0,  # "Council Area blk"
-#     # MSOA (intermediate zone)?
-#     "LSOA11": 1,  # "SNS Data Zone 2011 blk"
-#     "OA11": 2,  # "Output Area blk"
-# }
-
-# DATA_SOURCES = [
-#     {
-#         "source": "Council Area blk",
-#         "resolution": "LAD",
-#         "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip",
-#     },
-#     {
-#         "source": "SNS Data Zone 2011 blk",
-#         "resolution": "LSOA11",
-#         "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip",
-#     },
-#     {
-#         "source": "Output Area blk",
-#         "resolution": "OA11",
-#         "url": URL2 + urlparse.quote("Output Area blk") + ".zip",
-#     },
-# ]
-
-
-# # NB. Make sure no spaces in asset keys
-# @multi_asset(
-#     outs={
-#         "oa_dz_iz_2011_lookup": AssetOut(),
-#         "data_zone_2011_lookup": AssetOut(),
-#         "intermediate_zone_2011_lookup": AssetOut(),
-#     },
-# )
-# def lookups():
-#     """Creates lookup dataframes."""
-#     Path(cache_dir).mkdir(parents=True, exist_ok=True)
-#     lookup_path = download_file(cache_dir, URL_LOOKUP)
-#     df1 = pd.read_excel(lookup_path, sheet_name="OA_DZ_IZ_2011 Lookup")
-#     df2 = pd.read_excel(lookup_path, sheet_name="DataZone2011Lookup")
-#     df3 = pd.read_excel(lookup_path, sheet_name="IntermediateZone2011Lookup")
-#     return df1, df2, df3
-
-
-# def source_to_zip(source_name: str, url: str) -> Path:
-#     """Downloads if necessary and returns the name of the locally cached zip file
-#     of the source data (replacing spaces with _)"""
-#     file_name = Path(cache_dir) / (source_name.replace(" ", "_") + ".zip")
-#     return download_file(cache_dir, url, file_name)
-
-
-# def add_metadata(
-#     context,
-#     df: pd.DataFrame | gpd.GeoDataFrame,
-#     title: str | list[str],
-#     output_name: str | None = None,
-# ):
-#     context.add_output_metadata(
-#         metadata={
-#             "title": title,
-#             "num_records": len(df),
-#             "columns": MetadataValue.md(
-#                 "\n".join([f"- '`{col}`'" for col in df.columns.to_list()])
-#             ),
-#             "preview": MetadataValue.md(df.head().to_markdown()),
-#         },
-#         output_name=output_name,
-#     )
-
-
-# @asset
-# def catalog_reference(context) -> pd.DataFrame:
-#     catalog_reference = pd.read_excel(
-#         URL_CATALOG,
-#         sheet_name=None,
-#         header=None,
-#         storage_options={"User-Agent": "Mozilla/5.0"},
-#     )["Index"].rename(
-#         columns={
-#             0: "census_release",
-#             1: "table_name",
-#             2: "description",
-#             3: "population_coverage",
-#             4: "variable",
-#             5: "catalog_resolution",
-#             6: "year",
-#             7: "additional_url",
-#             8: "population_coverage_and_variable",
-#         }
-#     )
-#     add_metadata(context, catalog_reference, "Metadata for census tables")
-#     return catalog_reference
-
-
-# def get_table_metadata(
-#     catalog_reference: pd.DataFrame, table_name: str
-# ) -> dict[str, str]:
-#     """Returns a dict of table metadata for a given table name."""
-#     rows = catalog_reference.loc[catalog_reference.loc[:, "table_name"].eq(table_name)]
-#     census_release = rows.loc[:, "census_release"].unique()[0]
-#     description = rows.loc[:, "description"].unique()[0]
-#     population_coverage = rows.loc[:, "population_coverage"].unique()[0]
-#     variables = ", ".join(rows.loc[:, "variable"].astype(str).to_list())
-#     catalog_resolution = rows.loc[:, "catalog_resolution"].unique()[0]
-#     year = int(rows.loc[:, "year"].unique()[0])
-#     return {
-#         "census_release": census_release,
-#         "description": description,
-#         "population_coverage": population_coverage,
-#         "variables": variables,
-#         "catalog_resolution": catalog_resolution,
-#         "year": str(year),
-#         "human_readable_name": f"{description} ({population_coverage})",
-#     }
-
-
-# def get_table_name(file_name: str) -> str:
-#     return file_name.rsplit(".csv")[0]
-
-
-# @asset
-# def catalog_as_dataframe(context, catalog_reference: pd.DataFrame) -> pd.DataFrame:
-#     """Creates a catalog of the individual census tables from all data sources."""
-#     records = []
-#     for data_source in DATA_SOURCES:
-#         resolution = data_source["resolution"]
-#         source = data_source["source"]
-#         url = data_source["url"]
-#         zip_file_name = source_to_zip(source, url)
-#         with zipfile.ZipFile(zip_file_name) as zip_ref:
-#             for file_name in zip_ref.namelist():
-#                 # Get table name
-#                 table_name = get_table_name(file_name)
-
-#                 # Skip bulk output files and missing tables from catalog_reference
-#                 if (
-#                     "bulk_output" in file_name.lower()
-#                     or catalog_reference.loc[:, "table_name"].ne(table_name).all()
-#                 ):
-#                     continue
-
-#                 # Get table metadata
-#                 table_metadata = get_table_metadata(catalog_reference, table_name)
-
-#                 # Get source release metadata if available
-#                 source_data_release = sources.get(
-#                     table_metadata["census_release"], None
-#                 )
-#                 source_data_release_id = (
-#                     None if source_data_release is None else source_data_release.id
-#                 )
-
-#                 # Create a record for each census table use same keys as MetricMetadata
-#                 # where possible since this makes it simpler to populate derived
-#                 # metrics downstream
-#                 record = {
-#                     "resolution": resolution,
-#                     "catalog_resolution": table_metadata["catalog_resolution"],
-#                     "source": source,
-#                     "url": url,
-#                     "file_name": Path(source) / file_name,
-#                     "table_name": table_name,
-#                     "year": table_metadata["year"],
-#                     # Use constructed name of description and coverage
-#                     "human_readable_name": table_metadata["human_readable_name"],
-#                     "source_metric_id": None,
-#                     # Use catalog_reference description
-#                     "description": table_metadata["description"],
-#                     "hxl_tag": None,
-#                     "metric_parquet_file_url": None,
-#                     "parquet_column_name": None,
-#                     "parquet_margin_of_error_column": None,
-#                     "parquet_margin_of_error_file": None,
-#                     "potential_denominator_ids": None,
-#                     "parent_metric_id": None,
-#                     # TODO: check this is not an ID but a name
-#                     "source_data_release_id": source_data_release_id,
-#                     "source_download_url": url,
-#                     # TODO: what should this be?
-#                     "source_archive_file_path": None,
-#                     "source_documentation_url": URL_CATALOG,
-#                 }
-#                 context.log.debug(record)
-#                 records.append(record)
-#                 zip_ref.extract(file_name, Path(cache_dir) / source)
-
-#     # TODO: check if required
-#     for partition in context.instance.get_dynamic_partitions(PARTITIONS_DEF_NAME):
-#         context.instance.delete_dynamic_partition(PARTITIONS_DEF_NAME, partition)
-
-#     # Create a dynamic partition for the datasets listed in the catalog
-#     catalog_df: pd.DataFrame = pd.DataFrame.from_records(records)
-#     catalog_df["partition_key"] = (
-#         catalog_df[["year", "resolution", "table_name"]]
-#         .astype(str)
-#         .agg(lambda s: "/".join(s).rsplit(".")[0], axis=1)
-#     )
-#     # TODO: consider filtering here based on a set of keys to keep derived from
-#     # config (i.e. backend/frontend modes)
-#     context.instance.add_dynamic_partitions(
-#         partitions_def_name=PARTITIONS_DEF_NAME,
-#         # To ensure this is unique, prepend the resolution,
-#         partition_keys=catalog_df.loc[
-#             catalog_df["partition_key"].str.contains(REQUIRED_TABLES_REGEX),
-#             "partition_key",
-#         ].to_list(),
-#     )
-#     context.add_output_metadata(
-#         metadata={
-#             "num_records": len(catalog_df),
-#             "ignored_datasets": "",
-#             "columns": MetadataValue.md(
-#                 "\n".join([f"- '`{col}`'" for col in catalog_df.columns.to_list()])
-#             ),
-#             "columns_types": MetadataValue.md(catalog_df.dtypes.to_markdown()),
-#             "preview": MetadataValue.md(catalog_df.to_markdown()),
-#         }
-#     )
-#     return catalog_df
-
-
-# def get_table(context, table_details) -> pd.DataFrame:
-#     table_df = pd.read_csv(Path(cache_dir) / table_details["file_name"].iloc[0])
-#     add_metadata(context, table_df, table_details["partition_key"].iloc[0])
-#     return table_df
-
-
-# @asset(partitions_def=dataset_node_partition)
-# def individual_census_table(
-#     context, catalog_as_dataframe: pd.DataFrame
-# ) -> pd.DataFrame:
-#     """Creates individual census tables as dataframe."""
-#     partition_key = context.asset_partition_key_for_output()
-#     context.log.info(partition_key)
-#     table_details = catalog_as_dataframe.loc[
-#         catalog_as_dataframe["partition_key"].isin([partition_key])
-#     ]
-#     context.log.info(table_details)
-#     return get_table(context, table_details)
-
-
-# subset_partition_keys: list[str] = ["2011/OA11/LC1117SC"]
-# subset_mapping = SpecificPartitionsPartitionMapping(subset_partition_keys)
-# subset_partition = StaticPartitionsDefinition(subset_partition_keys)

From 2149324633984df9688864e3ef399190bd3fb2b1 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Sat, 22 Jun 2024 17:46:34 +0100
Subject: [PATCH 36/60] Remove obsolete code

---
 python/popgetter/assets/scotland/__init__.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/scotland/__init__.py
index 506bca8..c6831bf 100755
--- a/python/popgetter/assets/scotland/__init__.py
+++ b/python/popgetter/assets/scotland/__init__.py
@@ -706,10 +706,6 @@ def _census_tables(self, context, catalog: pd.DataFrame) -> pd.DataFrame:
         context.log.info(table_details)
         return self.get_table(context, table_details)
 
-    # subset_partition_keys: list[str] = ["2011/OA11/LC1117SC"]
-    # subset_mapping = SpecificPartitionsPartitionMapping(subset_partition_keys)
-    # subset_partition = StaticPartitionsDefinition(subset_partition_keys)
-
     @staticmethod
     def census_table_metadata(
         catalog_row: dict[str, str],
@@ -863,7 +859,6 @@ def make_pivot(df: pd.DataFrame) -> pd.DataFrame:
             split = exceptions[source_mmd.description]
         out_cols = ["".join(x for x in col.title() if not x.isspace()) for col in split]
         context.log.debug(ic(out_cols))
-        ic("----")
         ic(new_table.columns)
         for metric_col in new_table.columns:
             metric_df = new_table.loc[:, metric_col].to_frame()

From dfbef8770b56f273b48a5ddc8ab735a2ec61ba53 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 26 Jun 2024 21:54:37 +0100
Subject: [PATCH 37/60] Rename module, add country metadata to class

---
 python/popgetter/assets/__init__.py             |  4 ++--
 .../assets/{scotland => gb_sct}/__init__.py     | 17 +++++++++--------
 2 files changed, 11 insertions(+), 10 deletions(-)
 rename python/popgetter/assets/{scotland => gb_sct}/__init__.py (99%)

diff --git a/python/popgetter/assets/__init__.py b/python/popgetter/assets/__init__.py
index a766e2a..b7cc89f 100644
--- a/python/popgetter/assets/__init__.py
+++ b/python/popgetter/assets/__init__.py
@@ -1,9 +1,9 @@
 from __future__ import annotations
 
-from . import bel, gb_nir, scotland, uk, us
+from . import bel, gb_nir, gb_sct, uk, us
 
 countries = [
-    (mod, mod.__name__.split(".")[-1]) for mod in [bel, gb_nir, uk, us, scotland]
+    (mod, mod.__name__.split(".")[-1]) for mod in [bel, gb_nir, uk, us, gb_sct]
 ]
 
 __all__ = ["countries"]
diff --git a/python/popgetter/assets/scotland/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
similarity index 99%
rename from python/popgetter/assets/scotland/__init__.py
rename to python/popgetter/assets/gb_sct/__init__.py
index c6831bf..b316984 100755
--- a/python/popgetter/assets/scotland/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -347,7 +347,13 @@ def get_source_data_release(geo_level: str, cenesus_release: str) -> str:
 
 
 class Scotland(Country):
-    key_prefix: str = "scotland"
+    country_metadata: ClassVar[CountryMetadata] = CountryMetadata(
+        name_short_en="Scotland",
+        name_official="Scotland",
+        iso3="GBR",
+        iso2="GB",
+        iso3166_2="GB-SCT",
+    )
     geo_levels: ClassVar[list[str]] = list(SCOTLAND_GEO_LEVELS.keys())
     tables_to_process: list[str] | None = TABLES_TO_PROCESS
 
@@ -503,13 +509,7 @@ def get_table_metadata(
         return catalog_df
 
     def _country_metadata(self, _context) -> CountryMetadata:
-        return CountryMetadata(
-            name_short_en="Scotland",
-            name_official="Scotland",
-            iso3="GBR",
-            iso2="GB",
-            iso3166_2="GB-SCT",
-        )
+        return self.country_metadata
 
     def _data_publisher(
         self, _context, country_metdata: CountryMetadata
@@ -584,6 +584,7 @@ def geometry(
             for level_details in SCOTLAND_GEO_LEVELS.values():
                 # TODO: get correct values
                 geometry_metadata = GeometryMetadata(
+                    country_metadata=self.country_metadata,
                     validity_period_start=CENSUS_COLLECTION_DATE,
                     validity_period_end=CENSUS_COLLECTION_DATE,
                     level=level_details.level,

From 450d7cf22ea2454b6a7c26cb0f582e7a59abc709 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 26 Jun 2024 21:57:39 +0100
Subject: [PATCH 38/60] Update metrics file name

---
 python/popgetter/assets/gb_sct/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index b316984..ef42d27 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -774,7 +774,8 @@ def _derived_metrics(
         partition_key = context.partition_key
         source_mmd = source_metric_metadata
         parquet_file_name = (
-            "".join(c for c in partition_key if c.isalnum()) + ".parquet"
+            f"{self.key_prefix}/metrics/"
+            f"{''.join(c for c in partition_key if c.isalnum()) + '.parquet'}"
         )
         derived_metrics, derived_mmd = [], []
 

From ceccf58a8332be818ff23ca30bf71fbaa812270e Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 27 Jun 2024 06:58:41 +0100
Subject: [PATCH 39/60] Fix loop over geometry

---
 python/popgetter/assets/gb_sct/__init__.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index ef42d27..9154557 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -21,7 +21,7 @@
 from icecream import ic
 
 from popgetter.assets.country import Country
-from popgetter.cloud_outputs import send_to_geometry_sensor
+from popgetter.cloud_outputs import GeometryOutput, send_to_geometry_sensor
 from popgetter.metadata import (
     CountryMetadata,
     DataPublisher,
@@ -661,13 +661,13 @@ def _geometry(self, context):
     def _source_data_releases(
         self,
         _context,
-        geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]],
+        geometry: list[GeometryOutput],
         data_publisher: DataPublisher,
         # TODO: consider version without inputs so only output type specified
         # **kwargs,
     ) -> dict[str, SourceDataRelease]:
         source_data_releases = {}
-        for geo_metadata, _, _ in geometry:
+        for geo in geometry:
             for (
                 source_data_release_id,
                 source_data_release,
@@ -683,10 +683,10 @@ def _source_data_releases(
                     url=source_data_release.url,
                     data_publisher_id=data_publisher.id,
                     description=source_data_release.description,
-                    geometry_metadata_id=geo_metadata.id,
+                    geometry_metadata_id=geo.metadata.id,
                 )
                 combined_level_and_release_id = get_source_data_release(
-                    geo_metadata.level, source_data_release_id
+                    geo.metadata.level, source_data_release_id
                 )
                 source_data_releases[
                     combined_level_and_release_id

From 2812184713327c61d0742a284b7ee13563534b38 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 27 Jun 2024 07:00:24 +0100
Subject: [PATCH 40/60] Replace 'en' with 'eng'

---
 python/popgetter/assets/gb_sct/__init__.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index 9154557..25574bf 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -194,7 +194,7 @@ class ScotlandGeometryLevel:
         geo_id_column="OA_CODE",
         census_table_column="TODO",
         # census_table_column="Census 2021 Data Zone Code",
-        name_columns={"en": "OutputArea2011Name"},  # TODO
+        name_columns={"eng": "OutputArea2011Name"},  # TODO
         # url=URL_SHAPEFILE,
         url="https://www.nrscotland.gov.uk/files/geography/output-area-2011-eor.zip",
         lookup_url=None,
@@ -209,7 +209,7 @@ class ScotlandGeometryLevel:
         geo_id_column="DataZone",
         census_table_column="TODO",
         # census_table_column="Census 2021 Data Zone Code",
-        name_columns={"en": "Name"},
+        name_columns={"eng": "Name"},
         url="https://maps.gov.scot/ATOM/shapefiles/SG_DataZoneBdry_2011.zip",
         lookup_url=None,
         lookup_sheet=None,
@@ -222,7 +222,7 @@ class ScotlandGeometryLevel:
     #     geo_id_column="OA_CODE",
     #     census_table_column="TODO",
     #     # census_table_column="Census 2021 Data Zone Code",
-    #     name_columns={"en": "OA_CODE"},
+    #     name_columns={"eng": "OA_CODE"},
     #     # url=URL_SHAPEFILE,
     #     url="https://www.nrscotland.gov.uk/files/geography/output-area-2011-eor.zip",
     #     lookup_url=None,
@@ -237,7 +237,7 @@ class ScotlandGeometryLevel:
         geo_id_column="CouncilArea2011Code",
         census_table_column="TODO",
         # census_table_column="Census 2021 Data Zone Code",
-        name_columns={"en": "CouncilArea2011Name"},
+        name_columns={"eng": "CouncilArea2011Name"},
         url="https://maps.gov.scot/ATOM/shapefiles/SG_DataZoneBdry_2011.zip",
         lookup_url=None,
         lookup_sheet=None,
@@ -633,7 +633,7 @@ def geometry(
             # Add output metadata
             first_metadata, first_gdf, first_names = geometries_to_return[0]
             first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID")
-            ax = first_joined_gdf.plot(column="en", legend=False)
+            ax = first_joined_gdf.plot(column="eng", legend=False)
             ax.set_title(f"Scotland 2011 {first_metadata.level}")
             md_plot = markdown_from_plot(plt)
             context.add_output_metadata(

From 452ad096e1720ef14bc248b99ac640e221f8dd51 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 27 Jun 2024 08:50:36 +0100
Subject: [PATCH 41/60] Add source_data_releases, fix geo output

---
 python/popgetter/assets/gb_sct/__init__.py | 265 +++++++++++++++++----
 1 file changed, 222 insertions(+), 43 deletions(-)

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index 25574bf..b3e6c31 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -50,6 +50,14 @@
 CENSUS_COLLECTION_DATE = date(2011, 3, 27)
 CENSUS_EXPECT_NEXT_UPDATE = date(2022, 1, 1)
 
+
+# Source releases for 2011:
+# '3A','3I', '2A', '3C', '3D', '3E', '3L', '3K', '3N',
+# '3B', '3J', '3M', '3G', '3H', '2C', '2B', '2D',
+# Others:
+# '2001 Census',
+# 'nan', '75+', 'Daytime Tables',
+# '1991 Census', '1992 Census',
 SOURCE_DATA_RELEASES: dict[str, SourceDataRelease] = {
     "3A": SourceDataRelease(
         name="Census 2011: Release 3A",
@@ -61,59 +69,229 @@
         expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
         url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3a",
         data_publisher_id="TBD",
-        description="TBC",
-        # geography_file="TBC",
-        # geography_level="TBC",
-        geometry_metadata_id="TBC",
-        # countries_of_interest=[country.id],
+        description="TBD",
+        geometry_metadata_id="TBD",
     ),
     "3I": SourceDataRelease(
         name="Census 2011: Release 3I",
         date_published=date(2014, 9, 24),
-        reference_period_start=date(2015, 10, 22),
-        reference_period_end=date(2015, 10, 22),
-        collection_period_start=date(2011, 10, 22),
-        collection_period_end=date(2011, 10, 22),
-        expect_next_update=date(2022, 1, 1),
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
         url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3i",
         data_publisher_id="TBD",
-        description="TBC",
-        # geography_file="TBC",
-        # geography_level="TBC",
-        geometry_metadata_id="TBC",
-        # countries_of_interest=[country.id],
+        description="TBD",
+        geometry_metadata_id="TBD",
     ),
     "2A": SourceDataRelease(
         name="Census 2011: Release 2A",
         date_published=date(2013, 9, 26),
-        reference_period_start=date(2015, 10, 22),
-        reference_period_end=date(2015, 10, 22),
-        collection_period_start=date(2011, 10, 22),
-        collection_period_end=date(2011, 10, 22),
-        expect_next_update=date(2022, 1, 1),
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
         url="https://www.nrscotland.gov.uk/news/2013/census-2011-release-2a",
         data_publisher_id="TBD",
-        description="TBC",
-        # geography_file="TBC",
-        # geography_level="TBC",
-        geometry_metadata_id="",
-        # countries_of_interest=[country.id],
+        description="TBD",
+        geometry_metadata_id="TBD",
     ),
     "3C": SourceDataRelease(
         name="Census 2011: Release 3C",
         date_published=date(2014, 4, 9),
-        reference_period_start=date(2015, 10, 22),
-        reference_period_end=date(2015, 10, 22),
-        collection_period_start=date(2011, 10, 22),
-        collection_period_end=date(2011, 10, 22),
-        expect_next_update=date(2022, 1, 1),
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
         url="https://www.nrscotland.gov.uk/news/2014/census-2011-releases-2d-and-3c",
         data_publisher_id="TBD",
-        description="TBC",
-        geometry_metadata_id="",
-        # geography_file="TBC",
-        # geography_level="TBC",
-        # countries_of_interest=[country.id],
+        description="TBD",
+        geometry_metadata_id="TBD",
+    ),
+    "3D": SourceDataRelease(
+        name="Census 2011: Release 3D",
+        date_published=date(2014, 5, 15),
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+        url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3d",
+        data_publisher_id="TBD",
+        description="TBD",
+        geometry_metadata_id="TBD",
+    ),
+    "3E": SourceDataRelease(
+        name="Census 2011: Release 3E",
+        date_published=date(2014, 6, 4),
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+        url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3e",
+        data_publisher_id="TBD",
+        description="TBD",
+        geometry_metadata_id="TBD",
+    ),
+    "3F": SourceDataRelease(
+        name="Census 2011: Release 3F",
+        date_published=date(2014, 6, 25),
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+        url="https://www.nrscotland.gov.uk/news/2014/census-release-3f",
+        data_publisher_id="TBD",
+        description="TBD",
+        geometry_metadata_id="TBD",
+    ),
+    "3L": SourceDataRelease(
+        name="Census 2011: Release 3L",
+        date_published=date(2014, 11, 27),
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+        url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3l",
+        data_publisher_id="TBD",
+        description="TBD",
+        geometry_metadata_id="TBD",
+    ),
+    "3K": SourceDataRelease(
+        name="Census 2011: Release 3K",
+        date_published=date(2014, 11, 6),
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+        url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3k",
+        data_publisher_id="TBD",
+        description="TBD",
+        geometry_metadata_id="TBD",
+    ),
+    "3N": SourceDataRelease(
+        name="Census 2011: Release 3N",
+        date_published=date(2015, 1, 29),
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+        url="https://www.nrscotland.gov.uk/news/2015/census-2011-release-3n",
+        data_publisher_id="TBD",
+        description="TBD",
+        geometry_metadata_id="TBD",
+    ),
+    "3B": SourceDataRelease(
+        name="Census 2011: Release 3B",
+        date_published=date(2014, 3, 19),
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+        url="https://www.nrscotland.gov.uk/news/2014/census-2011-detailed-characteristics-on-ethnicity-identity-language-and-religion-in-scotland-%E2%80%93-release-3b",
+        data_publisher_id="TBD",
+        description="TBD",
+        geometry_metadata_id="TBD",
+    ),
+    "3J": SourceDataRelease(
+        name="Census 2011: Release 3J",
+        date_published=date(2014, 10, 16),
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+        url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3j",
+        data_publisher_id="TBD",
+        description="TBD",
+        geometry_metadata_id="TBD",
+    ),
+    "3M": SourceDataRelease(
+        name="Census 2011: Release 3M",
+        date_published=date(2014, 12, 18),
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+        url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3m",
+        data_publisher_id="TBD",
+        description="TBD",
+        geometry_metadata_id="TBD",
+    ),
+    "3G": SourceDataRelease(
+        name="Census 2011: Release 3G",
+        date_published=date(2014, 7, 23),
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+        url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3g",
+        data_publisher_id="TBD",
+        description="TBD",
+        geometry_metadata_id="TBD",
+    ),
+    "3H": SourceDataRelease(
+        name="Census 2011: Release 3H",
+        date_published=date(2014, 8, 13),
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+        url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3h",
+        data_publisher_id="TBD",
+        description="TBD",
+        geometry_metadata_id="TBD",
+    ),
+    "2C": SourceDataRelease(
+        name="Census 2011: Release 2C",
+        date_published=date(2013, 12, 18),
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+        url="https://www.nrscotland.gov.uk/news/2013/census-2011-release-2c",
+        data_publisher_id="TBD",
+        description="TBD",
+        geometry_metadata_id="TBD",
+    ),
+    "2B": SourceDataRelease(
+        name="Census 2011: Release 2B",
+        date_published=date(2013, 11, 14),
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+        url="https://www.nrscotland.gov.uk/news/2013/census-2011-release-2b",
+        data_publisher_id="TBD",
+        description="TBD",
+        geometry_metadata_id="TBD",
+    ),
+    "2D": SourceDataRelease(
+        name="Census 2011: Release 2D",
+        date_published=date(2014, 4, 9),
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+        url="https://www.nrscotland.gov.uk/news/2014/census-2011-releases-2d-and-3c",
+        data_publisher_id="TBD",
+        description="TBD",
+        geometry_metadata_id="TBD",
     ),
 }
 
@@ -576,9 +754,7 @@ def create_geometry(self):
 
         @send_to_geometry_sensor
         @asset(key_prefix=self.key_prefix)
-        def geometry(
-            context, lookup: pd.DataFrame
-        ) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]:
+        def geometry(context, lookup: pd.DataFrame) -> list[GeometryOutput]:
             """List of geometries, metadata and names at different resolutions."""
             geometries_to_return = []
             for level_details in SCOTLAND_GEO_LEVELS.values():
@@ -627,11 +803,16 @@ def geometry(
                     .drop_duplicates()
                 )
                 geometries_to_return.append(
-                    (geometry_metadata, region_geometries, region_names)
+                    GeometryOutput(
+                        metadata=geometry_metadata,
+                        gdf=region_geometries,
+                        names_df=region_names,
+                    )
                 )
 
             # Add output metadata
-            first_metadata, first_gdf, first_names = geometries_to_return[0]
+            geo: GeometryOutput = geometries_to_return[0]
+            first_metadata, first_gdf, first_names = geo.metadata, geo.gdf, geo.names_df
             first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID")
             ax = first_joined_gdf.plot(column="eng", legend=False)
             ax.set_title(f"Scotland 2011 {first_metadata.level}")
@@ -639,9 +820,7 @@ def geometry(
             context.add_output_metadata(
                 metadata={
                     "all_geom_levels": MetadataValue.md(
-                        ",".join(
-                            [metadata.level for metadata, _, _ in geometries_to_return]
-                        )
+                        ",".join([geo.metadata.level for geo in geometries_to_return])
                     ),
                     "first_geometry_plot": MetadataValue.md(md_plot),
                     "first_names_preview": MetadataValue.md(

From 1d28c55d956cadd2495b6022228bf6a727d6736b Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 27 Jun 2024 08:59:53 +0100
Subject: [PATCH 42/60] Fix derived metric output

---
 python/popgetter/assets/gb_sct/__init__.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index b3e6c31..5b8fb3e 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -21,7 +21,11 @@
 from icecream import ic
 
 from popgetter.assets.country import Country
-from popgetter.cloud_outputs import GeometryOutput, send_to_geometry_sensor
+from popgetter.cloud_outputs import (
+    GeometryOutput,
+    MetricsOutput,
+    send_to_geometry_sensor,
+)
 from popgetter.metadata import (
     CountryMetadata,
     DataPublisher,
@@ -947,7 +951,7 @@ def _derived_metrics(
         context,
         census_tables: pd.DataFrame,
         source_metric_metadata: MetricMetadata,
-    ) -> tuple[list[MetricMetadata], pd.DataFrame]:
+    ) -> MetricsOutput:
         ...
         SEP = "__"
         partition_key = context.partition_key
@@ -1109,7 +1113,7 @@ def make_int(maybe_non_int_df: pd.DataFrame) -> pd.DataFrame:
                 ),
             },
         )
-        return derived_mmd, joined_metrics
+        return MetricsOutput(metadata=derived_mmd, metrics=joined_metrics)
 
 
 # Create assets

From 786bcd3c311a4034aa5f566bf9667d7d0f1b9731 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 27 Jun 2024 09:03:13 +0100
Subject: [PATCH 43/60] Fix module name

---
 tests/test_be.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/test_be.py b/tests/test_be.py
index fae975b..b08a6da 100644
--- a/tests/test_be.py
+++ b/tests/test_be.py
@@ -12,7 +12,7 @@
 from rdflib import Graph
 from rdflib.namespace import DCAT
 
-from popgetter.assets import be
+from popgetter.assets import bel
 
 
 @pytest.fixture(scope="module")
@@ -36,7 +36,7 @@ def demo_catalog() -> Graph:
 @pytest.fixture(scope="module")
 def demo_catalog_df(demo_catalog) -> pd.DataFrame:
     context = build_asset_context()
-    return be.census_tables.catalog_as_dataframe(context, demo_catalog)
+    return bel.census_tables.catalog_as_dataframe(context, demo_catalog)
 
 
 @pytest.mark.skip(
@@ -46,7 +46,7 @@ def test_aggregate_sectors_to_municipalities(demo_sectors):
     # Test the that the row count is correctly added to the metadata
     context = build_asset_context()
 
-    actual_municipalities = be.census_geometry.aggregate_sectors_to_municipalities(
+    actual_municipalities = bel.census_geometry.aggregate_sectors_to_municipalities(
         context, demo_sectors
     )
 
@@ -62,7 +62,7 @@ def test_aggregate_sectors_to_municipalities(demo_sectors):
 @pytest.mark.skip(reason="Fix test_get_population_details_per_municipality first")
 def test_get_population_details_per_municipality():
     with build_asset_context() as muni_context:
-        stat_muni = be.census_tables.get_population_details_per_municipality(
+        stat_muni = bel.census_tables.get_population_details_per_municipality(
             muni_context
         )
 
@@ -87,7 +87,7 @@ def test_pivot_population():
         )
 
         # Get the geometries
-        stat_muni = be.census_tables.get_population_details_per_municipality(
+        stat_muni = bel.census_tables.get_population_details_per_municipality(
             muni_context
         )
 
@@ -99,7 +99,7 @@ def test_pivot_population():
 
     with build_asset_context() as pivot_context:
         # Pivot the population
-        pivoted = be.pivot_population(pivot_context, stat_muni)
+        pivoted = bel.pivot_population(pivot_context, stat_muni)
 
     expected_number_of_municipalities = 581
 
@@ -115,7 +115,7 @@ def test_demo_catalog(demo_catalog):
     actual_length = len(
         list(
             demo_catalog.objects(
-                subject=be.census_tables.opendata_catalog_root,
+                subject=bel.census_tables.opendata_catalog_root,
                 predicate=DCAT.dataset,
                 unique=False,
             )
@@ -128,7 +128,7 @@ def test_demo_catalog(demo_catalog):
 def test_catalog_metadata_details(demo_catalog_df):
     # Get the metadata for a specific dataset in the demo catalogue:
     # https://statbel.fgov.be/node/4151 "Population by Statistical sector"
-    # mmd = be.census_tables.get_mmd_from_dataset_node(
+    # mmd = bel.census_tables.get_mmd_from_dataset_node(
     #     demo_catalog, dataset_node=URIRef("https://statbel.fgov.be/node/4151")
     # )
 
@@ -179,7 +179,7 @@ def test_catalog_as_dataframe(demo_catalog_df):
 
     # # Convert the demo catalog to a DataFrame
     # with build_asset_context() as context:
-    #     catalog_df = be.census_tables.catalog_as_dataframe(context, demo_catalog_df)
+    #     catalog_df = bel.census_tables.catalog_as_dataframe(context, demo_catalog_df)
 
     #     # Check that the catalog has been converted to a DataFrame
     #     assert isinstance(catalog_df, pd.DataFrame)
@@ -228,7 +228,7 @@ def test_filter_known_failing_datasets():
         "2676",
     ]
 
-    actual_list = be.census_tables.filter_known_failing_datasets(mock_catalog)
+    actual_list = bel.census_tables.filter_known_failing_datasets(mock_catalog)
 
     assert mock_catalog != expected_list
     assert actual_list != mock_catalog

From fec9af84f3837562603514b5a9e78d03bbf40e3a Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 27 Jun 2024 09:06:51 +0100
Subject: [PATCH 44/60] Fix test

---
 tests/test_metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_metadata.py b/tests/test_metadata.py
index bc272b2..502f186 100644
--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
@@ -57,7 +57,7 @@ def test_source_data_release_hash():
     )
     assert (
         source_data_release.id
-        == "9ec7e234d73664339e4c1f04bfa485dbb17e204dd72dc3ffbb9cab6870475597"
+        == "4d61bfe401ba17becd02d6b3912152c135daa9ecaebc9bd45a589dc831a85217"
     )
 
     source_data_release2 = SourceDataRelease(

From 31ac586fce5ceb73e27db5409fd6586b08081006 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 27 Jun 2024 10:49:28 +0100
Subject: [PATCH 45/60] Add first modifications to ensure that runs for all
 tables

---
 python/popgetter/assets/gb_sct/__init__.py | 59 +++++++++++++---------
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index 5b8fb3e..b98a6ac 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -36,19 +36,7 @@
 )
 from popgetter.utils import add_metadata, markdown_from_plot
 
-# From: https://github.com/alan-turing-institute/microsimulation/blob/37ce2843f10b83a8e7a225c801cec83b85e6e0d0/microsimulation/common.py#L32
-REQUIRED_TABLES = [
-    "QS103SC",
-    "QS104SC",
-    "KS201SC",
-    "DC1117SC",
-    "DC2101SC",
-    "DC6206SC",
-    "LC1117SC",
-]
-REQUIRED_TABLES_REGEX = "|".join(REQUIRED_TABLES)
 # Currently including only releases matching tables included
-REQUIRED_RELEASES = ["3A", "3I", "2A", "3C"]
 GENERAL_METHODS_URL = "https://www.scotlandscensus.gov.uk/media/jx2lz54n/scotland-s_census_2011_general_report.pdf"
 CENSUS_REFERENCE_DATE = date(2011, 3, 27)
 CENSUS_COLLECTION_DATE = date(2011, 3, 27)
@@ -297,6 +285,20 @@
         description="TBD",
         geometry_metadata_id="TBD",
     ),
+    "75+": SourceDataRelease(
+        name="Census 2011: 75+",
+        # TODO: unable to find published date for 75+ release
+        date_published=date(2014, 1, 1),
+        reference_period_start=CENSUS_REFERENCE_DATE,
+        reference_period_end=CENSUS_REFERENCE_DATE,
+        collection_period_start=CENSUS_COLLECTION_DATE,
+        collection_period_end=CENSUS_COLLECTION_DATE,
+        expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+        url="TBD",
+        data_publisher_id="TBD",
+        description="TBD",
+        geometry_metadata_id="TBD",
+    ),
 }
 
 
@@ -321,6 +323,7 @@ def download_file(
     return file_name
 
 
+# TODO: remove ones no longer used
 URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html"
 URL1 = "https://www.scotlandscensus.gov.uk/"
 URL2 = "https://nrscensusprodumb.blob.core.windows.net/downloads/"
@@ -506,21 +509,23 @@ class SourceTable:
     ),
 ]
 
-TABLES_TO_PROCESS: list[str] = [
-    "QS103SC",
-    "QS104SC",
-    "KS201SC",
-    "DC1117SC",
-    "DC2101SC",
-    "DC6206SC",
-    "LC1117SC",
-]
-
-PARTITIONS_TO_PUBLISH: list[str] = ["2011/OutputArea2011/LC1117SC"]
-
+# For all available:
+TABLES_TO_PROCESS = None
+# For a subset:
+# TABLES_TO_PROCESS: list[str] = [
+#     "QS103SC",
+#     "QS104SC",
+#     "KS201SC",
+#     "DC1117SC",
+#     "DC2101SC",
+#     "DC6206SC",
+#     "LC1117SC",
+# ]
 
 DERIVED_COLUMN_SPECIFICATIONS: dict[str, list[DerivedColumn]] = {
-    PARTITIONS_TO_PUBLISH[0]: DERIVED_COLUMNS,
+    "2011/OutputArea2011/LC1117SC": DERIVED_COLUMNS,
+    "2011/DataZone2011/LC1117SC": DERIVED_COLUMNS,
+    "2011/CouncilArea2011/LC1117SC": DERIVED_COLUMNS,
 }
 
 
@@ -632,6 +637,10 @@ def get_table_metadata(
                     ):
                         continue
 
+                    # Fix case with missing data for release
+                    if resolution == "CouncilArea2011" and table_name == "DC6102SC":
+                        table_metadata["census_release"] = "3I"
+
                     # Create a record for each census table use same keys as MetricMetadata
                     # where possible since this makes it simpler to populate derived
                     # metrics downstream

From 5522d170e640ca63bc64f078dc18c7a74fb69d96 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 27 Jun 2024 12:38:15 +0100
Subject: [PATCH 46/60] Filter from catalog partition that is missing

---
 python/popgetter/assets/gb_sct/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index b98a6ac..e2aded9 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -637,6 +637,11 @@ def get_table_metadata(
                     ):
                         continue
 
+                    # Remove failing case (no data in census table):
+                    #   "2011/DataZone2011/QS421SC"
+                    if table_name == "QS421SC" and resolution == "DataZone2011":
+                        continue
+
                     # Fix case with missing data for release
                     if resolution == "CouncilArea2011" and table_name == "DC6102SC":
                         table_metadata["census_release"] = "3I"

From 33832b31c7fb8b2ce8043edef2dcfde27b6387ef Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 27 Jun 2024 18:02:51 +0100
Subject: [PATCH 47/60] Add option to allow ok return from derived_metrics is
 partition fails

---
 python/popgetter/assets/gb_sct/__init__.py | 95 ++++++++++++++--------
 1 file changed, 60 insertions(+), 35 deletions(-)

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index e2aded9..da4f3dd 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -543,6 +543,7 @@ class Scotland(Country):
     )
     geo_levels: ClassVar[list[str]] = list(SCOTLAND_GEO_LEVELS.keys())
     tables_to_process: list[str] | None = TABLES_TO_PROCESS
+    allow_missing_derived_metrics: ClassVar[bool] = True
 
     def _catalog(self, context) -> pd.DataFrame:
         """Creates a catalog of the individual census tables from all data sources."""
@@ -1051,6 +1052,19 @@ def make_pivot(df: pd.DataFrame) -> pd.DataFrame:
                 "Sex and Age",
                 "National Statistics Socio-economic Classification (NS-SeC)",
             ],
+            # 2011/CouncilArea2011/DC1104SC
+            "Residence type by sex by age": ["Residence type and Sex", "Age"],
+            # 2011/CouncilArea2011/DC1106SC
+            "Schoolchildren and full-time students living away from home during term time by sex by age": [
+                "Schoolchildren and full-time students living away from home during term time and Sex",
+                "Age",
+            ],
+            # 2011/CouncilArea2011/DC1112SC
+            "Dependent children by household type by sex by age": [
+                "Dependent children by household type",
+                "Sex",
+                "Age",
+            ],
         }
         if source_mmd.description not in exceptions:
             split = source_mmd.description.split(" by ")[::-1]
@@ -1059,44 +1073,55 @@ def make_pivot(df: pd.DataFrame) -> pd.DataFrame:
         out_cols = ["".join(x for x in col.title() if not x.isspace()) for col in split]
         context.log.debug(ic(out_cols))
         ic(new_table.columns)
-        for metric_col in new_table.columns:
-            metric_df = new_table.loc[:, metric_col].to_frame()
-            ic(metric_df)
-            derived_metrics.append(metric_df)
-            new_mmd = source_mmd.copy()
-            new_mmd.parent_metric_id = source_mmd.source_metric_id
-            new_mmd.metric_parquet_path = parquet_file_name
-
-            # TODO: fix automating the hxltag
-            key_val = dict(zip(out_cols, metric_col.split(SEP), strict=True))
-
-            def gen_hxltag(kv: dict[str, str]) -> str:
-                out = ["#population"]
-                for key, value in kv.items():
-                    out += [
-                        "".join(c for c in key if c.isalnum())
-                        + "_"
-                        + "".join(c for c in value if c.isalnum())
+        try:
+            for metric_col in new_table.columns:
+                metric_df = new_table.loc[:, metric_col].to_frame()
+                ic(metric_df)
+                derived_metrics.append(metric_df)
+                new_mmd = source_mmd.copy()
+                new_mmd.parent_metric_id = source_mmd.source_metric_id
+                new_mmd.metric_parquet_path = parquet_file_name
+
+                # TODO: fix automating the hxltag
+                key_val = dict(zip(out_cols, metric_col.split(SEP), strict=True))
+
+                def gen_hxltag(kv: dict[str, str]) -> str:
+                    out = ["#population"]
+                    for key, value in kv.items():
+                        out += [
+                            "".join(c for c in key if c.isalnum())
+                            + "_"
+                            + "".join(c for c in value if c.isalnum())
+                        ]
+                    return "+".join(out)
+
+                new_mmd.hxl_tag = gen_hxltag(key_val)
+                new_mmd.parquet_column_name = metric_col
+                context.log.debug(ic(key_val))
+                # TODO: Update after fixing hxltag
+                new_mmd.human_readable_name = "; ".join(
+                    [
+                        f"Variable: '{key}'; Value: '{value}'"
+                        for key, value in key_val.items()
                     ]
-                return "+".join(out)
-
-            new_mmd.hxl_tag = gen_hxltag(key_val)
-            new_mmd.parquet_column_name = metric_col
-            # TODO: Update after fixing hxltag
-            new_mmd.human_readable_name = "; ".join(
-                [
-                    f"Variable: '{key}'; Value: '{value}'"
-                    for key, value in key_val.items()
-                ]
+                )
+                derived_mmd.append(new_mmd)
+
+            joined_metrics = reduce(
+                lambda left, right: left.merge(
+                    right, on="GEO_ID", how="inner", validate="one_to_one"
+                ),
+                derived_metrics,
             )
-            derived_mmd.append(new_mmd)
 
-        joined_metrics = reduce(
-            lambda left, right: left.merge(
-                right, on="GEO_ID", how="inner", validate="one_to_one"
-            ),
-            derived_metrics,
-        )
+        except Exception as err:
+            err_msg = (
+                f"Failed to automatically derive levels and description for "
+                f"'{partition_key}', error:\n{err}"
+            )
+            context.log.error(err_msg)
+            if self.allow_missing_derived_metrics:
+                return MetricsOutput(metadata=[], metrics=pd.DataFrame())
 
         def make_int(maybe_non_int_df: pd.DataFrame) -> pd.DataFrame:
             for col in maybe_non_int_df:

From 9b2485686127f5e50f3d408b1e00fc17e7576c56 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 27 Jun 2024 21:00:28 +0100
Subject: [PATCH 48/60] Create try/except to optionally allow derived metrics
 with failures

---
 python/popgetter/assets/gb_sct/__init__.py | 277 +++++++++++----------
 1 file changed, 143 insertions(+), 134 deletions(-)

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index da4f3dd..dfea3d5 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -967,113 +967,119 @@ def _derived_metrics(
         census_tables: pd.DataFrame,
         source_metric_metadata: MetricMetadata,
     ) -> MetricsOutput:
-        ...
-        SEP = "__"
-        partition_key = context.partition_key
-        source_mmd = source_metric_metadata
-        parquet_file_name = (
-            f"{self.key_prefix}/metrics/"
-            f"{''.join(c for c in partition_key if c.isalnum()) + '.parquet'}"
-        )
-        derived_metrics, derived_mmd = [], []
-
-        # If derived metrics
         try:
-            metric_specs = DERIVED_COLUMN_SPECIFICATIONS[partition_key]
-
-            def reshape(df_to_reshape: pd.DataFrame) -> pd.DataFrame:
-                df_to_reshape = df_to_reshape.rename(
-                    columns={"Unnamed: 0": "GEO_ID", "Unnamed: 1": "Age Category"}
-                ).drop(columns=["All people"])
-                df_to_reshape = df_to_reshape.melt(
-                    ["GEO_ID", "Age Category"], var_name="Sex Label", value_name="Count"
-                )
-                df_to_reshape["Sex Label"] = df_to_reshape["Sex Label"].map(
-                    {"Males": "Male", "Females": "Female"}
-                )
-                return df_to_reshape
-
-            census_tables_for_derived_metrics = reshape(census_tables)
-            source_column = source_mmd.parquet_column_name
-            for metric_spec in metric_specs:
-                new_table = (
-                    census_tables_for_derived_metrics.pipe(metric_spec.filter_func)
-                    .groupby(by="GEO_ID", as_index=True)
-                    .sum()
-                    .rename(columns={source_column: metric_spec.output_column_name})
-                    .filter(items=["GEO_ID", metric_spec.output_column_name])
-                )
-                derived_metrics.append(new_table)
-                new_mmd = source_mmd.copy()
-                new_mmd.parent_metric_id = source_mmd.source_metric_id
-                new_mmd.metric_parquet_path = parquet_file_name
-                new_mmd.hxl_tag = metric_spec.hxltag
-                new_mmd.parquet_column_name = metric_spec.output_column_name
-                new_mmd.human_readable_name = metric_spec.human_readable_name
-                derived_mmd.append(new_mmd)
-        except KeyError:
-            # No extra derived metrics specified for this partition -- only use
-            # those from pivoted data
-            pass
-
-        # Batch
-        def make_pivot(df: pd.DataFrame) -> pd.DataFrame:
-            # TODO: reshape based on Unnamed: 1 to Unnamed N
-            pivot_cols = [
-                col
-                for col in df.columns
-                if col != "Unnamed: 0" and col.startswith("Unnamed: ")
-            ]
-            pivot = df.pivot_table(
-                index="Unnamed: 0", columns=pivot_cols, aggfunc="sum"
+            SEP = "__"
+            partition_key = context.partition_key
+            source_mmd = source_metric_metadata
+            parquet_file_name = (
+                f"{self.key_prefix}/metrics/"
+                f"{''.join(c for c in partition_key if c.isalnum()) + '.parquet'}"
             )
-
-            # FLattent multi-index
-            if isinstance(pivot.columns, pd.MultiIndex):
-                pivot.columns = [
-                    SEP.join(list(map(str, col))).strip()
-                    for col in pivot.columns.to_numpy()
+            derived_metrics, derived_mmd = [], []
+
+            # If derived metrics
+            try:
+                metric_specs = DERIVED_COLUMN_SPECIFICATIONS[partition_key]
+
+                def reshape(df_to_reshape: pd.DataFrame) -> pd.DataFrame:
+                    df_to_reshape = df_to_reshape.rename(
+                        columns={"Unnamed: 0": "GEO_ID", "Unnamed: 1": "Age Category"}
+                    ).drop(columns=["All people"])
+                    df_to_reshape = df_to_reshape.melt(
+                        ["GEO_ID", "Age Category"],
+                        var_name="Sex Label",
+                        value_name="Count",
+                    )
+                    df_to_reshape["Sex Label"] = df_to_reshape["Sex Label"].map(
+                        {"Males": "Male", "Females": "Female"}
+                    )
+                    return df_to_reshape
+
+                census_tables_for_derived_metrics = reshape(census_tables)
+                source_column = source_mmd.parquet_column_name
+                for metric_spec in metric_specs:
+                    new_table = (
+                        census_tables_for_derived_metrics.pipe(metric_spec.filter_func)
+                        .groupby(by="GEO_ID", as_index=True)
+                        .sum()
+                        .rename(columns={source_column: metric_spec.output_column_name})
+                        .filter(items=["GEO_ID", metric_spec.output_column_name])
+                    )
+                    derived_metrics.append(new_table)
+                    new_mmd = source_mmd.copy()
+                    new_mmd.parent_metric_id = source_mmd.source_metric_id
+                    new_mmd.metric_parquet_path = parquet_file_name
+                    new_mmd.hxl_tag = metric_spec.hxltag
+                    new_mmd.parquet_column_name = metric_spec.output_column_name
+                    new_mmd.human_readable_name = metric_spec.human_readable_name
+                    derived_mmd.append(new_mmd)
+            except KeyError:
+                # No extra derived metrics specified for this partition -- only use
+                # those from pivoted data
+                pass
+
+            # Batch
+            def make_pivot(df: pd.DataFrame) -> pd.DataFrame:
+                # TODO: reshape based on Unnamed: 1 to Unnamed N
+                pivot_cols = [
+                    col
+                    for col in df.columns
+                    if col != "Unnamed: 0" and col.startswith("Unnamed: ")
                 ]
-            # Ensure columns are string
+                pivot = df.pivot_table(
+                    index="Unnamed: 0", columns=pivot_cols, aggfunc="sum"
+                )
+
+                # FLattent multi-index
+                if isinstance(pivot.columns, pd.MultiIndex):
+                    pivot.columns = [
+                        SEP.join(list(map(str, col))).strip()
+                        for col in pivot.columns.to_numpy()
+                    ]
+                # Ensure columns are string
+                else:
+                    pivot.columns = [
+                        str(col).strip() for col in pivot.columns.to_numpy()
+                    ]
+
+                pivot.index = pivot.index.rename("GEO_ID")
+
+                return pivot
+
+            new_table = make_pivot(census_tables)
+
+            # Split for description of metrics
+            exceptions = {
+                "Age by single year": ["Age by single year"],
+                "National Statistics Socio-economic Classification (NS-SeC) by ethnic group by sex by age": [
+                    "Ethnic group",
+                    "Sex and Age",
+                    "National Statistics Socio-economic Classification (NS-SeC)",
+                ],
+                # 2011/CouncilArea2011/DC1104SC
+                "Residence type by sex by age": ["Residence type and Sex", "Age"],
+                # 2011/CouncilArea2011/DC1106SC
+                "Schoolchildren and full-time students living away from home during term time by sex by age": [
+                    "Schoolchildren and full-time students living away from home during term time and Sex",
+                    "Age",
+                ],
+                # 2011/CouncilArea2011/DC1112SC
+                "Dependent children by household type by sex by age": [
+                    "Dependent children by household type",
+                    "Sex",
+                    "Age",
+                ],
+            }
+            if source_mmd.description not in exceptions:
+                split = source_mmd.description.split(" by ")[::-1]
             else:
-                pivot.columns = [str(col).strip() for col in pivot.columns.to_numpy()]
-
-            pivot.index = pivot.index.rename("GEO_ID")
-
-            return pivot
-
-        new_table = make_pivot(census_tables)
-
-        # Split for description of metrics
-        exceptions = {
-            "Age by single year": ["Age by single year"],
-            "National Statistics Socio-economic Classification (NS-SeC) by ethnic group by sex by age": [
-                "Ethnic group",
-                "Sex and Age",
-                "National Statistics Socio-economic Classification (NS-SeC)",
-            ],
-            # 2011/CouncilArea2011/DC1104SC
-            "Residence type by sex by age": ["Residence type and Sex", "Age"],
-            # 2011/CouncilArea2011/DC1106SC
-            "Schoolchildren and full-time students living away from home during term time by sex by age": [
-                "Schoolchildren and full-time students living away from home during term time and Sex",
-                "Age",
-            ],
-            # 2011/CouncilArea2011/DC1112SC
-            "Dependent children by household type by sex by age": [
-                "Dependent children by household type",
-                "Sex",
-                "Age",
-            ],
-        }
-        if source_mmd.description not in exceptions:
-            split = source_mmd.description.split(" by ")[::-1]
-        else:
-            split = exceptions[source_mmd.description]
-        out_cols = ["".join(x for x in col.title() if not x.isspace()) for col in split]
-        context.log.debug(ic(out_cols))
-        ic(new_table.columns)
-        try:
+                split = exceptions[source_mmd.description]
+            out_cols = [
+                "".join(x for x in col.title() if not x.isspace()) for col in split
+            ]
+            context.log.debug(ic(out_cols))
+            ic(new_table.columns)
+
             for metric_col in new_table.columns:
                 metric_df = new_table.loc[:, metric_col].to_frame()
                 ic(metric_df)
@@ -1114,44 +1120,47 @@ def gen_hxltag(kv: dict[str, str]) -> str:
                 derived_metrics,
             )
 
+            def make_int(maybe_non_int_df: pd.DataFrame) -> pd.DataFrame:
+                for col in maybe_non_int_df:
+                    if maybe_non_int_df[col].dtype == "object":
+                        maybe_non_int_df[col] = (
+                            maybe_non_int_df[col]
+                            .str.replace(",", "")
+                            .str.replace("-", "0")
+                            .fillna("0")
+                            .astype(int)
+                        )
+                return maybe_non_int_df
+
+            # Fix format
+            joined_metrics = make_int(joined_metrics)
+
+            # Filter out whole country Scotland
+            joined_metrics = joined_metrics.loc[
+                ~joined_metrics.index.isin(["S92000003"])
+            ]
+
+            context.add_output_metadata(
+                metadata={
+                    "metadata_preview": MetadataValue.md(
+                        metadata_to_dataframe(derived_mmd).head().to_markdown()
+                    ),
+                    "metrics_shape": f"{joined_metrics.shape[0]} rows x {joined_metrics.shape[1]} columns",
+                    "metrics_preview": MetadataValue.md(
+                        joined_metrics.head().to_markdown()
+                    ),
+                },
+            )
+
         except Exception as err:
             err_msg = (
-                f"Failed to automatically derive levels and description for "
-                f"'{partition_key}', error:\n{err}"
+                f"Failed to automatically derive metrics for '{partition_key}' with "
+                f"error: {err}"
             )
             context.log.error(err_msg)
             if self.allow_missing_derived_metrics:
                 return MetricsOutput(metadata=[], metrics=pd.DataFrame())
 
-        def make_int(maybe_non_int_df: pd.DataFrame) -> pd.DataFrame:
-            for col in maybe_non_int_df:
-                if maybe_non_int_df[col].dtype == "object":
-                    maybe_non_int_df[col] = (
-                        maybe_non_int_df[col]
-                        .str.replace(",", "")
-                        .str.replace("-", "0")
-                        .fillna("0")
-                        .astype(int)
-                    )
-            return maybe_non_int_df
-
-        # Fix format
-        joined_metrics = make_int(joined_metrics)
-
-        # Filter out whole country Scotland
-        joined_metrics = joined_metrics.loc[~joined_metrics.index.isin(["S92000003"])]
-
-        context.add_output_metadata(
-            metadata={
-                "metadata_preview": MetadataValue.md(
-                    metadata_to_dataframe(derived_mmd).head().to_markdown()
-                ),
-                "metrics_shape": f"{joined_metrics.shape[0]} rows x {joined_metrics.shape[1]} columns",
-                "metrics_preview": MetadataValue.md(
-                    joined_metrics.head().to_markdown()
-                ),
-            },
-        )
         return MetricsOutput(metadata=derived_mmd, metrics=joined_metrics)
 
 

From 6ebdf7287369b4419378036be7bbe6bbadf4615a Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 2 Jul 2024 18:32:56 +0100
Subject: [PATCH 49/60] Replace GEO_ID with COL enum

---
 python/popgetter/assets/gb_sct/__init__.py | 33 +++++++++++++---------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index dfea3d5..31c9cd9 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -10,7 +10,6 @@
 from typing import ClassVar
 
 import geopandas as gpd
-import matplotlib.pyplot as plt
 import pandas as pd
 import requests
 import zipfile_deflate64 as zipfile
@@ -27,6 +26,7 @@
     send_to_geometry_sensor,
 )
 from popgetter.metadata import (
+    COL,
     CountryMetadata,
     DataPublisher,
     GeometryMetadata,
@@ -805,20 +805,22 @@ def geometry(context, lookup: pd.DataFrame) -> list[GeometryOutput]:
                 context.log.debug(ic(region_geometries_merge.head()))
                 context.log.debug(ic(region_geometries_merge.columns))
                 region_geometries = region_geometries_merge.rename(
-                    columns={level_details.geo_id_column: "GEO_ID"}
-                ).loc[:, ["geometry", "GEO_ID"]]
+                    columns={level_details.geo_id_column: COL.GEO_ID.value}
+                ).loc[:, ["geometry", COL.GEO_ID.value]]
 
                 region_names = (
                     region_geometries_merge.rename(
                         columns={
-                            level_details.geo_id_column: "GEO_ID",
+                            level_details.geo_id_column: COL.GEO_ID.value,
                         }
                         | {
                             value: key
                             for key, value in level_details.name_columns.items()
                         }
                     )
-                    .loc[:, ["GEO_ID", *list(level_details.name_columns.keys())]]
+                    .loc[
+                        :, [COL.GEO_ID.value, *list(level_details.name_columns.keys())]
+                    ]
                     .drop_duplicates()
                 )
                 geometries_to_return.append(
@@ -832,10 +834,10 @@ def geometry(context, lookup: pd.DataFrame) -> list[GeometryOutput]:
             # Add output metadata
             geo: GeometryOutput = geometries_to_return[0]
             first_metadata, first_gdf, first_names = geo.metadata, geo.gdf, geo.names_df
-            first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID")
+            first_joined_gdf = first_gdf.merge(first_names, on=COL.GEO_ID.value)
             ax = first_joined_gdf.plot(column="eng", legend=False)
             ax.set_title(f"Scotland 2011 {first_metadata.level}")
-            md_plot = markdown_from_plot(plt)
+            md_plot = markdown_from_plot()
             context.add_output_metadata(
                 metadata={
                     "all_geom_levels": MetadataValue.md(
@@ -983,10 +985,13 @@ def _derived_metrics(
 
                 def reshape(df_to_reshape: pd.DataFrame) -> pd.DataFrame:
                     df_to_reshape = df_to_reshape.rename(
-                        columns={"Unnamed: 0": "GEO_ID", "Unnamed: 1": "Age Category"}
+                        columns={
+                            "Unnamed: 0": COL.GEO_ID.value,
+                            "Unnamed: 1": "Age Category",
+                        }
                     ).drop(columns=["All people"])
                     df_to_reshape = df_to_reshape.melt(
-                        ["GEO_ID", "Age Category"],
+                        [COL.GEO_ID.value, "Age Category"],
                         var_name="Sex Label",
                         value_name="Count",
                     )
@@ -1000,10 +1005,12 @@ def reshape(df_to_reshape: pd.DataFrame) -> pd.DataFrame:
                 for metric_spec in metric_specs:
                     new_table = (
                         census_tables_for_derived_metrics.pipe(metric_spec.filter_func)
-                        .groupby(by="GEO_ID", as_index=True)
+                        .groupby(by=COL.GEO_ID.value, as_index=True)
                         .sum()
                         .rename(columns={source_column: metric_spec.output_column_name})
-                        .filter(items=["GEO_ID", metric_spec.output_column_name])
+                        .filter(
+                            items=[COL.GEO_ID.value, metric_spec.output_column_name]
+                        )
                     )
                     derived_metrics.append(new_table)
                     new_mmd = source_mmd.copy()
@@ -1042,7 +1049,7 @@ def make_pivot(df: pd.DataFrame) -> pd.DataFrame:
                         str(col).strip() for col in pivot.columns.to_numpy()
                     ]
 
-                pivot.index = pivot.index.rename("GEO_ID")
+                pivot.index = pivot.index.rename(COL.GEO_ID.value)
 
                 return pivot
 
@@ -1115,7 +1122,7 @@ def gen_hxltag(kv: dict[str, str]) -> str:
 
             joined_metrics = reduce(
                 lambda left, right: left.merge(
-                    right, on="GEO_ID", how="inner", validate="one_to_one"
+                    right, on=COL.GEO_ID.value, how="inner", validate="one_to_one"
                 ),
                 derived_metrics,
             )

From d62bfc80992e6e664ae90b42bd4aacf198cd6b5d Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 2 Jul 2024 18:38:23 +0100
Subject: [PATCH 50/60] Use tempfile.mkdtemp() for cache_dir

---
 python/popgetter/assets/gb_sct/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index 31c9cd9..a65c47e 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -1,6 +1,7 @@
 #!/usr/bin/python3
 from __future__ import annotations
 
+import tempfile
 import urllib.parse as urlparse
 from collections.abc import Callable
 from dataclasses import dataclass
@@ -432,8 +433,8 @@ class ScotlandGeometryLevel:
 }
 
 
-# cache_dir = tempfile.mkdtemp()
-cache_dir = "./cache"
+# Use temporary directory for `cache_dir``
+cache_dir = tempfile.mkdtemp()
 
 
 @dataclass

From 070c8175bd8b96ce3283757013164fffc3d847ca Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 2 Jul 2024 20:42:39 +0100
Subject: [PATCH 51/60] Add utils, rename static variables upper case

---
 python/popgetter/assets/gb_sct/__init__.py | 70 ++++++++--------------
 python/popgetter/assets/gb_sct/utils.py    | 24 ++++++++
 2 files changed, 49 insertions(+), 45 deletions(-)
 create mode 100644 python/popgetter/assets/gb_sct/utils.py

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index a65c47e..845bbf7 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -12,7 +12,6 @@
 
 import geopandas as gpd
 import pandas as pd
-import requests
 import zipfile_deflate64 as zipfile
 from dagster import (
     MetadataValue,
@@ -21,6 +20,7 @@
 from icecream import ic
 
 from popgetter.assets.country import Country
+from popgetter.assets.gb_sct.utils import HEADERS, download_file
 from popgetter.cloud_outputs import (
     GeometryOutput,
     MetricsOutput,
@@ -303,27 +303,6 @@
 }
 
 
-# Move to tests
-HEADERS = {
-    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
-}
-
-
-def download_file(
-    cache_dir: str,
-    url: str,
-    file_name: Path | None = None,
-    headers: dict[str, str] = HEADERS,
-) -> Path:
-    """Downloads file checking first if exists in cache, returning file name."""
-    file_name = Path(cache_dir) / url.split("/")[-1] if file_name is None else file_name
-    if not Path(file_name).exists():
-        r = requests.get(url, allow_redirects=True, headers=headers)
-        with Path(file_name).open("wb") as fp:
-            fp.write(r.content)
-    return file_name
-
-
 # TODO: remove ones no longer used
 URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html"
 URL1 = "https://www.scotlandscensus.gov.uk/"
@@ -433,8 +412,8 @@ class ScotlandGeometryLevel:
 }
 
 
-# Use temporary directory for `cache_dir``
-cache_dir = tempfile.mkdtemp()
+# Use temporary directory
+CACHE_DIR = tempfile.mkdtemp()
 
 
 @dataclass
@@ -454,36 +433,36 @@ class SourceTable:
 
 
 # Config for each partition to be derived
-age_code = "`Age Category`"
-sex_label = "`Sex Label`"
-infants = ["0 to 4"]
-children_5_to_17 = ["5 to 9", "10 to 11", "12 to 14" "15", "16 to 17"]
-children = ["0 to 4", "5 to 9", "10 to 11", "12 to 14" "15", "16 to 17"]
-adults = ["18 to 19"] + [f"{i} to {i+4}" for i in range(20, 91, 5)] + ["95 and over"]
-people = ["All people"]
+AGE_CODE = "`Age Category`"
+SEX_LABEL = "`Sex Label`"
+INFANTS = ["0 to 4"]
+CHILDREN_5_TO_17 = ["5 to 9", "10 to 11", "12 to 14" "15", "16 to 17"]
+CHILDREN = ["0 to 4", "5 to 9", "10 to 11", "12 to 14" "15", "16 to 17"]
+ADULTS = ["18 to 19"] + [f"{i} to {i+4}" for i in range(20, 91, 5)] + ["95 and over"]
+PEOPLE = ["All people"]
 DERIVED_COLUMNS = [
     DerivedColumn(
         hxltag="#population+children+age5_17",
-        filter_func=lambda df: df.query(f"{age_code} in @children_5_to_17"),
+        filter_func=lambda df: df.query(f"{AGE_CODE} in @children_5_to_17"),
         output_column_name="children_5_17",
         human_readable_name="Children aged 5 to 17",
     ),
     DerivedColumn(
         hxltag="#population+infants+age0_4",
-        filter_func=lambda df: df.query(f"{age_code} in @infants"),
+        filter_func=lambda df: df.query(f"{AGE_CODE} in @infants"),
         output_column_name="infants_0_4",
         human_readable_name="Infants aged 0 to 4",
     ),
     DerivedColumn(
         hxltag="#population+children+age0_17",
-        filter_func=lambda df: df.query(f"{age_code} in @children"),
+        filter_func=lambda df: df.query(f"{AGE_CODE} in @children"),
         output_column_name="children_0_17",
         human_readable_name="Children aged 0 to 17",
     ),
     DerivedColumn(
         hxltag="#population+adults+f",
         filter_func=lambda df: df.query(
-            f"{age_code} in @adults and {sex_label} == 'Female'"
+            f"{AGE_CODE} in @adults and {SEX_LABEL} == 'Female'"
         ),
         output_column_name="adults_f",
         human_readable_name="Female adults",
@@ -491,27 +470,25 @@ class SourceTable:
     DerivedColumn(
         hxltag="#population+adults+m",
         filter_func=lambda df: df.query(
-            f"{age_code} in @adults and {sex_label} == 'Male'"
+            f"{AGE_CODE} in @adults and {SEX_LABEL} == 'Male'"
         ),
         output_column_name="adults_m",
         human_readable_name="Male adults",
     ),
     DerivedColumn(
         hxltag="#population+adults",
-        filter_func=lambda df: df.query(f"{age_code} in @adults"),
+        filter_func=lambda df: df.query(f"{AGE_CODE} in @adults"),
         output_column_name="adults",
         human_readable_name="Adults",
     ),
     DerivedColumn(
         hxltag="#population+ind",
-        filter_func=lambda df: df.query(f"{age_code} in @people"),
+        filter_func=lambda df: df.query(f"{AGE_CODE} in @people"),
         output_column_name="individuals",
         human_readable_name="Total individuals",
     ),
 ]
 
-# For all available:
-TABLES_TO_PROCESS = None
 # For a subset:
 # TABLES_TO_PROCESS: list[str] = [
 #     "QS103SC",
@@ -523,6 +500,9 @@ class SourceTable:
 #     "LC1117SC",
 # ]
 
+# For all available:
+TABLES_TO_PROCESS = None
+
 DERIVED_COLUMN_SPECIFICATIONS: dict[str, list[DerivedColumn]] = {
     "2011/OutputArea2011/LC1117SC": DERIVED_COLUMNS,
     "2011/DataZone2011/LC1117SC": DERIVED_COLUMNS,
@@ -552,8 +532,8 @@ def _catalog(self, context) -> pd.DataFrame:
         def source_to_zip(source_name: str, url: str) -> Path:
             """Downloads if necessary and returns the name of the locally cached zip file
             of the source data (replacing spaces with _)"""
-            file_name = Path(cache_dir) / (source_name.replace(" ", "_") + ".zip")
-            return download_file(cache_dir, url, file_name)
+            file_name = Path(CACHE_DIR) / (source_name.replace(" ", "_") + ".zip")
+            return download_file(CACHE_DIR, url, file_name)
 
         def get_table_name(file_name: str) -> str:
             return file_name.rsplit(".csv")[0]
@@ -681,7 +661,7 @@ def get_table_metadata(
                     }
                     context.log.debug(record)
                     records.append(record)
-                    zip_ref.extract(file_name, Path(cache_dir) / source)
+                    zip_ref.extract(file_name, Path(CACHE_DIR) / source)
 
         # Create a dynamic partition for the datasets listed in the catalog
         catalog_df: pd.DataFrame = pd.DataFrame.from_records(records)
@@ -786,7 +766,7 @@ def geometry(context, lookup: pd.DataFrame) -> list[GeometryOutput]:
                     level=level_details.level,
                     hxl_tag=level_details.hxl_tag,
                 )
-                file_name = download_file(cache_dir, level_details.url)
+                file_name = download_file(CACHE_DIR, level_details.url)
                 region_geometries_raw: gpd.GeoDataFrame = gpd.read_file(
                     f"zip://{file_name}"
                 )
@@ -896,7 +876,7 @@ def _source_data_releases(
 
     @staticmethod
     def get_table(context, table_details) -> pd.DataFrame:
-        table_df = pd.read_csv(Path(cache_dir) / table_details["file_name"].iloc[0])
+        table_df = pd.read_csv(Path(CACHE_DIR) / table_details["file_name"].iloc[0])
         add_metadata(context, table_df, table_details["partition_key"].iloc[0])
         return table_df
 
diff --git a/python/popgetter/assets/gb_sct/utils.py b/python/popgetter/assets/gb_sct/utils.py
new file mode 100644
index 0000000..671d17c
--- /dev/null
+++ b/python/popgetter/assets/gb_sct/utils.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import requests
+
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0"
+}
+
+
+def download_file(
+    cache_dir: str,
+    url: str,
+    file_name: Path | None = None,
+    headers: dict[str, str] = HEADERS,
+) -> Path:
+    """Downloads file checking first if exists in cache, returning file name."""
+    file_name = Path(cache_dir) / url.split("/")[-1] if file_name is None else file_name
+    if not Path(file_name).exists():
+        r = requests.get(url, allow_redirects=True, headers=headers)
+        with Path(file_name).open("wb") as fp:
+            fp.write(r.content)
+    return file_name

From e15ddff65d6412f7660ba8f1137a6ee9ad237761 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 2 Jul 2024 21:28:21 +0100
Subject: [PATCH 52/60] Revert tempfile

---
 python/popgetter/assets/gb_sct/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index 845bbf7..c0bc9dd 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -1,7 +1,6 @@
 #!/usr/bin/python3
 from __future__ import annotations
 
-import tempfile
 import urllib.parse as urlparse
 from collections.abc import Callable
 from dataclasses import dataclass
@@ -412,8 +411,10 @@ class ScotlandGeometryLevel:
 }
 
 
+# TODO: identify better tempfile option
 # Use temporary directory
-CACHE_DIR = tempfile.mkdtemp()
+# CACHE_DIR = tempfile.mkdtemp()
+CACHE_DIR = "./cache"
 
 
 @dataclass

From d561c406d1c61fa6c09afd5eddad431d1974ce39 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 2 Jul 2024 21:46:57 +0100
Subject: [PATCH 53/60] Ensure CACHE_DIR made

---
 python/popgetter/assets/gb_sct/__init__.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index c0bc9dd..88c0c70 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -527,6 +527,12 @@ class Scotland(Country):
     tables_to_process: list[str] | None = TABLES_TO_PROCESS
     allow_missing_derived_metrics: ClassVar[bool] = True
 
+    def __init__(self):
+        super().__init__()
+
+        # Make temp directory
+        Path(CACHE_DIR).mkdir(parents=True)
+
     def _catalog(self, context) -> pd.DataFrame:
         """Creates a catalog of the individual census tables from all data sources."""
 

From 27514ecc88591d8fa90e4516912a1c075c35220e Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 10 Jul 2024 14:59:41 +0100
Subject: [PATCH 54/60] Update deps

---
 pyproject.toml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index db220c7..550ef4b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,8 +47,7 @@ dependencies = [
   "rdflib >=7.0.0", # Required to parse BEL TTL Metadata catalogue.
   "icecream >=2.1.3", # General debugging tool
   "python-slugify >=8.0.4", # Required for generating asset names from GBR Ordnance Survey OpenData Product names
-  "openpyxl",
-  "zipfile-deflate64",
+  "zipfile-deflate64 >= 0.2.0", # Required for handling zipped files in Scotland DAG
   "jcs >=0.2.1", # For generating IDs from class attributes
   "beautifulsoup4 >=4.12.3", # For extracting catalogs from web pages
   "openpyxl >=3.1.3", # For reading Excel files

From de1422eab4bc25ce57e4768fae360c2ec9b1dc11 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 10 Jul 2024 15:03:21 +0100
Subject: [PATCH 55/60] Remove URL constants

---
 python/popgetter/assets/gb_sct/__init__.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index 88c0c70..5784514 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -304,8 +304,6 @@
 
 # TODO: remove ones no longer used
 URL = "https://www.scotlandscensus.gov.uk/ods-web/download/getDownloadFile.html"
-URL1 = "https://www.scotlandscensus.gov.uk/"
-URL2 = "https://nrscensusprodumb.blob.core.windows.net/downloads/"
 URL_LOOKUP = (
     "https://www.nrscotland.gov.uk/files//geography/2011-census/OA_DZ_IZ_2011.xlsx"
 )
@@ -320,19 +318,23 @@
         "source": "Council Area blk",
         # "resolution": "LAD",
         "resolution": "CouncilArea2011",
-        "url": URL1 + "/media/hjmd0oqr/council-area-blk.zip",
+        "url": "https://www.scotlandscensus.gov.uk/media/hjmd0oqr/council-area-blk.zip",
     },
     {
         "source": "SNS Data Zone 2011 blk",
         # "resolution": "LSOA11",
         "resolution": "DataZone2011",
-        "url": URL2 + urlparse.quote("SNS Data Zone 2011 blk") + ".zip",
+        "url": "https://nrscensusprodumb.blob.core.windows.net/downloads/"
+        + urlparse.quote("SNS Data Zone 2011 blk")
+        + ".zip",
     },
     {
         "source": "Output Area blk",
         # "resolution": "OA11",
         "resolution": "OutputArea2011",
-        "url": URL2 + urlparse.quote("Output Area blk") + ".zip",
+        "url": "https://nrscensusprodumb.blob.core.windows.net/downloads/"
+        + urlparse.quote("Output Area blk")
+        + ".zip",
     },
 ]
 

From e0c86d37eae728474b743b27d092c8efcc3eb8e6 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 10 Jul 2024 15:04:27 +0100
Subject: [PATCH 56/60] Rename variable

---
 python/popgetter/assets/gb_sct/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index 5784514..8ad4b35 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -313,7 +313,7 @@
 )
 
 
-DATA_SOURCES = [
+CENSUS_TABLE_DATA_SOURCES = [
     {
         "source": "Council Area blk",
         # "resolution": "LAD",
@@ -593,7 +593,7 @@ def get_table_metadata(
         self.remove_all_partition_keys(context)
 
         records = []
-        for data_source in DATA_SOURCES:
+        for data_source in CENSUS_TABLE_DATA_SOURCES:
             resolution = data_source["resolution"]
             source = data_source["source"]
             url = data_source["url"]

From b7baea2d0fbd17b49038a5f03fdeafa63113eb29 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 10 Jul 2024 15:11:17 +0100
Subject: [PATCH 57/60] Change minimum version requirement

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 550ef4b..693bbc0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,7 +47,7 @@ dependencies = [
   "rdflib >=7.0.0", # Required to parse BEL TTL Metadata catalogue.
   "icecream >=2.1.3", # General debugging tool
   "python-slugify >=8.0.4", # Required for generating asset names from GBR Ordnance Survey OpenData Product names
-  "zipfile-deflate64 >= 0.2.0", # Required for handling zipped files in Scotland DAG
+  "zipfile-deflate64 >= 0.1.0", # Required for handling zipped files in Scotland DAG
   "jcs >=0.2.1", # For generating IDs from class attributes
   "beautifulsoup4 >=4.12.3", # For extracting catalogs from web pages
   "openpyxl >=3.1.3", # For reading Excel files

From 068920a8ffa122d7a99c04c53729fe44a70c9b51 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 10 Jul 2024 15:54:50 +0100
Subject: [PATCH 58/60] Fix deprecated warning

---
 python/popgetter/assets/gb_sct/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index 8ad4b35..d02c56e 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -1003,7 +1003,7 @@ def reshape(df_to_reshape: pd.DataFrame) -> pd.DataFrame:
                         )
                     )
                     derived_metrics.append(new_table)
-                    new_mmd = source_mmd.copy()
+                    new_mmd = source_mmd.model_copy(deep=True)
                     new_mmd.parent_metric_id = source_mmd.source_metric_id
                     new_mmd.metric_parquet_path = parquet_file_name
                     new_mmd.hxl_tag = metric_spec.hxltag

From 8dacb88f7b5fefd7af0bbf23dcd8d6adc047e552 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 10 Jul 2024 15:59:27 +0100
Subject: [PATCH 59/60] Remove obsolete class field

---
 python/popgetter/assets/gb_sct/__init__.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index d02c56e..092cc78 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -344,7 +344,6 @@ class ScotlandGeometryLevel:
     level: str
     hxl_tag: str
     geo_id_column: str
-    census_table_column: str
     name_columns: dict[str, str]  # keys = language codes, values = column names
     url: str
     lookup_url: str | None
@@ -358,10 +357,7 @@ class ScotlandGeometryLevel:
         level="OutputArea2011",
         hxl_tag="TBD",
         geo_id_column="OA_CODE",
-        census_table_column="TODO",
-        # census_table_column="Census 2021 Data Zone Code",
-        name_columns={"eng": "OutputArea2011Name"},  # TODO
-        # url=URL_SHAPEFILE,
+        name_columns={"eng": "OutputArea2011Name"},
         url="https://www.nrscotland.gov.uk/files/geography/output-area-2011-eor.zip",
         lookup_url=None,
         lookup_sheet=None,
@@ -373,8 +369,6 @@ class ScotlandGeometryLevel:
         level="DataZone2011",
         hxl_tag="TBD",
         geo_id_column="DataZone",
-        census_table_column="TODO",
-        # census_table_column="Census 2021 Data Zone Code",
         name_columns={"eng": "Name"},
         url="https://maps.gov.scot/ATOM/shapefiles/SG_DataZoneBdry_2011.zip",
         lookup_url=None,
@@ -386,8 +380,6 @@ class ScotlandGeometryLevel:
     #     level="OA11",
     #     hxl_tag="TBD",
     #     geo_id_column="OA_CODE",
-    #     census_table_column="TODO",
-    #     # census_table_column="Census 2021 Data Zone Code",
     #     name_columns={"eng": "OA_CODE"},
     #     # url=URL_SHAPEFILE,
     #     url="https://www.nrscotland.gov.uk/files/geography/output-area-2011-eor.zip",
@@ -401,8 +393,6 @@ class ScotlandGeometryLevel:
         level="CouncilArea2011",
         hxl_tag="TBD",
         geo_id_column="CouncilArea2011Code",
-        census_table_column="TODO",
-        # census_table_column="Census 2021 Data Zone Code",
         name_columns={"eng": "CouncilArea2011Name"},
         url="https://maps.gov.scot/ATOM/shapefiles/SG_DataZoneBdry_2011.zip",
         lookup_url=None,

From ad3e787d4adac9baa87a44ba49e04e2d56c78233 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 10 Jul 2024 16:00:44 +0100
Subject: [PATCH 60/60] Allow cache to previously exist

---
 python/popgetter/assets/gb_sct/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/popgetter/assets/gb_sct/__init__.py b/python/popgetter/assets/gb_sct/__init__.py
index 092cc78..a72fc24 100755
--- a/python/popgetter/assets/gb_sct/__init__.py
+++ b/python/popgetter/assets/gb_sct/__init__.py
@@ -523,7 +523,7 @@ def __init__(self):
         super().__init__()
 
         # Make temp directory
-        Path(CACHE_DIR).mkdir(parents=True)
+        Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
 
     def _catalog(self, context) -> pd.DataFrame:
         """Creates a catalog of the individual census tables from all data sources."""