Skip to content

Commit

Permalink
Merge pull request #77 from worldbank/refactor/metadata
Browse files Browse the repository at this point in the history
Update STAC Metadata with Proposal
  • Loading branch information
andresfchamorro authored Oct 25, 2024
2 parents 7232c49 + cd2f843 commit 234aa3f
Show file tree
Hide file tree
Showing 19 changed files with 2,410 additions and 1,054 deletions.
Binary file added docs/images/create_stac_workflow.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
74 changes: 74 additions & 0 deletions notebooks/data-processing/population.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "local.parquet",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[2], line 6\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompute\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpc\u001b[39;00m\n\u001b[1;32m 5\u001b[0m parquet_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlocal.parquet\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 6\u001b[0m table \u001b[38;5;241m=\u001b[39m \u001b[43mpq\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_table\u001b[49m\u001b[43m(\u001b[49m\u001b[43mparquet_file\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 8\u001b[0m male_columns \u001b[38;5;241m=\u001b[39m [col \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m table\u001b[38;5;241m.\u001b[39mcolumn_names \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_m_\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m col]\n\u001b[1;32m 9\u001b[0m female_columns \u001b[38;5;241m=\u001b[39m [col \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m table\u001b[38;5;241m.\u001b[39mcolumn_names \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_f_\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m col]\n",
"File \u001b[0;32m/opt/anaconda3/envs/spacestats/lib/python3.11/site-packages/pyarrow/parquet/core.py:1793\u001b[0m, in \u001b[0;36mread_table\u001b[0;34m(source, columns, use_threads, schema, use_pandas_metadata, read_dictionary, memory_map, buffer_size, partitioning, filesystem, filters, use_legacy_dataset, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit, decryption_properties, thrift_string_size_limit, thrift_container_size_limit, page_checksum_verification)\u001b[0m\n\u001b[1;32m 1787\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 1788\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPassing \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124muse_legacy_dataset\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m is deprecated as of pyarrow 15.0.0 \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1789\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mand will be removed in a future version.\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 1790\u001b[0m \u001b[38;5;167;01mFutureWarning\u001b[39;00m, stacklevel\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m)\n\u001b[1;32m 1792\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1793\u001b[0m dataset \u001b[38;5;241m=\u001b[39m \u001b[43mParquetDataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1794\u001b[0m \u001b[43m \u001b[49m\u001b[43msource\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1795\u001b[0m \u001b[43m \u001b[49m\u001b[43mschema\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mschema\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1796\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilesystem\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilesystem\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1797\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartitioning\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartitioning\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1798\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmemory_map\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1799\u001b[0m \u001b[43m \u001b[49m\u001b[43mread_dictionary\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mread_dictionary\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1800\u001b[0m \u001b[43m \u001b[49m\u001b[43mbuffer_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbuffer_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1801\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1802\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_prefixes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_prefixes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1803\u001b[0m \u001b[43m \u001b[49m\u001b[43mpre_buffer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpre_buffer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1804\u001b[0m \u001b[43m \u001b[49m\u001b[43mcoerce_int96_timestamp_unit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcoerce_int96_timestamp_unit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1805\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecryption_properties\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecryption_properties\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1806\u001b[0m \u001b[43m \u001b[49m\u001b[43mthrift_string_size_limit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mthrift_string_size_limit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1807\u001b[0m \u001b[43m \u001b[49m\u001b[43mthrift_container_size_limit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mthrift_container_size_limit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1808\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_checksum_verification\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_checksum_verification\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1809\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1810\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[1;32m 1811\u001b[0m \u001b[38;5;66;03m# fall back on ParquetFile for simple cases when pyarrow.dataset\u001b[39;00m\n\u001b[1;32m 1812\u001b[0m \u001b[38;5;66;03m# module is not available\u001b[39;00m\n\u001b[1;32m 1813\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m filters \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
"File \u001b[0;32m/opt/anaconda3/envs/spacestats/lib/python3.11/site-packages/pyarrow/parquet/core.py:1371\u001b[0m, in \u001b[0;36mParquetDataset.__init__\u001b[0;34m(self, path_or_paths, filesystem, schema, filters, read_dictionary, memory_map, buffer_size, partitioning, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit, decryption_properties, thrift_string_size_limit, thrift_container_size_limit, page_checksum_verification, use_legacy_dataset)\u001b[0m\n\u001b[1;32m 1367\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m partitioning \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhive\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 1368\u001b[0m partitioning \u001b[38;5;241m=\u001b[39m ds\u001b[38;5;241m.\u001b[39mHivePartitioning\u001b[38;5;241m.\u001b[39mdiscover(\n\u001b[1;32m 1369\u001b[0m infer_dictionary\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m-> 1371\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_dataset \u001b[38;5;241m=\u001b[39m \u001b[43mds\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath_or_paths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilesystem\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilesystem\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1372\u001b[0m \u001b[43m \u001b[49m\u001b[43mschema\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mschema\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mformat\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparquet_format\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1373\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartitioning\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartitioning\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1374\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_prefixes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_prefixes\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/opt/anaconda3/envs/spacestats/lib/python3.11/site-packages/pyarrow/dataset.py:794\u001b[0m, in \u001b[0;36mdataset\u001b[0;34m(source, schema, format, filesystem, partitioning, partition_base_dir, exclude_invalid_files, ignore_prefixes)\u001b[0m\n\u001b[1;32m 783\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(\n\u001b[1;32m 784\u001b[0m schema\u001b[38;5;241m=\u001b[39mschema,\n\u001b[1;32m 785\u001b[0m filesystem\u001b[38;5;241m=\u001b[39mfilesystem,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 790\u001b[0m selector_ignore_prefixes\u001b[38;5;241m=\u001b[39mignore_prefixes\n\u001b[1;32m 791\u001b[0m )\n\u001b[1;32m 793\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _is_path_like(source):\n\u001b[0;32m--> 794\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_filesystem_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43msource\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 795\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(source, (\u001b[38;5;28mtuple\u001b[39m, \u001b[38;5;28mlist\u001b[39m)):\n\u001b[1;32m 796\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mall\u001b[39m(_is_path_like(elem) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(elem, FileInfo) \u001b[38;5;28;01mfor\u001b[39;00m elem \u001b[38;5;129;01min\u001b[39;00m source):\n",
"File \u001b[0;32m/opt/anaconda3/envs/spacestats/lib/python3.11/site-packages/pyarrow/dataset.py:476\u001b[0m, in \u001b[0;36m_filesystem_dataset\u001b[0;34m(source, schema, filesystem, partitioning, format, partition_base_dir, exclude_invalid_files, selector_ignore_prefixes)\u001b[0m\n\u001b[1;32m 474\u001b[0m fs, paths_or_selector \u001b[38;5;241m=\u001b[39m _ensure_multiple_sources(source, filesystem)\n\u001b[1;32m 475\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 476\u001b[0m fs, paths_or_selector \u001b[38;5;241m=\u001b[39m \u001b[43m_ensure_single_source\u001b[49m\u001b[43m(\u001b[49m\u001b[43msource\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilesystem\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 478\u001b[0m options \u001b[38;5;241m=\u001b[39m FileSystemFactoryOptions(\n\u001b[1;32m 479\u001b[0m partitioning\u001b[38;5;241m=\u001b[39mpartitioning,\n\u001b[1;32m 480\u001b[0m partition_base_dir\u001b[38;5;241m=\u001b[39mpartition_base_dir,\n\u001b[1;32m 481\u001b[0m exclude_invalid_files\u001b[38;5;241m=\u001b[39mexclude_invalid_files,\n\u001b[1;32m 482\u001b[0m selector_ignore_prefixes\u001b[38;5;241m=\u001b[39mselector_ignore_prefixes\n\u001b[1;32m 483\u001b[0m )\n\u001b[1;32m 484\u001b[0m factory \u001b[38;5;241m=\u001b[39m FileSystemDatasetFactory(fs, paths_or_selector, \u001b[38;5;28mformat\u001b[39m, options)\n",
"File \u001b[0;32m/opt/anaconda3/envs/spacestats/lib/python3.11/site-packages/pyarrow/dataset.py:441\u001b[0m, in \u001b[0;36m_ensure_single_source\u001b[0;34m(path, filesystem)\u001b[0m\n\u001b[1;32m 439\u001b[0m paths_or_selector \u001b[38;5;241m=\u001b[39m [path]\n\u001b[1;32m 440\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 441\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(path)\n\u001b[1;32m 443\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m filesystem, paths_or_selector\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: local.parquet"
]
}
],
"source": [
"import pyarrow as pa\n",
"import pyarrow.parquet as pq\n",
"import pyarrow.compute as pc\n",
"\n",
"parquet_file = \"../../local.parquet\"\n",
"table = pq.read_table(parquet_file)\n",
"\n",
"male_columns = [col for col in table.column_names if \"_m_\" in col]\n",
"female_columns = [col for col in table.column_names if \"_f_\" in col]\n",
"\n",
"def sum_column_with_filter(column):\n",
" non_negative_column = pc.if_else(pc.less(column, 0), pa.array([0] * len(column)), column)\n",
" return pc.sum(non_negative_column)\n",
"\n",
"sum_pop_f_2020 = sum(sum_column_with_filter(table.column(col)) for col in female_columns)\n",
"sum_pop_m_2020 = sum(sum_column_with_filter(table.column(col)) for col in male_columns)\n",
"sum_pop_2020 = sum_pop_f_2020 + sum_pop_m_2020\n",
"\n",
"new_table = table.append_column(\"sum_pop_f_2020\", sum_pop_f_2020)\n",
"new_table = new_table.append_column(\"sum_pop_m_2020\", sum_pop_m_2020)\n",
"new_table = new_table.append_column(\"sum_pop_2020\", sum_pop_2020)\n",
"\n",
"updated_parquet_file = \"updated_local.parquet\"\n",
"pq.write_table(new_table, updated_parquet_file)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "spacestats",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
9 changes: 9 additions & 0 deletions space2stats_api/src/README.md
Original file line number Diff line number Diff line change
@@ -1 +1,10 @@
## space2stats

### Generating STAC files
- Navigate to the METADATA sub-directory and run the following commands in order:
1. get_types.py
2. create_stac.py
- Note that the get types function is reading in a parquet file from the following directory: space2stats_api/src/local.parquet
- Here is a workflow diagram of the STAC metadata creation:

![Create Stac](../../docs/images/create_stac_workflow.png)
Loading

0 comments on commit 234aa3f

Please sign in to comment.