diff --git a/README.md b/README.md index 301e646..acae258 100644 --- a/README.md +++ b/README.md @@ -16,31 +16,54 @@ pip install "git+https://github.com/worldbank/DECAT_Space2Stats.git#subdirectory - Setup the database: -``` -docker-compose up -d -``` + ``` + docker-compose up -d + ``` - Create a `db.env` file: -```.env -PGHOST=localhost -PGPORT=5439 -PGDATABASE=postgis -PGUSER=username -PGPASSWORD=password -PGTABLENAME=space2stats -``` + ```.env + PGHOST=localhost + PGPORT=5439 + PGDATABASE=postgis + PGUSER=username + PGPASSWORD=password + PGTABLENAME=space2stats + ``` -- Load our dataset into the database +- Ingest the dataset into the database: -``` -./postgres/download_parquet.sh -./load_to_prod.sh -``` + Use the space2stats-ingest CLI to download the Parquet file from S3 and load it into your database. You’ll also need the STAC metadata file to validate the Parquet schema during ingestion. + + To download the Parquet file from S3 and load it into the database: + ``` + poetry run space2stats-ingest download-and-load \ + "s3:///space2stats.parquet" \ + "postgresql://username:password@localhost:5439/postgis" \ + "/space2stats.json" \ + --parquet-file "local.parquet" + ``` + +Alternatively, you can download the Parquet file and load it into the database separately: + +- Download the Parquet file: + + ``` + poetry run space2stats-ingest download \ + "s3:///space2stats.parquet" \ + --local-path "local.parquet" + ``` + +- Load the Parquet file into the database: -> You can get started with a subset of data for NYC with `./load_nyc_sample.sh` which requires changing your `db.env` value for `PGTABLENAME` to `space2stats_nyc_sample`. + ``` + poetry run space2stats-ingest load \ + "postgresql://username:password@localhost:5439/postgis" \ + "/space2stats.json" \ + --parquet-file "local.parquet" + ``` -- Access your data using the Space2stats API! See the [example notebook](notebooks/space2stats_api_demo.ipynb). +- Finally, access your data using the Space2stats API! See the [example notebook](notebooks/space2stats_api_demo.ipynb). ## Usage diff --git a/docs/acceptance/db.md b/docs/acceptance/db.md index 6ad13f4..d05d2e5 100644 --- a/docs/acceptance/db.md +++ b/docs/acceptance/db.md @@ -13,6 +13,8 @@ The input data is stored in Parquet format on AWS S3, located in the file `space - `hex_id` - `{variable_name}_{aggregation_method[sum, mean, etc.]}_{year}` +In addition to the Parquet file, a corresponding STAC metadata file is required to ensure that the data structure in the Parquet file matches the metadata specification. The STAC metadata file describes the columns present in the Parquet file and is used to perform schema validation before loading the data into the database. + ### Database Setup You can use a local database for this acceptance test by running: @@ -40,6 +42,12 @@ PGTABLENAME=space2stats ### CLI Usage: +You can use the CLI tool for data ingestion, which includes validation of the Parquet file against the STAC metadata file to ensure consistency between the data structure and the metadata. + +#### Ingestion Process + +The ingestion process now includes an additional parameter for specifying the STAC metadata file, which ensures that the Parquet file schema matches the metadata. This validation step ensures that there are no extra columns in either the Parquet file or the metadata, providing a 1:1 correspondence between them. + You can use the CLI tool for data ingestion. First, ensure you have the required dependencies installed via Poetry: ```bash @@ -49,19 +57,27 @@ poetry install To download the Parquet file from S3 and load it into the database, run the following command: ```bash -poetry run space2stats-ingest download-and-load "s3://yourbucket/space2stats_updated.parquet" "postgresql://postgres:password@localhost:5432/postgis" +poetry run space2stats-ingest download-and-load \ + "s3:///space2stats.parquet" \ + "postgresql://username:password@localhost:5439/postgres" \ + "/space2stats.json" \ + --parquet-file "local.parquet" ``` Alternatively, you can run the `download` and `load` commands separately: 1. **Download the Parquet file**: ```bash - poetry run space2stats-ingest download "s3://yourbucket/space2stats_updated.parquet" --local-path "local.parquet" + poetry run space2stats-ingest download "s3:///space2stats.parquet" --local-path "local.parquet" ``` 2. **Load the Parquet file into the database**: ```bash - poetry run space2stats-ingest load "postgresql://postgres:password@localhost:5432/postgis" --parquet-file "local.parquet" + poetry run space2stats-ingest download-and-load \ + "s3:///space2stats.parquet" \ + "postgresql://username:password@localhost:5439/postgres" \ + "/space2stats.json" \ + --parquet-file "local.parquet" ``` ### Database Configuration diff --git a/space2stats_api/src/poetry.lock b/space2stats_api/src/poetry.lock index 01078c5..ee06e2a 100644 --- a/space2stats_api/src/poetry.lock +++ b/space2stats_api/src/poetry.lock @@ -123,17 +123,17 @@ testing = ["boto3-stubs[s3] (>=1.35.11)", "coverage", "moto (>=5.0.13)", "pytest [[package]] name = "boto3" -version = "1.35.26" +version = "1.35.31" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" files = [ - {file = "boto3-1.35.26-py3-none-any.whl", hash = "sha256:c31db992655db233d98762612690cfe60723c9e1503b5709aad92c1c564877bb"}, - {file = "boto3-1.35.26.tar.gz", hash = "sha256:b04087afd3570ba540fd293823c77270ec675672af23da9396bd5988a3f8128b"}, + {file = "boto3-1.35.31-py3-none-any.whl", hash = "sha256:2e9af74d10d8af7610a8d8468d2914961f116912a024fce17351825260385a52"}, + {file = "boto3-1.35.31.tar.gz", hash = "sha256:8c593af260c4ea3eb6f079c09908f94494ca2222aa4e40a7ff490fab1cee8b39"}, ] [package.dependencies] -botocore = ">=1.35.26,<1.36.0" +botocore = ">=1.35.31,<1.36.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.10.0,<0.11.0" @@ -142,13 +142,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.35.26" +version = "1.35.31" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" files = [ - {file = "botocore-1.35.26-py3-none-any.whl", hash = "sha256:0b9dee5e4a3314e251e103585837506b17fcc7485c3c8adb61a9a913f46da1e7"}, - {file = "botocore-1.35.26.tar.gz", hash = "sha256:19efc3a22c9df77960712b4e203f912486f8bcd3794bff0fd7b2a0f5f1d5712d"}, + {file = "botocore-1.35.31-py3-none-any.whl", hash = "sha256:4cee814875bc78656aef4011d3d6b2231e96f53ea3661ee428201afb579d5c31"}, + {file = "botocore-1.35.31.tar.gz", hash = "sha256:f7bfa910cf2cbcc8c2307c1cf7b93495d614c2d699883417893e0a337fe4eb63"}, ] [package.dependencies] @@ -769,13 +769,13 @@ test = ["flake8", "pylint", "pytest", "pytest-cov"] [[package]] name = "httpcore" -version = "1.0.5" +version = "1.0.6" description = "A minimal low-level HTTP client." optional = false python-versions = ">=3.8" files = [ - {file = "httpcore-1.0.5-py3-none-any.whl", hash = "sha256:421f18bac248b25d310f3cacd198d55b8e6125c107797b609ff9b7a6ba7991b5"}, - {file = "httpcore-1.0.5.tar.gz", hash = "sha256:34a38e2f9291467ee3b44e89dd52615370e152954ba21721378a87b2960f7a61"}, + {file = "httpcore-1.0.6-py3-none-any.whl", hash = "sha256:27b59625743b85577a8c0e10e55b50b5368a4f2cfe8cc7bcfa9cf00829c2682f"}, + {file = "httpcore-1.0.6.tar.gz", hash = "sha256:73f6dbd6eb8c21bbf7ef8efad555481853f5f6acdeaff1edb0694289269ee17f"}, ] [package.dependencies] @@ -786,7 +786,7 @@ h11 = ">=0.13,<0.15" asyncio = ["anyio (>=4.0,<5.0)"] http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] -trio = ["trio (>=0.22.0,<0.26.0)"] +trio = ["trio (>=0.22.0,<1.0)"] [[package]] name = "httpx" @@ -901,13 +901,13 @@ files = [ [[package]] name = "mangum" -version = "0.18.0" +version = "0.19.0" description = "AWS Lambda support for ASGI applications" optional = false python-versions = ">=3.7" files = [ - {file = "mangum-0.18.0-py3-none-any.whl", hash = "sha256:cf71aa89bd655d465ece62c9c336be6b537eb297cfc72f7ea463aabc5eda2e6a"}, - {file = "mangum-0.18.0.tar.gz", hash = "sha256:50179395012d9ae3df30484c16280decf0c56fb4c76bd6e1979b9b60135f254f"}, + {file = "mangum-0.19.0-py3-none-any.whl", hash = "sha256:e500b35f495d5e68ac98bc97334896d6101523f2ee2c57ba6a61893b65266e59"}, + {file = "mangum-0.19.0.tar.gz", hash = "sha256:e388e7c491b7b67970f8234e46fd4a7b21ff87785848f418de08148f71cf0bd6"}, ] [package.dependencies] @@ -1033,13 +1033,13 @@ psutil = {version = ">=4.0.0", markers = "sys_platform != \"cygwin\""} [[package]] name = "moto" -version = "5.0.15" +version = "5.0.16" description = "" optional = false python-versions = ">=3.8" files = [ - {file = "moto-5.0.15-py2.py3-none-any.whl", hash = "sha256:fa1e92ffb55dbfb9fa92a2115a88c32481b75aa3fbd24075d1f29af2f9becffa"}, - {file = "moto-5.0.15.tar.gz", hash = "sha256:57aa8c2af417cc64a0ddfe63e5bcd1ada90f5079b73cdd1f74c4e9fb30a1a7e6"}, + {file = "moto-5.0.16-py2.py3-none-any.whl", hash = "sha256:4ce1f34830307f7b3d553d77a7ef26066ab3b70006203d4226b048c9d11a3be4"}, + {file = "moto-5.0.16.tar.gz", hash = "sha256:f4afb176a964cd7a70da9bc5e053d43109614ce3cab26044bcbb53610435dff4"}, ] [package.dependencies] @@ -1403,24 +1403,24 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] [[package]] name = "psycopg" -version = "3.2.2" +version = "3.2.3" description = "PostgreSQL database adapter for Python" optional = false python-versions = ">=3.8" files = [ - {file = "psycopg-3.2.2-py3-none-any.whl", hash = "sha256:babf565d459d8f72fb65da5e211dd0b58a52c51e4e1fa9cadecff42d6b7619b2"}, - {file = "psycopg-3.2.2.tar.gz", hash = "sha256:8bad2e497ce22d556dac1464738cb948f8d6bab450d965cf1d8a8effd52412e0"}, + {file = "psycopg-3.2.3-py3-none-any.whl", hash = "sha256:644d3973fe26908c73d4be746074f6e5224b03c1101d302d9a53bf565ad64907"}, + {file = "psycopg-3.2.3.tar.gz", hash = "sha256:a5764f67c27bec8bfac85764d23c534af2c27b893550377e37ce59c12aac47a2"}, ] [package.dependencies] -psycopg-binary = {version = "3.2.2", optional = true, markers = "implementation_name != \"pypy\" and extra == \"binary\""} +psycopg-binary = {version = "3.2.3", optional = true, markers = "implementation_name != \"pypy\" and extra == \"binary\""} psycopg-pool = {version = "*", optional = true, markers = "extra == \"pool\""} typing-extensions = {version = ">=4.6", markers = "python_version < \"3.13\""} tzdata = {version = "*", markers = "sys_platform == \"win32\""} [package.extras] -binary = ["psycopg-binary (==3.2.2)"] -c = ["psycopg-c (==3.2.2)"] +binary = ["psycopg-binary (==3.2.3)"] +c = ["psycopg-c (==3.2.3)"] dev = ["ast-comments (>=1.1.2)", "black (>=24.1.0)", "codespell (>=2.2)", "dnspython (>=2.1)", "flake8 (>=4.0)", "mypy (>=1.11)", "types-setuptools (>=57.4)", "wheel (>=0.37)"] docs = ["Sphinx (>=5.0)", "furo (==2022.6.21)", "sphinx-autobuild (>=2021.3.14)", "sphinx-autodoc-typehints (>=1.12)"] pool = ["psycopg-pool"] @@ -1428,75 +1428,75 @@ test = ["anyio (>=4.0)", "mypy (>=1.11)", "pproxy (>=2.7)", "pytest (>=6.2.5)", [[package]] name = "psycopg-binary" -version = "3.2.2" +version = "3.2.3" description = "PostgreSQL database adapter for Python -- C optimisation distribution" optional = false python-versions = ">=3.8" files = [ - {file = "psycopg_binary-3.2.2-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:8eacbf58d4f8d7bc82e0a60476afa2622b5a58f639a3cc2710e3e37b72aff3cb"}, - {file = "psycopg_binary-3.2.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:d07e62476ee8c54853b2b8cfdf3858a574218103b4cd213211f64326c7812437"}, - {file = "psycopg_binary-3.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c22e615ee0ecfc6687bb8a39a4ed9d6bac030b5e72ac15e7324fd6e48979af71"}, - {file = "psycopg_binary-3.2.2-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ec29c7ec136263628e3f09a53e51d0a4b1ad765a6e45135707bfa848b39113f9"}, - {file = "psycopg_binary-3.2.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:035753f80cbbf6aceca6386f53e139df70c7aca057b0592711047b5a8cfef8bb"}, - {file = "psycopg_binary-3.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c9ee99336151ff7c30682f2ef9cb1174d235bc1471322faabba97f9db1398167"}, - {file = "psycopg_binary-3.2.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a60674dff4a4194e88312b463fb84ac80924c2b9e25d0e0460f3176bf1af4a6b"}, - {file = "psycopg_binary-3.2.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3c701507a49340de422d77a6ce95918a0019990bbf27daec35aa40050c6eadb6"}, - {file = "psycopg_binary-3.2.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1b3c5a04eaf8866e399315cff2e810260cce10b797437a9f49fd71b5f4b94d0a"}, - {file = "psycopg_binary-3.2.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0ad9c09de4c262f516ae6891d042a4325649b18efa39dd82bbe0f7bc95c37bfb"}, - {file = "psycopg_binary-3.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:bf1d3582185cb43ecc27403bee2f5405b7a45ccaab46c8508d9a9327341574fc"}, - {file = "psycopg_binary-3.2.2-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:554d208757129d34fa47b7c890f9ef922f754e99c6b089cb3a209aa0fe282682"}, - {file = "psycopg_binary-3.2.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:71dc3cc10d1fd7d26a3079d0a5b4a8e8ad0d7b89a702ceb7605a52e4395be122"}, - {file = "psycopg_binary-3.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a86f578d63f2e1fdf87c9adaed4ff23d7919bda8791cf1380fa4cf3a857ccb8b"}, - {file = "psycopg_binary-3.2.2-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1a4eb737682c02a602a12aa85a492608066f77793dab681b1c4e885fedc160b1"}, - {file = "psycopg_binary-3.2.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9e120a576e74e4e612c48f4b021e322e320ca102534d78a0ca4db2ffd058ae8d"}, - {file = "psycopg_binary-3.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:849d518e7d4c6186e1e48ea2ac2671912edf7e732fffe6f01dfed61cf0245de4"}, - {file = "psycopg_binary-3.2.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8ee2b19152bcec8f356f989c31768702be5f139b4d51094273c4a9ddc8c55380"}, - {file = "psycopg_binary-3.2.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:00273dd011892e8216fcef76b42f775ddaa6348664a7fffae2a27c9557f45bfa"}, - {file = "psycopg_binary-3.2.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:4bcb489615d7e56d1de42937e6a0fc13f766505729afdb54c2947a52db295220"}, - {file = "psycopg_binary-3.2.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:06963f88916a177df95aaed27101af0989ba206654743b1a0e050b9d8e734686"}, - {file = "psycopg_binary-3.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:ed1ad836a0c21890c7f84e73c7ef1ed0950e0e4b0d8e49b609b6fd9c13f2ca21"}, - {file = "psycopg_binary-3.2.2-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:0dd314229885a81f9497875295d8788e651b78945627540f1e78ed71595e614a"}, - {file = "psycopg_binary-3.2.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:989acbe2f552769cdb780346cea32d86e7c117044238d5172ac10b025fe47194"}, - {file = "psycopg_binary-3.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:566b1c530898590f0ac9d949cf94351c08d73c89f8800c74c0a63ffd89a383c8"}, - {file = "psycopg_binary-3.2.2-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:68d03efab7e2830a0df3aa4c29a708930e3f6b9fd98774ff9c4fd1f33deafecc"}, - {file = "psycopg_binary-3.2.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e1f013bfb744023df23750fde51edcb606def8328473361db3c192c392c6060"}, - {file = "psycopg_binary-3.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a06136aab55a2de7dd4e2555badae276846827cfb023e6ba1b22f7a7b88e3f1b"}, - {file = "psycopg_binary-3.2.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:020c5154be144a1440cf87eae012b9004fb414ae4b9e7b1b9fb808fe39e96e83"}, - {file = "psycopg_binary-3.2.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:ef341c556aeaa43a2729b07b04e20bfffdcf3d96c4a96e728ca94fe4ce632d8c"}, - {file = "psycopg_binary-3.2.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:66de2dd7d37bf66eb234ca9d907f5cd8caca43ff8d8a50dd5c15844d1cf0390c"}, - {file = "psycopg_binary-3.2.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2eb6f8f410dbbb71b8c633f283b8588b63bee0a7321f00ab76e9c800c593f732"}, - {file = "psycopg_binary-3.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:b45553c6b614d02e1486585980afdfd18f0000aac668e2e87c6e32da1adb051a"}, - {file = "psycopg_binary-3.2.2-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:1ee891287c2da57e7fee31fbe2fbcdf57125768133d811b02e9523d5a052eb28"}, - {file = "psycopg_binary-3.2.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:5e95e4a8076ac7611e571623e1113fa84fd48c0459601969ffbf534d7aa236e7"}, - {file = "psycopg_binary-3.2.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6269d79a3d7d76b6fcf0fafae8444da00e83777a6c68c43851351a571ad37155"}, - {file = "psycopg_binary-3.2.2-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6dd5d21a298c3c53af20ced8da4ae4cd038c6fe88c80842a8888fa3660b2094"}, - {file = "psycopg_binary-3.2.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4cf64e41e238620f05aad862f06bc8424f8f320d8075f1499bd85a225d18bd57"}, - {file = "psycopg_binary-3.2.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c482c3236ded54add31136a91d5223b233ec301f297fa2db79747404222dca6"}, - {file = "psycopg_binary-3.2.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0718be095cefdad712542169d16fa58b3bd9200a3de1b0217ae761cdec1cf569"}, - {file = "psycopg_binary-3.2.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:fb303b03c243a9041e1873b596e246f7caaf01710b312fafa65b1db5cd77dd6f"}, - {file = "psycopg_binary-3.2.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:705da5bc4364bd7529473225fca02b795653bc5bd824dbe43e1df0b1a40fe691"}, - {file = "psycopg_binary-3.2.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:05406b96139912574571b1c56bb023839a9146cf4b57c4548f36251dd5909fa1"}, - {file = "psycopg_binary-3.2.2-cp313-cp313-win_amd64.whl", hash = "sha256:7c357cf87e8d7612cfe781225be7669f35038a765d1b53ec9605f6c5aef9ee85"}, - {file = "psycopg_binary-3.2.2-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:059aa5e8fa119de328b4cb02ee80775443763b25682a02dd7d026b8d4f565834"}, - {file = "psycopg_binary-3.2.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05a50f94e1e4fa37a0074b09263b83b0aa038c3c72068a61f1ad61ea449ef9d5"}, - {file = "psycopg_binary-3.2.2-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:951507b3d77a64c907afe893e01e09b41051fd7e27e9462f450fb8bb64bc22b0"}, - {file = "psycopg_binary-3.2.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ec4986c4ac2503e865acd3943d179531c3bbfa5a1c8ee81fcfccb551dad645f"}, - {file = "psycopg_binary-3.2.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b32b0e838841d5b109d32fc706b8bc64e50c161fee3f1371ccf696e5598bc49"}, - {file = "psycopg_binary-3.2.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:fdc74a83348477b28bea9e7b391c9fc189b480fe3cd0e46bb989514410b64d60"}, - {file = "psycopg_binary-3.2.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:9efe0ca78be4a573b4b81226904c711cfadc4783d64bfdf58a3394da7c1a1354"}, - {file = "psycopg_binary-3.2.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:51f56ae2898acaa33623adad96ddc5acbb5e2f72f2fc020065c8be05c0e01dce"}, - {file = "psycopg_binary-3.2.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:43b209be0424e8abece428a884cb711f504e3526dfbcb0bf51529907a55eda15"}, - {file = "psycopg_binary-3.2.2-cp38-cp38-win_amd64.whl", hash = "sha256:d3c147eea9f3950a34133dc187e8d3534e54ff4a178a4ebd8993b2c97e123200"}, - {file = "psycopg_binary-3.2.2-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:6c7b6a8d4e1b77cdb50192b61235b33fc2f1d28c67627fc93a1d43e9130dd479"}, - {file = "psycopg_binary-3.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e234edc4bb746d8ac3daae8753ee38eaa7af2ee333a1d35ce6b02a02874aed18"}, - {file = "psycopg_binary-3.2.2-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f12640ba92c538b3b64a199a918d3bb0cc0d7f7123c6ba93cb065e1a2d049f0"}, - {file = "psycopg_binary-3.2.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8937dc548621b336b0d8383a3470fb7192b42a108c760a152282909867bf5b26"}, - {file = "psycopg_binary-3.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4afbb97d64cd8078edec859b07859a18ef3de7261a3a873ba52f32548373ae92"}, - {file = "psycopg_binary-3.2.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:c432710bdf8ccfdd75b0bc9cdf1fd21ff394363e4daec099c667f3c5f1721e2b"}, - {file = "psycopg_binary-3.2.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:366cc4e194f7feb4e3038d6775fd4b69835e7d923972aee5baec986de972abd6"}, - {file = "psycopg_binary-3.2.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b286ed65a891928bd457ffa0cd5fec09b9b5208bfd096d087e45369f07c5cb85"}, - {file = "psycopg_binary-3.2.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:9fee41c99312002e5d1f7462b1954aefed44c6efe5f021c3eac311640c16f6b7"}, - {file = "psycopg_binary-3.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:87cceaf07760a04023596f9ca1d4e929d38ae8d778161cb3e8d27a0f990dd264"}, + {file = "psycopg_binary-3.2.3-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:965455eac8547f32b3181d5ec9ad8b9be500c10fe06193543efaaebe3e4ce70c"}, + {file = "psycopg_binary-3.2.3-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:71adcc8bc80a65b776510bc39992edf942ace35b153ed7a9c6c573a6849ce308"}, + {file = "psycopg_binary-3.2.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f73adc05452fb85e7a12ed3f69c81540a8875960739082e6ea5e28c373a30774"}, + {file = "psycopg_binary-3.2.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e8630943143c6d6ca9aefc88bbe5e76c90553f4e1a3b2dc339e67dc34aa86f7e"}, + {file = "psycopg_binary-3.2.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bffb61e198a91f712cc3d7f2d176a697cb05b284b2ad150fb8edb308eba9002"}, + {file = "psycopg_binary-3.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc4fa2240c9fceddaa815a58f29212826fafe43ce80ff666d38c4a03fb036955"}, + {file = "psycopg_binary-3.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:192a5f8496e6e1243fdd9ac20e117e667c0712f148c5f9343483b84435854c78"}, + {file = "psycopg_binary-3.2.3-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:64dc6e9ec64f592f19dc01a784e87267a64a743d34f68488924251253da3c818"}, + {file = "psycopg_binary-3.2.3-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:79498df398970abcee3d326edd1d4655de7d77aa9aecd578154f8af35ce7bbd2"}, + {file = "psycopg_binary-3.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:949551752930d5e478817e0b49956350d866b26578ced0042a61967e3fcccdea"}, + {file = "psycopg_binary-3.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:80a2337e2dfb26950894c8301358961430a0304f7bfe729d34cc036474e9c9b1"}, + {file = "psycopg_binary-3.2.3-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:6d8f2144e0d5808c2e2aed40fbebe13869cd00c2ae745aca4b3b16a435edb056"}, + {file = "psycopg_binary-3.2.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:94253be2b57ef2fea7ffe08996067aabf56a1eb9648342c9e3bad9e10c46e045"}, + {file = "psycopg_binary-3.2.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fda0162b0dbfa5eaed6cdc708179fa27e148cb8490c7d62e5cf30713909658ea"}, + {file = "psycopg_binary-3.2.3-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c0419cdad8c70eaeb3116bb28e7b42d546f91baf5179d7556f230d40942dc78"}, + {file = "psycopg_binary-3.2.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:74fbf5dd3ef09beafd3557631e282f00f8af4e7a78fbfce8ab06d9cd5a789aae"}, + {file = "psycopg_binary-3.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d784f614e4d53050cbe8abf2ae9d1aaacf8ed31ce57b42ce3bf2a48a66c3a5c"}, + {file = "psycopg_binary-3.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:4e76ce2475ed4885fe13b8254058be710ec0de74ebd8ef8224cf44a9a3358e5f"}, + {file = "psycopg_binary-3.2.3-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5938b257b04c851c2d1e6cb2f8c18318f06017f35be9a5fe761ee1e2e344dfb7"}, + {file = "psycopg_binary-3.2.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:257c4aea6f70a9aef39b2a77d0658a41bf05c243e2bf41895eb02220ac6306f3"}, + {file = "psycopg_binary-3.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:06b5cc915e57621eebf2393f4173793ed7e3387295f07fed93ed3fb6a6ccf585"}, + {file = "psycopg_binary-3.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:09baa041856b35598d335b1a74e19a49da8500acedf78164600694c0ba8ce21b"}, + {file = "psycopg_binary-3.2.3-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:48f8ca6ee8939bab760225b2ab82934d54330eec10afe4394a92d3f2a0c37dd6"}, + {file = "psycopg_binary-3.2.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:5361ea13c241d4f0ec3f95e0bf976c15e2e451e9cc7ef2e5ccfc9d170b197a40"}, + {file = "psycopg_binary-3.2.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb987f14af7da7c24f803111dbc7392f5070fd350146af3345103f76ea82e339"}, + {file = "psycopg_binary-3.2.3-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0463a11b1cace5a6aeffaf167920707b912b8986a9c7920341c75e3686277920"}, + {file = "psycopg_binary-3.2.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8b7be9a6c06518967b641fb15032b1ed682fd3b0443f64078899c61034a0bca6"}, + {file = "psycopg_binary-3.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64a607e630d9f4b2797f641884e52b9f8e239d35943f51bef817a384ec1678fe"}, + {file = "psycopg_binary-3.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:fa33ead69ed133210d96af0c63448b1385df48b9c0247eda735c5896b9e6dbbf"}, + {file = "psycopg_binary-3.2.3-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:1f8b0d0e99d8e19923e6e07379fa00570be5182c201a8c0b5aaa9a4d4a4ea20b"}, + {file = "psycopg_binary-3.2.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:709447bd7203b0b2debab1acec23123eb80b386f6c29e7604a5d4326a11e5bd6"}, + {file = "psycopg_binary-3.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5e37d5027e297a627da3551a1e962316d0f88ee4ada74c768f6c9234e26346d9"}, + {file = "psycopg_binary-3.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:261f0031ee6074765096a19b27ed0f75498a8338c3dcd7f4f0d831e38adf12d1"}, + {file = "psycopg_binary-3.2.3-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:41fdec0182efac66b27478ac15ef54c9ebcecf0e26ed467eb7d6f262a913318b"}, + {file = "psycopg_binary-3.2.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:07d019a786eb020c0f984691aa1b994cb79430061065a694cf6f94056c603d26"}, + {file = "psycopg_binary-3.2.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4c57615791a337378fe5381143259a6c432cdcbb1d3e6428bfb7ce59fff3fb5c"}, + {file = "psycopg_binary-3.2.3-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e8eb9a4e394926b93ad919cad1b0a918e9b4c846609e8c1cfb6b743683f64da0"}, + {file = "psycopg_binary-3.2.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5905729668ef1418bd36fbe876322dcb0f90b46811bba96d505af89e6fbdce2f"}, + {file = "psycopg_binary-3.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd65774ed7d65101b314808b6893e1a75b7664f680c3ef18d2e5c84d570fa393"}, + {file = "psycopg_binary-3.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:700679c02f9348a0d0a2adcd33a0275717cd0d0aee9d4482b47d935023629505"}, + {file = "psycopg_binary-3.2.3-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:96334bb64d054e36fed346c50c4190bad9d7c586376204f50bede21a913bf942"}, + {file = "psycopg_binary-3.2.3-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:9099e443d4cc24ac6872e6a05f93205ba1a231b1a8917317b07c9ef2b955f1f4"}, + {file = "psycopg_binary-3.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1985ab05e9abebfbdf3163a16ebb37fbc5d49aff2bf5b3d7375ff0920bbb54cd"}, + {file = "psycopg_binary-3.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:e90352d7b610b4693fad0feea48549d4315d10f1eba5605421c92bb834e90170"}, + {file = "psycopg_binary-3.2.3-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:69320f05de8cdf4077ecd7fefdec223890eea232af0d58f2530cbda2871244a0"}, + {file = "psycopg_binary-3.2.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4926ea5c46da30bec4a85907aa3f7e4ea6313145b2aa9469fdb861798daf1502"}, + {file = "psycopg_binary-3.2.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c64c4cd0d50d5b2288ab1bcb26c7126c772bbdebdfadcd77225a77df01c4a57e"}, + {file = "psycopg_binary-3.2.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05a1bdce30356e70a05428928717765f4a9229999421013f41338d9680d03a63"}, + {file = "psycopg_binary-3.2.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ad357e426b0ea5c3043b8ec905546fa44b734bf11d33b3da3959f6e4447d350"}, + {file = "psycopg_binary-3.2.3-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:967b47a0fd237aa17c2748fdb7425015c394a6fb57cdad1562e46a6eb070f96d"}, + {file = "psycopg_binary-3.2.3-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:71db8896b942770ed7ab4efa59b22eee5203be2dfdee3c5258d60e57605d688c"}, + {file = "psycopg_binary-3.2.3-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:2773f850a778575dd7158a6dd072f7925b67f3ba305e2003538e8831fec77a1d"}, + {file = "psycopg_binary-3.2.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:aeddf7b3b3f6e24ccf7d0edfe2d94094ea76b40e831c16eff5230e040ce3b76b"}, + {file = "psycopg_binary-3.2.3-cp38-cp38-win_amd64.whl", hash = "sha256:824c867a38521d61d62b60aca7db7ca013a2b479e428a0db47d25d8ca5067410"}, + {file = "psycopg_binary-3.2.3-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:9994f7db390c17fc2bd4c09dca722fd792ff8a49bb3bdace0c50a83f22f1767d"}, + {file = "psycopg_binary-3.2.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1303bf8347d6be7ad26d1362af2c38b3a90b8293e8d56244296488ee8591058e"}, + {file = "psycopg_binary-3.2.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:842da42a63ecb32612bb7f5b9e9f8617eab9bc23bd58679a441f4150fcc51c96"}, + {file = "psycopg_binary-3.2.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2bb342a01c76f38a12432848e6013c57eb630103e7556cf79b705b53814c3949"}, + {file = "psycopg_binary-3.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd40af959173ea0d087b6b232b855cfeaa6738f47cb2a0fd10a7f4fa8b74293f"}, + {file = "psycopg_binary-3.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9b60b465773a52c7d4705b0a751f7f1cdccf81dd12aee3b921b31a6e76b07b0e"}, + {file = "psycopg_binary-3.2.3-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:fc6d87a1c44df8d493ef44988a3ded751e284e02cdf785f746c2d357e99782a6"}, + {file = "psycopg_binary-3.2.3-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:f0b018e37608c3bfc6039a1dc4eb461e89334465a19916be0153c757a78ea426"}, + {file = "psycopg_binary-3.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2a29f5294b0b6360bfda69653697eff70aaf2908f58d1073b0acd6f6ab5b5a4f"}, + {file = "psycopg_binary-3.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:e56b1fd529e5dde2d1452a7d72907b37ed1b4f07fdced5d8fb1e963acfff6749"}, ] [[package]] @@ -1979,18 +1979,19 @@ tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asy [[package]] name = "rich" -version = "13.8.1" +version = "13.9.1" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" optional = false -python-versions = ">=3.7.0" +python-versions = ">=3.8.0" files = [ - {file = "rich-13.8.1-py3-none-any.whl", hash = "sha256:1760a3c0848469b97b558fc61c85233e3dafb69c7a071b4d60c38099d3cd4c06"}, - {file = "rich-13.8.1.tar.gz", hash = "sha256:8260cda28e3db6bf04d2d1ef4dbc03ba80a824c88b0e7668a0f23126a424844a"}, + {file = "rich-13.9.1-py3-none-any.whl", hash = "sha256:b340e739f30aa58921dc477b8adaa9ecdb7cecc217be01d93730ee1bc8aa83be"}, + {file = "rich-13.9.1.tar.gz", hash = "sha256:097cffdf85db1babe30cc7deba5ab3a29e1b9885047dab24c57e9a7f8a9c1466"}, ] [package.dependencies] markdown-it-py = ">=2.2.0" pygments = ">=2.13.0,<3.0.0" +typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.11\""} [package.extras] jupyter = ["ipywidgets (>=7.5.1,<9)"] @@ -2262,13 +2263,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "uvicorn" -version = "0.30.6" +version = "0.31.0" description = "The lightning-fast ASGI server." optional = false python-versions = ">=3.8" files = [ - {file = "uvicorn-0.30.6-py3-none-any.whl", hash = "sha256:65fd46fe3fda5bdc1b03b94eb634923ff18cd35b2f084813ea79d1f103f711b5"}, - {file = "uvicorn-0.30.6.tar.gz", hash = "sha256:4b15decdda1e72be08209e860a1e10e92439ad5b97cf44cc945fcbee66fc5788"}, + {file = "uvicorn-0.31.0-py3-none-any.whl", hash = "sha256:cac7be4dd4d891c363cd942160a7b02e69150dcbc7a36be04d5f4af4b17c8ced"}, + {file = "uvicorn-0.31.0.tar.gz", hash = "sha256:13bc21373d103859f68fe739608e2eb054a816dea79189bc3ca08ea89a275906"}, ] [package.dependencies] @@ -2281,13 +2282,13 @@ standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)", [[package]] name = "virtualenv" -version = "20.26.5" +version = "20.26.6" description = "Virtual Python Environment builder" optional = false python-versions = ">=3.7" files = [ - {file = "virtualenv-20.26.5-py3-none-any.whl", hash = "sha256:4f3ac17b81fba3ce3bd6f4ead2749a72da5929c01774948e243db9ba41df4ff6"}, - {file = "virtualenv-20.26.5.tar.gz", hash = "sha256:ce489cac131aa58f4b25e321d6d186171f78e6cb13fafbf32a840cee67733ff4"}, + {file = "virtualenv-20.26.6-py3-none-any.whl", hash = "sha256:7345cc5b25405607a624d8418154577459c3e0277f5466dd79c49d5e492995f2"}, + {file = "virtualenv-20.26.6.tar.gz", hash = "sha256:280aede09a2a5c317e409a00102e7077c6432c5a38f0ef938e643805a7ad2c48"}, ] [package.dependencies] diff --git a/space2stats_api/src/space2stats_ingest/cli.py b/space2stats_api/src/space2stats_ingest/cli.py index 5704e36..de23b9c 100644 --- a/space2stats_api/src/space2stats_ingest/cli.py +++ b/space2stats_api/src/space2stats_ingest/cli.py @@ -12,8 +12,12 @@ def handle_errors(func): def wrapper(*args, **kwargs): try: return func(*args, **kwargs) - except Exception as e: + except ValueError as e: typer.echo(f"An error occurred: {e}", err=True) + raise typer.Exit(code=1) # Ensure non-zero exit code on error + except Exception as e: + typer.echo(f"An unexpected error occurred: {e}", err=True) + raise typer.Exit(code=1) # General non-zero exit for any exception return wrapper @@ -33,14 +37,15 @@ def download(s3_path: str, local_path: str = typer.Option("local.parquet")): @handle_errors def load( connection_string: str, + stac_metadata_file: str, # Add the STAC metadata file path as an argument parquet_file: str = typer.Option("local.parquet"), chunksize: int = 64_000, ): """ - Load a Parquet file into a PostgreSQL database. + Load a Parquet file into a PostgreSQL database after verifying columns with the STAC metadata. """ typer.echo(f"Loading data into PostgreSQL database from {parquet_file}") - load_parquet_to_db(parquet_file, connection_string, chunksize) + load_parquet_to_db(parquet_file, connection_string, stac_metadata_file, chunksize) typer.echo("Data loaded successfully to PostgreSQL!") @@ -49,11 +54,12 @@ def load( def download_and_load( s3_path: str, connection_string: str, + stac_metadata_file: str, # Add the STAC metadata file path as an argument parquet_file: str = typer.Option("local.parquet"), chunksize: int = 64_000, ): """ - Download a Parquet file from S3 and load it into a PostgreSQL database. + Download a Parquet file from S3, verify columns with the STAC metadata, and load it into a PostgreSQL database. """ download( s3_path=s3_path, @@ -62,5 +68,6 @@ def download_and_load( load( parquet_file=parquet_file, connection_string=connection_string, + stac_metadata_file=stac_metadata_file, # Ensure this is passed along chunksize=chunksize, ) diff --git a/space2stats_api/src/space2stats_ingest/main.py b/space2stats_api/src/space2stats_ingest/main.py index 717cbff..2c8fe4f 100644 --- a/space2stats_api/src/space2stats_ingest/main.py +++ b/space2stats_api/src/space2stats_ingest/main.py @@ -1,3 +1,6 @@ +import json +import tempfile + import adbc_driver_postgresql.dbapi as pg import boto3 import pyarrow.parquet as pq @@ -6,6 +9,83 @@ TABLE_NAME = "space2stats" +def read_parquet_file(file_path: str): + """ + Reads a Parquet file either from a local path or an S3 path. + + Args: + file_path (str): Path to the Parquet file, either local or S3. + + Returns: + pyarrow.Table: Parquet table object. + """ + if file_path.startswith("s3://"): + # Read from S3 + s3 = boto3.client("s3") + bucket, key = file_path[5:].split("/", 1) + with tempfile.NamedTemporaryFile() as tmp_file: + s3.download_file(bucket, key, tmp_file.name) + table = pq.read_table(tmp_file.name) + else: + # Read from local path + table = pq.read_table(file_path) + + return table + + +def read_stac_metadata_file(file_path: str): + """ + Reads a STAC metadata file either from a local path or an S3 path. + + Args: + file_path (str): Path to the STAC metadata file, either local or S3. + + Returns: + dict: Parsed JSON content of the STAC metadata. + """ + if file_path.startswith("s3://"): + s3 = boto3.client("s3") + bucket, key = file_path[5:].split("/", 1) + with tempfile.NamedTemporaryFile() as tmp_file: + s3.download_file(bucket, key, tmp_file.name) + with open(tmp_file.name, "r") as f: + stac_metadata = json.load(f) + else: + with open(file_path, "r") as f: + stac_metadata = json.load(f) + + return stac_metadata + + +def verify_columns(parquet_file: str, stac_metadata_file: str) -> bool: + """ + Verifies that the Parquet file columns match the STAC item metadata columns. + + Args: + parquet_file (str): Path to the Parquet file. + stac_metadata_file (str): Path to the STAC item metadata JSON file. + + Returns: + bool: True if the columns match, False otherwise. + """ + parquet_table = read_parquet_file(parquet_file) + parquet_columns = set(parquet_table.column_names) + + stac_metadata = read_stac_metadata_file(stac_metadata_file) + stac_columns = { + column["name"] for column in stac_metadata["properties"]["table:columns"] + } + + if parquet_columns != stac_columns: + extra_in_parquet = parquet_columns - stac_columns + extra_in_stac = stac_columns - parquet_columns + raise ValueError( + f"Column mismatch: Extra in Parquet: {extra_in_parquet}, Extra in STAC: {extra_in_stac}" + ) + + return True + + def download_parquet_from_s3(s3_path: str, local_path: str): """ Downloads a Parquet file from an S3 bucket and saves it locally. @@ -21,16 +101,15 @@ def download_parquet_from_s3(s3_path: str, local_path: str): def load_parquet_to_db( - parquet_file: str, connection_string: str, chunksize: int = 64_000 + parquet_file: str, + connection_string: str, + stac_metadata_file: str, + chunksize: int = 64_000, ): - """ - Loads a local Parquet file into a PostgreSQL database in chunks with a progress bar. + # Verify column consistency between Parquet file and STAC metadata + if not verify_columns(parquet_file, stac_metadata_file): + raise ValueError("Column mismatch between Parquet file and STAC metadata") - Args: - parquet_file (str): Path to the Parquet file. - connection_string (str): SQLAlchemy-compatible connection string to the PostgreSQL database. - chunksize (int): Number of rows to process in each chunk. - """ table = pq.read_table(parquet_file) with ( pg.connect(connection_string) as conn, @@ -41,6 +120,5 @@ def load_parquet_to_db( for batch in table.to_batches(max_chunksize=chunksize): count = cur.adbc_ingest(TABLE_NAME, batch, mode="append") pbar.update(count) - break cur.execute("CREATE INDEX ON space2stats (hex_id);") conn.commit() diff --git a/space2stats_api/src/tests/test_ingest.py b/space2stats_api/src/tests/test_ingest.py index fba85af..fc91e2f 100644 --- a/space2stats_api/src/tests/test_ingest.py +++ b/space2stats_api/src/tests/test_ingest.py @@ -19,9 +19,10 @@ def test_download_parquet_from_s3(s3_mock): assert os.path.exists(parquet_file) -def test_load_parquet_to_db(database): +def test_load_parquet_to_db(database, tmpdir): connection_string = f"postgresql://{database.user}:{database.password}@{database.host}:{database.port}/{database.dbname}" - parquet_file = "local.parquet" + parquet_file = tmpdir.join("local.parquet") + stac_metadata_file = tmpdir.join("stac_metadata.json") data = { "hex_id": ["hex_1", "hex_2"], @@ -31,7 +32,21 @@ def test_load_parquet_to_db(database): table = pa.table(data) pq.write_table(table, parquet_file) - load_parquet_to_db(parquet_file, connection_string) + with open(stac_metadata_file, "w") as f: + f.write(""" + { + "type": "Feature", + "properties": { + "table:columns": [ + {"name": "hex_id", "type": "string"}, + {"name": "sum_pop_2020", "type": "int64"}, + {"name": "sum_pop_f_10_2020", "type": "int64"} + ] + } + } + """) + + load_parquet_to_db(str(parquet_file), connection_string, str(stac_metadata_file)) with psycopg.connect(connection_string) as conn: with conn.cursor() as cur: diff --git a/space2stats_api/src/tests/test_ingest_cli.py b/space2stats_api/src/tests/test_ingest_cli.py index 1274433..f05f684 100644 --- a/space2stats_api/src/tests/test_ingest_cli.py +++ b/space2stats_api/src/tests/test_ingest_cli.py @@ -1,11 +1,18 @@ import os +import pyarrow as pa +import pyarrow.parquet as pq from space2stats_ingest.cli import app from typer.testing import CliRunner runner = CliRunner() +def create_mock_parquet_file(parquet_file, columns): + table = pa.Table.from_pydict({name: [1.0, 2.0, 3.0] for name, _ in columns}) + pq.write_table(table, parquet_file) + + def test_download_command(tmpdir, s3_mock): s3_path = "s3://mybucket/myfile.parquet" parquet_file = tmpdir.join("local.parquet") @@ -28,12 +35,34 @@ def test_download_command(tmpdir, s3_mock): def test_load_command(tmpdir, database): connection_string = f"postgresql://{database.user}:{database.password}@{database.host}:{database.port}/{database.dbname}" parquet_file = tmpdir.join("local.parquet") + stac_metadata_file = tmpdir.join("stac_metadata.json") + + create_mock_parquet_file( + parquet_file, [("hex_id", pa.string()), ("mock_column", pa.float64())] + ) - with open(parquet_file, "wb") as f: - f.write(b"mock_parquet_data") + with open(stac_metadata_file, "w") as f: + f.write(""" + { + "type": "Feature", + "properties": { + "table:columns": [ + {"name": "hex_id", "type": "string"}, + {"name": "mock_column", "type": "float64"} + ] + } + } + """) result = runner.invoke( - app, ["load", connection_string, "--parquet-file", str(parquet_file)] + app, + [ + "load", + connection_string, + str(stac_metadata_file), + "--parquet-file", + str(parquet_file), + ], ) print(result.output) @@ -41,21 +70,74 @@ def test_load_command(tmpdir, database): assert "Loading data into PostgreSQL" in result.stdout +def test_load_command_column_mismatch(tmpdir, database): + connection_string = f"postgresql://{database.user}:{database.password}@{database.host}:{database.port}/{database.dbname}" + parquet_file = tmpdir.join("local.parquet") + stac_metadata_file = tmpdir.join("stac_metadata.json") + + create_mock_parquet_file(parquet_file, [("different_column", pa.float64())]) + + with open(stac_metadata_file, "w") as f: + f.write(""" + { + "type": "Feature", + "properties": { + "table:columns": [ + {"name": "mock_column", "type": "float64"} + ] + } + } + """) + + result = runner.invoke( + app, + [ + "load", + connection_string, + str(stac_metadata_file), + "--parquet-file", + str(parquet_file), + ], + ) + print(result.output) + + assert result.exit_code != 0 + assert "Column mismatch" in result.stdout + + def test_download_and_load_command(tmpdir, database, s3_mock): s3_path = "s3://mybucket/myfile.parquet" parquet_file = tmpdir.join("local.parquet") + stac_metadata_file = tmpdir.join("stac_metadata.json") connection_string = f"postgresql://{database.user}:{database.password}@{database.host}:{database.port}/{database.dbname}" - s3_mock.put_object( - Bucket="mybucket", Key="myfile.parquet", Body=b"mock_parquet_data" + create_mock_parquet_file( + parquet_file, [("hex_id", pa.string()), ("mock_column", pa.float64())] ) + with open(parquet_file, "rb") as f: + s3_mock.put_object(Bucket="mybucket", Key="myfile.parquet", Body=f.read()) + + with open(stac_metadata_file, "w") as f: + f.write(""" + { + "type": "Feature", + "properties": { + "table:columns": [ + {"name": "hex_id", "type": "string"}, + {"name": "mock_column", "type": "float64"} + ] + } + } + """) + result = runner.invoke( app, [ "download-and-load", s3_path, connection_string, + str(stac_metadata_file), "--parquet-file", str(parquet_file), ],