From d3a9b8bf5e81a92de96fd9e278924e88ef364d4b Mon Sep 17 00:00:00 2001 From: Zac Deziel Date: Fri, 6 Sep 2024 08:01:14 -0700 Subject: [PATCH] Define acceptance test for database deliverable (#41) * Define acceptance test for database deliverable * Updated based on review * Add remote database url. Modify attribute naming convention. Remove chunking scripts. * Convert environment variables from DB_ to PG --- .github/workflows/ci.yml | 12 ++-- README.md | 14 ++--- docs/acceptance/db.md | 65 +++++++++++++++++++++ postgres/chunk_parquet.py | 18 ------ postgres/download_parquet.sh | 10 ++-- postgres/load_nyc_sample.sh | 8 +-- postgres/load_parquet_chunks.sh | 55 ----------------- postgres/load_to_prod.sh | 6 +- space2stats_api/cdk/settings.py | 12 ++-- space2stats_api/src/space2stats/main.py | 4 +- space2stats_api/src/space2stats/settings.py | 18 +++--- space2stats_api/src/tests/test_api.py | 12 ++-- 12 files changed, 113 insertions(+), 121 deletions(-) create mode 100644 docs/acceptance/db.md delete mode 100755 postgres/load_parquet_chunks.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8a06f7b..76722d2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,10 +35,10 @@ jobs: run: | poetry run pytest -s -vv env: - DB_HOST: localhost - DB_PORT: 5432 - DB_NAME: mydatabase - DB_USER: myuser - DB_PASSWORD: mypassword - DB_TABLE_NAME: space2stats + PGHOST: localhost + PGPORT: 5432 + PGDATABASE: mydatabase + PGUSER: myuser + PGPASSWORD: mypassword + PGTABLENAME: space2stats S3_BUCKET_NAME: test-bucket \ No newline at end of file diff --git a/README.md b/README.md index c7e0844..ccf6735 100644 --- a/README.md +++ b/README.md @@ -11,12 +11,12 @@ docker-compose up -d - Create a `db.env` file: ```.env -DB_HOST=localhost -DB_PORT=5439 -DB_NAME=postgis -DB_USER=username -DB_PASSWORD=password -DB_TABLE_NAME=space2stats +PGHOST=localhost +PGPORT=5439 +PGDATABASE=postgis +PGUSER=username +PGPASSWORD=password +PGTABLENAME=space2stats ``` - Load our dataset into the database @@ -26,7 +26,7 @@ python postgres/chunk_parquet.py ./postgres/load_parquet_chunks.sh ``` -> You can get started with a subset of data for NYC with `./load_nyc_sample.sh` which requires changing your `db.env` value for `DB_TABLE_NAME` to `space2stats_nyc_sample`. +> You can get started with a subset of data for NYC with `./load_nyc_sample.sh` which requires changing your `db.env` value for `PGTABLENAME` to `space2stats_nyc_sample`. - Access your data using the Space2statS API! See the [example notebook](notebooks/space2stats_api_demo.ipynb). diff --git a/docs/acceptance/db.md b/docs/acceptance/db.md new file mode 100644 index 0000000..c980f0c --- /dev/null +++ b/docs/acceptance/db.md @@ -0,0 +1,65 @@ +## Database Deliverable Acceptance Test + +### Description of Deliverable + +This deliverable includes the implementation of an ETL process, the design of a database schema, the selection of a format for raw data storage, infrastructure as code, database configuration, and accompanying documentation. + +The acceptance test below provides steps to verify that the deliverable meets our agreed-upon specifications. + +### Input Data + +The input data is stored in Parquet format on AWS S3 (object storage), specifically in the file `space2stats_updated.parquet`. Any additional fields must be appended to this file. The Parquet file is tabular with the following columns: +- `hex_id` +- `{variable_name}_{aggregation_method[sum, mean, etc.]}_{year}` + +### Database Setup + +You can use a local database for this acceptance test by running the following command in the root directory: + +```bash +docker-compose up +``` + +Alternatively, you can connect to a remote database, such as the [Tembo database](reluctantly-simple-spoonbill.data-1.use1.tembo.io) used for production. + +### Data Ingestion + +Set the database environment variables in `db.env`: + +```bash +PGHOST=localhost +PGPORT=5432 +PGDATABASE=postgis +PGUSER=postgres +PGPASSWORD=password +PGTABLENAME=space2stats +``` + +> Note: If using the `docker-compose` approach, the above configuration is suitable. + +To ingest data, run the following script: + +```bash +chmod +x postgres/load_to_prod.sh +./postgres/load_to_prod.sh +``` + +### Database Configuration + +Once connected to your database via `psql` or another PostgreSQL client (e.g., `pgAdmin`): + +- Create an index on the `space2stats` table: + +```sql +CREATE INDEX idx_hex_id ON space2stats (hex_id); +``` + +### Testing the Database Table + +You can run sample queries to verify data is accessible in the database. Our primary access patterns involve filtering by specific hex identifiers and returning specified fields. Here are some example queries: + +```sql +SELECT * FROM space2stats LIMIT 100; +SELECT * FROM space2stats WHERE hex_id = '86beabd8fffffff'; +SELECT sum_pop_2020 FROM space2stats WHERE hex_id IN ('86beabd8fffffff', '86beabdb7ffffff', '86beac01fffffff'); +``` \ No newline at end of file diff --git a/postgres/chunk_parquet.py b/postgres/chunk_parquet.py index 30db620..e69de29 100644 --- a/postgres/chunk_parquet.py +++ b/postgres/chunk_parquet.py @@ -1,18 +0,0 @@ -import os - -import pandas as pd - -chunk_dir = "parquet_chunks" -df = pd.read_parquet("space2stats_updated.parquet") -chunk_size = 100000 # Number of rows per chunk - -if not os.path.exists(chunk_dir): - os.mkdir(chunk_dir) - -for i in range(0, len(df), chunk_size): - chunk = df.iloc[i : i + chunk_size] - chunk.to_parquet( - os.path.join(chunk_dir, f"space2stats_part_{i // chunk_size}.parquet") - ) - -print("Parquet file split into smaller chunks.") diff --git a/postgres/download_parquet.sh b/postgres/download_parquet.sh index 434f269..6c2bc5f 100644 --- a/postgres/download_parquet.sh +++ b/postgres/download_parquet.sh @@ -9,11 +9,11 @@ PARQUET_FILE="Space2Stats/parquet/GLOBAL/combined_population.parquet" LOCAL_PARQUET_FILE="space2stats.parquet" # PostgreSQL configuration -DB_HOST="${MY_DOCKER_IP:-127.0.0.1}" -DB_PORT=5439 -DB_NAME="postgis" -DB_USER="username" -DB_PASSWORD="password" +PGHOST="${MY_DOCKER_IP:-127.0.0.1}" +PGPORT=5439 +PGDATABASE="postgis" +PGUSER="username" +PGPASSWORD="password" # Download Parquet file from S3 echo "Downloading Parquet file from S3..." diff --git a/postgres/load_nyc_sample.sh b/postgres/load_nyc_sample.sh index 276956c..78c5613 100755 --- a/postgres/load_nyc_sample.sh +++ b/postgres/load_nyc_sample.sh @@ -6,7 +6,7 @@ if [ -f db.env ]; then fi # Check if required environment variables are set -if [ -z "$DB_HOST" ] || [ -z "$DB_PORT" ] || [ -z "$DB_NAME" ] || [ -z "$DB_USER" ] || [ -z "$DB_PASSWORD" ]; then +if [ -z "$PGHOST" ] || [ -z "$PGPORT" ] || [ -z "$PGDATABASE" ] || [ -z "$PGUSER" ] || [ -z "$PGPASSWORD" ]; then echo "One or more required environment variables are missing." exit 1 fi @@ -18,21 +18,21 @@ PARQUET_FILE="nyc_sample.parquet" TABLE_NAME="space2stats_nyc_sample" # Check if the table exists -TABLE_EXISTS=$(psql -h $DB_HOST -p $DB_PORT -d $DB_NAME -U $DB_USER -tAc "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_schema='public' AND table_name='$TABLE_NAME');") +TABLE_EXISTS=$(psql -h $PGHOST -p $PGPORT -d $PGDATABASE -U $PGUSER -tAc "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_schema='public' AND table_name='$TABLE_NAME');") echo "Importing $PARQUET_FILE..." if [ "$TABLE_EXISTS" = "t" ]; then # Table exists, append data ogr2ogr -f "PostgreSQL" \ - PG:"host=$DB_HOST port=$DB_PORT dbname=$DB_NAME user=$DB_USER password=$DB_PASSWORD" \ + PG:"host=$PGHOST port=$PGPORT dbname=$PGDATABASE user=$PGUSER password=$PGPASSWORD" \ "$PARQUET_FILE" \ -nln $TABLE_NAME \ -append else # Table does not exist, create table and import data ogr2ogr -f "PostgreSQL" \ - PG:"host=$DB_HOST port=$DB_PORT dbname=$DB_NAME user=$DB_USER password=$DB_PASSWORD" \ + PG:"host=$PGHOST port=$PGPORT dbname=$PGDATABASE user=$PGUSER password=$PGPASSWORD" \ "$PARQUET_FILE" \ -nln $TABLE_NAME diff --git a/postgres/load_parquet_chunks.sh b/postgres/load_parquet_chunks.sh deleted file mode 100755 index 8f79d8b..0000000 --- a/postgres/load_parquet_chunks.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - - -# Load environment variables from db.env file -if [ -f db.env ]; then - export $(cat db.env | grep -v '#' | awk '/=/ {print $1}') -fi - -# Check if required environment variables are set -if [ -z "$DB_HOST" ] || [ -z "$DB_PORT" ] || [ -z "$DB_NAME" ] || [ -z "$DB_USER" ] || [ -z "$DB_PASSWORD" ]; then - echo "One or more required environment variables are missing." - exit 1 -fi - -# Directory containing the Parquet chunks -CHUNKS_DIR="parquet_chunks" - -# Name of the target table -TABLE_NAME="space2stats" - -# Flag to check if the table exists -TABLE_EXISTS=$(psql -h $DB_HOST -p $DB_PORT -d $DB_NAME -U $DB_USER -tAc "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_schema='public' AND table_name='$TABLE_NAME');") - -# Loop through each Parquet file in the chunks directory -for PARQUET_FILE in "$CHUNKS_DIR"/*.parquet; -do - echo "Importing $PARQUET_FILE..." - - if [ "$TABLE_EXISTS" = "t" ]; then - # Table exists, append data - ogr2ogr -f "PostgreSQL" \ - PG:"host=$DB_HOST port=$DB_PORT dbname=$DB_NAME user=$DB_USER password=$DB_PASSWORD" \ - "$PARQUET_FILE" \ - -nln $TABLE_NAME \ - -append - else - # Table does not exist, create table and import data - ogr2ogr -f "PostgreSQL" \ - PG:"host=$DB_HOST port=$DB_PORT dbname=$DB_NAME user=$DB_USER password=$DB_PASSWORD" \ - "$PARQUET_FILE" \ - -nln $TABLE_NAME \ - -lco SPATIAL_INDEX=NONE - - TABLE_EXISTS="t" - fi - - if [ $? -ne 0 ]; then - echo "Failed to import $PARQUET_FILE" - exit 1 - fi - - echo "Successfully imported $PARQUET_FILE" -done - -echo "All Parquet chunks have been imported." \ No newline at end of file diff --git a/postgres/load_to_prod.sh b/postgres/load_to_prod.sh index 3149e7b..64c40dd 100755 --- a/postgres/load_to_prod.sh +++ b/postgres/load_to_prod.sh @@ -7,7 +7,7 @@ if [ -f db.env ]; then fi # Check if required environment variables are set -if [ -z "$DB_HOST" ] || [ -z "$DB_PORT" ] || [ -z "$DB_NAME" ] || [ -z "$DB_USER" ] || [ -z "$DB_PASSWORD" ]; then +if [ -z "$PGHOST" ] || [ -z "$PGPORT" ] || [ -z "$PGDATABASE" ] || [ -z "$PGUSER" ] || [ -z "$PGPASSWORD" ]; then echo "One or more required environment variables are missing." exit 1 fi @@ -17,12 +17,12 @@ CHUNKS_DIR="parquet_chunks" # Name of the target table TABLE_NAME="space2stats" -PARQUET_FILE=space2stats_updated.parquet +PARQUET_FILE=space2stats.parquet echo "Starting" ogr2ogr -progress -f "PostgreSQL" \ - PG:"host=$DB_HOST port=$DB_PORT dbname=$DB_NAME user=$DB_USER password=$DB_PASSWORD" \ + PG:"host=$PGHOST port=$PGPORT dbname=$PGDATABASE user=$PGUSER password=$PGPASSWORD" \ "$PARQUET_FILE" \ -nln $TABLE_NAME \ -append \ diff --git a/space2stats_api/cdk/settings.py b/space2stats_api/cdk/settings.py index 706189c..5934527 100644 --- a/space2stats_api/cdk/settings.py +++ b/space2stats_api/cdk/settings.py @@ -2,12 +2,12 @@ class AppSettings(BaseSettings): - DB_HOST: str - DB_PORT: str - DB_NAME: str - DB_USER: str - DB_PASSWORD: str - DB_TABLE_NAME: str + PGHOST: str + PGPORT: str + PGDATABASE: str + PGUSER: str + PGPASSWORD: str + PGTABLENAME: str class DeploymentSettings(BaseSettings): diff --git a/space2stats_api/src/space2stats/main.py b/space2stats_api/src/space2stats/main.py index 2141c9d..433f3c3 100644 --- a/space2stats_api/src/space2stats/main.py +++ b/space2stats_api/src/space2stats/main.py @@ -30,7 +30,7 @@ def _get_summaries(fields: List[str], h3_ids: List[str], conn: Connection): FROM {1} WHERE hex_id = ANY (%s) """ - ).format(pg.sql.SQL(", ").join(cols), pg.sql.Identifier(settings.DB_TABLE_NAME)) + ).format(pg.sql.SQL(", ").join(cols), pg.sql.Identifier(settings.PGTABLENAME)) # Convert h3_ids to a list to ensure compatibility with psycopg h3_ids = list(h3_ids) @@ -101,7 +101,7 @@ def get_available_fields(conn: Connection) -> List[str]: cur.execute( sql_query, [ - settings.DB_TABLE_NAME, + settings.PGTABLENAME, ], ) columns = [row[0] for row in cur.fetchall() if row[0] != "hex_id"] diff --git a/space2stats_api/src/space2stats/settings.py b/space2stats_api/src/space2stats/settings.py index e9eccf3..4aff4c8 100644 --- a/space2stats_api/src/space2stats/settings.py +++ b/space2stats_api/src/space2stats/settings.py @@ -2,12 +2,12 @@ class Settings(BaseSettings): - DB_HOST: str - DB_PORT: int - DB_NAME: str - DB_USER: str - DB_PASSWORD: str - DB_TABLE_NAME: str + PGHOST: str + PGPORT: int + PGDATABASE: str + PGUSER: str + PGPASSWORD: str + PGTABLENAME: str # Bucket for large responses S3_BUCKET_NAME: str @@ -27,9 +27,9 @@ class Settings(BaseSettings): @property def DB_CONNECTION_STRING(self) -> str: - host_port = f"host={self.DB_HOST} port={self.DB_PORT}" - db_user = f"dbname={self.DB_NAME} user={self.DB_USER}" - return f"{host_port} {db_user} password={self.DB_PASSWORD}" + host_port = f"host={self.PGHOST} port={self.PGPORT}" + db_user = f"dbname={self.PGDATABASE} user={self.PGUSER}" + return f"{host_port} {db_user} password={self.PGPASSWORD}" model_config = { "env_file": "local_db.env", diff --git a/space2stats_api/src/tests/test_api.py b/space2stats_api/src/tests/test_api.py index 4a06f38..00a18f0 100644 --- a/space2stats_api/src/tests/test_api.py +++ b/space2stats_api/src/tests/test_api.py @@ -38,12 +38,12 @@ def database(postgresql_proc): @pytest.fixture(autouse=True) def client(monkeypatch, database, test_bucket): - monkeypatch.setenv("DB_HOST", database.host) - monkeypatch.setenv("DB_PORT", str(database.port)) - monkeypatch.setenv("DB_NAME", database.dbname) - monkeypatch.setenv("DB_USER", database.user) - monkeypatch.setenv("DB_PASSWORD", database.password) - monkeypatch.setenv("DB_TABLE_NAME", "space2stats") + monkeypatch.setenv("PGHOST", database.host) + monkeypatch.setenv("PGPORT", str(database.port)) + monkeypatch.setenv("PGDATABASE", database.dbname) + monkeypatch.setenv("PGUSER", database.user) + monkeypatch.setenv("PGPASSWORD", database.password) + monkeypatch.setenv("PGTABLENAME", "space2stats") from space2stats.app import app