Skip to content

Commit

Permalink
Feature/deploy (#25)
Browse files Browse the repository at this point in the history
* Setup database ingestion

* Update path to requirements in CI

* Refactor project structure for cdk deployment

* Update tests for new deployment structure

* Update timeout of lambda function

* Add custom domain for API

Separate settings into deployment and application.

* Transition to API Gateway v2

* Update path to requirements in CI workflow

* Update memory setting for lambda to improve performance

* Add pytest dependency installation to CI workflow

* Update pattern for env files
  • Loading branch information
zacdezgeo authored Aug 14, 2024
1 parent 0a96ba1 commit 6fdcb39
Show file tree
Hide file tree
Showing 27 changed files with 237 additions and 140 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r space2stats_api/requirements.txt
pip install pre-commit
pip install -r space2stats_api/src/requirements.txt
pip install pre-commit pytest
- name: Set PYTHONPATH
run: echo "PYTHONPATH=$(pwd)/space2stats_api" >> $GITHUB_ENV
Expand Down
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -100,4 +100,8 @@ db.env
# data
*.parquet
*.duckdb
.pgdata
.pgdata
space2stats_api/space2stats_env
*.env
cdk.out
lambda_layer
127 changes: 38 additions & 89 deletions notebooks/space2stats_api_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -21,18 +21,18 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"BASE_URL = \"http://localhost:8000\"\n",
"BASE_URL = \"https://space2stats.ds.io\"\n",
"FIELDS_ENDPOINT = f\"{BASE_URL}/fields\"\n",
"SUMMARY_ENDPOINT = f\"{BASE_URL}/summary\""
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 17,
"metadata": {},
"outputs": [
{
Expand All @@ -54,7 +54,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -115,15 +115,15 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# Define the Request Payload\n",
"request_payload = {\n",
" \"aoi\": aoi,\n",
" \"spatial_join_method\": \"centroid\",\n",
" \"fields\": [\"sum_pop_2020\", \"sum_pop_f_2020\", \"sum_pop_m_2020\"], \n",
" \"fields\": [\"sum_pop_2020\"], \n",
" \"geometry\": \"point\"\n",
"}\n",
"\n",
Expand All @@ -138,7 +138,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 20,
"metadata": {},
"outputs": [
{
Expand All @@ -165,135 +165,98 @@
" <th>hex_id</th>\n",
" <th>geometry</th>\n",
" <th>sum_pop_2020</th>\n",
" <th>sum_pop_f_2020</th>\n",
" <th>sum_pop_m_2020</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>866a4a48fffffff</td>\n",
" <td>POINT (36.31771 2.23633)</td>\n",
" <td>POINT (35.76352 2.99589)</td>\n",
" <td>399.860905</td>\n",
" <td>189.675539</td>\n",
" <td>210.185366</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>866a4a497ffffff</td>\n",
" <td>POINT (40.18159 0.05763)</td>\n",
" <td>POINT (40.58048 -3.79365)</td>\n",
" <td>582.555159</td>\n",
" <td>276.337255</td>\n",
" <td>306.217904</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>866a4a49fffffff</td>\n",
" <td>POINT (38.59096 0.13944)</td>\n",
" <td>POINT (41.10421 3.37873)</td>\n",
" <td>749.911237</td>\n",
" <td>355.723245</td>\n",
" <td>394.187992</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>866a4a4d7ffffff</td>\n",
" <td>POINT (35.07124 0.80971)</td>\n",
" <td>POINT (37.26153 3.74581)</td>\n",
" <td>863.888290</td>\n",
" <td>418.309236</td>\n",
" <td>445.579054</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>866a5820fffffff</td>\n",
" <td>POINT (37.4356 3.35699)</td>\n",
" <td>POINT (40.01148 1.53124)</td>\n",
" <td>525.085147</td>\n",
" <td>249.076134</td>\n",
" <td>276.009012</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16212</th>\n",
" <td>867b5dd77ffffff</td>\n",
" <td>POINT (39.15438 -1.51437)</td>\n",
" <td>POINT (34.94474 1.24558)</td>\n",
" <td>-36.000000</td>\n",
" <td>-18.000000</td>\n",
" <td>-18.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16213</th>\n",
" <td>867b5dd87ffffff</td>\n",
" <td>POINT (35.80252 0.90823)</td>\n",
" <td>POINT (40.95343 -1.83280)</td>\n",
" <td>-36.000000</td>\n",
" <td>-18.000000</td>\n",
" <td>-18.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16214</th>\n",
" <td>867b5dd8fffffff</td>\n",
" <td>POINT (37.93845 0.83454)</td>\n",
" <td>POINT (35.20290 -0.29666)</td>\n",
" <td>-36.000000</td>\n",
" <td>-18.000000</td>\n",
" <td>-18.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16215</th>\n",
" <td>867b5dd9fffffff</td>\n",
" <td>POINT (38.65824 -2.60028)</td>\n",
" <td>POINT (41.28333 -1.08552)</td>\n",
" <td>-36.000000</td>\n",
" <td>-18.000000</td>\n",
" <td>-18.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16216</th>\n",
" <td>867b5ddafffffff</td>\n",
" <td>POINT (36.6641 2.37083)</td>\n",
" <td>POINT (36.63048 1.35038)</td>\n",
" <td>-36.000000</td>\n",
" <td>-18.000000</td>\n",
" <td>-18.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>16217 rows × 5 columns</p>\n",
"<p>16217 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" hex_id geometry sum_pop_2020 \\\n",
"0 866a4a48fffffff POINT (36.31771 2.23633) 399.860905 \n",
"1 866a4a497ffffff POINT (40.18159 0.05763) 582.555159 \n",
"2 866a4a49fffffff POINT (38.59096 0.13944) 749.911237 \n",
"3 866a4a4d7ffffff POINT (35.07124 0.80971) 863.888290 \n",
"4 866a5820fffffff POINT (37.4356 3.35699) 525.085147 \n",
"... ... ... ... \n",
"16212 867b5dd77ffffff POINT (39.15438 -1.51437) -36.000000 \n",
"16213 867b5dd87ffffff POINT (35.80252 0.90823) -36.000000 \n",
"16214 867b5dd8fffffff POINT (37.93845 0.83454) -36.000000 \n",
"16215 867b5dd9fffffff POINT (38.65824 -2.60028) -36.000000 \n",
"16216 867b5ddafffffff POINT (36.6641 2.37083) -36.000000 \n",
"\n",
" sum_pop_f_2020 sum_pop_m_2020 \n",
"0 189.675539 210.185366 \n",
"1 276.337255 306.217904 \n",
"2 355.723245 394.187992 \n",
"3 418.309236 445.579054 \n",
"4 249.076134 276.009012 \n",
"... ... ... \n",
"16212 -18.000000 -18.000000 \n",
"16213 -18.000000 -18.000000 \n",
"16214 -18.000000 -18.000000 \n",
"16215 -18.000000 -18.000000 \n",
"16216 -18.000000 -18.000000 \n",
" hex_id geometry sum_pop_2020\n",
"0 866a4a48fffffff POINT (35.76352 2.99589) 399.860905\n",
"1 866a4a497ffffff POINT (40.58048 -3.79365) 582.555159\n",
"2 866a4a49fffffff POINT (41.10421 3.37873) 749.911237\n",
"3 866a4a4d7ffffff POINT (37.26153 3.74581) 863.888290\n",
"4 866a5820fffffff POINT (40.01148 1.53124) 525.085147\n",
"... ... ... ...\n",
"16212 867b5dd77ffffff POINT (34.94474 1.24558) -36.000000\n",
"16213 867b5dd87ffffff POINT (40.95343 -1.83280) -36.000000\n",
"16214 867b5dd8fffffff POINT (35.20290 -0.29666) -36.000000\n",
"16215 867b5dd9fffffff POINT (41.28333 -1.08552) -36.000000\n",
"16216 867b5ddafffffff POINT (36.63048 1.35038) -36.000000\n",
"\n",
"[16217 rows x 5 columns]"
"[16217 rows x 3 columns]"
]
},
"execution_count": 17,
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -306,24 +269,24 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f9d4a524b5bf4d1a950f1cb8cc8d5b54",
"model_id": "00bbfea95ae440d3a73ebb161e3142ab",
"version_major": 2,
"version_minor": 1
"version_minor": 0
},
"text/plain": [
"Map(layers=[ScatterplotLayer(get_fill_color=<pyarrow.lib.FixedSizeListArray object at 0x13966dde0>\n",
"Map(layers=[ScatterplotLayer(get_fill_color=<pyarrow.lib.FixedSizeListArray object at 0x1631ef160>\n",
"[\n",
" [\n",
" 2…"
]
},
"execution_count": 18,
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -338,20 +301,6 @@
"m = Map(layer)\n",
"m\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
7 changes: 7 additions & 0 deletions postgres/deploy.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
## Deployment Notes

- Create database instance
- Update configuration in `db.env`
- Ingest parquet file with `load_to_prod.sh` (may require `chmod +x load_to_prod.sh`)
- Create index on hex_id (for performance):`CREATE INDEX idx_hex_id ON space2stats (hex_id)` - critical for performance of our queries
- Test with the [example notebook](notebooks/space2stats_api_demo.ipynb)
16 changes: 10 additions & 6 deletions postgres/load_nyc_sample.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
#!/bin/bash

# Database connection details
DB_HOST="localhost"
DB_PORT="5439"
DB_NAME="postgis"
DB_USER="username"
DB_PASSWORD="password"
# Load environment variables from db.env file
if [ -f db.env ]; then
export $(cat db.env | grep -v '#' | awk '/=/ {print $1}')
fi

# Check if required environment variables are set
if [ -z "$DB_HOST" ] || [ -z "$DB_PORT" ] || [ -z "$DB_NAME" ] || [ -z "$DB_USER" ] || [ -z "$DB_PASSWORD" ]; then
echo "One or more required environment variables are missing."
exit 1
fi

# Path to the sample Parquet file
PARQUET_FILE="nyc_sample.parquet"
Expand Down
17 changes: 11 additions & 6 deletions postgres/load_parquet_chunks.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
#!/bin/bash

# Database connection details
DB_HOST="localhost"
DB_PORT="5439"
DB_NAME="postgis"
DB_USER="username"
DB_PASSWORD="password"

# Load environment variables from db.env file
if [ -f db.env ]; then
export $(cat db.env | grep -v '#' | awk '/=/ {print $1}')
fi

# Check if required environment variables are set
if [ -z "$DB_HOST" ] || [ -z "$DB_PORT" ] || [ -z "$DB_NAME" ] || [ -z "$DB_USER" ] || [ -z "$DB_PASSWORD" ]; then
echo "One or more required environment variables are missing."
exit 1
fi

# Directory containing the Parquet chunks
CHUNKS_DIR="parquet_chunks"
Expand Down
30 changes: 30 additions & 0 deletions postgres/load_to_prod.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash


# Load environment variables from db.env file
if [ -f db.env ]; then
export $(cat db.env | grep -v '#' | awk '/=/ {print $1}')
fi

# Check if required environment variables are set
if [ -z "$DB_HOST" ] || [ -z "$DB_PORT" ] || [ -z "$DB_NAME" ] || [ -z "$DB_USER" ] || [ -z "$DB_PASSWORD" ]; then
echo "One or more required environment variables are missing."
exit 1
fi

# Directory containing the Parquet chunks
CHUNKS_DIR="parquet_chunks"

# Name of the target table
TABLE_NAME="space2stats"
PARQUET_FILE=space2stats_updated.parquet

echo "Starting"

ogr2ogr -progress -f "PostgreSQL" \
PG:"host=$DB_HOST port=$DB_PORT dbname=$DB_NAME user=$DB_USER password=$DB_PASSWORD" \
"$PARQUET_FILE" \
-nln $TABLE_NAME \
-append \
-lco SPATIAL_INDEX=NONE

Loading

0 comments on commit 6fdcb39

Please sign in to comment.