Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding S3 Select tutorial and small fixes #748

Merged
merged 4 commits into from
Jun 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/static-checking.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
python -m pip install --upgrade pip
pip install -U -r requirements-dev.txt
- name: mypy check
run: mypy awswrangler
run: yes y | mypy --install-types awswrangler
- name: Flake8 Lint
run: flake8 .
- name: Pylint Lint
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Easy integration with Athena, Glue, Redshift, Timestream, QuickSight, Chime, Clo
[![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
[![Coverage](https://img.shields.io/badge/coverage-91%25-brightgreen.svg)](https://pypi.org/project/awswrangler/)
![Static Checking](https://github.com/awslabs/aws-data-wrangler/workflows/Static%20Checking/badge.svg?branch=main)
![Build Status](https://codebuild.us-east-1.amazonaws.com/badges?uuid=eyJlbmNyeXB0ZWREYXRhIjoiaHFXUk91Wks2MGFRMSsxM1R3ZFVjVGg3d0dCT05YTVpoL0VRakRieG43Y3dhdytYZjZtdFVBdG5Sek44anlweDFkM2Z2TWJibVRCRVB5TjlWSnhTdzRBPSIsIml2UGFyYW1ldGVyU3BlYyI6IjUzUllpN295VTUxeFNPQWQiLCJtYXRlcmlhbFNldFNlcmlhbCI6MX0%3D&branch=main)
![Build Status](https://codebuild.us-east-1.amazonaws.com/badges?uuid=eyJlbmNyeXB0ZWREYXRhIjoiL05FNGxiZCtNT05ibGUrbzY5TzJxaFlOcnFrUFlyNjhWRm5tTmg1bXJXRkdnYUFySzgycEUvMTBBbWxEUzZ2eUpOdjVpcmNQV2hsNkRzQTZtTTVwSjF3PSIsIml2UGFyYW1ldGVyU3BlYyI6IkQ1RVkxWjg5YloyaTJOcVgiLCJtYXRlcmlhbFNldFNlcmlhbCI6MX0%3D&branch=main)
[![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/?badge=latest)

| Source | Downloads | Installation Command |
Expand Down
6 changes: 3 additions & 3 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
wheel==0.36.2
isort==5.8.0
black==21.5b2
black==21.6b0
pylint==2.8.3
flake8==3.9.2
mypy==0.812
mypy==0.902
pydocstyle==6.1.1
doc8==0.8.1
tox==3.23.1
Expand All @@ -20,7 +20,7 @@ sphinx_bootstrap_theme==0.7.1
nbsphinx==0.8.6
nbsphinx-link==1.3.0
IPython~=7.16
moto==2.0.8
moto==2.0.9
jupyterlab==3.0.16
s3fs==0.4.2 # keep it at 0.4.2
python-Levenshtein==0.12.2
Expand Down
10 changes: 5 additions & 5 deletions tests/test_athena_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,15 +97,15 @@ def test_parquet_catalog_duplicated(path, glue_table, glue_database):
)


def test_parquet_catalog_casting(path, glue_database):
def test_parquet_catalog_casting(path, glue_database, glue_table):
wr.s3.to_parquet(
df=get_df_cast(),
path=path,
index=False,
dataset=True,
mode="overwrite",
database=glue_database,
table="__test_parquet_catalog_casting",
table=glue_table,
dtype={
"iint8": "tinyint",
"iint16": "smallint",
Expand All @@ -127,14 +127,14 @@ def test_parquet_catalog_casting(path, glue_database):
df = wr.s3.read_parquet(path=path)
assert df.shape == (3, 16)
ensure_data_types(df=df, has_list=False)
df = wr.athena.read_sql_table(table="__test_parquet_catalog_casting", database=glue_database, ctas_approach=True)
df = wr.athena.read_sql_table(table=glue_table, database=glue_database, ctas_approach=True)
assert df.shape == (3, 16)
ensure_data_types(df=df, has_list=False)
df = wr.athena.read_sql_table(table="__test_parquet_catalog_casting", database=glue_database, ctas_approach=False)
df = wr.athena.read_sql_table(table=glue_table, database=glue_database, ctas_approach=False)
assert df.shape == (3, 16)
ensure_data_types(df=df, has_list=False)
wr.s3.delete_objects(path=path)
assert wr.catalog.delete_table_if_exists(database=glue_database, table="__test_parquet_catalog_casting") is True
assert wr.catalog.delete_table_if_exists(database=glue_database, table=glue_table) is True


def test_parquet_catalog_casting_to_string_with_null(path, glue_table, glue_database):
Expand Down
227 changes: 227 additions & 0 deletions tutorials/029 - S3 Select.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "pythonjvsc74a57bd0e4beff3b9c91951bd870e0e6d1ba9dfdd106cfe45c6f3d0f8d31550063fd3386",
"display_name": "Python 3.7.7 64-bit ('.env': venv)"
},
"metadata": {
"interpreter": {
"hash": "e4beff3b9c91951bd870e0e6d1ba9dfdd106cfe45c6f3d0f8d31550063fd3386"
}
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"source": [
"[![AWS Data Wrangler](_static/logo.png \"AWS Data Wrangler\")](https://github.com/awslabs/aws-data-wrangler)"
],
"cell_type": "markdown",
"metadata": {}
},
{
"source": [
"# 29 - S3 Select"
],
"cell_type": "markdown",
"metadata": {}
},
{
"source": [
"AWS Data Wrangler supports [Amazon S3 Select](https://aws.amazon.com/blogs/aws/s3-glacier-select/), enabling applications to use SQL statements in order to query and filter the contents of a single S3 object. It works on objects stored in CSV, JSON or Apache Parquet, including compressed and large files of several TBs.\n",
"\n",
"With S3 Select, the query workload is delegated to Amazon S3, leading to lower latency and cost, and to higher performance (up to 400% improvement). This is in comparison with other Wrangler operations such as `read_parquet` where the S3 object is downloaded and filtered on the client-side.\n",
"\n",
"This feature has a number of limitations however, and should be used for specific scenarios only:\n",
"* It operates on a single S3 object\n",
"* The maximum length of a record in the input or result is 1 MB\n",
"* The maximum uncompressed row group size is 256 MB (Parquet only)\n",
"* It can only emit nested data in JSON format\n",
"* Certain SQL operations are not supported (e.g. ORDER BY)\n"
],
"cell_type": "markdown",
"metadata": {}
},
{
"source": [
"## Read full CSV file"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" dispatching_base_num pickup_datetime dropoff_datetime PULocationID \\\n",
"0 B00009 2019-09-01 00:35:00 2019-09-01 00:59:00 264 \n",
"1 B00009 2019-09-01 00:48:00 2019-09-01 01:09:00 264 \n",
"2 B00014 2019-09-01 00:16:18 2019-09-02 00:35:37 264 \n",
"3 B00014 2019-09-01 00:55:03 2019-09-01 01:09:35 264 \n",
"4 B00014 2019-09-01 00:13:08 2019-09-02 01:12:31 264 \n",
"\n",
" DOLocationID SR_Flag \n",
"0 264 \n",
"1 264 \n",
"2 264 \n",
"3 264 \n",
"4 264 "
],
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>dispatching_base_num</th>\n <th>pickup_datetime</th>\n <th>dropoff_datetime</th>\n <th>PULocationID</th>\n <th>DOLocationID</th>\n <th>SR_Flag</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>B00009</td>\n <td>2019-09-01 00:35:00</td>\n <td>2019-09-01 00:59:00</td>\n <td>264</td>\n <td>264</td>\n <td></td>\n </tr>\n <tr>\n <th>1</th>\n <td>B00009</td>\n <td>2019-09-01 00:48:00</td>\n <td>2019-09-01 01:09:00</td>\n <td>264</td>\n <td>264</td>\n <td></td>\n </tr>\n <tr>\n <th>2</th>\n <td>B00014</td>\n <td>2019-09-01 00:16:18</td>\n <td>2019-09-02 00:35:37</td>\n <td>264</td>\n <td>264</td>\n <td></td>\n </tr>\n <tr>\n <th>3</th>\n <td>B00014</td>\n <td>2019-09-01 00:55:03</td>\n <td>2019-09-01 01:09:35</td>\n <td>264</td>\n <td>264</td>\n <td></td>\n </tr>\n <tr>\n <th>4</th>\n <td>B00014</td>\n <td>2019-09-01 00:13:08</td>\n <td>2019-09-02 01:12:31</td>\n <td>264</td>\n <td>264</td>\n <td></td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {},
"execution_count": 1
}
],
"source": [
"import awswrangler as wr\n",
"\n",
"df = wr.s3.select_query(\n",
" sql=\"SELECT * FROM s3object\",\n",
" path=\"s3://nyc-tlc/trip data/fhv_tripdata_2019-09.csv\", # 58 MB\n",
" input_serialization=\"CSV\",\n",
" input_serialization_params={\n",
" \"FileHeaderInfo\": \"Use\",\n",
" \"RecordDelimiter\": \"\\r\\n\",\n",
" },\n",
" use_threads=True,\n",
")\n",
"df.head()"
]
},
{
"source": [
"## Filter JSON file"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" family_name contact_details name \\\n",
"0 Biden [{'type': 'twitter', 'value': 'joebiden'}] Joseph Biden, Jr. \n",
"\n",
" links gender \\\n",
"0 [{'note': 'Wikipedia (ace)', 'url': 'https://a... male \n",
"\n",
" image \\\n",
"0 https://theunitedstates.io/images/congress/ori... \n",
"\n",
" identifiers \\\n",
"0 [{'scheme': 'bioguide', 'identifier': 'B000444... \n",
"\n",
" other_names sort_name \\\n",
"0 [{'note': 'alternate', 'name': 'Joe Biden'}, {... Biden, Joseph \n",
"\n",
" images given_name birth_date \\\n",
"0 [{'url': 'https://theunitedstates.io/images/co... Joseph 1942-11-20 \n",
"\n",
" id \n",
"0 64239edf-8e06-4d2d-acc0-33d96bc79774 "
],
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>family_name</th>\n <th>contact_details</th>\n <th>name</th>\n <th>links</th>\n <th>gender</th>\n <th>image</th>\n <th>identifiers</th>\n <th>other_names</th>\n <th>sort_name</th>\n <th>images</th>\n <th>given_name</th>\n <th>birth_date</th>\n <th>id</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Biden</td>\n <td>[{'type': 'twitter', 'value': 'joebiden'}]</td>\n <td>Joseph Biden, Jr.</td>\n <td>[{'note': 'Wikipedia (ace)', 'url': 'https://a...</td>\n <td>male</td>\n <td>https://theunitedstates.io/images/congress/ori...</td>\n <td>[{'scheme': 'bioguide', 'identifier': 'B000444...</td>\n <td>[{'note': 'alternate', 'name': 'Joe Biden'}, {...</td>\n <td>Biden, Joseph</td>\n <td>[{'url': 'https://theunitedstates.io/images/co...</td>\n <td>Joseph</td>\n <td>1942-11-20</td>\n <td>64239edf-8e06-4d2d-acc0-33d96bc79774</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {},
"execution_count": 2
}
],
"source": [
"wr.s3.select_query(\n",
" sql=\"SELECT * FROM s3object[*] s where s.\\\"family_name\\\" = \\'Biden\\'\",\n",
" path=\"s3://awsglue-datasets/examples/us-legislators/all/persons.json\",\n",
" input_serialization=\"JSON\",\n",
" input_serialization_params={\n",
" \"Type\": \"Document\",\n",
" },\n",
")"
]
},
{
"source": [
"## Read Snappy compressed Parquet"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" marketplace customer_id review_id product_id product_parent \\\n",
"0 US 52670295 RGPOFKORD8RTU B0002CZPPG 867256265 \n",
"1 US 29964102 R2U8X8V5KPB4J3 B00H5BMF00 373287760 \n",
"2 US 25173351 R15XV3LXUMLTXL B00PG40CO4 137115061 \n",
"3 US 12516181 R3G6G7H8TX4H0T B0002CZPPG 867256265 \n",
"4 US 38355314 R2NJ7WNBU16YTQ B00B2TFSO6 89375983 \n",
"\n",
" star_rating helpful_votes total_votes vine verified_purchase \\\n",
"0 5 105 107 N N \n",
"1 5 0 0 N Y \n",
"2 5 0 0 N Y \n",
"3 5 6 6 N N \n",
"4 5 0 0 N Y \n",
"\n",
" review_headline review_body \\\n",
"0 Excellent Gift Idea I wonder if the other reviewer actually read t... \n",
"1 Five Stars convenience is the name of the game. \n",
"2 Birthday Gift This gift card was handled with accuracy in de... \n",
"3 Love 'em. Gotta love these iTunes Prepaid Card thingys. ... \n",
"4 Five Stars perfect \n",
"\n",
" review_date year \n",
"0 2005-02-08 2005 \n",
"1 2015-05-03 2015 \n",
"2 2015-05-03 2015 \n",
"3 2005-10-15 2005 \n",
"4 2015-05-03 2015 "
],
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>marketplace</th>\n <th>customer_id</th>\n <th>review_id</th>\n <th>product_id</th>\n <th>product_parent</th>\n <th>star_rating</th>\n <th>helpful_votes</th>\n <th>total_votes</th>\n <th>vine</th>\n <th>verified_purchase</th>\n <th>review_headline</th>\n <th>review_body</th>\n <th>review_date</th>\n <th>year</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>US</td>\n <td>52670295</td>\n <td>RGPOFKORD8RTU</td>\n <td>B0002CZPPG</td>\n <td>867256265</td>\n <td>5</td>\n <td>105</td>\n <td>107</td>\n <td>N</td>\n <td>N</td>\n <td>Excellent Gift Idea</td>\n <td>I wonder if the other reviewer actually read t...</td>\n <td>2005-02-08</td>\n <td>2005</td>\n </tr>\n <tr>\n <th>1</th>\n <td>US</td>\n <td>29964102</td>\n <td>R2U8X8V5KPB4J3</td>\n <td>B00H5BMF00</td>\n <td>373287760</td>\n <td>5</td>\n <td>0</td>\n <td>0</td>\n <td>N</td>\n <td>Y</td>\n <td>Five Stars</td>\n <td>convenience is the name of the game.</td>\n <td>2015-05-03</td>\n <td>2015</td>\n </tr>\n <tr>\n <th>2</th>\n <td>US</td>\n <td>25173351</td>\n <td>R15XV3LXUMLTXL</td>\n <td>B00PG40CO4</td>\n <td>137115061</td>\n <td>5</td>\n <td>0</td>\n <td>0</td>\n <td>N</td>\n <td>Y</td>\n <td>Birthday Gift</td>\n <td>This gift card was handled with accuracy in de...</td>\n <td>2015-05-03</td>\n <td>2015</td>\n </tr>\n <tr>\n <th>3</th>\n <td>US</td>\n <td>12516181</td>\n <td>R3G6G7H8TX4H0T</td>\n <td>B0002CZPPG</td>\n <td>867256265</td>\n <td>5</td>\n <td>6</td>\n <td>6</td>\n <td>N</td>\n <td>N</td>\n <td>Love 'em.</td>\n <td>Gotta love these iTunes Prepaid Card thingys. ...</td>\n <td>2005-10-15</td>\n <td>2005</td>\n </tr>\n <tr>\n <th>4</th>\n <td>US</td>\n <td>38355314</td>\n <td>R2NJ7WNBU16YTQ</td>\n <td>B00B2TFSO6</td>\n <td>89375983</td>\n <td>5</td>\n <td>0</td>\n <td>0</td>\n <td>N</td>\n <td>Y</td>\n <td>Five Stars</td>\n <td>perfect</td>\n <td>2015-05-03</td>\n <td>2015</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {},
"execution_count": 3
}
],
"source": [
"df = wr.s3.select_query(\n",
" sql=\"SELECT * FROM s3object s where s.\\\"star_rating\\\" >= 5\",\n",
" path=\"s3://amazon-reviews-pds/parquet/product_category=Gift_Card/part-00000-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet\",\n",
" input_serialization=\"Parquet\",\n",
" input_serialization_params={},\n",
" use_threads=True,\n",
")\n",
"df.loc[:, df.columns != \"product_title\"].head()"
]
}
]
}
2 changes: 1 addition & 1 deletion validate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ set -ex

isort --check .
black --check .
mypy awswrangler
yes y | mypy --install-types awswrangler
flake8 .
pylint -j 0 awswrangler
pydocstyle awswrangler/ --convention=numpy
Expand Down