Skip to content

Commit

Permalink
Bumping version to 0.2.5
Browse files Browse the repository at this point in the history
  • Loading branch information
igorborgest committed Jan 15, 2020
1 parent eb06a51 commit 34dbdd7
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 73 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
![AWS Data Wrangler](docs/source/_static/logo.png?raw=true "AWS Data Wrangler")

> Utility belt to handle data on AWS.
> DataFrames on AWS.
[![Release](https://img.shields.io/badge/release-0.2.2-brightgreen.svg)](https://pypi.org/project/awswrangler/)
[![Release](https://img.shields.io/badge/release-0.2.5-brightgreen.svg)](https://pypi.org/project/awswrangler/)
[![Downloads](https://img.shields.io/pypi/dm/awswrangler.svg)](https://pypi.org/project/awswrangler/)
[![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7-brightgreen.svg)](https://pypi.org/project/awswrangler/)
[![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/en/latest/?badge=latest)
Expand Down
4 changes: 2 additions & 2 deletions awswrangler/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__title__ = "awswrangler"
__description__ = "Utility belt to handle data on AWS."
__version__ = "0.2.2"
__description__ = "DataFrames on AWS."
__version__ = "0.2.5"
__license__ = "Apache License 2.0"
24 changes: 18 additions & 6 deletions awswrangler/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1247,7 +1247,6 @@ def to_redshift(
generated_conn = True

try:

if num_rows < MIN_NUMBER_OF_ROWS_TO_DISTRIBUTE:
num_partitions: int = 1
else:
Expand Down Expand Up @@ -1558,7 +1557,7 @@ def read_sql_redshift(self,
:param sql: SQL Query
:param iam_role: AWS IAM role with the related permissions
:param connection: A PEP 249 compatible connection (Can be generated with Redshift.generate_connection())
:param connection: Glue connection name (str) OR a PEP 249 compatible connection (Can be generated with Redshift.generate_connection())
:param temp_s3_path: AWS S3 path to write temporary data (e.g. s3://...) (Default uses the Athena's results bucket)
:param procs_cpu_bound: Number of cores used for CPU bound tasks
"""
Expand All @@ -1574,21 +1573,34 @@ def read_sql_redshift(self,
logger.debug(f"temp_s3_path: {temp_s3_path}")
self._session.s3.delete_objects(path=temp_s3_path)
paths: Optional[List[str]] = None

generated_conn: bool = False
if type(connection) == str:
logger.debug("Glue connection (str) provided.")
connection = self._session.glue.get_connection(name=connection)
generated_conn = True

try:
paths = self._session.redshift.to_parquet(sql=sql,
path=temp_s3_path,
iam_role=iam_role,
connection=connection)
logger.debug(f"paths: {paths}")
df: pd.DataFrame = self.read_parquet(path=paths, procs_cpu_bound=procs_cpu_bound) # type: ignore
self._session.s3.delete_listed_objects(objects_paths=paths + [temp_s3_path + "/manifest"]) # type: ignore
return df
except Exception as e:
except Exception as ex:
connection.rollback()
if paths is not None:
self._session.s3.delete_listed_objects(objects_paths=paths + [temp_s3_path + "/manifest"])
else:
self._session.s3.delete_objects(path=temp_s3_path)
raise e
if generated_conn is True:
connection.close()
raise ex

if generated_conn is True:
connection.close()
self._session.s3.delete_listed_objects(objects_paths=paths + [temp_s3_path + "/manifest"]) # type: ignore
return df

def to_aurora(self,
dataframe: pd.DataFrame,
Expand Down
133 changes: 74 additions & 59 deletions awswrangler/spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def to_redshift(
:param dataframe: Pandas Dataframe
:param path: S3 path to write temporary files (E.g. s3://BUCKET_NAME/ANY_NAME/)
:param connection: A PEP 249 compatible connection (Can be generated with Redshift.generate_connection())
:param connection: Glue connection name (str) OR a PEP 249 compatible connection (Can be generated with Redshift.generate_connection())
:param schema: The Redshift Schema for the table
:param table: The name of the desired Redshift table
:param iam_role: AWS IAM role with the related permissions
Expand All @@ -93,68 +93,83 @@ def to_redshift(
dataframe.cache()
num_rows: int = dataframe.count()
logger.info(f"Number of rows: {num_rows}")
num_partitions: int
if num_rows < MIN_NUMBER_OF_ROWS_TO_DISTRIBUTE:
num_partitions = 1
else:
num_slices: int = self._session.redshift.get_number_of_slices(redshift_conn=connection)
logger.debug(f"Number of slices on Redshift: {num_slices}")
num_partitions = num_slices
while num_partitions < min_num_partitions:
num_partitions += num_slices
logger.debug(f"Number of partitions calculated: {num_partitions}")
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
session_primitives = self._session.primitives
par_col_name: str = "aws_data_wrangler_internal_partition_id"

@pandas_udf(returnType="objects_paths string", functionType=PandasUDFType.GROUPED_MAP)
def write(pandas_dataframe: pd.DataFrame) -> pd.DataFrame:
# Exporting ARROW_PRE_0_15_IPC_FORMAT environment variable for
# a temporary workaround while waiting for Apache Arrow updates
# https://stackoverflow.com/questions/58273063/pandasudf-and-pyarrow-0-15-0
os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1"
generated_conn: bool = False
if type(connection) == str:
logger.debug("Glue connection (str) provided.")
connection = self._session.glue.get_connection(name=connection)
generated_conn = True

del pandas_dataframe[par_col_name]
paths: List[str] = session_primitives.session.pandas.to_parquet(dataframe=pandas_dataframe,
path=path,
preserve_index=False,
mode="append",
procs_cpu_bound=1,
procs_io_bound=1,
cast_columns=casts)
return pd.DataFrame.from_dict({"objects_paths": paths})
try:
num_partitions: int
if num_rows < MIN_NUMBER_OF_ROWS_TO_DISTRIBUTE:
num_partitions = 1
else:
num_slices: int = self._session.redshift.get_number_of_slices(redshift_conn=connection)
logger.debug(f"Number of slices on Redshift: {num_slices}")
num_partitions = num_slices
while num_partitions < min_num_partitions:
num_partitions += num_slices
logger.debug(f"Number of partitions calculated: {num_partitions}")
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
session_primitives = self._session.primitives
par_col_name: str = "aws_data_wrangler_internal_partition_id"

df_objects_paths: DataFrame = dataframe.repartition(numPartitions=num_partitions) # type: ignore
df_objects_paths: DataFrame = df_objects_paths.withColumn(par_col_name, spark_partition_id()) # type: ignore
df_objects_paths: DataFrame = df_objects_paths.groupby(par_col_name).apply(write) # type: ignore
@pandas_udf(returnType="objects_paths string", functionType=PandasUDFType.GROUPED_MAP)
def write(pandas_dataframe: pd.DataFrame) -> pd.DataFrame:
# Exporting ARROW_PRE_0_15_IPC_FORMAT environment variable for
# a temporary workaround while waiting for Apache Arrow updates
# https://stackoverflow.com/questions/58273063/pandasudf-and-pyarrow-0-15-0
os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1"

objects_paths: List[str] = list(df_objects_paths.toPandas()["objects_paths"])
dataframe.unpersist()
num_files_returned: int = len(objects_paths)
if num_files_returned != num_partitions:
raise MissingBatchDetected(f"{num_files_returned} files returned. {num_partitions} expected.")
logger.debug(f"List of objects returned: {objects_paths}")
logger.debug(f"Number of objects returned from UDF: {num_files_returned}")
manifest_path: str = f"{path}manifest.json"
self._session.redshift.write_load_manifest(manifest_path=manifest_path,
objects_paths=objects_paths,
procs_io_bound=self._procs_io_bound)
self._session.redshift.load_table(dataframe=dataframe,
dataframe_type="spark",
manifest_path=manifest_path,
schema_name=schema,
table_name=table,
redshift_conn=connection,
preserve_index=False,
num_files=num_partitions,
iam_role=iam_role,
diststyle=diststyle,
distkey=distkey,
sortstyle=sortstyle,
sortkey=sortkey,
mode=mode,
cast_columns=casts)
self._session.s3.delete_objects(path=path, procs_io_bound=self._procs_io_bound)
del pandas_dataframe[par_col_name]
paths: List[str] = session_primitives.session.pandas.to_parquet(dataframe=pandas_dataframe,
path=path,
preserve_index=False,
mode="append",
procs_cpu_bound=1,
procs_io_bound=1,
cast_columns=casts)
return pd.DataFrame.from_dict({"objects_paths": paths})

df_objects_paths: DataFrame = dataframe.repartition(numPartitions=num_partitions) # type: ignore
df_objects_paths = df_objects_paths.withColumn(par_col_name, spark_partition_id()) # type: ignore
df_objects_paths = df_objects_paths.groupby(par_col_name).apply(write) # type: ignore

objects_paths: List[str] = list(df_objects_paths.toPandas()["objects_paths"])
dataframe.unpersist()
num_files_returned: int = len(objects_paths)
if num_files_returned != num_partitions:
raise MissingBatchDetected(f"{num_files_returned} files returned. {num_partitions} expected.")
logger.debug(f"List of objects returned: {objects_paths}")
logger.debug(f"Number of objects returned from UDF: {num_files_returned}")
manifest_path: str = f"{path}manifest.json"
self._session.redshift.write_load_manifest(manifest_path=manifest_path,
objects_paths=objects_paths,
procs_io_bound=self._procs_io_bound)
self._session.redshift.load_table(dataframe=dataframe,
dataframe_type="spark",
manifest_path=manifest_path,
schema_name=schema,
table_name=table,
redshift_conn=connection,
preserve_index=False,
num_files=num_partitions,
iam_role=iam_role,
diststyle=diststyle,
distkey=distkey,
sortstyle=sortstyle,
sortkey=sortkey,
mode=mode,
cast_columns=casts)
self._session.s3.delete_objects(path=path, procs_io_bound=self._procs_io_bound)
except Exception as ex:
connection.rollback()
if generated_conn is True:
connection.close()
raise ex
if generated_conn is True:
connection.close()

def create_glue_table(self,
database,
Expand Down
2 changes: 1 addition & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
:alt: alternate text
:figclass: align-center

*Utility belt to handle data on AWS.*
*DataFrames on AWS.*

`Read the Tutorials <https://github.com/awslabs/aws-data-wrangler/tree/master/tutorials>`_: `Catalog & Metadata <https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/catalog_and_metadata.ipynb>`_ | `Athena Nested <https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/athena_nested.ipynb>`_ | `S3 Write Modes <https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/s3_write_modes.ipynb>`_

Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
numpy~=1.18.1
pandas~=0.25.3
pyarrow~=0.15.1
botocore~=1.14.1
boto3~=1.11.1
botocore~=1.14.2
boto3~=1.11.2
s3fs~=0.4.0
tenacity~=6.0.0
pg8000~=1.13.2
Expand Down
32 changes: 31 additions & 1 deletion testing/test_awswrangler/test_redshift.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ def test_to_redshift_spark_bool(session, bucket, redshift_parameters):
session.spark.to_redshift(
dataframe=dataframe,
path=f"s3://{bucket}/redshift-load-bool/",
connection=con,
connection="aws-data-wrangler-redshift",
schema="public",
table="test",
iam_role=redshift_parameters.get("RedshiftRole"),
Expand Down Expand Up @@ -722,3 +722,33 @@ def test_to_redshift_pandas_upsert(session, bucket, redshift_parameters):

wr.s3.delete_objects(path=f"s3://{bucket}/")
con.close()


@pytest.mark.parametrize("sample_name", ["micro", "small", "nano"])
def test_read_sql_redshift_pandas_glue_conn(session, bucket, redshift_parameters, sample_name):
if sample_name == "micro":
dates = ["date"]
elif sample_name == "small":
dates = ["date"]
else:
dates = ["date", "time"]
df = pd.read_csv(f"data_samples/{sample_name}.csv", parse_dates=dates, infer_datetime_format=True)
df["date"] = df["date"].dt.date
path = f"s3://{bucket}/test_read_sql_redshift_pandas_glue_conn/"
session.pandas.to_redshift(
dataframe=df,
path=path,
schema="public",
table="test",
connection="aws-data-wrangler-redshift",
iam_role=redshift_parameters.get("RedshiftRole"),
mode="overwrite",
preserve_index=True,
)
path2 = f"s3://{bucket}/test_read_sql_redshift_pandas_glue_conn2/"
df2 = session.pandas.read_sql_redshift(sql="select * from public.test",
iam_role=redshift_parameters.get("RedshiftRole"),
connection="aws-data-wrangler-redshift",
temp_s3_path=path2)
assert len(df.index) == len(df2.index)
assert len(df.columns) + 1 == len(df2.columns)

0 comments on commit 34dbdd7

Please sign in to comment.