diff --git a/examples/dask/hello_world/business_logic.py b/examples/dask/hello_world/business_logic.py index b4f559c0d..8255669a3 120000 --- a/examples/dask/hello_world/business_logic.py +++ b/examples/dask/hello_world/business_logic.py @@ -1 +1 @@ -../../hello_world/my_functions.py \ No newline at end of file +../../hello_world/my_functions.py diff --git a/examples/polars/materialization/my_script.py b/examples/polars/materialization/my_script.py index ea7aa52be..b532ebf26 100644 --- a/examples/polars/materialization/my_script.py +++ b/examples/polars/materialization/my_script.py @@ -67,6 +67,14 @@ file="./df.avro", combine=df_builder, ), + # materialize the dataframe to a spreadsheet file + to.spreadsheet( + dependencies=output_columns, + id="df_to_spreadsheet", + workbook="./df.xlsx", + worksheet="Sheet1", + combine=df_builder, + ), # materialize the dataframe to a database to.database( dependencies=output_columns, @@ -93,6 +101,7 @@ "df_to_feather_build_result", "df_to_json_build_result", "df_to_avro_build_result", + "df_to_spreadsheet_build_result", "df_to_database_build_result", ], # because combine is used, we can get that result here. inputs=initial_columns, @@ -102,4 +111,5 @@ print(additional_outputs["df_to_feather_build_result"]) print(additional_outputs["df_to_json_build_result"]) print(additional_outputs["df_to_avro_build_result"]) +print(additional_outputs["df_to_spreadsheet_build_result"]) print(additional_outputs["df_to_database_build_result"]) diff --git a/examples/polars/materialization/notebook.ipynb b/examples/polars/materialization/notebook.ipynb index 8ef172237..5150c8f50 100644 --- a/examples/polars/materialization/notebook.ipynb +++ b/examples/polars/materialization/notebook.ipynb @@ -86,7 +86,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Overwriting spend_calculations.py\n" + "Writing spend_calculations.py\n" ] } ], @@ -225,442 +225,496 @@ "\n", "\n", - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", "\n", "cluster__legend\n", - "\n", - "Legend\n", + "\n", + "Legend\n", "\n", - "\n", + "\n", "\n", - "df_to_json_build_result\n", - "\n", - "df_to_json_build_result\n", - "DataFrame\n", + "spend_mean\n", + "\n", + "spend_mean\n", + "float\n", "\n", - "\n", - "\n", - "df_to_json\n", - "\n", - "\n", - "df_to_json\n", - "PolarsJSONWriter\n", + "\n", + "\n", + "spend_zero_mean\n", + "\n", + "spend_zero_mean\n", + "Series\n", "\n", - "\n", - "\n", - "df_to_json_build_result->df_to_json\n", - "\n", - "\n", + "\n", + "\n", + "spend_mean->spend_zero_mean\n", + "\n", + "\n", "\n", - "\n", + "\n", "\n", - "avg_3wk_spend\n", - "\n", - "avg_3wk_spend\n", - "Series\n", + "df_to_feather\n", + "\n", + "\n", + "df_to_feather\n", + "PolarsFeatherWriter\n", "\n", - "\n", - "\n", - "avg_3wk_spend->df_to_json_build_result\n", - "\n", - "\n", + "\n", + "\n", + "avg_3wk_spend\n", + "\n", + "avg_3wk_spend\n", + "Series\n", "\n", - "\n", + "\n", "\n", - "df_to_feather_build_result\n", - "\n", - "df_to_feather_build_result\n", - "DataFrame\n", + "df_to_json_build_result\n", + "\n", + "df_to_json_build_result\n", + "DataFrame\n", "\n", - "\n", - "\n", - "avg_3wk_spend->df_to_feather_build_result\n", - "\n", - "\n", + "\n", + "\n", + "avg_3wk_spend->df_to_json_build_result\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "df_to_database_build_result\n", - "\n", - "df_to_database_build_result\n", - "DataFrame\n", + "\n", + "df_to_database_build_result\n", + "DataFrame\n", "\n", "\n", - "\n", + "\n", "avg_3wk_spend->df_to_database_build_result\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "df_to_avro_build_result\n", - "\n", - "df_to_avro_build_result\n", - "DataFrame\n", + "\n", + "\n", + "df_to_feather_build_result\n", + "\n", + "df_to_feather_build_result\n", + "DataFrame\n", "\n", - "\n", - "\n", - "avg_3wk_spend->df_to_avro_build_result\n", - "\n", - "\n", + "\n", + "\n", + "avg_3wk_spend->df_to_feather_build_result\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "df_to_parquet_build_result\n", - "\n", - "df_to_parquet_build_result\n", - "DataFrame\n", + "\n", + "df_to_parquet_build_result\n", + "DataFrame\n", "\n", "\n", - "\n", + "\n", "avg_3wk_spend->df_to_parquet_build_result\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "spend_zero_mean\n", - "\n", - "spend_zero_mean\n", - "Series\n", + "\n", + "\n", + "df_to_spreadsheet_build_result\n", + "\n", + "df_to_spreadsheet_build_result\n", + "DataFrame\n", "\n", - "\n", - "\n", - "spend_zero_mean_unit_variance\n", - "\n", - "spend_zero_mean_unit_variance\n", - "Series\n", + "\n", + "\n", + "avg_3wk_spend->df_to_spreadsheet_build_result\n", + "\n", + "\n", "\n", - "\n", - "\n", - "spend_zero_mean->spend_zero_mean_unit_variance\n", - "\n", - "\n", + "\n", + "\n", + "df_to_avro_build_result\n", + "\n", + "df_to_avro_build_result\n", + "DataFrame\n", "\n", - "\n", - "\n", - "df_to_feather\n", - "\n", - "\n", - "df_to_feather\n", - "PolarsFeatherWriter\n", + "\n", + "\n", + "avg_3wk_spend->df_to_avro_build_result\n", + "\n", + "\n", "\n", - "\n", - "\n", - "df_to_feather_build_result->df_to_feather\n", - "\n", - "\n", + "\n", + "\n", + "df_to_json\n", + "\n", + "\n", + "df_to_json\n", + "PolarsJSONWriter\n", + "\n", + "\n", + "\n", + "df_to_json_build_result->df_to_json\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "df_to_parquet\n", + "\n", + "\n", + "df_to_parquet\n", + "PolarsParquetWriter\n", "\n", "\n", - "\n", + "\n", "df_to_database\n", - "\n", - "\n", - "df_to_database\n", - "PolarsDatabaseWriter\n", + "\n", + "\n", + "df_to_database\n", + "PolarsDatabaseWriter\n", "\n", "\n", - "\n", + "\n", "df_to_database_build_result->df_to_database\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", + "\n", + "\n", + "df_to_feather_build_result->df_to_feather\n", + "\n", + "\n", + "\n", + "\n", "\n", + "df_to_spreadsheet\n", + "\n", + "\n", + "df_to_spreadsheet\n", + "PolarsSpreadsheetWriter\n", + "\n", + "\n", + "\n", + "df_to_avro\n", + "\n", + "\n", + "df_to_avro\n", + "PolarsAvroWriter\n", + "\n", + "\n", + "\n", + "df_to_parquet_build_result->df_to_parquet\n", + "\n", + "\n", + "\n", + "\n", + "\n", "spend_std_dev\n", - "\n", - "spend_std_dev\n", - "float\n", + "\n", + "spend_std_dev\n", + "float\n", + "\n", + "\n", + "\n", + "spend_zero_mean_unit_variance\n", + "\n", + "spend_zero_mean_unit_variance\n", + "Series\n", "\n", "\n", - "\n", + "\n", "spend_std_dev->spend_zero_mean_unit_variance\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "df_to_avro\n", - "\n", - "\n", - "df_to_avro\n", - "PolarsAvroWriter\n", + "\n", + "\n", + "df_to_spreadsheet_build_result->df_to_spreadsheet\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "df_to_avro_build_result->df_to_avro\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "df_to_parquet\n", - "\n", - "\n", - "df_to_parquet\n", - "PolarsParquetWriter\n", - "\n", - "\n", - "\n", - "df_to_parquet_build_result->df_to_parquet\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "spend_zero_mean_unit_variance->df_to_json_build_result\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "spend_zero_mean_unit_variance->df_to_feather_build_result\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "spend_zero_mean_unit_variance->df_to_database_build_result\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "spend_zero_mean_unit_variance->df_to_avro_build_result\n", - "\n", - "\n", + "\n", + "\n", + "spend_zero_mean_unit_variance->df_to_feather_build_result\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "spend_zero_mean_unit_variance->df_to_parquet_build_result\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "spend_mean\n", - "\n", - "spend_mean\n", - "float\n", + "\n", + "\n", + "spend_zero_mean_unit_variance->df_to_spreadsheet_build_result\n", + "\n", + "\n", "\n", - "\n", - "\n", - "spend_mean->spend_zero_mean\n", - "\n", - "\n", + "\n", + "\n", + "spend_zero_mean_unit_variance->df_to_avro_build_result\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "spend_zero_mean->spend_zero_mean_unit_variance\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "spend_per_signup\n", - "\n", - "spend_per_signup\n", - "Series\n", + "\n", + "spend_per_signup\n", + "Series\n", "\n", "\n", - "\n", + "\n", "spend_per_signup->df_to_json_build_result\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "spend_per_signup->df_to_feather_build_result\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "spend_per_signup->df_to_database_build_result\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "spend_per_signup->df_to_avro_build_result\n", - "\n", - "\n", + "\n", + "\n", + "spend_per_signup->df_to_feather_build_result\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "spend_per_signup->df_to_parquet_build_result\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "_df_to_json_build_result_inputs\n", - "\n", - "spend\n", - "Series\n", - "signups\n", - "Series\n", + "\n", + "\n", + "spend_per_signup->df_to_spreadsheet_build_result\n", + "\n", + "\n", "\n", - "\n", - "\n", - "_df_to_json_build_result_inputs->df_to_json_build_result\n", - "\n", - "\n", + "\n", + "\n", + "spend_per_signup->df_to_avro_build_result\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "_spend_mean_inputs\n", + "\n", + "spend\n", + "Series\n", + "\n", + "\n", + "\n", + "_spend_mean_inputs->spend_mean\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "_avg_3wk_spend_inputs\n", - "\n", - "spend\n", - "Series\n", + "\n", + "spend\n", + "Series\n", "\n", "\n", - "\n", + "\n", "_avg_3wk_spend_inputs->avg_3wk_spend\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "_spend_zero_mean_inputs\n", - "\n", - "spend\n", - "Series\n", + "\n", + "\n", + "_df_to_json_build_result_inputs\n", + "\n", + "signups\n", + "Series\n", + "spend\n", + "Series\n", "\n", - "\n", + "\n", "\n", - "_spend_zero_mean_inputs->spend_zero_mean\n", - "\n", - "\n", + "_df_to_json_build_result_inputs->df_to_json_build_result\n", + "\n", + "\n", "\n", - "\n", - "\n", - "_df_to_feather_build_result_inputs\n", + "\n", + "\n", + "_df_to_database_build_result_inputs\n", "\n", - "spend\n", + "signups\n", "Series\n", - "signups\n", + "spend\n", "Series\n", "\n", + "\n", + "\n", + "_df_to_database_build_result_inputs->df_to_database_build_result\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "_df_to_feather_build_result_inputs\n", + "\n", + "signups\n", + "Series\n", + "spend\n", + "Series\n", + "\n", "\n", - "\n", + "\n", "_df_to_feather_build_result_inputs->df_to_feather_build_result\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "_df_to_database_build_result_inputs\n", - "\n", - "spend\n", - "Series\n", - "signups\n", - "Series\n", + "\n", + "\n", + "_df_to_parquet_build_result_inputs\n", + "\n", + "signups\n", + "Series\n", + "spend\n", + "Series\n", "\n", - "\n", - "\n", - "_df_to_database_build_result_inputs->df_to_database_build_result\n", - "\n", - "\n", + "\n", + "\n", + "_df_to_parquet_build_result_inputs->df_to_parquet_build_result\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "_spend_std_dev_inputs\n", - "\n", - "spend\n", - "Series\n", + "\n", + "spend\n", + "Series\n", "\n", "\n", - "\n", + "\n", "_spend_std_dev_inputs->spend_std_dev\n", - "\n", - "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "_df_to_avro_build_result_inputs\n", + "\n", + "\n", + "_df_to_spreadsheet_build_result_inputs\n", "\n", - "spend\n", + "signups\n", "Series\n", - "signups\n", + "spend\n", "Series\n", "\n", - "\n", - "\n", - "_df_to_avro_build_result_inputs->df_to_avro_build_result\n", - "\n", - "\n", + "\n", + "\n", + "_df_to_spreadsheet_build_result_inputs->df_to_spreadsheet_build_result\n", + "\n", + "\n", "\n", - "\n", - "\n", - "_df_to_parquet_build_result_inputs\n", + "\n", + "\n", + "_df_to_avro_build_result_inputs\n", "\n", - "spend\n", + "signups\n", "Series\n", - "signups\n", + "spend\n", "Series\n", "\n", - "\n", - "\n", - "_df_to_parquet_build_result_inputs->df_to_parquet_build_result\n", - "\n", - "\n", + "\n", + "\n", + "_df_to_avro_build_result_inputs->df_to_avro_build_result\n", + "\n", + "\n", "\n", - "\n", - "\n", - "_spend_mean_inputs\n", - "\n", - "spend\n", - "Series\n", + "\n", + "\n", + "_spend_zero_mean_inputs\n", + "\n", + "spend\n", + "Series\n", "\n", - "\n", - "\n", - "_spend_mean_inputs->spend_mean\n", - "\n", - "\n", + "\n", + "\n", + "_spend_zero_mean_inputs->spend_zero_mean\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "_spend_per_signup_inputs\n", - "\n", - "spend\n", - "Series\n", - "signups\n", - "Series\n", + "\n", + "signups\n", + "Series\n", + "spend\n", + "Series\n", "\n", "\n", - "\n", + "\n", "_spend_per_signup_inputs->spend_per_signup\n", - "\n", - "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "input\n", - "\n", - "input\n", + "\n", + "input\n", "\n", "\n", - "\n", + "\n", "function\n", - "\n", - "function\n", + "\n", + "function\n", "\n", "\n", - "\n", + "\n", "output\n", - "\n", - "output\n", + "\n", + "output\n", "\n", "\n", - "\n", + "\n", "materializer\n", - "\n", - "\n", - "materializer\n", + "\n", + "\n", + "materializer\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 9, @@ -707,6 +761,14 @@ " if_table_exists=\"append\",\n", " combine=df_builder,\n", " ),\n", + " #materialize the dataframe to a spreadsheet file\n", + " to.spreadsheet(\n", + " dependencies=output_columns,\n", + " id=\"df_to_spreadsheet\",\n", + " workbook=\"./df.xlsx\",\n", + " worksheet=\"Sheet1\",\n", + " combine=df_builder,\n", + " ),\n", "]\n", "# Visualize what is happening\n", "dr.visualize_materialization(\n", @@ -736,6 +798,7 @@ " \"df_to_json_build_result\",\n", " \"df_to_avro_build_result\",\n", " \"df_to_database_build_result\",\n", + " \"df_to_spreadsheet_build_result\",\n", " ], # because combine is used, we can get that result here.\n", " inputs=initial_columns,\n", ")" @@ -756,8 +819,8 @@ "text/plain": [ "{'df_to_parquet': {'file_metadata': {'size': 1609,\n", " 'path': './df.parquet',\n", - " 'last_modified': 1711021595.6833296,\n", - " 'timestamp': 1711001795.684963,\n", + " 'last_modified': 1711363433.0819426,\n", + " 'timestamp': 1711343633.081942,\n", " 'scheme': '',\n", " 'notes': ''},\n", " 'dataframe_metadata': {'rows': 6,\n", @@ -770,8 +833,8 @@ " 'datatypes': ['Int64', 'Int64', 'Float64', 'Float64', 'Float64']}},\n", " 'df_to_feather': {'file_metadata': {'size': 1696,\n", " 'path': './df.feather',\n", - " 'last_modified': 1711021595.684963,\n", - " 'timestamp': 1711001795.684963,\n", + " 'last_modified': 1711363433.0829415,\n", + " 'timestamp': 1711343633.082941,\n", " 'scheme': '',\n", " 'notes': ''},\n", " 'dataframe_metadata': {'rows': 6,\n", @@ -784,8 +847,8 @@ " 'datatypes': ['Int64', 'Int64', 'Float64', 'Float64', 'Float64']}},\n", " 'df_to_json': {'file_metadata': {'size': 657,\n", " 'path': './df.json',\n", - " 'last_modified': 1711021595.684963,\n", - " 'timestamp': 1711001795.684963,\n", + " 'last_modified': 1711363433.0839424,\n", + " 'timestamp': 1711343633.083942,\n", " 'scheme': '',\n", " 'notes': ''},\n", " 'dataframe_metadata': {'rows': 6,\n", @@ -798,8 +861,8 @@ " 'datatypes': ['Int64', 'Int64', 'Float64', 'Float64', 'Float64']}},\n", " 'df_to_avro': {'file_metadata': {'size': 517,\n", " 'path': './df.avro',\n", - " 'last_modified': 1711021595.684963,\n", - " 'timestamp': 1711001795.684963,\n", + " 'last_modified': 1711363433.0849397,\n", + " 'timestamp': 1711343633.084939,\n", " 'scheme': '',\n", " 'notes': ''},\n", " 'dataframe_metadata': {'rows': 6,\n", @@ -812,8 +875,8 @@ " 'datatypes': ['Int64', 'Int64', 'Float64', 'Float64', 'Float64']}},\n", " 'df_to_database': {'file_metadata': {'size': None,\n", " 'path': 'test',\n", - " 'last_modified': 1711021595.9625595,\n", - " 'timestamp': 1711001795.962559,\n", + " 'last_modified': 1711363433.3330674,\n", + " 'timestamp': 1711343633.333067,\n", " 'scheme': '',\n", " 'notes': 'File metadata is unsupported for scheme: or path: test does not exist.'},\n", " 'dataframe_metadata': {'rows': 6,\n", @@ -823,6 +886,20 @@ " 'avg_3wk_spend',\n", " 'spend_per_signup',\n", " 'spend_zero_mean_unit_variance'],\n", + " 'datatypes': ['Int64', 'Int64', 'Float64', 'Float64', 'Float64']}},\n", + " 'df_to_spreadsheet': {'file_metadata': {'size': 6503,\n", + " 'path': './df.xlsx',\n", + " 'last_modified': 1711363433.3686314,\n", + " 'timestamp': 1711343633.369631,\n", + " 'scheme': '',\n", + " 'notes': ''},\n", + " 'dataframe_metadata': {'rows': 6,\n", + " 'columns': 5,\n", + " 'column_names': ['spend',\n", + " 'signups',\n", + " 'avg_3wk_spend',\n", + " 'spend_per_signup',\n", + " 'spend_zero_mean_unit_variance'],\n", " 'datatypes': ['Int64', 'Int64', 'Float64', 'Float64', 'Float64']}}}" ] }, @@ -1063,6 +1140,55 @@ "additional_outputs[\"df_to_database_build_result\"]" ] }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (6, 5)
spendsignupsavg_3wk_spendspend_per_signupspend_zero_mean_unit_variance
i64i64f64f64f64
101null10.0-1.064405
1010null1.0-1.064405
205013.3333330.4-0.483821
4010023.3333330.40.677349
4020033.3333330.20.677349
5040043.3333330.1251.257934
" + ], + "text/plain": [ + "shape: (6, 5)\n", + "┌───────┬─────────┬───────────────┬──────────────────┬───────────────────────────────┐\n", + "│ spend ┆ signups ┆ avg_3wk_spend ┆ spend_per_signup ┆ spend_zero_mean_unit_variance │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ i64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═══════╪═════════╪═══════════════╪══════════════════╪═══════════════════════════════╡\n", + "│ 10 ┆ 1 ┆ null ┆ 10.0 ┆ -1.064405 │\n", + "│ 10 ┆ 10 ┆ null ┆ 1.0 ┆ -1.064405 │\n", + "│ 20 ┆ 50 ┆ 13.333333 ┆ 0.4 ┆ -0.483821 │\n", + "│ 40 ┆ 100 ┆ 23.333333 ┆ 0.4 ┆ 0.677349 │\n", + "│ 40 ┆ 200 ┆ 33.333333 ┆ 0.2 ┆ 0.677349 │\n", + "│ 50 ┆ 400 ┆ 43.333333 ┆ 0.125 ┆ 1.257934 │\n", + "└───────┴─────────┴───────────────┴──────────────────┴───────────────────────────────┘" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "additional_outputs[\"df_to_spreadsheet_build_result\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, diff --git a/hamilton/plugins/polars_extensions.py b/hamilton/plugins/polars_extensions.py index 49d87fbc6..448ac1c54 100644 --- a/hamilton/plugins/polars_extensions.py +++ b/hamilton/plugins/polars_extensions.py @@ -17,6 +17,22 @@ Union, ) +try: + from xlsxwriter.workbook import Workbook +except ImportError: + Workbook = Type + + +from polars.type_aliases import ( + ColumnFormatDict, + ColumnTotalsDefinition, + ColumnWidthsDefinition, + ConditionalFormatDict, + OneOrMoreDataTypes, + RowTotalsDefinition, + SelectorType, +) + try: import polars as pl from polars import PolarsDataType @@ -561,6 +577,148 @@ def name(cls) -> str: return "json" +@dataclasses.dataclass +class PolarsSpreadsheetReader(DataLoader): + """ + Class specifically to handle loading Spreadsheet files with Polars. + Should map to https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_excel.html + """ + + source: Union[str, Path, IOBase, bytes] + # kwargs: + sheet_id: Union[int, Sequence[int], None] = None + sheet_name: Union[str, List[str], Tuple[str], None] = None + engine: Literal["xlsx2csv", "openpyxl", "pyxlsb", "odf", "xlrd", "xlsxwriter"] = "xlsx2csv" + engine_options: Union[Dict[str, Any], None] = None + read_options: Union[Dict[str, Any], None] = None + schema_overrides: Union[Dict[str, Any], None] = None + raise_if_empty: bool = True + + @classmethod + def applicable_types(cls) -> Collection[Type]: + return [DATAFRAME_TYPE] + + def _get_loading_kwargs(self): + kwargs = {} + if self.sheet_id is not None: + kwargs["sheet_id"] = self.sheet_id + if self.sheet_name is not None: + kwargs["sheet_name"] = self.sheet_name + if self.engine is not None: + kwargs["engine"] = self.engine + if self.engine_options is not None: + kwargs["engine_options"] = self.engine_options + if self.read_options is not None: + kwargs["read_options"] = self.read_options + if self.schema_overrides is not None: + kwargs["schema_overrides"] = self.schema_overrides + if self.raise_if_empty is not None: + kwargs["raise_if_empty"] = self.raise_if_empty + return kwargs + + def load_data(self, type_: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]: + df = pl.read_excel(self.source, **self._get_loading_kwargs()) + metadata = utils.get_file_metadata(self.source) + return df, metadata + + @classmethod + def name(cls) -> str: + return "spreadsheet" + + +@dataclasses.dataclass +class PolarsSpreadsheetWriter(DataSaver): + """ + Class specifically to handle saving Spreadsheet files with Polars. + Should map to https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_excel.html + """ + + workbook: Union[Workbook, BytesIO, Path, str] + worksheet: Union[str, None] = None + # kwargs: + position: Union[Tuple[int, int], str] = "A1" + table_style: Union[str, Dict[str, Any], None] = None + table_name: Union[str, None] = None + column_formats: Union[ColumnFormatDict, None] = None + dtype_formats: Union[Dict[OneOrMoreDataTypes, str], None] = None + conditional_formats: Union[ConditionalFormatDict, None] = None + header_format: Union[Dict[str, Any], None] = None + column_totals: Union[ColumnTotalsDefinition, None] = None + column_widths: Union[ColumnWidthsDefinition, None] = None + row_totals: Union[RowTotalsDefinition, None] = None + row_heights: Union[Dict[Union[int, Tuple[int, ...]], int], int, None] = None + sparklines: Union[Dict[str, Union[Sequence[str], Dict[str, Any]]], None] = None + formulas: Union[Dict[str, Union[str, Dict[str, str]]], None] = None + float_precision: int = 3 + include_header: bool = True + autofilter: bool = True + autofit: bool = False + hidden_columns: Union[Sequence[str], SelectorType, None] = None + hide_gridlines: bool = None + sheet_zoom: Union[int, None] = None + freeze_panes: Union[ + str, Tuple[int, int], Tuple[str, int, int], Tuple[int, int, int, int], None + ] = None + + @classmethod + def applicable_types(cls) -> Collection[Type]: + return [DATAFRAME_TYPE] + + def _get_saving_kwargs(self): + kwargs = {} + if self.position is not None: + kwargs["position"] = self.position + if self.table_style is not None: + kwargs["table_style"] = self.table_style + if self.table_name is not None: + kwargs["table_name"] = self.table_name + if self.column_formats is not None: + kwargs["column_formats"] = self.column_formats + if self.dtype_formats is not None: + kwargs["dtype_formats"] = self.dtype_formats + if self.conditional_formats is not None: + kwargs["conditional_formats"] = self.conditional_formats + if self.header_format is not None: + kwargs["header_format"] = self.header_format + if self.column_totals is not None: + kwargs["column_totals"] = self.column_totals + if self.column_widths is not None: + kwargs["column_widths"] = self.column_widths + if self.row_totals is not None: + kwargs["row_totals"] = self.row_totals + if self.row_heights is not None: + kwargs["row_heights"] = self.row_heights + if self.sparklines is not None: + kwargs["sparklines"] = self.sparklines + if self.formulas is not None: + kwargs["formulas"] = self.formulas + if self.float_precision is not None: + kwargs["float_precision"] = self.float_precision + if self.include_header is not None: + kwargs["include_header"] = self.include_header + if self.autofilter is not None: + kwargs["autofilter"] = self.autofilter + if self.autofit is not None: + kwargs["autofit"] = self.autofit + if self.hidden_columns is not None: + kwargs["hidden_columns"] = self.hidden_columns + if self.hide_gridlines is not None: + kwargs["hide_gridlines"] = self.hide_gridlines + if self.sheet_zoom is not None: + kwargs["sheet_zoom"] = self.sheet_zoom + if self.freeze_panes is not None: + kwargs["freeze_panes"] = self.freeze_panes + return kwargs + + def save_data(self, data: DATAFRAME_TYPE) -> Dict[str, Any]: + data.write_excel(self.workbook, self.worksheet, **self._get_saving_kwargs()) + return utils.get_file_and_dataframe_metadata(self.workbook, data) + + @classmethod + def name(cls) -> str: + return "spreadsheet" + + @dataclasses.dataclass class PolarsDatabaseReader(DataLoader): """ @@ -659,6 +817,8 @@ def register_data_loaders(): PolarsJSONWriter, PolarsDatabaseReader, PolarsDatabaseWriter, + PolarsSpreadsheetReader, + PolarsSpreadsheetWriter, ]: registry.register_adapter(loader) diff --git a/requirements-test.txt b/requirements-test.txt index e66ec4387..bee9a7545 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -22,3 +22,5 @@ sqlalchemy==1.4.49; python_version == '3.7.*' sqlalchemy; python_version >= '3.8' typer xgboost +xlsx2csv # for excel data loader +xlsxwriter # Excel export requires 'xlsxwriter' diff --git a/tests/plugins/test_polars_extensions.py b/tests/plugins/test_polars_extensions.py index 113b97a4d..2e719efbc 100644 --- a/tests/plugins/test_polars_extensions.py +++ b/tests/plugins/test_polars_extensions.py @@ -17,6 +17,8 @@ PolarsJSONWriter, PolarsParquetReader, PolarsParquetWriter, + PolarsSpreadsheetReader, + PolarsSpreadsheetWriter, ) @@ -142,3 +144,27 @@ def test_polars_database(df: pl.DataFrame, tmp_path: pathlib.Path) -> None: assert "batch_size" not in kwargs2 assert df2.shape == (2, 2) assert df.frame_equal(df2) + + +def test_polars_spreadsheet(df: pl.DataFrame, tmp_path: pathlib.Path) -> None: + file_path = tmp_path / "test.xlsx" + writer = PolarsSpreadsheetWriter(workbook=file_path, worksheet="test_load_from_data_sheet") + write_kwargs = writer._get_saving_kwargs() + metadata = writer.save_data(df) + + reader = PolarsSpreadsheetReader(source=file_path, sheet_name="test_load_from_data_sheet") + read_kwargs = reader._get_loading_kwargs() + df2, _ = reader.load_data(pl.DataFrame) + + assert PolarsSpreadsheetWriter.applicable_types() == [pl.DataFrame] + assert PolarsSpreadsheetReader.applicable_types() == [pl.DataFrame] + assert file_path.exists() + assert metadata["file_metadata"]["path"] == str(file_path) + assert df.shape == (2, 2) + assert metadata["dataframe_metadata"]["column_names"] == ["a", "b"] + assert metadata["dataframe_metadata"]["datatypes"] == ["Int64", "Int64"] + assert df.frame_equal(df2) + assert "include_header" in write_kwargs + assert write_kwargs["include_header"] is True + assert "raise_if_empty" in read_kwargs + assert read_kwargs["raise_if_empty"] is True