diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b670ff1c..4454dc55d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - [INF] Simplify CI system @ericmjl - [ENH] Added "read_commandline" function to janitor.io @BaritoneBeard - [BUG] Fix bug with the complement parameter of `filter_on`. Issue #988. @thatlittleboy +- [ENH] Add `xlsx_table`, for reading tables from an Excel sheet. @samukweku ## [v0.22.0] - 2021-11-21 diff --git a/janitor/io.py b/janitor/io.py index d8383f570..02d9b9c01 100644 --- a/janitor/io.py +++ b/janitor/io.py @@ -7,7 +7,7 @@ import pandas as pd from .errors import JanitorError -from .utils import deprecated_alias, check +from .utils import deprecated_alias, check, import_message @deprecated_alias(seperate_df="separate_df", filespath="files_path") @@ -111,3 +111,112 @@ def read_commandline(cmd: str, **kwargs) -> pd.DataFrame: else: outcome = outcome.stdout return pd.read_csv(StringIO(outcome), **kwargs) + + +def xlsx_table( + path: str, + sheetname: str, + table: Union[str, list, tuple] = None, +) -> Union[pd.DataFrame, dict]: + """ + Returns a DataFrame of values in a table in the Excel file. + This applies to an Excel file, where the data range is explicitly + specified as a Microsoft Excel table. + + If there is a single table in the sheet, or a string is provided + as an argument to the `table` parameter, a pandas DataFrame is returned; + if there is more than one table in the sheet, + and the `table` argument is `None`, or a list/tuple of names, + a dictionary of DataFrames is returned, where the keys of the dictionary + are the table names. + + Example: + + ```python + + filename = "excel_table.xlsx" + + # single table + jn.xlsx_table(filename, sheetname='Tables', table = 'dCategory') + + CategoryID Category + 0 1 Beginner + 1 2 Advanced + 2 3 Freestyle + 3 4 Competition + 4 5 Long Distance + + # multiple tables: + jn.xlsx_table(filename, sheetname = 'Tables', table = ['dCategory', 'dSupplier']) + + {'dCategory': CategoryID Category + 0 1 Beginner + 1 2 Advanced + 2 3 Freestyle + 3 4 Competition + 4 5 Long Distance, + 'dSupplier': SupplierID Supplier City State E-mail + 0 GB Gel Boomerangs Oakland CA gel@gel-boomerang.com + 1 CO Colorado Boomerangs Gunnison CO Pollock@coloradoboomerang.com + 2 CC Channel Craft Richland WA Dino@CC.com + 3 DB Darnell Booms Burlington VT Darnell@Darnell.com} + ``` + + :param path: Path to the Excel File. + :param sheetname: Name of the sheet from which the tables + are to be extracted. + :param table: Name of a table, or list of tables in the sheet. + :returns: A pandas DataFrame, or a dictionary of DataFrames, + if there are multiple arguments for the `table` parameter, + or the argument to `table` is `None`. + :raises ValueError: If there are no tables in the sheet. + + """ # noqa : E501 + + try: + from openpyxl import load_workbook + except ImportError: + import_message( + submodule="io", + package="openpyxl", + conda_channel="conda-forge", + pip_install=True, + ) + wb = load_workbook(filename=path, read_only=False, keep_links=False) + check("sheetname", sheetname, [str]) + ws = wb[sheetname] + + contents = ws.tables + if not contents: + raise ValueError(f"There is no table in `{sheetname}` sheet.") + + if isinstance(table, str): + table = [table] + if table is not None: + check("table", table, [list, tuple]) + for entry in table: + if entry not in contents: + raise ValueError( + f"{entry} is not a table in the {sheetname} sheet." + ) + data = ( + (key, value) for key, value in contents.items() if key in table + ) + else: + data = contents.items() + + frame = {} + for key, value in data: + content = ((cell.value for cell in row) for row in ws[value]) + if contents[key].headerRowCount == 1: + column_names = next(content) + content = zip(*content) + frame[key] = dict(zip(column_names, content)) + else: + content = zip(*content) + frame[key] = {f"C{num}": val for num, val in enumerate(content)} + + if len(frame) == 1: + _, frame = frame.popitem() + return pd.DataFrame(frame) + return {key: pd.DataFrame(value) for key, value in frame.items()} diff --git a/tests/io/test_xlsx_table.py b/tests/io/test_xlsx_table.py new file mode 100644 index 000000000..cc3270870 --- /dev/null +++ b/tests/io/test_xlsx_table.py @@ -0,0 +1,115 @@ +import pandas as pd +import pytest + +from janitor import io +from pandas.testing import assert_frame_equal +from pathlib import Path + + +TEST_DATA_DIR = "tests/test_data" +filename = Path(TEST_DATA_DIR).joinpath("016-MSPTDA-Excel.xlsx").resolve() +no_headers = ( + Path( + TEST_DATA_DIR, + ) + .joinpath("excel_without_headers.xlsx") + .resolve() +) + + +def test_check_sheetname(): + """Test sheetname is a string.""" + with pytest.raises(TypeError): + io.xlsx_table(filename, 1, None) + + +def test_check_filename(): + """Raise error if file does not exist.""" + with pytest.raises(FileNotFoundError): + io.xlsx_table("excel.xlsx", 1, None) + + +def test_table_exists(): + """Raise error if there is no table in the sheet.""" + with pytest.raises(ValueError): + io.xlsx_table(filename, "Cover") + + +def test_table_name(): + """ + Raise error if `table` is not None, + and the table name cannot be found. + """ + with pytest.raises(ValueError): + io.xlsx_table(filename, "Tables", table="fake") + + +def test_table_str(): + """Test output for single table.""" + expected = io.xlsx_table(filename, "Tables", "dSupplier") + actual = ( + pd.read_excel( + filename, engine="openpyxl", sheet_name="Tables", usecols="N:R" + ) + .rename(columns={"SupplierID.1": "SupplierID"}) + .dropna() + ) + assert_frame_equal(expected, actual) + + +def test_table_no_header(): + """Test output for single table, without header.""" + expected = io.xlsx_table(no_headers, "Tables", "dSalesReps") + actual = pd.read_excel( + no_headers, + engine="openpyxl", + sheet_name="Tables", + usecols="A:C", + names=["C0", "C1", "C2"], + ) + assert_frame_equal(expected, actual) + + +def test_tables(): + """Test output for multiple tables.""" + expected = io.xlsx_table(filename, "Tables", ("dSalesReps", "dSupplier")) + actual = { + "dSalesReps": pd.read_excel( + filename, engine="openpyxl", sheet_name="Tables", usecols="A:C" + ), + "dSupplier": pd.read_excel( + filename, engine="openpyxl", sheet_name="Tables", usecols="N:R" + ) + .rename(columns={"SupplierID.1": "SupplierID"}) + .dropna(), + } + for key, value in expected.items(): + assert_frame_equal(value, actual[key]) + + +def test_tables_None(): + """Test output for multiple tables.""" + expected = io.xlsx_table(filename, "Tables") + actual = { + "dSalesReps": pd.read_excel( + filename, engine="openpyxl", sheet_name="Tables", usecols="A:C" + ), + "dSupplier": pd.read_excel( + filename, engine="openpyxl", sheet_name="Tables", usecols="N:R" + ) + .rename(columns={"SupplierID.1": "SupplierID"}) + .dropna(), + "dProduct": pd.read_excel( + filename, engine="openpyxl", sheet_name="Tables", usecols="E:I" + ) + .dropna() + .astype({"ProductID": int, "CategoryID": int}), + "dCategory": pd.read_excel( + filename, engine="openpyxl", sheet_name="Tables", usecols="K:L" + ) + .rename(columns={"CategoryID.1": "CategoryID"}) + .dropna() + .astype({"CategoryID": int}), + } + for key, value in expected.items(): + assert_frame_equal(value, actual[key]) diff --git a/tests/test_data/016-MSPTDA-Excel.xlsx b/tests/test_data/016-MSPTDA-Excel.xlsx new file mode 100644 index 000000000..1ebfc5c29 Binary files /dev/null and b/tests/test_data/016-MSPTDA-Excel.xlsx differ diff --git a/tests/test_data/excel_without_headers.xlsx b/tests/test_data/excel_without_headers.xlsx new file mode 100644 index 000000000..229aec43e Binary files /dev/null and b/tests/test_data/excel_without_headers.xlsx differ