Skip to content

Commit

Permalink
added pandas table reader along with test (#804)
Browse files Browse the repository at this point in the history
Adds the a Pandas Table reader DataSaver.

* added pandas table reader along with test

* fix pre-commit

* remove int form sequence

* Fix for Python 3.8

* Fix Build Iterable
  • Loading branch information
swapdewalkar authored Apr 22, 2024
1 parent d89b03e commit b5982fd
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 0 deletions.
83 changes: 83 additions & 0 deletions hamilton/plugins/pandas_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1533,6 +1533,88 @@ def name(cls) -> str:
return "excel"


@dataclasses.dataclass
class PandasTableReader(DataLoader):
"""Class for loading/reading table files with Pandas.
Maps to https://pandas.pydata.org/docs/reference/api/pandas.read_table.html
"""

filepath_or_buffer: Union[str, Path, BytesIO, BufferedReader]
# kwargs
sep: Union[str, None] = None
delimiter: Optional[str] = None
header: Union[int, Sequence, str, None] = "infer"
names: Optional[Sequence] = None
index_col: Union[int, str, Sequence, None] = None
usecols: Union[Sequence, None] = None
dtype: Union[Dtype, Dict[Hashable, Dtype], None] = None
engine: Optional[Literal["c", "python", "pyarrow"]] = None
converters: Optional[Dict[Hashable, Callable]] = None
true_values: Optional[Iterable] = None
false_values: Optional[Iterable] = None
skipinitialspace: bool = False
skiprows: Optional[Union[List[int], int, List[Callable]]] = None
skipfooter: int = 0
nrows: Optional[int] = None
na_values: Optional[Union[Hashable, Iterable, Dict[Hashable, Iterable]]] = None
keep_default_na: bool = True
na_filter: bool = True
verbose: bool = False
skip_blank_lines: bool = True
parse_dates: Union[List[Union[int, str]], Dict[str, List[Union[int, str]]], bool] = False
infer_datetime_format: bool = False
keep_date_col: bool = False
date_parser: Optional[Callable] = None
date_format: Optional[Union[str, str]] = None
dayfirst: bool = False
cache_dates: bool = True
iterator: bool = False
chunksize: Optional[int] = None
compression: Union[str, Dict] = "infer"
thousands: Optional[str] = None
decimal: str = "."
lineterminator: Optional[str] = None
quotechar: Optional[str] = '"'
quoting: int = 0
doublequote: bool = True
escapechar: Optional[str] = None
comment: Optional[str] = None
encoding: Optional[str] = None
encoding_errors: Optional[str] = "strict"
dialect: Optional[str] = None
on_bad_lines: Union[Literal["error", "warn", "skip"], Callable] = "error"
delim_whitespace: bool = False
low_memory: bool = True
memory_map: bool = False
float_precision: Optional[Literal["high", "legacy", "round_trip"]] = None
storage_options: Optional[Dict] = None
dtype_backend: Literal["numpy_nullable", "pyarrow"] = "numpy_nullable"

@classmethod
def applicable_types(cls) -> Collection[Type]:
return [DATAFRAME_TYPE]

def _get_loading_kwargs(self) -> Dict[str, Any]:
# Puts kwargs in a dict
kwargs = dataclasses.asdict(self)

# filepath_or_buffer corresponds to 'filepath_or_buffer' argument of pandas.read_table,
# but we send it separately
del kwargs["filepath_or_buffer"]

return kwargs

def load_data(self, type_: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]:
# Loads the data and returns the df and metadata of the table
df = pd.read_table(self.filepath_or_buffer, **self._get_loading_kwargs())
metadata = utils.get_file_and_dataframe_metadata(self.filepath_or_buffer, df)
return df, metadata

@classmethod
def name(cls) -> str:
return "table"


@dataclasses.dataclass
class PandasSPSSReader(DataLoader):
"""Class for loading/reading spss files with Pandas.
Expand Down Expand Up @@ -1595,6 +1677,7 @@ def register_data_loaders():
PandasORCReader,
PandasExcelWriter,
PandasExcelReader,
PandasTableReader,
PandasSPSSReader,
]:
registry.register_adapter(loader)
Expand Down
19 changes: 19 additions & 0 deletions tests/plugins/test_pandas_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
PandasSqlWriter,
PandasStataReader,
PandasStataWriter,
PandasTableReader,
PandasXmlReader,
PandasXmlWriter,
)
Expand Down Expand Up @@ -277,6 +278,24 @@ def test_pandas_excel_reader(tmp_path: pathlib.Path) -> None:
]


def test_pandas_table_reader(tmp_path: pathlib.Path) -> None:

path_to_test = "tests/resources/data/test_load_from_data.csv"
reader = PandasTableReader(filepath_or_buffer=path_to_test)
df, metadata = reader.load_data(pd.DataFrame)

assert PandasTableReader.applicable_types() == [pd.DataFrame]
assert df.loc[0, "firstName"] == "John"
assert df.shape == (3, 5)
assert metadata["dataframe_metadata"]["column_names"] == [
"firstName",
"lastName",
"age",
"department",
"email",
]


def test_pandas_spss_reader(tmp_path: pathlib.Path) -> None:
import pyreadstat

Expand Down

0 comments on commit b5982fd

Please sign in to comment.