Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added pandas table reader along with test #804

Merged
merged 6 commits into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions hamilton/plugins/pandas_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1533,6 +1533,88 @@ def name(cls) -> str:
return "excel"


@dataclasses.dataclass
class PandasTableReader(DataLoader):
"""Class for loading/reading table files with Pandas.
Maps to https://pandas.pydata.org/docs/reference/api/pandas.read_table.html
"""

filepath_or_buffer: Union[str, Path, BytesIO, BufferedReader]
# kwargs
sep: Union[str, None] = None
delimiter: Optional[str] = None
header: Union[int, Sequence, str, None] = "infer"
names: Optional[Sequence] = None
index_col: Union[int, str, Sequence, None] = None
usecols: Union[Sequence, None] = None
dtype: Union[Dtype, Dict[Hashable, Dtype], None] = None
engine: Optional[Literal["c", "python", "pyarrow"]] = None
converters: Optional[Dict[Hashable, Callable]] = None
true_values: Optional[Iterable] = None
false_values: Optional[Iterable] = None
skipinitialspace: bool = False
skiprows: Optional[Union[List[int], int, List[Callable]]] = None
skipfooter: int = 0
nrows: Optional[int] = None
na_values: Optional[Union[Hashable, Iterable, Dict[Hashable, Iterable]]] = None
keep_default_na: bool = True
na_filter: bool = True
verbose: bool = False
skip_blank_lines: bool = True
parse_dates: Union[List[Union[int, str]], Dict[str, List[Union[int, str]]], bool] = False
infer_datetime_format: bool = False
keep_date_col: bool = False
date_parser: Optional[Callable] = None
date_format: Optional[Union[str, str]] = None
dayfirst: bool = False
cache_dates: bool = True
iterator: bool = False
chunksize: Optional[int] = None
compression: Union[str, Dict] = "infer"
thousands: Optional[str] = None
decimal: str = "."
lineterminator: Optional[str] = None
quotechar: Optional[str] = '"'
quoting: int = 0
doublequote: bool = True
escapechar: Optional[str] = None
comment: Optional[str] = None
encoding: Optional[str] = None
encoding_errors: Optional[str] = "strict"
dialect: Optional[str] = None
on_bad_lines: Union[Literal["error", "warn", "skip"], Callable] = "error"
delim_whitespace: bool = False
low_memory: bool = True
memory_map: bool = False
float_precision: Optional[Literal["high", "legacy", "round_trip"]] = None
storage_options: Optional[Dict] = None
dtype_backend: Literal["numpy_nullable", "pyarrow"] = "numpy_nullable"

@classmethod
def applicable_types(cls) -> Collection[Type]:
return [DATAFRAME_TYPE]

def _get_loading_kwargs(self) -> Dict[str, Any]:
# Puts kwargs in a dict
kwargs = dataclasses.asdict(self)

# filepath_or_buffer corresponds to 'filepath_or_buffer' argument of pandas.read_table,
# but we send it separately
del kwargs["filepath_or_buffer"]

return kwargs

def load_data(self, type_: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]:
# Loads the data and returns the df and metadata of the table
df = pd.read_table(self.filepath_or_buffer, **self._get_loading_kwargs())
metadata = utils.get_file_and_dataframe_metadata(self.filepath_or_buffer, df)
return df, metadata

@classmethod
def name(cls) -> str:
return "table"


@dataclasses.dataclass
class PandasSPSSReader(DataLoader):
"""Class for loading/reading spss files with Pandas.
Expand Down Expand Up @@ -1595,6 +1677,7 @@ def register_data_loaders():
PandasORCReader,
PandasExcelWriter,
PandasExcelReader,
PandasTableReader,
PandasSPSSReader,
]:
registry.register_adapter(loader)
Expand Down
19 changes: 19 additions & 0 deletions tests/plugins/test_pandas_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
PandasSqlWriter,
PandasStataReader,
PandasStataWriter,
PandasTableReader,
PandasXmlReader,
PandasXmlWriter,
)
Expand Down Expand Up @@ -277,6 +278,24 @@ def test_pandas_excel_reader(tmp_path: pathlib.Path) -> None:
]


def test_pandas_table_reader(tmp_path: pathlib.Path) -> None:

path_to_test = "tests/resources/data/test_load_from_data.csv"
reader = PandasTableReader(filepath_or_buffer=path_to_test)
df, metadata = reader.load_data(pd.DataFrame)

assert PandasTableReader.applicable_types() == [pd.DataFrame]
assert df.loc[0, "firstName"] == "John"
assert df.shape == (3, 5)
assert metadata["dataframe_metadata"]["column_names"] == [
"firstName",
"lastName",
"age",
"department",
"email",
]


def test_pandas_spss_reader(tmp_path: pathlib.Path) -> None:
import pyreadstat

Expand Down