diff --git a/docs/README.md b/docs/README.md index fb1b4ed..4b8813b 100644 --- a/docs/README.md +++ b/docs/README.md @@ -3,10 +3,9 @@ # sk-transformers ***A collection of various pandas & scikit-learn compatible transformers for all kinds of preprocessing and feature engineering steps*** 🛠 -[![testing](https://github.com/chrislemke/sk-transformers/actions/workflows/testing.yml/badge.svg?branch=main)](https://github.com/chrislemke/sk-transformers/actions/workflows/testing.yml) -[![Codacy Badge](https://app.codacy.com/project/badge/Grade/44093b8f6b28454fb8e0336ccb83cdc5)](https://www.codacy.com/gh/chrislemke/sk-transformers/dashboard?utm_source=github.com&utm_medium=referral&utm_content=chrislemke/sk-transformers&utm_campaign=Badge_Grade) +[![ChecksAndTesting](https://github.com/chrislemke/sk-transformers/actions/workflows/checks-testing.yml/badge.svg)](https://github.com/chrislemke/sk-transformers/actions/workflows/checks-testing.yml) [![codecov](https://codecov.io/github/chrislemke/sk-transformers/branch/main/graph/badge.svg?token=LJLXQXX6M8)](https://codecov.io/github/chrislemke/sk-transformers) -[![deploy package](https://github.com/chrislemke/sk-transformers/actions/workflows/deploy-package.yml/badge.svg)](https://github.com/chrislemke/sk-transformers/actions/workflows/deploy-package.yml) +[![Release](https://github.com/chrislemke/sk-transformers/actions/workflows/release.yml/badge.svg)](https://github.com/chrislemke/sk-transformers/actions/workflows/release.yml) [![pypi](https://img.shields.io/pypi/v/sk-transformers)](https://pypi.org/project/sk-transformers/) [![python version](https://img.shields.io/pypi/pyversions/sk-transformers?logo=python&logoColor=yellow)](https://www.python.org/) [![downloads](https://img.shields.io/pypi/dm/sk-transformers)](https://pypistats.org/packages/sk-transformers) @@ -50,6 +49,7 @@ poetry install ## Available transformers | Module | Transformer | Description | | ------ | ----------- | ----------- | +|[`Datetime transformer`](https://chrislemke.github.io/sk-transformers/API-reference/transformer/datetime_transformer/)|[`DateColumnsTransformer`](https://chrislemke.github.io/sk-transformers/API-reference/transformer/datetime_transformer/#sk_transformers.datetime_transformer.DateColumnsTransformer)|Splits a date column into multiple columns.| |[`Datetime transformer`](https://chrislemke.github.io/sk-transformers/API-reference/transformer/datetime_transformer/)|[`DurationCalculatorTransformer`](https://chrislemke.github.io/sk-transformers/API-reference/transformer/datetime_transformer/#sk_transformers.datetime_transformer.DurationCalculatorTransformer)|Calculates the duration between to given dates.| |[`Deep transformer`](https://chrislemke.github.io/sk-transformers/API-reference/transformer/deep_transformer/)|[`ToVecTransformer`](https://chrislemke.github.io/sk-transformers/API-reference/transformer/deep_transformer/#sk_transformers.deep_transformer.ToVecTransformer)|This transformer trains an [FT-Transformer](https://paperswithcode.com/method/ft-transformer) using the [pytorch-widedeep package](https://github.com/jrzaurin/pytorch-widedeep) and extracts the embeddings from its embedding layer.| |[`Encoder transformer`](https://chrislemke.github.io/sk-transformers/API-reference/transformer/encoder_transformer/)|[`MeanEncoderTransformer`](https://chrislemke.github.io/sk-transformers/API-reference/transformer/encoder_transformer/#sk_transformers.encoder_transformer.MeanEncoderTransformer)|Scikit-learn API for the [feature-engine MeanEncoder](https://feature-engine.readthedocs.io/en/latest/api_doc/encoding/MeanEncoder.html).| diff --git a/examples/playground.ipynb b/examples/playground.ipynb index 6c02dd0..54379e9 100644 --- a/examples/playground.ipynb +++ b/examples/playground.ipynb @@ -39,6 +39,30 @@ "## [Datetime transformer](https://chrislemke.github.io/sk-transformers/API-reference/transformer/datetime_transformer/)" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### [`DateColumnsTransformer`](https://chrislemke.github.io/sk-transformers/API-reference/transformer/datetime_transformer/#sk_transformers.datetime_transformer.DateColumnsTransformer)\n", + "\n", + "Splits a date column into multiple columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sk_transformers import DateColumnsTransformer\n", + "\n", + "X = pd.DataFrame({\"foo\": [\"2021-01-01\", \"2022-02-02\", \"2023-03-03\"]})\n", + "transformer = DateColumnsTransformer([\"foo\"])\n", + "transformer.fit_transform(X)" + ] + }, { "attachments": {}, "cell_type": "markdown", diff --git a/poetry.lock b/poetry.lock index b25be98..8dd6054 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3361,14 +3361,14 @@ transformers = ["spacy-transformers (>=1.1.2,<1.3.0)"] [[package]] name = "spacy-legacy" -version = "3.0.11" +version = "3.0.12" description = "Legacy registered functions for spaCy backwards compatibility" category = "main" optional = false python-versions = ">=3.6" files = [ - {file = "spacy-legacy-3.0.11.tar.gz", hash = "sha256:bfaef03c377c323a3089b1885518e0dad489597b07a80af2499750b24fdbf54b"}, - {file = "spacy_legacy-3.0.11-py2.py3-none-any.whl", hash = "sha256:7b2a72bfe8e135c5885ecf22db946daa352c7a24639aaeda10a76d4c1c66196f"}, + {file = "spacy-legacy-3.0.12.tar.gz", hash = "sha256:b37d6e0c9b6e1d7ca1cf5bc7152ab64a4c4671f59c85adaf7a3fcb870357a774"}, + {file = "spacy_legacy-3.0.12-py2.py3-none-any.whl", hash = "sha256:476e3bd0d05f8c339ed60f40986c07387c0a71479245d6d0f4298dbd52cda55f"}, ] [[package]] diff --git a/requirements.txt b/requirements.txt index ae297b7..a4b7ea7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1205,9 +1205,9 @@ smart-open==6.3.0 ; python_version >= "3.8" and python_version < "3.11" \ smmap==5.0.0 ; python_version >= "3.8" and python_version < "3.11" \ --hash=sha256:2aba19d6a040e78d8b09de5c57e96207b09ed71d8e55ce0959eeee6c8e190d94 \ --hash=sha256:c840e62059cd3be204b0c9c9f74be2c09d5648eddd4580d9314c3ecde0b30936 -spacy-legacy==3.0.11 ; python_version >= "3.8" and python_version < "3.11" \ - --hash=sha256:7b2a72bfe8e135c5885ecf22db946daa352c7a24639aaeda10a76d4c1c66196f \ - --hash=sha256:bfaef03c377c323a3089b1885518e0dad489597b07a80af2499750b24fdbf54b +spacy-legacy==3.0.12 ; python_version >= "3.8" and python_version < "3.11" \ + --hash=sha256:476e3bd0d05f8c339ed60f40986c07387c0a71479245d6d0f4298dbd52cda55f \ + --hash=sha256:b37d6e0c9b6e1d7ca1cf5bc7152ab64a4c4671f59c85adaf7a3fcb870357a774 spacy-loggers==1.0.4 ; python_version >= "3.8" and python_version < "3.11" \ --hash=sha256:e050bf2e63208b2f096b777e494971c962ad7c1dc997641c8f95c622550044ae \ --hash=sha256:e6f983bf71230091d5bb7b11bf64bd54415eca839108d5f83d9155d0ba93bf28 diff --git a/src/sk_transformers/__init__.py b/src/sk_transformers/__init__.py index 9b26678..c366e35 100644 --- a/src/sk_transformers/__init__.py +++ b/src/sk_transformers/__init__.py @@ -1,4 +1,5 @@ from sk_transformers.datetime_transformer import ( + DateColumnsTransformer, DurationCalculatorTransformer, TimestampTransformer, ) diff --git a/src/sk_transformers/datetime_transformer.py b/src/sk_transformers/datetime_transformer.py index a94fa03..e00d0e9 100644 --- a/src/sk_transformers/datetime_transformer.py +++ b/src/sk_transformers/datetime_transformer.py @@ -7,6 +7,116 @@ from sk_transformers.utils import check_ready_to_transform +class DateColumnsTransformer(BaseTransformer): + """Splits a date column into multiple columns. + + Example: + ```python + import pandas as pd + from sk_transformers import DateColumnsTransformer + + X = pd.DataFrame({"foo": ["2021-01-01", "2022-02-02", "2023-03-03"]}) + transformer = DateColumnsTransformer(["foo"]) + transformer.fit_transform(X) + ``` + ``` + foo foo_year ... foo_is_year_end foo_is_weekend + 0 2021-01-01 2021 ... False False + 1 2022-02-02 2022 ... False False + 2 2023-03-03 2023 ... False False + ``` + + Args: + features (List[str]): List of columns to transform. + date_format (str): Date format. Defaults to `%Y-%m-%d`. + errors (str): How to handle errors in `pd.to_datetime`. Defaults to `raise`. + available values: `ignore`, `raise`, `coerce`. + If `raise`, then invalid parsing will raise an exception. + If `coerce`, then invalid parsing will be set as `NaT`. + If `ignore`, then invalid parsing will return the input. + date_elements ([List[str]]): List of date elements to extract. + """ + + def __init__( # pylint: disable=dangerous-default-value + self, + features: List[str], + date_format: str = "%Y-%m-%d", + errors: str = "raise", + date_elements: List[str] = [ + "year", + "month", + "day", + "day_of_week", + "day_of_year", + "week_of_year", + "quarter", + "is_leap_year", + "is_month_start", + "is_month_end", + "is_quarter_start", + "is_quarter_end", + "is_year_start", + "is_year_end", + "is_weekend", + ], + ) -> None: + super().__init__() + self.features = features + self.date_format = date_format + self.date_elements = date_elements + self.errors = errors + + def transform( # pylint: disable=too-many-branches + self, X: pd.DataFrame + ) -> pd.DataFrame: + """Transforms columns from the provided dataframe. + + Args: + X (pandas.DataFrame): Dataframe with columns to transform. + + Returns: + pandas.DataFrame: Dataframe with transformed columns. + """ + + X = check_ready_to_transform(self, X, self.features) + + for column in self.features: + X[column] = pd.to_datetime( + X[column], format=self.date_format, errors=self.errors + ) + if "year" in self.date_elements: + X[f"{column}_year"] = X[column].dt.year + if "month" in self.date_elements: + X[f"{column}_month"] = X[column].dt.month + if "day" in self.date_elements: + X[f"{column}_day"] = X[column].dt.day + if "day_of_week" in self.date_elements: + X[f"{column}_day_of_week"] = X[column].dt.dayofweek + if "day_of_year" in self.date_elements: + X[f"{column}_day_of_year"] = X[column].dt.dayofyear + if "week_of_year" in self.date_elements: + X[f"{column}_week_of_year"] = X[column].dt.weekofyear + if "quarter" in self.date_elements: + X[f"{column}_quarter"] = X[column].dt.quarter + if "is_leap_year" in self.date_elements: + X[f"{column}_is_leap_year"] = X[column].dt.is_leap_year + if "is_month_start" in self.date_elements: + X[f"{column}_is_month_start"] = X[column].dt.is_month_start + if "is_month_end" in self.date_elements: + X[f"{column}_is_month_end"] = X[column].dt.is_month_end + if "is_quarter_start" in self.date_elements: + X[f"{column}_is_quarter_start"] = X[column].dt.is_quarter_start + if "is_quarter_end" in self.date_elements: + X[f"{column}_is_quarter_end"] = X[column].dt.is_quarter_end + if "is_year_start" in self.date_elements: + X[f"{column}_is_year_start"] = X[column].dt.is_year_start + if "is_year_end" in self.date_elements: + X[f"{column}_is_year_end"] = X[column].dt.is_year_end + if "is_weekend" in self.date_elements: + X[f"{column}_is_weekend"] = X[column].dt.dayofweek.isin([5, 6]) + return X + + class DurationCalculatorTransformer(BaseTransformer): """Calculates the duration between to given dates. diff --git a/tests/conftest.py b/tests/conftest.py index 7bedc79..c0a5c45 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -97,6 +97,11 @@ def X_time_values() -> pd.DataFrame: ) +@pytest.fixture() +def tiny_date_df() -> pd.DataFrame: + return pd.DataFrame({"a": ["2021-01-01", "2022-02-02", "2023-03-03"]}) + + @pytest.fixture() def X_nan_values() -> pd.DataFrame: return pd.DataFrame( diff --git a/tests/test_transformer/test_datetime_transformer.py b/tests/test_transformer/test_datetime_transformer.py index 6dcf5ba..4fdbc08 100644 --- a/tests/test_transformer/test_datetime_transformer.py +++ b/tests/test_transformer/test_datetime_transformer.py @@ -2,11 +2,98 @@ import pytest from sklearn.pipeline import make_pipeline -from sk_transformers import DurationCalculatorTransformer, TimestampTransformer +from sk_transformers import ( + DateColumnsTransformer, + DurationCalculatorTransformer, + TimestampTransformer, +) # pylint: disable=missing-function-docstring, missing-class-docstring +def test_date_columns_transformer_in_pipeline(tiny_date_df): + pipeline = make_pipeline(DateColumnsTransformer(["a"])) + X = pipeline.fit_transform(tiny_date_df).drop("a", axis=1) + expected = np.array( + [ + [ + 2021, + 1, + 1, + 4, + 1, + 53, + 1, + False, + True, + False, + True, + False, + True, + False, + False, + ], + [ + 2022, + 2, + 2, + 2, + 33, + 5, + 1, + False, + False, + False, + False, + False, + False, + False, + False, + ], + [ + 2023, + 3, + 3, + 4, + 62, + 9, + 1, + False, + False, + False, + False, + False, + False, + False, + False, + ], + ], + dtype=object, + ) + + assert np.array_equal(X.to_numpy(), expected) + assert pipeline.steps[0][0] == "datecolumnstransformer" + assert pipeline.steps[0][1].features == ["a"] + assert pipeline.steps[0][1].date_format == "%Y-%m-%d" + assert pipeline.steps[0][1].date_elements == [ + "year", + "month", + "day", + "day_of_week", + "day_of_year", + "week_of_year", + "quarter", + "is_leap_year", + "is_month_start", + "is_month_end", + "is_quarter_start", + "is_quarter_end", + "is_year_start", + "is_year_end", + "is_weekend", + ] + + def test_duration_calculator_transformer_in_pipeline_seconds(X_time_values) -> None: pipeline = make_pipeline( DurationCalculatorTransformer(