diff --git a/poetry.lock b/poetry.lock index 0373d1b..ec829d1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2422,14 +2422,14 @@ test = ["asv", "gmpy2", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeo [[package]] name = "setuptools" -version = "67.6.1" +version = "67.7.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "setuptools-67.6.1-py3-none-any.whl", hash = "sha256:e728ca814a823bf7bf60162daf9db95b93d532948c4c0bea762ce62f60189078"}, - {file = "setuptools-67.6.1.tar.gz", hash = "sha256:257de92a9d50a60b8e22abfcbb771571fde0dbf3ec234463212027a4eeecbe9a"}, + {file = "setuptools-67.7.0-py3-none-any.whl", hash = "sha256:888be97fde8cc3afd60f7784e678fa29ee13c4e5362daa7104a93bba33646c50"}, + {file = "setuptools-67.7.0.tar.gz", hash = "sha256:b7e53a01c6c654d26d2999ee033d8c6125e5fa55f03b7b193f937ae7ac999f22"}, ] [package.extras] diff --git a/src/sk_transformers/string_transformer.py b/src/sk_transformers/string_transformer.py index b9e3135..113631c 100644 --- a/src/sk_transformers/string_transformer.py +++ b/src/sk_transformers/string_transformer.py @@ -155,45 +155,32 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: expr = [ pl.col("username") - .apply(EmailTransformer.__num_of_digits) + .str.count_match(r"\d") + .cast(pl.Int64) .alias(f"{column}_num_of_digits"), pl.col("username") - .apply(EmailTransformer.__num_of_letters) + .str.count_match(r"[a-zA-Z]") + .cast(pl.Int64) .alias(f"{column}_num_of_letters"), pl.col("username") - .apply(EmailTransformer.__num_of_special_characters) + .str.count_match(r"[^A-Za-z0-9]") + .cast(pl.Int64) .alias(f"{column}_num_of_special_chars"), pl.col("username") .apply(EmailTransformer.__num_of_repeated_characters) .alias(f"{column}_num_of_repeated_chars"), - pl.col("username") - .apply(EmailTransformer.__num_of_words) + (pl.col("username").str.count_match(r"[.\-_]") + 1) + .cast(pl.Int64) .alias(f"{column}_num_of_words"), ] X = X.with_columns(expr).drop(column).rename({"username": column}) return X.to_pandas() - @staticmethod - def __num_of_digits(string: str) -> int: - return sum(map(str.isdigit, string)) - - @staticmethod - def __num_of_letters(string: str) -> int: - return sum(map(str.isalpha, string)) - - @staticmethod - def __num_of_special_characters(string: str) -> int: - return len(re.findall(r"[^A-Za-z0-9]", string)) - @staticmethod def __num_of_repeated_characters(string: str) -> int: return max(len("".join(g)) for _, g in itertools.groupby(string)) - @staticmethod - def __num_of_words(string: str) -> int: - return len(re.findall(r"[.\-_]", string)) + 1 - class StringSimilarityTransformer(BaseTransformer): """Calculates the similarity between two strings using the `gestalt pattern @@ -240,29 +227,26 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: X = check_ready_to_transform(self, X, list(self.features), return_polars=True) return X.with_columns( - pl.struct([self.features[0], self.features[1]]) + pl.struct( + [ + pl.col(self.features[0]).str.strip().str.to_lowercase(), + pl.col(self.features[1]).str.strip().str.to_lowercase(), + ] + ) .apply( - lambda x: StringSimilarityTransformer.__similar( - StringSimilarityTransformer.__normalize_string(x[self.features[0]]), - StringSimilarityTransformer.__normalize_string(x[self.features[1]]), - ) + lambda x: SequenceMatcher( + None, + unicodedata.normalize("NFKD", x[self.features[0]]) + .encode("utf8", "strict") + .decode("utf8"), + unicodedata.normalize("NFKD", x[self.features[1]]) + .encode("utf8", "strict") + .decode("utf8"), + ).ratio() ) .alias(f"{self.features[0]}_{self.features[1]}_similarity") ).to_pandas() - @staticmethod - def __similar(a: str, b: str) -> float: - return SequenceMatcher(None, a, b).ratio() - - @staticmethod - def __normalize_string(string: str) -> str: - string = str(string).strip().lower() - return ( - unicodedata.normalize("NFKD", string) - .encode("utf8", "strict") - .decode("utf8") - ) - class PhoneTransformer(BaseTransformer): """Transforms a phone number into multiple features.