Skip to content

Commit

Permalink
refactor: replace static methods in some transformers (#110)
Browse files Browse the repository at this point in the history
  • Loading branch information
premsrii authored Apr 21, 2023
1 parent 48c6977 commit fa6215d
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 42 deletions.
6 changes: 3 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

62 changes: 23 additions & 39 deletions src/sk_transformers/string_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,45 +155,32 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:

expr = [
pl.col("username")
.apply(EmailTransformer.__num_of_digits)
.str.count_match(r"\d")
.cast(pl.Int64)
.alias(f"{column}_num_of_digits"),
pl.col("username")
.apply(EmailTransformer.__num_of_letters)
.str.count_match(r"[a-zA-Z]")
.cast(pl.Int64)
.alias(f"{column}_num_of_letters"),
pl.col("username")
.apply(EmailTransformer.__num_of_special_characters)
.str.count_match(r"[^A-Za-z0-9]")
.cast(pl.Int64)
.alias(f"{column}_num_of_special_chars"),
pl.col("username")
.apply(EmailTransformer.__num_of_repeated_characters)
.alias(f"{column}_num_of_repeated_chars"),
pl.col("username")
.apply(EmailTransformer.__num_of_words)
(pl.col("username").str.count_match(r"[.\-_]") + 1)
.cast(pl.Int64)
.alias(f"{column}_num_of_words"),
]

X = X.with_columns(expr).drop(column).rename({"username": column})
return X.to_pandas()

@staticmethod
def __num_of_digits(string: str) -> int:
return sum(map(str.isdigit, string))

@staticmethod
def __num_of_letters(string: str) -> int:
return sum(map(str.isalpha, string))

@staticmethod
def __num_of_special_characters(string: str) -> int:
return len(re.findall(r"[^A-Za-z0-9]", string))

@staticmethod
def __num_of_repeated_characters(string: str) -> int:
return max(len("".join(g)) for _, g in itertools.groupby(string))

@staticmethod
def __num_of_words(string: str) -> int:
return len(re.findall(r"[.\-_]", string)) + 1


class StringSimilarityTransformer(BaseTransformer):
"""Calculates the similarity between two strings using the `gestalt pattern
Expand Down Expand Up @@ -240,29 +227,26 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
X = check_ready_to_transform(self, X, list(self.features), return_polars=True)

return X.with_columns(
pl.struct([self.features[0], self.features[1]])
pl.struct(
[
pl.col(self.features[0]).str.strip().str.to_lowercase(),
pl.col(self.features[1]).str.strip().str.to_lowercase(),
]
)
.apply(
lambda x: StringSimilarityTransformer.__similar(
StringSimilarityTransformer.__normalize_string(x[self.features[0]]),
StringSimilarityTransformer.__normalize_string(x[self.features[1]]),
)
lambda x: SequenceMatcher(
None,
unicodedata.normalize("NFKD", x[self.features[0]])
.encode("utf8", "strict")
.decode("utf8"),
unicodedata.normalize("NFKD", x[self.features[1]])
.encode("utf8", "strict")
.decode("utf8"),
).ratio()
)
.alias(f"{self.features[0]}_{self.features[1]}_similarity")
).to_pandas()

@staticmethod
def __similar(a: str, b: str) -> float:
return SequenceMatcher(None, a, b).ratio()

@staticmethod
def __normalize_string(string: str) -> str:
string = str(string).strip().lower()
return (
unicodedata.normalize("NFKD", string)
.encode("utf8", "strict")
.decode("utf8")
)


class PhoneTransformer(BaseTransformer):
"""Transforms a phone number into multiple features.
Expand Down

0 comments on commit fa6215d

Please sign in to comment.