-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #13 from jrasband-dev/dev-0.5.0
Dev 0.5.0
- Loading branch information
Showing
8 changed files
with
310 additions
and
36 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
dist | ||
__pycache__ | ||
polars_extensions.egg-info | ||
venv | ||
.venv | ||
.codegpt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
a,c | ||
apple,appl | ||
banana,BANANA | ||
cherry,cherr | ||
date,etad | ||
elderberry,elderberrys | ||
fig,FIG | ||
grape,gr@pe | ||
honeydew,ywendeyoh | ||
kiwi,KIW | ||
lemon,lemons | ||
mangoes are Tangy,mango are Tangy | ||
it was the best of times,it was the worst of times | ||
of times it was the best,it was the worst of times |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,12 @@ | ||
from .io import * | ||
from .name import * | ||
from .numeric import * | ||
from .string import * | ||
|
||
__all__ = [ | ||
"NameExtensionNameSpace", | ||
"NumericExtensionNamespace", | ||
"StringExtensionNamespace", | ||
"write_schema", | ||
"read_schema", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import polars as pl | ||
|
||
@pl.api.register_dataframe_namespace("str_ext") | ||
class StringExtensionNamespace: | ||
"""String Extensions for the Polars Library""" | ||
|
||
def __init__(self, df: pl.DataFrame): | ||
self._df = df | ||
|
||
def f1_string_similarity(self, col_a: str, col_b: str) -> pl.DataFrame: | ||
""" | ||
Calculates a similarity score between two columns of strings based on common characters, | ||
accounting for repeated characters. | ||
Parameters: | ||
col_a (str): The name of the first column to compare. | ||
col_b (str): The name of the second column to compare. | ||
Returns: | ||
pl.DataFrame: A DataFrame with the similarity scores as a new column. | ||
""" | ||
|
||
def similarity(row_str_a: str, row_str_b: str) -> float: | ||
# Normalize both strings (case-insensitive comparison) | ||
row_str_a = row_str_a.lower() | ||
row_str_b = row_str_b.lower() | ||
|
||
# If strings are identical, return a score of 1.0 | ||
if row_str_a == row_str_b: | ||
return 1.0 | ||
|
||
list1 = list(row_str_a) | ||
list2 = list(row_str_b) | ||
|
||
list2_copy = list2[:] | ||
intersection = [] | ||
|
||
# Account for repeated characters by checking all occurrences | ||
for char in list1: | ||
if char in list2_copy: | ||
intersection.append(char) | ||
list2_copy.remove(char) | ||
|
||
common_chars = len(intersection) | ||
total_chars = len(list1) + len(list2) | ||
return (2 * common_chars) / total_chars if total_chars > 0 else 0.0 | ||
|
||
# Apply the similarity function row-by-row | ||
similarity_scores = [ | ||
similarity(row_a, row_b) for row_a, row_b in zip(self._df[col_a], self._df[col_b]) | ||
] | ||
|
||
# Add the similarity scores as a new column to the DataFrame | ||
self._df = self._df.with_columns( | ||
pl.Series("f1_score", similarity_scores) | ||
) | ||
|
||
return self._df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.