Skip to content

Commit

Permalink
Merge pull request #13 from jrasband-dev/dev-0.5.0
Browse files Browse the repository at this point in the history
Dev 0.5.0
  • Loading branch information
jrasband-dev authored Jan 17, 2025
2 parents cd4a87f + 3a965aa commit e351ee3
Show file tree
Hide file tree
Showing 8 changed files with 310 additions and 36 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
dist
__pycache__
polars_extensions.egg-info
venv
.venv
.codegpt
14 changes: 14 additions & 0 deletions datasets/string_sim.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
a,c
apple,appl
banana,BANANA
cherry,cherr
date,etad
elderberry,elderberrys
fig,FIG
grape,gr@pe
honeydew,ywendeyoh
kiwi,KIW
lemon,lemons
mangoes are Tangy,mango are Tangy
it was the best of times,it was the worst of times
of times it was the best,it was the worst of times
2 changes: 2 additions & 0 deletions polars_extensions/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from .io import *
from .name import *
from .numeric import *
from .string import *

__all__ = [
"NameExtensionNameSpace",
"NumericExtensionNamespace",
"StringExtensionNamespace",
"write_schema",
"read_schema",
]
20 changes: 11 additions & 9 deletions polars_extensions/io.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
import json
from typing import Union
import ast

import polars as pl


def write_schema(schema: Union[pl.DataFrame, pl.Schema], file: str):
"Saves a Polars schema a JSON file"
if isinstance(schema, pl.DataFrame):
Expand All @@ -17,12 +14,17 @@ def write_schema(schema: Union[pl.DataFrame, pl.Schema], file: str):
json.dump(schema_dict, f)
return


def read_schema(file: str):
"Opens a JSON Schema file and return a Polars Schema object"
f = open(file, "r")
schema = json.load(f)
f.close()
schema_dict = {k: ast.literal_eval(f"pl.{v}") for k, v in schema.items()}
with open(file, "r") as f:
schema = json.load(f)

schema_dict = {}
for k, v in schema.items():
try:
schema_dict[k] = getattr(pl, v)
except AttributeError:
raise ValueError(f"Invalid type {v} for column {k}")

schema_object = pl.Schema(schema_dict)
return schema_object
return schema_object
58 changes: 58 additions & 0 deletions polars_extensions/string.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import polars as pl

@pl.api.register_dataframe_namespace("str_ext")
class StringExtensionNamespace:
"""String Extensions for the Polars Library"""

def __init__(self, df: pl.DataFrame):
self._df = df

def f1_string_similarity(self, col_a: str, col_b: str) -> pl.DataFrame:
"""
Calculates a similarity score between two columns of strings based on common characters,
accounting for repeated characters.
Parameters:
col_a (str): The name of the first column to compare.
col_b (str): The name of the second column to compare.
Returns:
pl.DataFrame: A DataFrame with the similarity scores as a new column.
"""

def similarity(row_str_a: str, row_str_b: str) -> float:
# Normalize both strings (case-insensitive comparison)
row_str_a = row_str_a.lower()
row_str_b = row_str_b.lower()

# If strings are identical, return a score of 1.0
if row_str_a == row_str_b:
return 1.0

list1 = list(row_str_a)
list2 = list(row_str_b)

list2_copy = list2[:]
intersection = []

# Account for repeated characters by checking all occurrences
for char in list1:
if char in list2_copy:
intersection.append(char)
list2_copy.remove(char)

common_chars = len(intersection)
total_chars = len(list1) + len(list2)
return (2 * common_chars) / total_chars if total_chars > 0 else 0.0

# Apply the similarity function row-by-row
similarity_scores = [
similarity(row_a, row_b) for row_a, row_b in zip(self._df[col_a], self._df[col_b])
]

# Add the similarity scores as a new column to the DataFrame
self._df = self._df.with_columns(
pl.Series("f1_score", similarity_scores)
)

return self._df
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "polars-extensions"
version = "0.4.0"
version = "0.5.0"
description = "The Library of Polars Extensions"
readme = "README.md"
authors = [
Expand Down
Loading

0 comments on commit e351ee3

Please sign in to comment.