Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev 0.5.0 #13

Merged
merged 6 commits into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
dist
__pycache__
polars_extensions.egg-info
venv
.venv
.codegpt
14 changes: 14 additions & 0 deletions datasets/string_sim.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
a,c
apple,appl
banana,BANANA
cherry,cherr
date,etad
elderberry,elderberrys
fig,FIG
grape,gr@pe
honeydew,ywendeyoh
kiwi,KIW
lemon,lemons
mangoes are Tangy,mango are Tangy
it was the best of times,it was the worst of times
of times it was the best,it was the worst of times
2 changes: 2 additions & 0 deletions polars_extensions/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from .io import *
from .name import *
from .numeric import *
from .string import *

__all__ = [
"NameExtensionNameSpace",
"NumericExtensionNamespace",
"StringExtensionNamespace",
"write_schema",
"read_schema",
]
20 changes: 11 additions & 9 deletions polars_extensions/io.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
import json
from typing import Union
import ast

import polars as pl


def write_schema(schema: Union[pl.DataFrame, pl.Schema], file: str):
"Saves a Polars schema a JSON file"
if isinstance(schema, pl.DataFrame):
Expand All @@ -17,12 +14,17 @@ def write_schema(schema: Union[pl.DataFrame, pl.Schema], file: str):
json.dump(schema_dict, f)
return


def read_schema(file: str):
"Opens a JSON Schema file and return a Polars Schema object"
f = open(file, "r")
schema = json.load(f)
f.close()
schema_dict = {k: ast.literal_eval(f"pl.{v}") for k, v in schema.items()}
with open(file, "r") as f:
schema = json.load(f)

schema_dict = {}
for k, v in schema.items():
try:
schema_dict[k] = getattr(pl, v)
except AttributeError:
raise ValueError(f"Invalid type {v} for column {k}")

schema_object = pl.Schema(schema_dict)
return schema_object
return schema_object
58 changes: 58 additions & 0 deletions polars_extensions/string.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import polars as pl

@pl.api.register_dataframe_namespace("str_ext")
class StringExtensionNamespace:
"""String Extensions for the Polars Library"""

def __init__(self, df: pl.DataFrame):
self._df = df

def f1_string_similarity(self, col_a: str, col_b: str) -> pl.DataFrame:
"""
Calculates a similarity score between two columns of strings based on common characters,
accounting for repeated characters.

Parameters:
col_a (str): The name of the first column to compare.
col_b (str): The name of the second column to compare.

Returns:
pl.DataFrame: A DataFrame with the similarity scores as a new column.
"""

def similarity(row_str_a: str, row_str_b: str) -> float:
# Normalize both strings (case-insensitive comparison)
row_str_a = row_str_a.lower()
row_str_b = row_str_b.lower()

# If strings are identical, return a score of 1.0
if row_str_a == row_str_b:
return 1.0

list1 = list(row_str_a)
list2 = list(row_str_b)

list2_copy = list2[:]
intersection = []

# Account for repeated characters by checking all occurrences
for char in list1:
if char in list2_copy:
intersection.append(char)
list2_copy.remove(char)

common_chars = len(intersection)
total_chars = len(list1) + len(list2)
return (2 * common_chars) / total_chars if total_chars > 0 else 0.0

# Apply the similarity function row-by-row
similarity_scores = [
similarity(row_a, row_b) for row_a, row_b in zip(self._df[col_a], self._df[col_b])
]

# Add the similarity scores as a new column to the DataFrame
self._df = self._df.with_columns(
pl.Series("f1_score", similarity_scores)
)

return self._df
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "polars-extensions"
version = "0.4.0"
version = "0.5.0"
description = "The Library of Polars Extensions"
readme = "README.md"
authors = [
Expand Down
Loading