Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

swarm -implemented uniquenessmeasurement #757

Merged
merged 1 commit into from
Nov 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from typing import Any, Dict, List, Literal, Union
import pandas as pd
from swarmauri.measurements.base.MeasurementBase import MeasurementBase


class UniquenessMeasurement(MeasurementBase):
"""
Measurement for evaluating the uniqueness of values in a dataset.
Uniqueness is calculated as the percentage of distinct values relative to the total number of values.

Attributes:
type (Literal['UniquenessMeasurement']): Type identifier for the measurement
unit (str): Unit of measurement (percentage)
value (float): Stores the calculated uniqueness score
"""

type: Literal["UniquenessMeasurement"] = "UniquenessMeasurement"
unit: str = "%" # Percentage as the unit of measurement

def calculate_uniqueness(self, data: Union[pd.DataFrame, List, Dict]) -> float:
"""
Calculates the uniqueness score for different data types.

Args:
data: Input data which can be a pandas DataFrame, List, or Dictionary

Returns:
float: Uniqueness score as a percentage (0-100)

Raises:
ValueError: If the input data type is not supported
"""
if isinstance(data, pd.DataFrame):
if data.empty:
return 0.0
# For DataFrame, calculate uniqueness across all columns
total_values = data.size
unique_values = sum(data[col].nunique() for col in data.columns)
return (unique_values / total_values) * 100

elif isinstance(data, list):
if not data:
return 0.0
total_values = len(data)
unique_values = len(
set(str(x) for x in data)
) # Convert to strings to handle unhashable types
return (unique_values / total_values) * 100

elif isinstance(data, dict):
if not data:
return 0.0
total_values = len(data)
unique_values = len(
set(str(v) for v in data.values())
) # Convert to strings to handle unhashable types
return (unique_values / total_values) * 100

else:
raise ValueError(
"Unsupported data type. Please provide DataFrame, List, or Dict."
)

def call(
self, data: Union[pd.DataFrame, List, Dict], kwargs: Dict[str, Any] = None
) -> float:
"""
Calculates and returns the uniqueness score for the provided data.

Args:
data: Input data to evaluate uniqueness
kwargs: Additional parameters (reserved for future use)

Returns:
float: Uniqueness score as a percentage (0-100)
"""
self.value = self.calculate_uniqueness(data)
return self.value

def get_column_uniqueness(self, df: pd.DataFrame) -> Dict[str, float]:
"""
Calculate uniqueness scores for individual columns in a DataFrame.

Args:
df: Input DataFrame

Returns:
Dict[str, float]: Dictionary mapping column names to their uniqueness scores

Raises:
ValueError: If input is not a pandas DataFrame
"""
if not isinstance(df, pd.DataFrame):
raise ValueError("Input must be a pandas DataFrame")

return {column: (df[column].nunique() / len(df) * 100) for column in df.columns}
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import pytest
import pandas as pd
from swarmauri.measurements.concrete.UniquenessMeasurement import (
UniquenessMeasurement as Measurement,
)


@pytest.mark.unit
def test_ubc_resource():
assert Measurement(unit="%").resource == "Measurement"


@pytest.mark.unit
def test_ubc_type():
measurement = Measurement(unit="%")
assert measurement.type == "UniquenessMeasurement"


@pytest.mark.unit
def test_serialization():
measurement = Measurement(unit="%", value=75.0)
assert (
measurement.id
== Measurement.model_validate_json(measurement.model_dump_json()).id
)


@pytest.mark.unit
def test_measurement_value():
measurement = Measurement(unit="%")
test_data = ["A", "B", "A", "C", "B", "D"] # 4 unique values out of 6 total
result = measurement.call(test_data)
assert result == pytest.approx(
66.66666666666667, rel=1e-9
) # Using approx for float comparison
assert measurement.value == pytest.approx(66.66666666666667, rel=1e-9)


@pytest.mark.unit
def test_measurement_unit():
measurement = Measurement(unit="%")
test_data = ["A", "B", "A", "C"]
measurement.call(test_data)
assert measurement.unit == "%"


@pytest.mark.unit
def test_dataframe_uniqueness():
measurement = Measurement(unit="%")
df = pd.DataFrame({"col1": [1, 2, 2, 3], "col2": ["A", "A", "B", "C"]})
result = measurement.call(df)
assert result == pytest.approx(75.0)


@pytest.mark.unit
def test_column_uniqueness():
measurement = Measurement(unit="%")
df = pd.DataFrame({"col1": [1, 2, 2, 3], "col2": ["A", "A", "B", "C"]})
column_uniqueness = measurement.get_column_uniqueness(df)
assert column_uniqueness["col1"] == pytest.approx(75.0)
assert column_uniqueness["col2"] == pytest.approx(75.0)


@pytest.mark.unit
def test_empty_input():
measurement = Measurement(unit="%")
assert measurement.call([]) == 0.0
assert measurement.call({}) == 0.0
assert measurement.call(pd.DataFrame()) == 0.0


@pytest.mark.unit
def test_dict_uniqueness():
measurement = Measurement(unit="%")
test_dict = {"a": 1, "b": 2, "c": 1, "d": 3} # 3 unique values out of 4
assert measurement.call(test_dict) == pytest.approx(75.0)


@pytest.mark.unit
def test_invalid_input():
measurement = Measurement(unit="%")
with pytest.raises(ValueError):
measurement.call(42) # Invalid input type