diff --git a/pkgs/swarmauri/swarmauri/measurements/concrete/UniquenessMeasurement.py b/pkgs/swarmauri/swarmauri/measurements/concrete/UniquenessMeasurement.py new file mode 100644 index 000000000..f807d86fd --- /dev/null +++ b/pkgs/swarmauri/swarmauri/measurements/concrete/UniquenessMeasurement.py @@ -0,0 +1,96 @@ +from typing import Any, Dict, List, Literal, Union +import pandas as pd +from swarmauri.measurements.base.MeasurementBase import MeasurementBase + + +class UniquenessMeasurement(MeasurementBase): + """ + Measurement for evaluating the uniqueness of values in a dataset. + Uniqueness is calculated as the percentage of distinct values relative to the total number of values. + + Attributes: + type (Literal['UniquenessMeasurement']): Type identifier for the measurement + unit (str): Unit of measurement (percentage) + value (float): Stores the calculated uniqueness score + """ + + type: Literal["UniquenessMeasurement"] = "UniquenessMeasurement" + unit: str = "%" # Percentage as the unit of measurement + + def calculate_uniqueness(self, data: Union[pd.DataFrame, List, Dict]) -> float: + """ + Calculates the uniqueness score for different data types. + + Args: + data: Input data which can be a pandas DataFrame, List, or Dictionary + + Returns: + float: Uniqueness score as a percentage (0-100) + + Raises: + ValueError: If the input data type is not supported + """ + if isinstance(data, pd.DataFrame): + if data.empty: + return 0.0 + # For DataFrame, calculate uniqueness across all columns + total_values = data.size + unique_values = sum(data[col].nunique() for col in data.columns) + return (unique_values / total_values) * 100 + + elif isinstance(data, list): + if not data: + return 0.0 + total_values = len(data) + unique_values = len( + set(str(x) for x in data) + ) # Convert to strings to handle unhashable types + return (unique_values / total_values) * 100 + + elif isinstance(data, dict): + if not data: + return 0.0 + total_values = len(data) + unique_values = len( + set(str(v) for v in data.values()) + ) # Convert to strings to handle unhashable types + return (unique_values / total_values) * 100 + + else: + raise ValueError( + "Unsupported data type. Please provide DataFrame, List, or Dict." + ) + + def call( + self, data: Union[pd.DataFrame, List, Dict], kwargs: Dict[str, Any] = None + ) -> float: + """ + Calculates and returns the uniqueness score for the provided data. + + Args: + data: Input data to evaluate uniqueness + kwargs: Additional parameters (reserved for future use) + + Returns: + float: Uniqueness score as a percentage (0-100) + """ + self.value = self.calculate_uniqueness(data) + return self.value + + def get_column_uniqueness(self, df: pd.DataFrame) -> Dict[str, float]: + """ + Calculate uniqueness scores for individual columns in a DataFrame. + + Args: + df: Input DataFrame + + Returns: + Dict[str, float]: Dictionary mapping column names to their uniqueness scores + + Raises: + ValueError: If input is not a pandas DataFrame + """ + if not isinstance(df, pd.DataFrame): + raise ValueError("Input must be a pandas DataFrame") + + return {column: (df[column].nunique() / len(df) * 100) for column in df.columns} diff --git a/pkgs/swarmauri/tests/unit/measurements/UniquenessMeasurement_unit_test.py b/pkgs/swarmauri/tests/unit/measurements/UniquenessMeasurement_unit_test.py new file mode 100644 index 000000000..690002cc8 --- /dev/null +++ b/pkgs/swarmauri/tests/unit/measurements/UniquenessMeasurement_unit_test.py @@ -0,0 +1,83 @@ +import pytest +import pandas as pd +from swarmauri.measurements.concrete.UniquenessMeasurement import ( + UniquenessMeasurement as Measurement, +) + + +@pytest.mark.unit +def test_ubc_resource(): + assert Measurement(unit="%").resource == "Measurement" + + +@pytest.mark.unit +def test_ubc_type(): + measurement = Measurement(unit="%") + assert measurement.type == "UniquenessMeasurement" + + +@pytest.mark.unit +def test_serialization(): + measurement = Measurement(unit="%", value=75.0) + assert ( + measurement.id + == Measurement.model_validate_json(measurement.model_dump_json()).id + ) + + +@pytest.mark.unit +def test_measurement_value(): + measurement = Measurement(unit="%") + test_data = ["A", "B", "A", "C", "B", "D"] # 4 unique values out of 6 total + result = measurement.call(test_data) + assert result == pytest.approx( + 66.66666666666667, rel=1e-9 + ) # Using approx for float comparison + assert measurement.value == pytest.approx(66.66666666666667, rel=1e-9) + + +@pytest.mark.unit +def test_measurement_unit(): + measurement = Measurement(unit="%") + test_data = ["A", "B", "A", "C"] + measurement.call(test_data) + assert measurement.unit == "%" + + +@pytest.mark.unit +def test_dataframe_uniqueness(): + measurement = Measurement(unit="%") + df = pd.DataFrame({"col1": [1, 2, 2, 3], "col2": ["A", "A", "B", "C"]}) + result = measurement.call(df) + assert result == pytest.approx(75.0) + + +@pytest.mark.unit +def test_column_uniqueness(): + measurement = Measurement(unit="%") + df = pd.DataFrame({"col1": [1, 2, 2, 3], "col2": ["A", "A", "B", "C"]}) + column_uniqueness = measurement.get_column_uniqueness(df) + assert column_uniqueness["col1"] == pytest.approx(75.0) + assert column_uniqueness["col2"] == pytest.approx(75.0) + + +@pytest.mark.unit +def test_empty_input(): + measurement = Measurement(unit="%") + assert measurement.call([]) == 0.0 + assert measurement.call({}) == 0.0 + assert measurement.call(pd.DataFrame()) == 0.0 + + +@pytest.mark.unit +def test_dict_uniqueness(): + measurement = Measurement(unit="%") + test_dict = {"a": 1, "b": 2, "c": 1, "d": 3} # 3 unique values out of 4 + assert measurement.call(test_dict) == pytest.approx(75.0) + + +@pytest.mark.unit +def test_invalid_input(): + measurement = Measurement(unit="%") + with pytest.raises(ValueError): + measurement.call(42) # Invalid input type