diff --git a/.gitignore b/.gitignore index 6769e21..ef951ea 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ __pycache__/ *.py[cod] *$py.class +weightfactors-env/ # C extensions *.so diff --git a/README.md b/README.md index 03e443e..cd8fb86 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ ![Continuous Integration](https://github.com/markteffect/weightfactors/actions/workflows/ci.yml/badge.svg) -![Python](https://img.shields.io/badge/Python-3.9%20|%203.10-blue) +![Python](https://img.shields.io/badge/Python-3.9+-blue) # **Weight Factors** Calculate weight factors for survey data to approximate a representative sample diff --git a/tests/test_generalized_raking.py b/tests/test_generalized_raking.py index aae3389..ec93fc7 100644 --- a/tests/test_generalized_raking.py +++ b/tests/test_generalized_raking.py @@ -38,6 +38,13 @@ def test_invalid_input(): data = pd.DataFrame({"Gender": ["Male", "Male", "Female"]}) raker = GeneralizedRaker({"Gender": {"Male": 0.51, "Female": 0.5}}) raker.rake(data) + with pytest.raises( + KeyError, + match="There are observations for a value in 'Gender' that has not been mapped to a population target", + ): + data = pd.DataFrame({"Gender": ["Male", "Male", "Female", "Other"]}) + raker = GeneralizedRaker({"Gender": {"Male": 0.51, "Female": 0.49}}) + raker.rake(data) def test_generalized_raking_no_convergence(): diff --git a/weightfactors/raking/generalized_raker.py b/weightfactors/raking/generalized_raker.py index cc0bf66..21f0736 100644 --- a/weightfactors/raking/generalized_raker.py +++ b/weightfactors/raking/generalized_raker.py @@ -22,7 +22,7 @@ class GeneralizedRaker: Whether to raise an error when the weight factors are extreme according to `cutoffs`, else we raise a warning. Default is False. cutoffs: Dict[str, float], optional - What we consider extreme weight factors. 'lo' is the lower bound (defaults to 0.25) + When weights are considered to be extreme. 'lo' is the lower bound (defaults to 0.25) and 'hi' is the upper bound (defaults to 4). If `raise_on_extreme` we raise an error if any weight exceeds the cutoffs, otherwise we clip the extremes to the cutoffs exclusion_column: str, optional @@ -118,10 +118,17 @@ def validate_input(self, data: pd.DataFrame) -> None: # Make sure all keys are present in the dataset if key not in data.columns: raise KeyError(f"There is no column {key} in the provided dataset") - # Make sure there are no missing values in the questions used for calculating weights + # Make sure there are no missing values in the columns used for calculating weights if data[key].isna().any(axis=None): raise ValueError(f"Column {key} contains missing values") + # Make sure all unique values in the target columns have been mapped + # It is impossible to set values with observations to a weight of 0 + if len(data[key].unique()) != len(value): + raise KeyError( + f"There are observations for a value in '{key}' that has not been mapped to a population target" + ) # Make sure we have at least 1 observation for each category + # It is impossible to set values without observations to a weight larger than 1 for k, _ in value.items(): if k not in data[key].unique(): raise KeyError(f"There are no observations for {k} in column {key}") @@ -193,11 +200,11 @@ def rake( data: pd.DataFrame The survey dataset max_steps: int - Maximum number of iterations + The maximum number of iterations to try and reach convergence tolerance: float - Maximum tolerance for loss, we claim success if the loss is lower than this + Maximum tolerance for loss, convergence is reached if the loss is smaller than this value early_stopping: int - Maximum number of iterations without improvement in loss before we call quits + Maximum number of iterations without improvement in loss Raises: WeightsConvergenceError if the algorithm did not converge before `max_steps`