From 2af117b9470f8615b910648327caa08d63f2549f Mon Sep 17 00:00:00 2001 From: DvGils Date: Mon, 6 May 2024 09:20:20 +0200 Subject: [PATCH 1/5] chore: improve docstrings --- weightfactors/raking/generalized_raker.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/weightfactors/raking/generalized_raker.py b/weightfactors/raking/generalized_raker.py index cc0bf66..46f30af 100644 --- a/weightfactors/raking/generalized_raker.py +++ b/weightfactors/raking/generalized_raker.py @@ -22,7 +22,7 @@ class GeneralizedRaker: Whether to raise an error when the weight factors are extreme according to `cutoffs`, else we raise a warning. Default is False. cutoffs: Dict[str, float], optional - What we consider extreme weight factors. 'lo' is the lower bound (defaults to 0.25) + When weights are considered to be extreme. 'lo' is the lower bound (defaults to 0.25) and 'hi' is the upper bound (defaults to 4). If `raise_on_extreme` we raise an error if any weight exceeds the cutoffs, otherwise we clip the extremes to the cutoffs exclusion_column: str, optional @@ -193,11 +193,11 @@ def rake( data: pd.DataFrame The survey dataset max_steps: int - Maximum number of iterations + The maximum number of iterations to try and reach convergence tolerance: float - Maximum tolerance for loss, we claim success if the loss is lower than this + Maximum tolerance for loss, convergence is reached if the loss is smaller than this value early_stopping: int - Maximum number of iterations without improvement in loss before we call quits + Maximum number of iterations without improvement in loss Raises: WeightsConvergenceError if the algorithm did not converge before `max_steps` From 00bce214230737a37240aaefa283639c577c02a4 Mon Sep 17 00:00:00 2001 From: DvGils Date: Mon, 6 May 2024 09:31:03 +0200 Subject: [PATCH 2/5] chore: add validation input for observations that have not been mapped to a population target --- tests/test_generalized_raking.py | 4 ++++ weightfactors/raking/generalized_raker.py | 7 ++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/test_generalized_raking.py b/tests/test_generalized_raking.py index aae3389..ba3c5c9 100644 --- a/tests/test_generalized_raking.py +++ b/tests/test_generalized_raking.py @@ -38,6 +38,10 @@ def test_invalid_input(): data = pd.DataFrame({"Gender": ["Male", "Male", "Female"]}) raker = GeneralizedRaker({"Gender": {"Male": 0.51, "Female": 0.5}}) raker.rake(data) + with pytest.raises(KeyError, match="There are observations for a value in 'Gender' that has not been mapped to a population target"): + data = pd.DataFrame({"Gender": ["Male", "Male", "Female", "Other"]}) + raker = GeneralizedRaker({"Gender": {"Male": 0.51, "Female": 0.49}}) + raker.rake(data) def test_generalized_raking_no_convergence(): diff --git a/weightfactors/raking/generalized_raker.py b/weightfactors/raking/generalized_raker.py index 46f30af..267ed50 100644 --- a/weightfactors/raking/generalized_raker.py +++ b/weightfactors/raking/generalized_raker.py @@ -118,10 +118,15 @@ def validate_input(self, data: pd.DataFrame) -> None: # Make sure all keys are present in the dataset if key not in data.columns: raise KeyError(f"There is no column {key} in the provided dataset") - # Make sure there are no missing values in the questions used for calculating weights + # Make sure there are no missing values in the columns used for calculating weights if data[key].isna().any(axis=None): raise ValueError(f"Column {key} contains missing values") + # Make sure all unique values in the target columns have been mapped + # It is impossible to set values with observations to a weight of 0 + if len(data[key].unique()) != len(value): + raise KeyError(f"There are observations for a value in '{key}' that has not been mapped to a population target") # Make sure we have at least 1 observation for each category + # It is impossible to set values without observations to a weight larger than 1 for k, _ in value.items(): if k not in data[key].unique(): raise KeyError(f"There are no observations for {k} in column {key}") From b7a06b02f3112d35e63cf26e254f5ab945cbae7f Mon Sep 17 00:00:00 2001 From: DvGils Date: Mon, 6 May 2024 09:31:13 +0200 Subject: [PATCH 3/5] chore: update `.gitignore` --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 6769e21..ef951ea 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ __pycache__/ *.py[cod] *$py.class +weightfactors-env/ # C extensions *.so From 36fd49649fbdb4c82a0dec07bfe2cc2bb11767ca Mon Sep 17 00:00:00 2001 From: DvGils Date: Mon, 6 May 2024 09:36:26 +0200 Subject: [PATCH 4/5] chore: update `README` --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 03e443e..cd8fb86 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ ![Continuous Integration](https://github.com/markteffect/weightfactors/actions/workflows/ci.yml/badge.svg) -![Python](https://img.shields.io/badge/Python-3.9%20|%203.10-blue) +![Python](https://img.shields.io/badge/Python-3.9+-blue) # **Weight Factors** Calculate weight factors for survey data to approximate a representative sample From 1418971d44a294602e39922a75e11e6501819ff2 Mon Sep 17 00:00:00 2001 From: DvGils Date: Mon, 6 May 2024 09:38:06 +0200 Subject: [PATCH 5/5] fix[black]: code formatting --- tests/test_generalized_raking.py | 5 ++++- weightfactors/raking/generalized_raker.py | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_generalized_raking.py b/tests/test_generalized_raking.py index ba3c5c9..ec93fc7 100644 --- a/tests/test_generalized_raking.py +++ b/tests/test_generalized_raking.py @@ -38,7 +38,10 @@ def test_invalid_input(): data = pd.DataFrame({"Gender": ["Male", "Male", "Female"]}) raker = GeneralizedRaker({"Gender": {"Male": 0.51, "Female": 0.5}}) raker.rake(data) - with pytest.raises(KeyError, match="There are observations for a value in 'Gender' that has not been mapped to a population target"): + with pytest.raises( + KeyError, + match="There are observations for a value in 'Gender' that has not been mapped to a population target", + ): data = pd.DataFrame({"Gender": ["Male", "Male", "Female", "Other"]}) raker = GeneralizedRaker({"Gender": {"Male": 0.51, "Female": 0.49}}) raker.rake(data) diff --git a/weightfactors/raking/generalized_raker.py b/weightfactors/raking/generalized_raker.py index 267ed50..21f0736 100644 --- a/weightfactors/raking/generalized_raker.py +++ b/weightfactors/raking/generalized_raker.py @@ -124,7 +124,9 @@ def validate_input(self, data: pd.DataFrame) -> None: # Make sure all unique values in the target columns have been mapped # It is impossible to set values with observations to a weight of 0 if len(data[key].unique()) != len(value): - raise KeyError(f"There are observations for a value in '{key}' that has not been mapped to a population target") + raise KeyError( + f"There are observations for a value in '{key}' that has not been mapped to a population target" + ) # Make sure we have at least 1 observation for each category # It is impossible to set values without observations to a weight larger than 1 for k, _ in value.items():