Refactor AssessPy for consistency, stability (#24)

* Add type stubs to dev deps * Refactor all assesspy formulas * Add full test suite for metrics * Add and clean testing data/samples * Update doc refs for data sets * Update metric docs types * Cleanup doc groupings * Bump package version * Add metrics _met docs * Update all CI functions * Don't export check_inputs * Update outlier functions * Cleanup test fixtures * Refactor outlier functions * Rename expect output vars * Add outlier function tests * Add option to disable lt 0 check for check_inputs * Update outliers docs heading * Remove scipy dependency * Refactor sales chasing functions * Fixup outlier functions * Finalize sales chasing rewrite * Add tests for all code paths * Update README * Update docs * Lint with ruff * Add Python 3.13 support * Update tests and types * Add Python 3.8 support * Add more python versions, update docstrings * Revert .python-versions setup * Remove pandas stubs dep * Set 3.9 to min python version * Fix indexing bug for MKI * Add more tests of MKI index issues * Update example ratio study notebook * Fix rst table format * Set notebook execution timeout * Update doc references * Fix minor doc issues * Remove 3.8 from tox env list * Bump release date * Remove unnecessary typing * Deduplicate PRD code * Scope test to session * Join check_inputs messages with newlines * Fix ruff errors * Scope seed fixture to each test
ccao-data · Nov 25, 2024 · d611de4 · d611de4
1 parent 7dfcba7
commit d611de4
Show file tree

Hide file tree

Showing 39 changed files with 1,285 additions and 1,298 deletions.
diff --git a/.github/workflows/python-package.yaml b/.github/workflows/python-package.yaml
@@ -12,7 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
 
     steps:
       - name: Checkout

diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
 
     steps:
       - name: Checkout
@@ -35,7 +35,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
 
       - name: Install dependencies
-        run: uv pip install .[test]
+        run: uv pip install .[dev]
 
       - name: Run pytest
         run: |

diff --git a/CITATION.cff b/CITATION.cff
@@ -2,6 +2,6 @@ message: "If you use this software, please cite it as below."
 authors:
 - family-names: "Cook County Assessor's Office"
 title: "AssessPy"
-version: 1.2.0
-date-released: 2022-11-14
+version: 2.0.0
+date-released: 2024-11-25
 url: https://github.com/ccao-data/assesspy
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 [![test-coverage](https://github.com/ccao-data/assesspy/actions/workflows/test-coverage.yaml/badge.svg)](https://github.com/ccao-data/assesspy/actions/workflows/test-coverage.yaml)
 [![pre-commit](https://github.com/ccao-data/assesspy/actions/workflows/pre-commit.yaml/badge.svg)](https://github.com/ccao-data/assesspy/actions/workflows/pre-commit.yaml)
 
-Assesspy is a software package for Python developed by the Cook County Assessor’s (CCAO)
+AssessPy is a software package for Python developed by the Cook County Assessor’s (CCAO)
 Data Department. It contains many of the functions necessary to perform a standard
 [sales ratio study](https://www.iaao.org/media/standards/Standard_on_Ratio_Studies.pdf).
 

diff --git a/assesspy/__init__.py b/assesspy/__init__.py
@@ -1,9 +1,11 @@
 from .ci import (
     boot_ci,
     cod_ci,
+    prb_ci,
     prd_ci,
 )
-from .formulas import (
+from .load_data import ccao_sample, quintos_sample
+from .metrics import (
     cod,
     cod_met,
     ki,
@@ -14,11 +16,5 @@
     prd,
     prd_met,
 )
-from .load_data import ratios_sample
-from .outliers import (
-    iqr_outlier,
-    is_outlier,
-    quantile_outlier,
-)
-from .sales_chasing import detect_chasing
-from .utils import check_inputs
+from .outliers import is_outlier
+from .sales_chasing import is_sales_chased
diff --git a/assesspy/ci.py b/assesspy/ci.py
@@ -1,38 +1,46 @@
-# Import necessary libraries
+from typing import Union
+
 import pandas as pd
-from pandas.api.types import is_numeric_dtype
 
-from .formulas import cod, prd
+from .metrics import _calculate_prb, cod, prd
 from .utils import check_inputs
 
 
-def boot_ci(fun, nboot=100, alpha=0.05, **kwargs):
+def boot_ci(
+    fun,
+    estimate: Union[list[int], list[float], pd.Series],
+    sale_price: Union[list[int], list[float], pd.Series],
+    nboot: int = 1000,
+    alpha: float = 0.05,
+) -> tuple[float, float]:
     """
     Calculate the non-parametric bootstrap confidence interval
-    for a given numeric input and a chosen function.
+    for a given set of numeric values and a chosen function.
 
     :param fun:
-        Function to bootstrap. Must return a single value.
+        Function to bootstrap. Must return a single float value.
+    :param estimate:
+        A list or ``pd.Series`` of estimated values.
+        Must be the same length as ``sale_price``.
+    :param sale_price:
+        A list or ``pd.Series`` of sale prices.
+        Must be the same length as ``estimate``.
     :param nboot:
-        Default 100. Number of iterations to use to estimate
+        Default 1000. Number of iterations to use to estimate
         the output statistic confidence interval.
     :param alpha:
-        Default 0.05. Numeric value indicating the confidence
+        Default 0.05. Float value indicating the confidence
         interval to return. 0.05 will return the 95% confidence interval.
-    :param kwargs:
-        Arguments passed on to ``fun``.
     :type fun: function
+    :type estimate: Array-like numeric values
+    :type sale_price: Array-like numeric values
     :type nboot: int
     :type alpha: float
-    :type kwargs: numeric
-
-    .. note::
-       Input function should require 1 argument or be ``assesspy.prd()``.
 
     :return:
-        A two-long list of floats containing the bootstrapped confidence
-        interval of the input vector(s).
-    :rtype: list[float]
+        A tuple of floats containing the bootstrapped confidence
+        interval of the input values.
+    :rtype: tuple[float, float]
 
     :Example:
 
@@ -43,69 +51,77 @@ def boot_ci(fun, nboot=100, alpha=0.05, **kwargs):
 
         ap.boot_ci(
             ap.prd,
-            assessed = ap.ratios_sample().assessed,
-            sale_price = ap.ratios_sample().sale_price,
-            nboot = 100
-            )
+            estimate = ap.ccao_sample().estimate,
+            sale_price = ap.ccao_sample().sale_price,
+            nboot = 1000
+        )
     """
+    if nboot <= 0:
+        raise ValueError("'nboot' must be a positive integer greater than 0.")
+    check_inputs(estimate, sale_price)
+    df = pd.DataFrame({"estimate": estimate, "sale_price": sale_price})
+    n: int = df.size
 
-    # Make sure prd is passed arguments in correct order
-    if fun.__name__ == "prd" and set(["assessed", "sale_price"]).issubset(
-        kwargs.keys()
-    ):
-        kwargs = (kwargs["assessed"], kwargs["sale_price"])
-    elif fun.__name__ == "prd" and not set(
-        ["assessed", "sale_price"]
-    ).issubset(kwargs.keys()):
-        raise Exception(
-            "PRD function expects argurments 'assessed' and 'sale_price'."
-        )
-    else:
-        kwargs = tuple(kwargs.values())
+    # Take a random sample of input, with the same number of rows as input,
+    # with replacement
+    ests = pd.Series(index=range(nboot), dtype=float)
+    for i in range(nboot):
+        sample = df.sample(n=n, replace=True)
+        ests[i] = fun(sample.iloc[:, 0], sample.iloc[:, 1])
 
-    check_inputs(kwargs)  # Input checking and error handling
+    ci = (ests.quantile(alpha / 2), ests.quantile(1 - alpha / 2))
 
-    num_kwargs = len(kwargs)
-    kwargs = pd.DataFrame(kwargs).T
-    n = len(kwargs)
+    return ci
 
-    # Check that the input function returns a numeric vector
-    out = (
-        fun(kwargs.iloc[:, 0])
-        if num_kwargs < 2
-        else fun(kwargs.iloc[:, 0], kwargs.iloc[:, 1])
+
+def cod_ci(
+    estimate: Union[list[int], list[float], pd.Series],
+    sale_price: Union[list[int], list[float], pd.Series],
+    nboot: int = 1000,
+    alpha: float = 0.05,
+) -> tuple[float, float]:
+    """
+    Calculate the non-parametric bootstrap confidence interval for COD.
+
+    See also:
+        :func:`boot_ci`
+    """
+    return boot_ci(
+        cod, estimate=estimate, sale_price=sale_price, nboot=nboot, alpha=alpha
     )
-    if not is_numeric_dtype(out):
-        raise Exception("Input function outputs non-numeric datatype.")
 
-    ests = []
 
-    # Take a random sample of input, with the same number of rows as input,
-    # with replacement.
-    for i in list(range(1, nboot)):
-        sample = kwargs.sample(n=n, replace=True)
-        if fun.__name__ == "cod" or num_kwargs == 1:
-            ests.append(fun(sample.iloc[:, 0]))
-        elif fun.__name__ == "prd":
-            ests.append(fun(sample.iloc[:, 0], sample.iloc[:, 1]))
-        else:
-            raise Exception(
-                "Input function should require 1 argument or be assesspy.prd."
-            )
-
-    ests = pd.Series(ests)
-
-    ci = [ests.quantile(alpha / 2), ests.quantile(1 - alpha / 2)]
+def prd_ci(
+    estimate: Union[list[int], list[float], pd.Series],
+    sale_price: Union[list[int], list[float], pd.Series],
+    nboot: int = 1000,
+    alpha: float = 0.05,
+) -> tuple[float, float]:
+    """
+    Calculate the non-parametric bootstrap confidence interval for PRD.
 
-    return ci
+    See also:
+        :func:`boot_ci`
+    """
+    return boot_ci(
+        prd, estimate=estimate, sale_price=sale_price, nboot=nboot, alpha=alpha
+    )
 
 
-# Formula specific bootstrapping functions
-def cod_ci(ratio, nboot=100, alpha=0.05):
-    return boot_ci(cod, ratio=ratio, nboot=nboot, alpha=alpha)
+def prb_ci(
+    estimate: Union[list[int], list[float], pd.Series],
+    sale_price: Union[list[int], list[float], pd.Series],
+    nboot: int = 1000,
+    alpha: float = 0.05,
+) -> tuple[float, float]:
+    """
+    Calculate the closed-form confidence interval for PRB. Unlike COD and PRB,
+    this does not use bootstrapping.
 
+    See also:
+        :func:`boot_ci`
+    """
+    prb_model = _calculate_prb(estimate, sale_price)
+    prb_ci = prb_model.conf_int(alpha=alpha)[0].tolist()
 
-def prd_ci(assessed, sale_price, nboot=100, alpha=0.05):
-    return boot_ci(
-        prd, assessed=assessed, sale_price=sale_price, nboot=nboot, alpha=alpha
-    )
+    return prb_ci[0], prb_ci[1]
diff --git a/assesspy/data/ccao_sample.parquet b/assesspy/data/ccao_sample.parquet
diff --git a/assesspy/data/mki_ki.csv b/assesspy/data/mki_ki.csv
diff --git a/assesspy/data/quintos_sample.csv b/assesspy/data/quintos_sample.csv
@@ -0,0 +1,31 @@
+estimate,sale_price
+37299,32900
+40166,36000
+56317,54000
+66184,64500
+69487,68000
+71515,70000
+75338,74000
+81036,80000
+85673,84900
+85021,89000
+90046,94250
+94089,99000
+100227,105900
+103157,109000
+108290,115000
+117099,124500
+115347,129900
+119678,135000
+131631,149000
+137321,155800
+143974,163500
+153572,175000
+148457,179000
+153488,185600
+165040,199900
+176940,215000
+192959,235000
+180046,250000
+200240,279000
+211445,295000
diff --git a/assesspy/data/ratios_sample.parquet b/assesspy/data/ratios_sample.parquet