Skip to content

Commit

Permalink
Cleanup/lint with ruff
Browse files Browse the repository at this point in the history
  • Loading branch information
dfsnow committed Nov 7, 2024
1 parent f478d7a commit 9af8f19
Show file tree
Hide file tree
Showing 9 changed files with 150 additions and 91 deletions.
8 changes: 4 additions & 4 deletions assesspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,19 @@
from .formulas import (
cod,
cod_met,
ki,
mki,
mki_met,
prb,
prb_met,
prd,
prd_met,
ki,
mki,
mki_met,
)
from .load_data import ratios_sample
from .outliers import (
iqr_outlier,
is_outlier,
quantile_outlier,
)
from .load_data import ratios_sample
from .sales_chasing import detect_chasing
from .utils import check_inputs
10 changes: 6 additions & 4 deletions assesspy/ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,12 @@ def boot_ci(fun, nboot=100, alpha=0.05, **kwargs):
kwargs.keys()
):
kwargs = (kwargs["assessed"], kwargs["sale_price"])
elif fun.__name__ == "prd" and not set(["assessed", "sale_price"]).issubset(
kwargs.keys()
):
raise Exception("PRD function expects argurments 'assessed' and 'sale_price'.")
elif fun.__name__ == "prd" and not set(
["assessed", "sale_price"]
).issubset(kwargs.keys()):
raise Exception(
"PRD function expects argurments 'assessed' and 'sale_price'."
)
else:
kwargs = tuple(kwargs.values())

Expand Down
5 changes: 4 additions & 1 deletion assesspy/formulas.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,10 @@ def prb(assessed, sale_price, round=None):
prb_ci = prb_model.conf_int(alpha=0.05)[0].tolist()

if round is not None:
out = {"prb": np.round(prb_val, round), "95% ci": np.round(prb_ci, round)}
out = {
"prb": np.round(prb_val, round),
"95% ci": np.round(prb_ci, round),
}

else:
out = {"prb": prb_val, "95% ci": prb_ci}
Expand Down
4 changes: 3 additions & 1 deletion assesspy/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,7 @@ def ratios_sample():
:rtype: DataFrame
"""

stream = pkg_resources.resource_stream(__name__, "data/ratios_sample.parquet")
stream = pkg_resources.resource_stream(
__name__, "data/ratios_sample.parquet"
)
return pd.read_parquet(stream)
4 changes: 3 additions & 1 deletion assesspy/outliers.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ def is_outlier(x, method="iqr", probs=[0.05, 0.95]):
ap.is_outlier(ap.ratios_sample().ratio)
"""

out = {"iqr": iqr_outlier(x), "quantile": quantile_outlier(x, probs)}.get(method)
out = {"iqr": iqr_outlier(x), "quantile": quantile_outlier(x, probs)}.get(
method
)

# Warn about removing data from small samples, as it can severely distort
# ratio study outcomes
Expand Down
4 changes: 3 additions & 1 deletion assesspy/sales_chasing.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ def detect_chasing_cdf(ratio, bounds=[0.98, 1.02], cdf_gap=0.03):
# Check if the largest difference is greater than the threshold and make
# sure it's within the specified boundaries
diff_loc = sorted_ratio[np.argmax(diffs)]
out = (max(diffs) > cdf_gap) & ((diff_loc > bounds[0]) & (diff_loc < bounds[1]))
out = (max(diffs) > cdf_gap) & (
(diff_loc > bounds[0]) & (diff_loc < bounds[1])
)

return out

Expand Down
4 changes: 3 additions & 1 deletion assesspy/tests/test_formulas.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,9 @@ def test_prb_met(self): # Standard met function
gini_data_sale.append(first_column)
gini_data_assessed.append(second_column)

gini_data_assessed = [int(value.replace('"', "")) for value in gini_data_assessed]
gini_data_assessed = [
int(value.replace('"', "")) for value in gini_data_assessed
]
gini_data_sale = [int(value.replace('"', "")) for value in gini_data_sale]

mki_out = assesspy.mki(gini_data_assessed, gini_data_sale)
Expand Down
1 change: 1 addition & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import sys

from sphinx_pyproject import SphinxConfig

sys.path.append(os.path.abspath("../.."))
Expand Down
201 changes: 123 additions & 78 deletions docs/source/notebooks/example-ratio-study.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -51,19 +51,18 @@
"outputs": [],
"source": [
"%%capture\n",
"import json\n",
"import pandas as pd\n",
"\n",
"# Load 100k rows of 2020 residential (major class 2) assessment data\n",
"assessments = pd.read_json(\n",
" \"https://datacatalog.cookcountyil.gov/resource/uzyt-m557.json\" +\n",
" \"?$where=starts_with(class,'2')&tax_year=2020&$limit=100000\"\n",
" \"https://datacatalog.cookcountyil.gov/resource/uzyt-m557.json\"\n",
" + \"?$where=starts_with(class,'2')&tax_year=2020&$limit=100000\"\n",
")\n",
"\n",
"# Load 100k rows of 2020 sales data\n",
"sales = pd.read_json(\n",
" \"https://datacatalog.cookcountyil.gov/resource/wvhk-k5uv.json\" +\n",
" \"?$where=sale_price>10000&year=2020&$limit=100000\"\n",
" \"https://datacatalog.cookcountyil.gov/resource/wvhk-k5uv.json\"\n",
" + \"?$where=sale_price>10000&year=2020&$limit=100000\"\n",
")\n",
"\n",
"# read_json removes leading zeroes, add them back\n",
Expand Down Expand Up @@ -139,12 +138,12 @@
" assessments.rename(columns={\"tax_year\": \"year\"}),\n",
" id_vars=[\"pin\", \"year\", \"township_name\"],\n",
" value_vars=[\"mailed_tot\", \"certified_tot\", \"board_tot\"],\n",
" var_name = \"stage\",\n",
" value_name='assessed'\n",
" var_name=\"stage\",\n",
" value_name=\"assessed\",\n",
" ),\n",
" sales[[\"pin\", \"year\", \"sale_price\", \"is_multisale\"]],\n",
" on = [\"pin\", \"year\"],\n",
" how = \"inner\"\n",
" on=[\"pin\", \"year\"],\n",
" how=\"inner\",\n",
")\n",
"\n",
"# Remove multisales, then calculate the ratio for each property\n",
Expand Down Expand Up @@ -482,33 +481,53 @@
}
],
"source": [
"import warnings\n",
"\n",
"import numpy as np\n",
"\n",
"import assesspy as ap\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"# For each town and stage, calculate COD, PRD, and PRB, and their respective\n",
"# confidence intervals then arrange by town name and stage of assessment\n",
"town_stats = combined[combined.assessed > 0].copy(deep=True)\n",
"town_stats['stage'] = town_stats.stage.astype('category').cat.reorder_categories(['mailed_tot', 'certified_tot', 'board_tot'])\n",
"town_stats[\"stage\"] = town_stats.stage.astype(\n",
" \"category\"\n",
").cat.reorder_categories([\"mailed_tot\", \"certified_tot\", \"board_tot\"])\n",
"town_stats = town_stats.groupby([\"township_name\", \"stage\"]).apply(\n",
" lambda x: pd.Series({\n",
" 'n':np.size(x['pin']),\n",
" 'cod':np.round(ap.cod(ratio = x['ratio']), 2),\n",
" 'cod_ci':np.round(ap.cod_ci(ratio = x['ratio']), 2),\n",
" 'prd':np.round(ap.prd(x['assessed'], x['sale_price']), 2),\n",
" 'prd_ci':np.round(ap.prd_ci(x['assessed'], x['sale_price']), 2),\n",
" 'prb':ap.prb(x['assessed'], x['sale_price'], 3)\n",
" })\n",
" lambda x: pd.Series(\n",
" {\n",
" \"n\": np.size(x[\"pin\"]),\n",
" \"cod\": np.round(ap.cod(ratio=x[\"ratio\"]), 2),\n",
" \"cod_ci\": np.round(ap.cod_ci(ratio=x[\"ratio\"]), 2),\n",
" \"prd\": np.round(ap.prd(x[\"assessed\"], x[\"sale_price\"]), 2),\n",
" \"prd_ci\": np.round(ap.prd_ci(x[\"assessed\"], x[\"sale_price\"]), 2),\n",
" \"prb\": ap.prb(x[\"assessed\"], x[\"sale_price\"], 3),\n",
" }\n",
" )\n",
")\n",
"\n",
"town_stats['prb_ci'] = town_stats.prb.str['95% ci']\n",
"town_stats['prb'] = town_stats.prb.str['prb']\n",
"town_stats['cod_met'] = town_stats.cod.apply(ap.cod_met)\n",
"town_stats['prd_met'] = town_stats.prd.apply(ap.prd_met)\n",
"town_stats['prb_met'] = town_stats.prb.apply(ap.prb_met)\n",
"town_stats = town_stats[['n', 'cod', 'cod_ci', 'cod_met', 'prd', 'prd_ci', 'prd_met', 'prb', 'prb_ci', 'prb_met']]\n",
"town_stats = town_stats[town_stats['n'] >= 70]\n",
"town_stats[\"prb_ci\"] = town_stats.prb.str[\"95% ci\"]\n",
"town_stats[\"prb\"] = town_stats.prb.str[\"prb\"]\n",
"town_stats[\"cod_met\"] = town_stats.cod.apply(ap.cod_met)\n",
"town_stats[\"prd_met\"] = town_stats.prd.apply(ap.prd_met)\n",
"town_stats[\"prb_met\"] = town_stats.prb.apply(ap.prb_met)\n",
"town_stats = town_stats[\n",
" [\n",
" \"n\",\n",
" \"cod\",\n",
" \"cod_ci\",\n",
" \"cod_met\",\n",
" \"prd\",\n",
" \"prd_ci\",\n",
" \"prd_met\",\n",
" \"prb\",\n",
" \"prb_ci\",\n",
" \"prb_met\",\n",
" ]\n",
"]\n",
"town_stats = town_stats[town_stats[\"n\"] >= 70]\n",
"\n",
"town_stats"
]
Expand Down Expand Up @@ -621,12 +640,16 @@
}
],
"source": [
"deciles = np.linspace(.1, .9, 9).round(1)\n",
"\n",
"median_ratios = pd.DataFrame(deciles, columns = ['Decile'])\n",
"median_ratios['Decile'] = (median_ratios.Decile * 100).astype(int).astype(str) + '%'\n",
"median_ratios['Sale Price'] = np.quantile(combined.sale_price, deciles)\n",
"median_ratios['Sale Price'] = median_ratios['Sale Price'].apply(lambda x: \"${:,.0f}\".format(x))\n",
"deciles = np.linspace(0.1, 0.9, 9).round(1)\n",
"\n",
"median_ratios = pd.DataFrame(deciles, columns=[\"Decile\"])\n",
"median_ratios[\"Decile\"] = (median_ratios.Decile * 100).astype(int).astype(\n",
" str\n",
") + \"%\"\n",
"median_ratios[\"Sale Price\"] = np.quantile(combined.sale_price, deciles)\n",
"median_ratios[\"Sale Price\"] = median_ratios[\"Sale Price\"].apply(\n",
" lambda x: \"${:,.0f}\".format(x)\n",
")\n",
"\n",
"median_ratios"
]
Expand Down Expand Up @@ -656,32 +679,39 @@
],
"source": [
"import matplotlib.pyplot as plt\n",
"plt.style.use('default')\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"plt.style.use(\"default\")\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"graph_data = combined\n",
"graph_data['rank'] = pd.qcut(graph_data['sale_price'], 10, labels=False)\n",
"graph_data['rank'] = graph_data['rank'] + 1\n",
"graph_data['decile'] = pd.qcut(graph_data['sale_price'] / 1000, 10, precision=0)\n",
"graph_data['decile'] = graph_data['decile'].astype(str).str.replace('(', '\\$')\n",
"graph_data['decile'] = graph_data['decile'].str.replace(', ', ' - \\$')\n",
"graph_data['decile'] = graph_data['decile'].str.replace('.0', 'K', regex=False)\n",
"graph_data['decile'] = graph_data['decile'].str.replace(']', '')\n",
"graph_data['decile'] = graph_data['decile'].str.replace(' - \\$9050K', '+', regex=False)\n",
"graph_data = graph_data.groupby(['rank', 'decile']).apply(\n",
" lambda x: pd.Series({\n",
" 'Median Sales Ratio':np.median(x['ratio']),\n",
" })\n",
" )\n",
"graph_data[\"rank\"] = pd.qcut(graph_data[\"sale_price\"], 10, labels=False)\n",
"graph_data[\"rank\"] = graph_data[\"rank\"] + 1\n",
"graph_data[\"decile\"] = pd.qcut(\n",
" graph_data[\"sale_price\"] / 1000, 10, precision=0\n",
")\n",
"graph_data[\"decile\"] = graph_data[\"decile\"].astype(str).str.replace(\"(\", \"\\$\")\n",
"graph_data[\"decile\"] = graph_data[\"decile\"].str.replace(\", \", \" - \\$\")\n",
"graph_data[\"decile\"] = graph_data[\"decile\"].str.replace(\".0\", \"K\", regex=False)\n",
"graph_data[\"decile\"] = graph_data[\"decile\"].str.replace(\"]\", \"\")\n",
"graph_data[\"decile\"] = graph_data[\"decile\"].str.replace(\n",
" \" - \\$9050K\", \"+\", regex=False\n",
")\n",
"graph_data = graph_data.groupby([\"rank\", \"decile\"]).apply(\n",
" lambda x: pd.Series(\n",
" {\n",
" \"Median Sales Ratio\": np.median(x[\"ratio\"]),\n",
" }\n",
" )\n",
")\n",
"\n",
"graph_data = graph_data.reset_index()\n",
"plt.scatter(graph_data['decile'], graph_data['Median Sales Ratio'])\n",
"plt.scatter(graph_data[\"decile\"], graph_data[\"Median Sales Ratio\"])\n",
"plt.xticks(rotation=45)\n",
"plt.xlabel('Decile')\n",
"plt.ylabel('Ratio')\n",
"plt.xlabel(\"Decile\")\n",
"plt.ylabel(\"Ratio\")\n",
"plt.suptitle(\"Median Sale Ratios: Open Data Sample\", fontsize=14)\n",
"plt.title(\"By decile of sale price in 2020\")\n",
"plt.gca().set_yticklabels([f'{x:.0%}' for x in plt.gca().get_yticks()])\n",
"plt.gca().set_yticklabels([f\"{x:.0%}\" for x in plt.gca().get_yticks()])\n",
"plt.grid()\n",
"plt.show()"
]
Expand Down Expand Up @@ -731,14 +761,17 @@
"# Plot to view discontinuity\n",
"ecdf_normal = ECDF(normal_ratios)\n",
"ecdf_chased = ECDF(chased_ratios)\n",
"plt.plot(ecdf_normal.x, ecdf_normal.y, color = 'blue')\n",
"plt.plot(ecdf_chased.x, ecdf_chased.y, color = 'red')\n",
"plt.xlabel('Ratio')\n",
"plt.ylabel('F(x)')\n",
"plt.plot(ecdf_normal.x, ecdf_normal.y, color=\"blue\")\n",
"plt.plot(ecdf_chased.x, ecdf_chased.y, color=\"red\")\n",
"plt.xlabel(\"Ratio\")\n",
"plt.ylabel(\"F(x)\")\n",
"plt.grid()\n",
"plt.show()\n",
"\n",
"{'Blue Chased?': ap.detect_chasing(normal_ratios), 'Red Chased?': ap.detect_chasing(chased_ratios)}"
"{\n",
" \"Blue Chased?\": ap.detect_chasing(normal_ratios),\n",
" \"Red Chased?\": ap.detect_chasing(chased_ratios),\n",
"}"
]
},
{
Expand Down Expand Up @@ -779,34 +812,46 @@
],
"source": [
"# Combine sale price and assessed value, calculate cumulative sums\n",
"gini_data = combined[['sale_price', 'assessed']].sort_values(by='sale_price')\n",
"\n",
"sale_price = gini_data['sale_price']\n",
"assessed = gini_data['assessed']\n",
"\n",
"lorenz_data_price = pd.DataFrame({\n",
" 'pct': np.concatenate(([0], np.cumsum(sale_price) / np.sum(sale_price))),\n",
" 'cum_pct': np.concatenate(([0], np.arange(1, len(sale_price) + 1) / len(sale_price)))\n",
"})\n",
"gini_data = combined[[\"sale_price\", \"assessed\"]].sort_values(by=\"sale_price\")\n",
"\n",
"sale_price = gini_data[\"sale_price\"]\n",
"assessed = gini_data[\"assessed\"]\n",
"\n",
"lorenz_data_price = pd.DataFrame(\n",
" {\n",
" \"pct\": np.concatenate(\n",
" ([0], np.cumsum(sale_price) / np.sum(sale_price))\n",
" ),\n",
" \"cum_pct\": np.concatenate(\n",
" ([0], np.arange(1, len(sale_price) + 1) / len(sale_price))\n",
" ),\n",
" }\n",
")\n",
"\n",
"lorenz_data_assessed = pd.DataFrame({\n",
" 'pct': np.concatenate(([0], np.cumsum(assessed) / np.sum(assessed))),\n",
" 'cum_pct': np.concatenate(([0], np.arange(1, len(assessed) + 1) / len(assessed)))\n",
"})\n",
"lorenz_data_assessed = pd.DataFrame(\n",
" {\n",
" \"pct\": np.concatenate(([0], np.cumsum(assessed) / np.sum(assessed))),\n",
" \"cum_pct\": np.concatenate(\n",
" ([0], np.arange(1, len(assessed) + 1) / len(assessed))\n",
" ),\n",
" }\n",
")\n",
"\n",
"# Plot Lorenz curves\n",
"fig, ax = plt.subplots()\n",
"\n",
"ax.plot(lorenz_data_price['cum_pct'], lorenz_data_price['pct'], color='blue')\n",
"ax.plot(lorenz_data_assessed['cum_pct'], lorenz_data_assessed['pct'], color='red')\n",
"ax.plot([0, 1], [0, 1], linestyle='dashed', color='green')\n",
"ax.plot(lorenz_data_price[\"cum_pct\"], lorenz_data_price[\"pct\"], color=\"blue\")\n",
"ax.plot(\n",
" lorenz_data_assessed[\"cum_pct\"], lorenz_data_assessed[\"pct\"], color=\"red\"\n",
")\n",
"ax.plot([0, 1], [0, 1], linestyle=\"dashed\", color=\"green\")\n",
"\n",
"ax.text(0.785, 0.1, 'Sale Price', color='blue', va='center')\n",
"ax.text(.9, 0.15, 'Assessed Price', color='red', ha='center', va='center')\n",
"ax.text(0.785, 0.1, \"Sale Price\", color=\"blue\", va=\"center\")\n",
"ax.text(0.9, 0.15, \"Assessed Price\", color=\"red\", ha=\"center\", va=\"center\")\n",
"\n",
"ax.set_title('Lorenz Curve for Sale and Assessed Values')\n",
"ax.set_xlabel('Percent of Properties')\n",
"ax.set_ylabel('Percent of Value')\n",
"ax.set_title(\"Lorenz Curve for Sale and Assessed Values\")\n",
"ax.set_xlabel(\"Percent of Properties\")\n",
"ax.set_ylabel(\"Percent of Value\")\n",
"\n",
"plt.show()"
]
Expand Down

0 comments on commit 9af8f19

Please sign in to comment.