diff --git a/docs/source/notebooks/clv/dev/beta_geo_beta_binom.ipynb b/docs/source/notebooks/clv/dev/beta_geo_beta_binom.ipynb index d434df4a5..484e3995c 100644 --- a/docs/source/notebooks/clv/dev/beta_geo_beta_binom.ipynb +++ b/docs/source/notebooks/clv/dev/beta_geo_beta_binom.ipynb @@ -2,46 +2,5899 @@ "cells": [ { "cell_type": "code", - "execution_count": 9, + "execution_count": 45, "id": "5e06e043-4631-47ae-a658-a9a928ff15e5", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", + "import nutpie\n", "import pandas as pd\n", "from lifetimes import BetaGeoBetaBinomFitter\n", - "from lifetimes.datasets import load_donations, load_cdnow_summary_data_with_monetary_value\n", "\n", "import pymc as pm\n", - "from pymc_marketing.clv.distributions import BetaGeoBetaBinom" + "from pymc_marketing.clv import BetaGeoBetaBinomModel\n", + "from pymc_marketing.prior import Prior" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "7b7431fc-1bad-41b6-8ec4-ec843a81709d", + "execution_count": 28, + "id": "fe652337-a582-4cce-bae0-a104316a0a4a", + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(\"https://raw.githubusercontent.com/pymc-labs/pymc-marketing/main/data/bgbb_donations.csv\") \n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "fae987d6-d412-4099-a9d1-bc985b92287d", "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "Sampling: [alpha, beta, beta_geo_beta_binom, delta, gamma]\n" - ] + "data": { + "text/plain": [ + "BG/BB\n", + " alpha ~ HalfFlat()\n", + " beta ~ HalfFlat()\n", + " gamma ~ HalfFlat()\n", + " delta ~ HalfFlat()\n", + "recency_frequency ~ BetaGeoBetaBinom(alpha, beta, gamma, delta, )" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_config = {\n", + " \"alpha_prior\": Prior(\"HalfFlat\"),\n", + " \"beta_prior\": Prior(\"HalfFlat\"),\n", + " \"gamma_prior\": Prior(\"HalfFlat\"),\n", + " \"delta_prior\": Prior(\"HalfFlat\"),\n", + "}\n", + "\n", + "model = BetaGeoBetaBinomModel(data=data,model_config=model_config)\n", + "model.build_model()\n", + "model" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "bc154487-8d4c-4015-9fe1-9bed0ac433fb", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "clustercustomer_id (11104) x obs_var (2)\n", + "\n", + "customer_id (11104) x obs_var (2)\n", + "\n", + "\n", + "\n", + "alpha\n", + "\n", + "alpha\n", + "~\n", + "HalfFlat\n", + "\n", + "\n", + "\n", + "recency_frequency\n", + "\n", + "recency_frequency\n", + "~\n", + "BetaGeoBetaBinom\n", + "\n", + "\n", + "\n", + "alpha->recency_frequency\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "delta\n", + "\n", + "delta\n", + "~\n", + "HalfFlat\n", + "\n", + "\n", + "\n", + "delta->recency_frequency\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "beta\n", + "\n", + "beta\n", + "~\n", + "HalfFlat\n", + "\n", + "\n", + "\n", + "beta->recency_frequency\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "gamma\n", + "\n", + "gamma\n", + "~\n", + "HalfFlat\n", + "\n", + "\n", + "\n", + "gamma->recency_frequency\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pm.model_to_graphviz(model.model)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "8bd37568-78cf-49c7-9fe8-5b2640e37077", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1021d77cf42f4b69aed5461ad9c7ba90", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" }, { "data": { "text/html": [ "\n", - "
\n", - "
\n", - "
arviz.InferenceData
\n", - "
\n", - "
    \n", - " \n", + "
    \n", + "
    \n", + "
    arviz.InferenceData
    \n", + "
    \n", + "
      \n", + " \n", + "
    • \n", + " \n", + " \n", + "
      \n", + "
      \n", + "
        \n", + "
        \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
        <xarray.Dataset> Size: 48B\n",
        +       "Dimensions:  (chain: 1, draw: 1)\n",
        +       "Coordinates:\n",
        +       "  * chain    (chain) int64 8B 0\n",
        +       "  * draw     (draw) int64 8B 0\n",
        +       "Data variables:\n",
        +       "    alpha    (chain, draw) float64 8B 1.204\n",
        +       "    beta     (chain, draw) float64 8B 0.7497\n",
        +       "    delta    (chain, draw) float64 8B 2.784\n",
        +       "    gamma    (chain, draw) float64 8B 0.6568\n",
        +       "Attributes:\n",
        +       "    created_at:                 2024-09-12T21:32:31.746938+00:00\n",
        +       "    arviz_version:              0.18.0\n",
        +       "    inference_library:          pymc\n",
        +       "    inference_library_version:  5.13.0

        \n", + "
      \n", + "
      \n", + "
    • \n", + " \n", + "
    • \n", + " \n", + " \n", + "
      \n", + "
      \n", + "
        \n", + "
        \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
        <xarray.Dataset> Size: 267kB\n",
        +       "Dimensions:            (customer_id: 11104, obs_var: 2)\n",
        +       "Coordinates:\n",
        +       "  * customer_id        (customer_id) int64 89kB 0 1 2 3 ... 11101 11102 11103\n",
        +       "  * obs_var            (obs_var) <U9 72B 'recency' 'frequency'\n",
        +       "Data variables:\n",
        +       "    recency_frequency  (customer_id, obs_var) float64 178kB 0.0 0.0 ... 6.0 6.0\n",
        +       "Attributes:\n",
        +       "    created_at:                 2024-09-12T21:32:31.749978+00:00\n",
        +       "    arviz_version:              0.18.0\n",
        +       "    inference_library:          pymc\n",
        +       "    inference_library_version:  5.13.0

        \n", + "
      \n", + "
      \n", + "
    • \n", + " \n", + "
    • \n", + " \n", + " \n", + "
      \n", + "
      \n", + "
        \n", + "
        \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
        <xarray.Dataset> Size: 444kB\n",
        +       "Dimensions:      (index: 11104)\n",
        +       "Coordinates:\n",
        +       "  * index        (index) int64 89kB 0 1 2 3 4 ... 11099 11100 11101 11102 11103\n",
        +       "Data variables:\n",
        +       "    customer_id  (index) int64 89kB 0 1 2 3 4 ... 11099 11100 11101 11102 11103\n",
        +       "    frequency    (index) int64 89kB 0 0 0 0 0 0 0 0 0 0 ... 6 6 6 6 6 6 6 6 6 6\n",
        +       "    recency      (index) int64 89kB 0 0 0 0 0 0 0 0 0 0 ... 6 6 6 6 6 6 6 6 6 6\n",
        +       "    T            (index) int64 89kB 6 6 6 6 6 6 6 6 6 6 ... 6 6 6 6 6 6 6 6 6 6

        \n", + "
      \n", + "
      \n", + "
    • \n", + " \n", + "
    \n", + "
    \n", + " " + ], + "text/plain": [ + "Inference data with groups:\n", + "\t> posterior\n", + "\t> observed_data\n", + "\t> fit_data" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.fit(fit_method='map')" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "2371018a-f69b-4f9e-add7-a21f69fb0749", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "alpha 1.204\n", + "beta 0.750\n", + "delta 2.784\n", + "gamma 0.657\n", + "Name: value, dtype: float64" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.fit_summary()" + ] + }, + { + "cell_type": "markdown", + "id": "4c33df35-23ed-45f6-a604-4e895df3d340", + "metadata": {}, + "source": [ + "Compare to `lifetimes`" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "371bb7a1-9f5c-4bdf-81b1-a9504261badb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bgbb = BetaGeoBetaBinomFitter().fit(data['frequency'].values,\n", + " data['recency'].values,\n", + " data['T'].values\n", + " )\n", + "bgbb" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "80d11cc8-98fb-426e-89b2-693f0a8d22fa", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "clustercustomer_id (11104) x obs_var (2)\n", + "\n", + "customer_id (11104) x obs_var (2)\n", + "\n", + "\n", + "\n", + "phi_dropout\n", + "\n", + "phi_dropout\n", + "~\n", + "Uniform\n", + "\n", + "\n", + "\n", + "delta\n", + "\n", + "delta\n", + "~\n", + "Deterministic\n", + "\n", + "\n", + "\n", + "phi_dropout->delta\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "gamma\n", + "\n", + "gamma\n", + "~\n", + "Deterministic\n", + "\n", + "\n", + "\n", + "phi_dropout->gamma\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "alpha\n", + "\n", + "alpha\n", + "~\n", + "Deterministic\n", + "\n", + "\n", + "\n", + "recency_frequency\n", + "\n", + "recency_frequency\n", + "~\n", + "BetaGeoBetaBinom\n", + "\n", + "\n", + "\n", + "alpha->recency_frequency\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "delta->recency_frequency\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "beta\n", + "\n", + "beta\n", + "~\n", + "Deterministic\n", + "\n", + "\n", + "\n", + "beta->recency_frequency\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "kappa_dropout\n", + "\n", + "kappa_dropout\n", + "~\n", + "Pareto\n", + "\n", + "\n", + "\n", + "kappa_dropout->delta\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "kappa_dropout->gamma\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "phi_purchase\n", + "\n", + "phi_purchase\n", + "~\n", + "Uniform\n", + "\n", + "\n", + "\n", + "phi_purchase->alpha\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "phi_purchase->beta\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "gamma->recency_frequency\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "kappa_purchase\n", + "\n", + "kappa_purchase\n", + "~\n", + "Pareto\n", + "\n", + "\n", + "\n", + "kappa_purchase->alpha\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "kappa_purchase->beta\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mcmc_model = BetaGeoBetaBinomModel(data=data)\n", + "mcmc_model.build_model()\n", + "pm.model_to_graphviz(mcmc_model.model)" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "a27cd610-74eb-4872-b5e3-4868f04ede6d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Sampling: [kappa_dropout, kappa_purchase, phi_dropout, phi_purchase, recency_frequency]\n" + ] + } + ], + "source": [ + "with mcmc_model.model:\n", + " prior_idata = pm.sample_prior_predictive()" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "782608ed-e302-4591-9d51-a826086f5d98", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
    \n", + "
    \n", + "
    arviz.InferenceData
    \n", + "
    \n", + "
      \n", + " \n", + "
    • \n", + " \n", + " \n", + "
      \n", + "
      \n", + "
        \n", + "
        \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
        <xarray.Dataset> Size: 36kB\n",
        +       "Dimensions:         (chain: 1, draw: 500)\n",
        +       "Coordinates:\n",
        +       "  * chain           (chain) int64 8B 0\n",
        +       "  * draw            (draw) int64 4kB 0 1 2 3 4 5 6 ... 494 495 496 497 498 499\n",
        +       "Data variables:\n",
        +       "    alpha           (chain, draw) float64 4kB 4.591 0.6012 ... 0.4558 1.681\n",
        +       "    beta            (chain, draw) float64 4kB 0.2142 4.733 ... 6.893 0.1661\n",
        +       "    delta           (chain, draw) float64 4kB 1.696 1.264 229.4 ... 0.1884 1.227\n",
        +       "    gamma           (chain, draw) float64 4kB 0.8744 0.3572 ... 0.9208 0.9806\n",
        +       "    kappa_dropout   (chain, draw) float64 4kB 2.571 1.621 263.1 ... 1.109 2.207\n",
        +       "    kappa_purchase  (chain, draw) float64 4kB 4.805 5.335 3.053 ... 7.348 1.847\n",
        +       "    phi_dropout     (chain, draw) float64 4kB 0.3401 0.2203 ... 0.8302 0.4443\n",
        +       "    phi_purchase    (chain, draw) float64 4kB 0.9554 0.1127 ... 0.06203 0.9101\n",
        +       "Attributes:\n",
        +       "    created_at:                 2024-09-12T21:43:04.658373+00:00\n",
        +       "    arviz_version:              0.18.0\n",
        +       "    inference_library:          pymc\n",
        +       "    inference_library_version:  5.13.0

        \n", + "
      \n", + "
      \n", + "
    • \n", + " \n", + "
    • \n", + " \n", + " \n", + "
      \n", + "
      \n", + "
        \n", + "
        \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
        <xarray.Dataset> Size: 89MB\n",
        +       "Dimensions:            (chain: 1, draw: 500, customer_id: 11104, obs_var: 2)\n",
        +       "Coordinates:\n",
        +       "  * chain              (chain) int64 8B 0\n",
        +       "  * draw               (draw) int64 4kB 0 1 2 3 4 5 ... 494 495 496 497 498 499\n",
        +       "  * customer_id        (customer_id) int64 89kB 0 1 2 3 ... 11101 11102 11103\n",
        +       "  * obs_var            (obs_var) <U9 72B 'recency' 'frequency'\n",
        +       "Data variables:\n",
        +       "    recency_frequency  (chain, draw, customer_id, obs_var) float64 89MB 1.0 ....\n",
        +       "Attributes:\n",
        +       "    created_at:                 2024-09-12T21:43:04.660348+00:00\n",
        +       "    arviz_version:              0.18.0\n",
        +       "    inference_library:          pymc\n",
        +       "    inference_library_version:  5.13.0

        \n", + "
      \n", + "
      \n", + "
    • \n", + " \n", + "
    • \n", + " \n", + " \n", + "
      \n", + "
      \n", + "
        \n", + "
        \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
        <xarray.Dataset> Size: 267kB\n",
        +       "Dimensions:            (customer_id: 11104, obs_var: 2)\n",
        +       "Coordinates:\n",
        +       "  * customer_id        (customer_id) int64 89kB 0 1 2 3 ... 11101 11102 11103\n",
        +       "  * obs_var            (obs_var) <U9 72B 'recency' 'frequency'\n",
        +       "Data variables:\n",
        +       "    recency_frequency  (customer_id, obs_var) float64 178kB 0.0 0.0 ... 6.0 6.0\n",
        +       "Attributes:\n",
        +       "    created_at:                 2024-09-12T21:43:04.661115+00:00\n",
        +       "    arviz_version:              0.18.0\n",
        +       "    inference_library:          pymc\n",
        +       "    inference_library_version:  5.13.0

        \n", + "
      \n", + "
      \n", + "
    • \n", + " \n", + "
    \n", + "
    \n", + " " + ], + "text/plain": [ + "Inference data with groups:\n", + "\t> prior\n", + "\t> prior_predictive\n", + "\t> observed_data" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prior_idata" + ] + }, + { + "cell_type": "markdown", + "id": "eeec297b-0c6f-4146-b19c-6e7aa365c470", + "metadata": {}, + "source": [ + "`nutpie` is about 3x faster than defaults NUTS sampler in `pymc`" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "310e576b-8deb-45b9-8ba2-ce0904127efd", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/coltallen/miniconda3/envs/pymc-marketing-dev/lib/python3.10/site-packages/nutpie/compile_pymc.py:554: NumbaWarning: \u001b[1m\u001b[1m\u001b[1m\u001b[1mCannot cache compiled function \"numba_funcified_fgraph\" as it uses dynamic globals (such as ctypes pointers and large global arrays)\u001b[0m\u001b[0m\u001b[0m\u001b[0m\n", + " return inner(x)\n", + "/Users/coltallen/miniconda3/envs/pymc-marketing-dev/lib/python3.10/site-packages/nutpie/compile_pymc.py:554: NumbaWarning: \u001b[1m\u001b[1m\u001b[1mCannot cache compiled function \"scan\" as it uses dynamic globals (such as ctypes pointers and large global arrays)\u001b[0m\u001b[0m\u001b[0m\n", + " return inner(x)\n", + "/Users/coltallen/miniconda3/envs/pymc-marketing-dev/lib/python3.10/site-packages/nutpie/compile_pymc.py:554: NumbaWarning: \u001b[1m\u001b[1mCannot cache compiled function \"numba_funcified_fgraph\" as it uses dynamic globals (such as ctypes pointers and large global arrays)\u001b[0m\u001b[0m\n", + " return inner(x)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "
    \n", + "

    Sampler Progress

    \n", + "

    Total Chains: 4

    \n", + "

    Active Chains: 0

    \n", + "

    \n", + " Finished Chains:\n", + " 4\n", + "

    \n", + "

    Sampling for 6 minutes

    \n", + "

    \n", + " Estimated Time to Completion:\n", + " now\n", + "

    \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    ProgressDrawsDivergencesStep SizeGradients/Draw
    \n", + " \n", + " \n", + " 200000.687
    \n", + " \n", + " \n", + " 200000.693
    \n", + " \n", + " \n", + " 200000.663
    \n", + " \n", + " \n", + " 200000.647
    \n", + "
    \n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "
    \n", + "
    \n", + "
    arviz.InferenceData
    \n", + "
    \n", + "
      \n", + " \n", + "
    • \n", + " \n", + " \n", + "
      \n", + "
      \n", + "
        \n", + "
        \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
        <xarray.Dataset> Size: 392kB\n",
        +       "Dimensions:                    (chain: 4, draw: 1000)\n",
        +       "Coordinates:\n",
        +       "  * chain                      (chain) int64 32B 0 1 2 3\n",
        +       "  * draw                       (draw) int64 8kB 0 1 2 3 4 ... 996 997 998 999\n",
        +       "Data variables:\n",
        +       "    alpha                      (chain, draw) float64 32kB 1.32 1.196 ... 1.15\n",
        +       "    beta                       (chain, draw) float64 32kB 0.7781 ... 0.7317\n",
        +       "    delta                      (chain, draw) float64 32kB 2.402 2.853 ... 2.966\n",
        +       "    gamma                      (chain, draw) float64 32kB 0.6269 ... 0.6927\n",
        +       "    kappa_dropout              (chain, draw) float64 32kB 3.029 3.539 ... 3.658\n",
        +       "    kappa_dropout_interval__   (chain, draw) float64 32kB 0.7075 ... 0.9777\n",
        +       "    kappa_purchase             (chain, draw) float64 32kB 2.098 1.946 ... 1.882\n",
        +       "    kappa_purchase_interval__  (chain, draw) float64 32kB 0.09346 ... -0.1257\n",
        +       "    phi_dropout                (chain, draw) float64 32kB 0.207 0.194 ... 0.1893\n",
        +       "    phi_dropout_interval__     (chain, draw) float64 32kB -1.343 ... -1.454\n",
        +       "    phi_purchase               (chain, draw) float64 32kB 0.6291 ... 0.6112\n",
        +       "    phi_purchase_interval__    (chain, draw) float64 32kB 0.5284 ... 0.4523\n",
        +       "Attributes:\n",
        +       "    created_at:                 2024-09-12T21:27:00.667319+00:00\n",
        +       "    arviz_version:              0.18.0\n",
        +       "    inference_library:          nutpie\n",
        +       "    inference_library_version:  0.13.2\n",
        +       "    sampling_time:              371.8816521167755

        \n", + "
      \n", + "
      \n", + "
    • \n", + " \n", + "
    • \n", + " \n", + " \n", + "
      \n", + "
      \n", + "
        \n", + "
        \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
        <xarray.Dataset> Size: 336kB\n",
        +       "Dimensions:               (chain: 4, draw: 1000)\n",
        +       "Coordinates:\n",
        +       "  * chain                 (chain) int64 32B 0 1 2 3\n",
        +       "  * draw                  (draw) int64 8kB 0 1 2 3 4 5 ... 995 996 997 998 999\n",
        +       "Data variables:\n",
        +       "    depth                 (chain, draw) uint64 32kB 3 3 3 3 3 3 ... 2 3 3 2 2 3\n",
        +       "    diverging             (chain, draw) bool 4kB False False ... False False\n",
        +       "    energy                (chain, draw) float64 32kB 3.324e+04 ... 3.323e+04\n",
        +       "    energy_error          (chain, draw) float64 32kB 0.637 -0.1158 ... 0.0765\n",
        +       "    index_in_trajectory   (chain, draw) int64 32kB -5 4 3 4 -5 ... 4 -3 -3 -1 5\n",
        +       "    logp                  (chain, draw) float64 32kB -3.324e+04 ... -3.323e+04\n",
        +       "    maxdepth_reached      (chain, draw) bool 4kB False False ... False False\n",
        +       "    mean_tree_accept      (chain, draw) float64 32kB 0.9861 0.724 ... 1.0 0.9373\n",
        +       "    mean_tree_accept_sym  (chain, draw) float64 32kB 0.9656 0.8221 ... 0.9368\n",
        +       "    n_steps               (chain, draw) uint64 32kB 7 11 15 11 7 ... 3 7 11 3 7\n",
        +       "    step_size             (chain, draw) float64 32kB 0.6836 0.6836 ... 0.6418\n",
        +       "    step_size_bar         (chain, draw) float64 32kB 0.6836 0.6836 ... 0.6418\n",
        +       "Attributes:\n",
        +       "    created_at:     2024-09-12T21:27:00.659509+00:00\n",
        +       "    arviz_version:  0.18.0

        \n", + "
      \n", + "
      \n", + "
    • \n", + " \n", + "
    • \n", + " \n", + " \n", + "
      \n", + "
      \n", + "
        \n", + "
        \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
        <xarray.Dataset> Size: 267kB\n",
        +       "Dimensions:            (customer_id: 11104, obs_var: 2)\n",
        +       "Coordinates:\n",
        +       "  * customer_id        (customer_id) int64 89kB 0 1 2 3 ... 11101 11102 11103\n",
        +       "  * obs_var            (obs_var) <U9 72B 'recency' 'frequency'\n",
        +       "Data variables:\n",
        +       "    recency_frequency  (customer_id, obs_var) float64 178kB 0.0 0.0 ... 6.0 6.0\n",
        +       "Attributes:\n",
        +       "    created_at:                 2024-09-12T21:27:00.666769+00:00\n",
        +       "    arviz_version:              0.18.0\n",
        +       "    inference_library:          pymc\n",
        +       "    inference_library_version:  5.13.0

        \n", + "
      \n", + "
      \n", + "
    • \n", + " \n", + "
    • \n", + " \n", + " \n", + "
      \n", + "
      \n", + "
        \n", + "
        \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
        <xarray.Dataset> Size: 444kB\n",
        +       "Dimensions:      (index: 11104)\n",
        +       "Coordinates:\n",
        +       "  * index        (index) int64 89kB 0 1 2 3 4 ... 11099 11100 11101 11102 11103\n",
        +       "Data variables:\n",
        +       "    customer_id  (index) int64 89kB 0 1 2 3 4 ... 11099 11100 11101 11102 11103\n",
        +       "    frequency    (index) int64 89kB 0 0 0 0 0 0 0 0 0 0 ... 6 6 6 6 6 6 6 6 6 6\n",
        +       "    recency      (index) int64 89kB 0 0 0 0 0 0 0 0 0 0 ... 6 6 6 6 6 6 6 6 6 6\n",
        +       "    T            (index) int64 89kB 6 6 6 6 6 6 6 6 6 6 ... 6 6 6 6 6 6 6 6 6 6

        \n", + "
      \n", + "
      \n", + "
    • \n", + " \n", "
    • \n", - " \n", - " \n", + " \n", + " \n", "
      \n", "
      \n", "
        \n", @@ -408,43 +6261,121 @@ " stroke: currentColor;\n", " fill: currentColor;\n", "}\n", - "
        <xarray.Dataset> Size: 584B\n",
        -       "Dimensions:                    (chain: 1, draw: 10, beta_geo_beta_binom_dim_0: 2)\n",
        +       "
        <xarray.Dataset> Size: 392kB\n",
        +       "Dimensions:                    (chain: 4, draw: 1000)\n",
                "Coordinates:\n",
        -       "  * chain                      (chain) int64 8B 0\n",
        -       "  * draw                       (draw) int64 80B 0 1 2 3 4 5 6 7 8 9\n",
        -       "  * beta_geo_beta_binom_dim_0  (beta_geo_beta_binom_dim_0) int64 16B 0 1\n",
        +       "  * chain                      (chain) int64 32B 0 1 2 3\n",
        +       "  * draw                       (draw) int64 8kB 0 1 2 3 4 ... 996 997 998 999\n",
                "Data variables:\n",
        -       "    alpha                      (chain, draw) float64 80B 1.206 1.203 ... 1.205\n",
        -       "    beta                       (chain, draw) float64 80B 0.7491 ... 0.7506\n",
        -       "    beta_geo_beta_binom        (chain, draw, beta_geo_beta_binom_dim_0) float64 160B ...\n",
        -       "    delta                      (chain, draw) float64 80B 2.784 2.782 ... 2.782\n",
        -       "    gamma                      (chain, draw) float64 80B 0.6572 ... 0.6568\n",
        +       "    alpha                      (chain, draw) float64 32kB 1.217 1.217 ... 1.171\n",
        +       "    beta                       (chain, draw) float64 32kB 0.2809 ... 0.7511\n",
        +       "    delta                      (chain, draw) float64 32kB 1.244 1.244 ... 3.435\n",
        +       "    gamma                      (chain, draw) float64 32kB 1.931 1.931 ... 0.7753\n",
        +       "    kappa_dropout              (chain, draw) float64 32kB 3.175 3.175 ... 4.211\n",
        +       "    kappa_dropout_interval__   (chain, draw) float64 32kB 0.7772 ... 1.166\n",
        +       "    kappa_purchase             (chain, draw) float64 32kB 1.498 1.498 ... 1.922\n",
        +       "    kappa_purchase_interval__  (chain, draw) float64 32kB -0.6972 ... -0.08079\n",
        +       "    phi_dropout                (chain, draw) float64 32kB 0.6081 ... 0.1841\n",
        +       "    phi_dropout_interval__     (chain, draw) float64 32kB 0.4393 ... -1.489\n",
        +       "    phi_purchase               (chain, draw) float64 32kB 0.8125 ... 0.6093\n",
        +       "    phi_purchase_interval__    (chain, draw) float64 32kB 1.466 1.466 ... 0.4444\n",
                "Attributes:\n",
        -       "    created_at:                 2024-05-26T23:06:22.691294+00:00\n",
        -       "    arviz_version:              0.18.0\n",
        -       "    inference_library:          pymc\n",
        -       "    inference_library_version:  5.13.0

    \n", + " created_at: 2024-09-12T21:27:00.655861+00:00\n", + " arviz_version: 0.18.0

\n", " \n", " \n", " \n", " \n", "
  • \n", - " \n", - " \n", + " \n", + " \n", "
    \n", "
    \n", "
      \n", @@ -811,15 +6742,94 @@ " stroke: currentColor;\n", " fill: currentColor;\n", "}\n", - "
      <xarray.Dataset> Size: 4B\n",
      -       "Dimensions:  ()\n",
      +       "
      <xarray.Dataset> Size: 336kB\n",
      +       "Dimensions:               (chain: 4, draw: 1000)\n",
      +       "Coordinates:\n",
      +       "  * chain                 (chain) int64 32B 0 1 2 3\n",
      +       "  * draw                  (draw) int64 8kB 0 1 2 3 4 5 ... 995 996 997 998 999\n",
              "Data variables:\n",
      -       "    T        int32 4B 6\n",
      +       "    depth                 (chain, draw) uint64 32kB 0 0 2 1 0 0 ... 3 2 3 2 3 3\n",
      +       "    diverging             (chain, draw) bool 4kB True True False ... False False\n",
      +       "    energy                (chain, draw) float64 32kB 4.085e+04 ... 3.324e+04\n",
      +       "    energy_error          (chain, draw) float64 32kB 0.0 0.0 ... -0.0498 0.6378\n",
      +       "    index_in_trajectory   (chain, draw) int64 32kB 0 0 2 -1 0 ... -3 -4 -3 4 -5\n",
      +       "    logp                  (chain, draw) float64 32kB -4.084e+04 ... -3.323e+04\n",
      +       "    maxdepth_reached      (chain, draw) bool 4kB False False ... False False\n",
      +       "    mean_tree_accept      (chain, draw) float64 32kB 0.0 0.0 ... 0.9126 0.867\n",
      +       "    mean_tree_accept_sym  (chain, draw) float64 32kB 0.0 0.0 ... 0.9539 0.918\n",
      +       "    n_steps               (chain, draw) uint64 32kB 0 1 1 3 1 1 ... 15 7 11 3 15\n",
      +       "    step_size             (chain, draw) float64 32kB 3.2 7.472 ... 0.6007 0.6479\n",
      +       "    step_size_bar         (chain, draw) float64 32kB 3.2 7.472 ... 0.6418 0.6418\n",
              "Attributes:\n",
      -       "    created_at:                 2024-05-26T23:06:22.695749+00:00\n",
      -       "    arviz_version:              0.18.0\n",
      -       "    inference_library:          pymc\n",
      -       "    inference_library_version:  5.13.0

    \n", + " created_at: 2024-09-12T21:27:00.663215+00:00\n", + " arviz_version: 0.18.0
    \n", " \n", " \n", "
  • \n", @@ -1170,387 +7180,23 @@ ], "text/plain": [ "Inference data with groups:\n", - "\t> prior\n", - "\t> constant_data" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "T_true = 6\n", - "alpha_true = 1.204\n", - "beta_true = 0.750\n", - "gamma_true = 0.657\n", - "delta_true = 2.783\n", - "\n", - "with pm.Model():\n", - " alpha = pm.Normal(name=\"alpha\", mu=alpha_true, sigma=1e-3)\n", - " beta = pm.Normal(name=\"beta\", mu=beta_true, sigma=1e-3)\n", - " gamma = pm.Normal(name=\"gamma\", mu=gamma_true, sigma=1e-3)\n", - " delta = pm.Normal(name=\"delta\", mu=delta_true, sigma=1e-3)\n", - "\n", - " T = pm.MutableData(name=\"T\", value=np.array(T_true))\n", - "\n", - " BetaGeoBetaBinom(\n", - " name=\"beta_geo_beta_binom\",\n", - " alpha=alpha,\n", - " beta=beta,\n", - " gamma=gamma,\n", - " delta=delta,\n", - " T=T,\n", - " )\n", - " prior = pm.sample_prior_predictive(samples=10)\n", - "\n", - "# recency = prior[\"prior\"][\"beta_geo_beta_binom\"][0][:,0]\n", - "# #frequency = prior[\"prior\"][\"beta_geo_beta_binom\"][1]\n", - "# recency.mean()\n", - "prior" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "8ff42e71-fc43-4d45-8446-7205e3d37bce", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ -3.94031398, -10.25427751, -6.82582822])" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "value = np.array([[1.5, 1], [5.3, 4], [6, 2]])\n", - "alpha = 0.55\n", - "beta = 10.58\n", - "gamma = 0.61\n", - "delta = 11.67\n", - "T = 12\n", - "\n", - "BetaGeoBetaBinomFitter._loglikelihood((alpha, beta, gamma, delta), value[..., 1], value[..., 0], T)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "371bb7a1-9f5c-4bdf-81b1-a9504261badb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "discrete_noncontract_df = load_donations()\n", - "\n", - "periods = 6\n", - "bgbb = BetaGeoBetaBinomFitter().fit(discrete_noncontract_df['frequency'].values,\n", - " discrete_noncontract_df['recency'].values,\n", - " discrete_noncontract_df['periods'].values,\n", - " discrete_noncontract_df['weights'].values)\n", - "bgbb" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "f8052f01-5aca-48fd-917e-1f2bbeb6326f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['conditional_expected_number_of_purchases_up_to_time', 'conditional_probability_alive', 'expected_number_of_transactions_in_first_n_periods', 'fit', 'load_model', 'save_model', 'summary']\n" - ] - } - ], - "source": [ - "method_list = [method for method in dir(BetaGeoBetaBinomFitter) if not method.startswith('_')]\n", - "print(method_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "f4f56b83-f830-4610-8f4f-e67ff242967e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 0.072863\n", - "1 0.085696\n", - "2 0.314238\n", - "3 0.593853\n", - "4 0.839396\n", - "5 1.021689\n", - "6 1.147885\n", - "7 0.119121\n", - "8 0.536111\n", - "9 1.057604\n", - "10 1.443042\n", - "11 1.668817\n", - "12 0.223595\n", - "13 1.034572\n", - "14 1.804703\n", - "15 2.189749\n", - "16 0.583192\n", - "17 2.030024\n", - "18 2.710681\n", - "19 1.812942\n", - "20 3.231612\n", - "21 3.752544\n", - "dtype: float64" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# equation 13 in paper\n", - "bgbb.conditional_expected_number_of_purchases_up_to_time(5,\n", - " discrete_noncontract_df['frequency'],\n", - " discrete_noncontract_df['recency'],\n", - " discrete_noncontract_df['periods'])" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "7ea5dc42-160f-4f96-9e16-a97f01dd4bdc", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 0.070072\n", - "1 0.045012\n", - "2 0.165056\n", - "3 0.311927\n", - "4 0.440900\n", - "5 0.536651\n", - "6 0.602936\n", - "7 0.043038\n", - "8 0.193695\n", - "9 0.382108\n", - "10 0.521365\n", - "11 0.602936\n", - "12 0.061566\n", - "13 0.284864\n", - "14 0.496916\n", - "15 0.602936\n", - "16 0.129719\n", - "17 0.451538\n", - "18 0.602936\n", - "19 0.338249\n", - "20 0.602936\n", - "21 0.602936\n", - "dtype: float64" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# equation 11 in paper\n", - "bgbb.conditional_probability_alive(10,\n", - " discrete_noncontract_df['frequency'],\n", - " discrete_noncontract_df['recency'],\n", - " discrete_noncontract_df['periods'])" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "96bcab46-7279-400a-82c1-e8b509ece774", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
    \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
    model
    frequency
    03454.881979
    11888.654343
    21348.882330
    31113.378533
    41017.922413
    51027.166049
    61253.114353
    \n", - "
    " - ], - "text/plain": [ - " model\n", - "frequency \n", - "0 3454.881979\n", - "1 1888.654343\n", - "2 1348.882330\n", - "3 1113.378533\n", - "4 1017.922413\n", - "5 1027.166049\n", - "6 1253.114353" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# TODO: write and test (8) as a replacement. Compare against just aggregating means across the exploded DF \n", - "# TODO: Can the arviz functions in the BetaGeoBetaBinom distribution block preclude the need for this?\n", - "# TODO: Replace this with (9) or (10) in a future PR, since that expression can predict interval ranges\n", - "\n", - "# equation 7 in paper, but that's for probabilities. should it be 8 for predicting mean n?\n", - "# yeah, this function should be renamed for clarity. \n", - "# it distributes customers in the dataset across n transaction opportunies\n", - "# it works better as an evaluation function, since it assumes a fixed customer population size\n", - "# if n > n_periods, it will keep right on predicting. This may be a bug\n", - "bgbb.expected_number_of_transactions_in_first_n_periods(n=6)" - ] - }, - { - "cell_type": "markdown", - "id": "9d55e986-d1f2-4c0d-8c25-3e289e90d5fe", - "metadata": {}, - "source": [ - "### Expected transactions in N periods\n", - "This expression will blow up to inf with large values of n (n=167 in this example). Recalculating on the log scale will allow for larger values, but this isn't possible if gamma < 1 because term1 will be negative." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "2e82f5b4-1b4a-4477-843b-58cbd411d348", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "average of 1.938137499995133 purchases expected in 5 opportunities\n" - ] - } - ], - "source": [ - "from scipy import special\n", - "from numpy import log,exp\n", - "\n", - "n = 5\n", - "alpha,beta,delta,gamma = bgbb._unload_params('alpha','beta','delta','gamma')\n", - "\n", - "# add a larger gamma value for testing\n", - "#gamma = .9\n", - "\n", - "log_scale = False\n", - "\n", - "if not log_scale:\n", - " term1 = alpha/(alpha+beta)*delta/(gamma-1)\n", - " term2 = 1-(special.gamma(gamma+delta))/special.gamma(gamma+delta+n)*(special.gamma(1+delta+n))/special.gamma(1+delta)\n", - " expected_purchases_n_periods = term1 * term2\n", - "else:\n", - " term1 = log(alpha/(alpha+beta)) + log(delta/(gamma-1))\n", - " term2 = special.gammaln(gamma+delta) - special.gammaln(gamma+delta+n) + special.gammaln(1+delta+n) - special.gammaln(1+delta)\n", - " expected_purchases_n_periods = exp(term1) - exp(term2)\n", - "\n", - "print(f'average of {expected_purchases_n_periods} purchases expected in {n} opportunities')" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "5186cf4d-710d-4e85-bef9-b66ccced5586", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[1.203522393608101, 0.7497163581757842, 2.783441982887136, 0.6567181695498788]" + "Warmup iterations saved (warmup_*)." ] }, - "execution_count": 17, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "bgbb._unload_params('alpha','beta','delta','gamma')" + "# add warning supress here\n", + "mcmc_model.fit(nuts_sampler=\"nutpie\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "80d11cc8-98fb-426e-89b2-693f0a8d22fa", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/pymc_marketing/clv/__init__.py b/pymc_marketing/clv/__init__.py index f80ce9a8b..be0ffa1a7 100644 --- a/pymc_marketing/clv/__init__.py +++ b/pymc_marketing/clv/__init__.py @@ -14,6 +14,7 @@ """CLV models and utilities.""" from pymc_marketing.clv.models import ( + BetaGeoBetaBinomModel, BetaGeoModel, GammaGammaModel, GammaGammaModelIndividual, @@ -34,6 +35,7 @@ __all__ = ( "BetaGeoModel", + "BetaGeoBetaBinomModel", "ParetoNBDModel", "GammaGammaModel", "GammaGammaModelIndividual", diff --git a/pymc_marketing/clv/distributions.py b/pymc_marketing/clv/distributions.py index e8a6f031e..77592167e 100644 --- a/pymc_marketing/clv/distributions.py +++ b/pymc_marketing/clv/distributions.py @@ -601,23 +601,17 @@ def logp(value, alpha, beta, gamma, delta, T): """Log-likelihood of the distribution.""" t_x = pt.atleast_1d(value[..., 0]) x = pt.atleast_1d(value[..., 1]) - scalar_case = t_x.type.broadcastable == (True,) for param in (t_x, x, alpha, beta, gamma, delta, T): if param.type.ndim > 1: raise NotImplementedError( f"BetaGeoBetaBinom logp only implemented for vector parameters, got ndim={param.type.ndim}" ) - if scalar_case: - if param.type.broadcastable == (False,): - raise NotImplementedError( - f"Parameter {param} cannot be larger than scalar value" - ) # Broadcast all the parameters so they are sequences. # Potentially inefficient, but otherwise ugly logic needed to unpack arguments in the scan function, # since sequences always precede non-sequences. - _, alpha, beta, gamma, delta, T = pt.broadcast_arrays( + t_x, alpha, beta, gamma, delta, T = pt.broadcast_arrays( t_x, alpha, beta, gamma, delta, T ) diff --git a/pymc_marketing/clv/models/__init__.py b/pymc_marketing/clv/models/__init__.py index 2c3a5a375..1bf31321c 100644 --- a/pymc_marketing/clv/models/__init__.py +++ b/pymc_marketing/clv/models/__init__.py @@ -16,6 +16,7 @@ from pymc_marketing.clv.models.basic import CLVModel from pymc_marketing.clv.models.beta_geo import BetaGeoModel +from pymc_marketing.clv.models.beta_geo_beta_binom import BetaGeoBetaBinomModel from pymc_marketing.clv.models.gamma_gamma import ( GammaGammaModel, GammaGammaModelIndividual, @@ -25,6 +26,7 @@ __all__ = ( "CLVModel", + "BetaGeoBetaBinomModel", "GammaGammaModel", "GammaGammaModelIndividual", "BetaGeoModel", diff --git a/pymc_marketing/clv/models/basic.py b/pymc_marketing/clv/models/basic.py index cd8ddeda7..473799349 100644 --- a/pymc_marketing/clv/models/basic.py +++ b/pymc_marketing/clv/models/basic.py @@ -61,6 +61,7 @@ def _validate_cols( data: pd.DataFrame, required_cols: Sequence[str], must_be_unique: Sequence[str] = (), + must_be_homogenous: Sequence[str] = (), ): existing_columns = set(data.columns) n = data.shape[0] @@ -71,6 +72,11 @@ def _validate_cols( if required_col in must_be_unique: if data[required_col].nunique() != n: raise ValueError(f"Column {required_col} has duplicate entries") + if required_col in must_be_homogenous: + if data[required_col].nunique() != 1: + raise ValueError( + f"Column {required_col} has non-homogeneous entries" + ) def __repr__(self) -> str: """Representation of the model.""" diff --git a/pymc_marketing/clv/models/beta_geo_beta_binom.py b/pymc_marketing/clv/models/beta_geo_beta_binom.py new file mode 100644 index 000000000..45f8b68f0 --- /dev/null +++ b/pymc_marketing/clv/models/beta_geo_beta_binom.py @@ -0,0 +1,701 @@ +# Copyright 2024 The PyMC Labs Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Beta-Geometric/Beta-Binomial Model.""" + +from collections.abc import Sequence +from typing import Literal + +import numpy as np +import pandas as pd +import pymc as pm +import pytensor.tensor as pt +import xarray +from numpy import exp +from pymc.util import RandomState +from pytensor.graph import vectorize_graph +from scipy.special import betaln, gammaln + +from pymc_marketing.clv.distributions import BetaGeoBetaBinom +from pymc_marketing.clv.models.basic import CLVModel +from pymc_marketing.clv.utils import to_xarray +from pymc_marketing.model_config import ModelConfig +from pymc_marketing.prior import Prior + + +class BetaGeoBetaBinomModel(CLVModel): + """Beta-Geometric/Beta-Binomial Model (BG/BB). + + CLV model for non-contractual, discrete purchase opportunities, introduced by Fadel et al. [1]_. + + The BG/BB model assumes the probability a customer will become inactive follows a Beta distribution, + and the probability of making a purchase is also Beta-distributed while customers are still active. + + This model requires data to be summarized by *recency*, *frequency*, and *T* for each customer. + *T* should be the same value across all customers. + + Parameters + ---------- + data : ~pandas.DataFrame + DataFrame containing the following columns: + + * `customer_id`: Unique customer identifier + * `frequency`: Number of repeat purchases + * `recency`: Purchase opportunities between the first and the last purchase + * `T`: Total purchase opportunities. + Model assumptions require *T >= recency* and all customers share the same value for *T. + + model_config : dict, optional + Dictionary containing model parameters: + + * `alpha_prior`: Shape parameter of purchase process; defaults to `phi_purchase_prior` * `kappa_purchase_prior` + * `beta_prior`: Shape parameter of purchase process; defaults to `1-phi_purchase_prior` * `kappa_purchase_prior` + * `gamma_prior`: Shape parameter of dropout process; defaults to `phi_purchase_prior` * `kappa_purchase_prior` + * `delta_prior`: Shape parameter of dropout process; defaults to `1-phi_dropout_prior` * `kappa_dropout_prior` + * `phi_purchase_prior`: Nested prior for alpha and beta priors; defaults to `Prior("Uniform", lower=0, upper=1)` + * `kappa_purchase_prior`: Nested prior for alpha and beta priors; defaults to `Prior("Pareto", alpha=1, m=1)` + * `phi_dropout_prior`: Nested prior for gamma and delta priors; defaults to `Prior("Uniform", lower=0, upper=1)` + * `kappa_dropout_prior`: Nested prior for gamma and delta priors; defaults to `Prior("Pareto", alpha=1, m=1)` + + If not provided, the model will use default priors specified in the `default_model_config` class attribute. + sampler_config : dict, optional + Dictionary of sampler parameters. Defaults to None. + + Examples + -------- + + .. code-block:: python + + import pymc as pm + + from pymc_marketing.prior import Prior + from pymc_marketing.clv import BetaGeoBetaBinomModel + + rfm_df = rfm_summary(raw_data,'id_col_name','date_col_name') + + # Initialize model with customer data; `model_config` parameter is optional + model = BetaGeoBetaBinomModel( + data=rfm_df, + model_config={ + "alpha_prior": Prior("HalfFlat"), + "beta_prior": Prior("HalfFlat"), + "gamma_prior": Prior("HalfFlat"), + "delta_prior": Prior("HalfFlat"), + }, + ) + + # Fit model quickly to large datasets via Maximum a Posteriori + model.fit(fit_method='map') + print(model.fit_summary()) + + # Fit with the default 'mcmc' for more informative predictions and reliable performance on smaller datasets + model.fit(fit_method='mcmc') + print(model.fit_summary()) + + # Predict number of purchases for customers over the next 10 time periods + expected_purchases = model.expected_purchases( + data=rfm_df, + future_t=10, + ) + + # Predict probability of customer making 'n' purchases over 't' time periods + # Data parameter is omitted here because predictions are ran on original dataset + expected_num_purchases = model.expected_purchase_probability( + n=[0, 1, 2, 3], + future_t=[10,20,30,40], + ) + + new_data = pd.DataFrame( + data = { + "customer_id": [0, 1, 2, 3], + "frequency": [5, 2, 1, 8], + "recency": [7, 4, 2.5, 11], + "T": [10, 8, 10, 22] + } + ) + + # Predict probability customers will still be active in 'future_t' time periods + probability_alive = model.expected_probability_alive( + data=new_data, + future_t=[0, 3, 6, 9], + ) + + # Predict number of purchases for a new customer over 't' time periods. + expected_purchases_new_customer = model.expected_purchases_new_customer( + t=[2, 5, 7, 10], + ) + + References + ---------- + .. [1] Peter Fader, Bruce Hardie, and Jen Shang. + "Customer-Base Analysis in a Discrete-Time Noncontractual Setting". + Marketing Science, Vol. 29, No. 6 (Nov-Dec, 2010), pp. 1086-1108. + https://www.brucehardie.com/papers/020/fader_et_al_mksc_10.pdf + """ + + _model_type = "BG/BB" # Beta-Geometric, Beta-Binomial Distribution + + def __init__( + self, + data: pd.DataFrame, + *, + model_config: ModelConfig | None = None, + sampler_config: dict | None = None, + ): + super().__init__( + data=data, + model_config=model_config, + sampler_config=sampler_config, + non_distributions=None, + ) + self._validate_cols( + data, + required_cols=[ + "customer_id", + "frequency", + "recency", + "T", + ], + must_be_unique=["customer_id"], + must_be_homogenous=["T"], + ) + + @property + def default_model_config(self) -> ModelConfig: + """Default model configuration.""" + return { + "phi_purchase_prior": Prior("Uniform", lower=0, upper=1), + "kappa_purchase_prior": Prior("Pareto", alpha=1, m=1), + "phi_dropout_prior": Prior("Uniform", lower=0, upper=1), + "kappa_dropout_prior": Prior("Pareto", alpha=1, m=1), + } + + def build_model(self) -> None: # type: ignore[override] + """Build the model.""" + coords = { + "obs_var": ["recency", "frequency"], + "customer_id": self.data["customer_id"], + } + with pm.Model(coords=coords) as self.model: + # purchase rate priors + if "alpha_prior" in self.model_config and "beta_prior" in self.model_config: + alpha = self.model_config["alpha_prior"].create_variable("alpha") + beta = self.model_config["beta_prior"].create_variable("beta") + else: + # hierarchical pooling of purchase rate priors + phi_purchase = self.model_config["phi_purchase_prior"].create_variable( + "phi_purchase" + ) + kappa_purchase = self.model_config[ + "kappa_purchase_prior" + ].create_variable("kappa_purchase") + + alpha = pm.Deterministic("alpha", phi_purchase * kappa_purchase) + beta = pm.Deterministic("beta", (1.0 - phi_purchase) * kappa_purchase) + + # dropout priors + if ( + "gamma_prior" in self.model_config + and "delta_prior" in self.model_config + ): + gamma = self.model_config["gamma_prior"].create_variable("gamma") + delta = self.model_config["delta_prior"].create_variable("delta") + else: + # hierarchical pooling of dropout rate priors + phi_dropout = self.model_config["phi_dropout_prior"].create_variable( + "phi_dropout" + ) + kappa_dropout = self.model_config[ + "kappa_dropout_prior" + ].create_variable("kappa_dropout") + + gamma = pm.Deterministic("gamma", phi_dropout * kappa_dropout) + delta = pm.Deterministic("delta", (1.0 - phi_dropout) * kappa_dropout) + + BetaGeoBetaBinom( + name="recency_frequency", + alpha=alpha, + beta=beta, + delta=delta, + gamma=gamma, + T=self.data["T"], + observed=np.stack( + (self.data["recency"], self.data["frequency"]), axis=1 + ), + dims=["customer_id", "obs_var"], + ) + + # TODO: cache this as a property + @staticmethod + def _logp( + alpha: xarray.DataArray, + beta: xarray.DataArray, + gamma: xarray.DataArray, + delta: xarray.DataArray, + x: xarray.DataArray, + t_x: xarray.DataArray, + T: xarray.DataArray, + ) -> xarray.DataArray: + """Log-likelihood of the BG/NBD model. + + Utility function for using BG/BB log-likelihood in predictive methods. + """ + # The BetaGeoBetaBinom distribution only works with vector parameters + # We stack the chain/draw dimensions in a long vector and use vectorize + # to broadcast along each customer `T` + dummy_T = pt.tensor(shape=(1,), dtype=int) + dummy_values = pt.tensor(shape=(1, 2), dtype=int) + bgbb_dist = BetaGeoBetaBinom.dist( + alpha=alpha.stack(sample=("chain", "draw")).values, + beta=beta.stack(sample=("chain", "draw")).values, + gamma=gamma.stack(sample=("chain", "draw")).values, + delta=delta.stack(sample=("chain", "draw")).values, + T=dummy_T, + ) + dummy_logp = pm.logp(bgbb_dist, dummy_values) + values = pt.constant(np.stack((t_x.values, x.values), axis=-1)) + loglike = vectorize_graph( + dummy_logp, + replace={dummy_T: T.values[:, None], dummy_values: values[:, None, :]}, + ).eval() + # Unstack chain/draw and put customer in last axis + loglike = np.moveaxis( + loglike.reshape((-1, alpha.sizes["chain"], alpha.sizes["draw"])), 0, -1 + ) + return xarray.DataArray(data=loglike, dims=("chain", "draw", "customer_id")) + + # TODO: move this into BaseModel + def _extract_predictive_variables( + self, + data: pd.DataFrame, + customer_varnames: Sequence[str] = (), + ) -> xarray.Dataset: + """ + Extract predictive variables from the data. + + Utility function assigning default customer arguments + for predictive methods and converting to xarrays. + """ + self._validate_cols( + data, + required_cols=[ + "customer_id", + *customer_varnames, + ], + must_be_unique=["customer_id"], + must_be_homogenous=["T"], + ) + + alpha = self.fit_result["alpha"] + beta = self.fit_result["beta"] + gamma = self.fit_result["gamma"] + delta = self.fit_result["delta"] + + customer_vars = to_xarray( + data["customer_id"], + *[data[customer_varname] for customer_varname in customer_varnames], + ) + if len(customer_varnames) == 1: + customer_vars = [customer_vars] + + return xarray.combine_by_coords( + ( + alpha, + beta, + gamma, + delta, + *customer_vars, + ) + ) + + def expected_purchases( + self, + data: pd.DataFrame | None = None, + *, + future_t: int | np.ndarray | pd.Series | None = None, + ) -> xarray.DataArray: + """ + Predict expected number of future purchases. + + Given *recency*, *frequency*, and *T* for an individual customer, this method estimates the + expected number of future purchases across *future_t* time periods. + + Adapted from equation (13) in Bruce Hardie's notes [1]_, and `lifetimes` library: + https://github.com/CamDavidsonPilon/lifetimes/blob/master/lifetimes/fitters/beta_geo_beta_binom_fitter.py#L179 + + Parameters + ---------- + data : ~pandas.DataFrame, optional + Dataframe containing the following columns: + + * `customer_id`: Unique customer identifier + * `frequency`: Number of repeat purchases + * `recency`: Purchase opportunities between the first and the last purchase + * `T`: Total purchase opportunities. + Model assumptions require *T >= recency* and all customers share the same value for *T. + * `future_t`: Optional column for *future_t* parametrization. + + If not provided, predictions will be ran with data used to fit model. + future_t : array_like + Number of time periods to predict expected purchases. + Not required if `data` Dataframe contains a *future_t* column. + + References + ---------- + .. [1] Peter Fader, Bruce Hardie, and Jen Shang. + "Customer-Base Analysis in a Discrete-Time Noncontractual Setting". + Marketing Science, Vol. 29, No. 6 (Nov-Dec, 2010), pp. 1086-1108. + https://www.brucehardie.com/papers/020/fader_et_al_mksc_10.pdf + """ + if data is None: + data = self.data + + if future_t is not None: + data = data.assign(future_t=future_t) + + dataset = self._extract_predictive_variables( + data, customer_varnames=["frequency", "recency", "T", "future_t"] + ) + alpha = dataset["alpha"] + beta = dataset["beta"] + gamma = dataset["gamma"] + delta = dataset["delta"] + x = dataset["frequency"] + t_x = dataset["recency"] + T = dataset["T"] + future_t = dataset["future_t"] + + loglike = self._logp(alpha, beta, gamma, delta, x, t_x, T) + + first_term = 1 / exp(loglike) + second_term = exp(betaln(alpha + x + 1, beta + T - x) - betaln(alpha, beta)) + third_term = ( + delta / (gamma - 1) * exp(gammaln(gamma + delta) - gammaln(1 + delta)) + ) + fourth_term = exp(gammaln(1 + delta + T) - gammaln(gamma + delta + T)) + fifth_term = exp( + gammaln(1 + delta + T + future_t) - gammaln(gamma + delta + T + future_t) + ) + + exp_purchases = ( + first_term * second_term * third_term * (fourth_term - fifth_term) + ) + + return exp_purchases.transpose( + "chain", "draw", "customer_id", missing_dims="ignore" + ) + + def expected_probability_alive( + self, + data: pd.DataFrame | None = None, + *, + future_t: int | np.ndarray | pd.Series | None = None, + ) -> xarray.DataArray: + """ + Predict expected probability of being alive. + + Estimate the probability that a customer with history *frequency*, *recency*, and *T* + is currently active. Can also estimate alive probability for *future_t* periods into the future. + + Adapted from equation (11) in Bruce Hardie's notes [1]_ and lifetimes library: + https://github.com/CamDavidsonPilon/lifetimes/blob/master/lifetimes/fitters/beta_geo_beta_binom_fitter.py#L217 + + Parameters + ---------- + data : ~pandas.DataFrame, optional + Dataframe containing the following columns: + + * `customer_id`: Unique customer identifier + * `frequency`: Number of repeat purchases + * `recency`: Purchase opportunities between the first and the last purchase + * `T`: Total purchase opportunities. + Model assumptions require *T >= recency* and all customers share the same value for *T. + * `future_t`: Optional column for *future_t* parametrization. + + If not provided, predictions will be ran with data used to fit model. + future_t : array_like + Number of time periods to predict expected purchases. + Not required if `data` Dataframe contains a *future_t* column. + + References + ---------- + .. [1] Peter Fader, Bruce Hardie, and Jen Shang. + "Customer-Base Analysis in a Discrete-Time Noncontractual Setting". + Marketing Science, Vol. 29, No. 6 (Nov-Dec, 2010), pp. 1086-1108. + https://www.brucehardie.com/papers/020/fader_et_al_mksc_10.pdf + """ + if data is None: + data = self.data + + if future_t is not None: + data = data.assign(future_t=future_t) + + dataset = self._extract_predictive_variables( + data, customer_varnames=["frequency", "recency", "T", "future_t"] + ) + alpha = dataset["alpha"] + beta = dataset["beta"] + gamma = dataset["gamma"] + delta = dataset["delta"] + x = dataset["frequency"] + t_x = dataset["recency"] + T = dataset["T"] + future_t = dataset["future_t"] + + loglike = self._logp(alpha, beta, gamma, delta, x, t_x, T) + + term1 = betaln(alpha + x, beta + T - x) - betaln(alpha, beta) + term2 = betaln(gamma, delta + T + future_t) - betaln(gamma, delta) + + prob_alive = exp(term1 + term2) / exp(loglike) + + return prob_alive.transpose( + "chain", "draw", "customer_id", missing_dims="ignore" + ) + + def expected_purchases_new_customer( + self, + data: pd.DataFrame | None = None, + *, + t: int | np.ndarray | pd.Series | None = None, + ) -> xarray.DataArray: + """Predict the expected number of purchases for a new customer across *t* time periods. + + Adapted from equation (8) in Bruce Hardie's notes [1]: + + Parameters + ---------- + t : array_like + Number of time periods over which to estimate purchases. + + References + ---------- + .. [1] Peter Fader, Bruce Hardie, and Jen Shang. + "Customer-Base Analysis in a Discrete-Time Noncontractual Setting". + Marketing Science, Vol. 29, No. 6 (Nov-Dec, 2010), pp. 1086-1108. + https://www.brucehardie.com/papers/020/fader_et_al_mksc_10.pdf + """ + if data is None: + data = self.data + + if t is not None: + data = data.assign(t=t) + + dataset = self._extract_predictive_variables(data, customer_varnames=["t"]) + alpha = dataset["alpha"] + beta = dataset["beta"] + gamma = dataset["gamma"] + delta = dataset["delta"] + t = dataset["t"] + + first_term = alpha / (alpha + beta) * delta / (gamma - 1) + second_term = 1 - exp( + gammaln(gamma + delta) + + gammaln(1 + delta + t) + - gammaln(gamma + delta + t) + - gammaln(1 + delta) + ) + + return (first_term * second_term).transpose( + "chain", "draw", "customer_id", missing_dims="ignore" + ) + + def _distribution_new_customers( + self, + data: pd.DataFrame | None = None, + *, + T: int | np.ndarray | pd.Series | None = None, + random_seed: RandomState | None = None, + var_names: Sequence[ + Literal["dropout", "purchase_rate", "recency_frequency"] + ] = ( + "dropout", + "purchase_rate", + "recency_frequency", + ), + ) -> xarray.Dataset: + """Compute posterior predictive samples of dropout, purchase rate and frequency/recency of new customers. + + Parameters + ---------- + data : ~pandas.DataFrame, Optional + DataFrame containing the following columns: + + * `customer_id`: Unique customer identifier + * `T`: Total number of purchase opportunities + + If not provided, predictions will be ran with data used to fit model. + T : array_like, optional + Total number of purchase opportunities. Not needed if `data` parameter is provided with a `T` column. + random_seed : ~numpy.random.RandomState, optional + Random state to use for sampling. + var_names : sequence of str, optional + Names of the variables to sample from. Defaults to ["dropout", "purchase_rate", "recency_frequency"]. + + """ + if data is None: + data = self.data + + if T is not None: + data = data.assign(T=T) + + dataset = self._extract_predictive_variables(data, customer_varnames=["T"]) + T = dataset["T"].values + # Delete "T" so we can pass dataset directly to `sample_posterior_predictive` + del dataset["T"] + + if dataset.sizes["chain"] == 1 and dataset.sizes["draw"] == 1: + # For map fit add a dummy draw dimension + dataset = dataset.squeeze("draw").expand_dims(draw=range(1000)) + + coords = self.model.coords.copy() # type: ignore + coords["customer_id"] = data["customer_id"] + with pm.Model(coords=coords): + alpha = pm.Flat("alpha") + beta = pm.Flat("beta") + gamma = pm.Flat("gamma") + delta = pm.Flat("delta") + + pm.Beta( + "purchase_rate", + alpha=alpha, + beta=beta, + ) + pm.Beta( + "dropout", + alpha=gamma, + beta=delta, + ) + + BetaGeoBetaBinom( + name="recency_frequency", + alpha=alpha, + beta=beta, + gamma=gamma, + delta=delta, + T=T, + dims=["customer_id", "obs_var"], + ) + + return pm.sample_posterior_predictive( + dataset, + var_names=var_names, + random_seed=random_seed, + predictions=True, + ).predictions + + # TODO: Move into BaseModel? + def distribution_new_customer_dropout( + self, + data: pd.DataFrame | None = None, + *, + random_seed: RandomState | None = None, + ) -> xarray.Dataset: + """Sample from the Beta distribution representing dropout probabilities for new customers. + + This is the probability a new customer will drop out after making a purchase. + + Parameters + ---------- + data : ~pandas.DataFrame, optional + DataFrame containing the following columns: + + * `customer_id`: Unique customer identifier + + If not provided, predictions will be ran with data used to fit model. + random_seed : ~numpy.random.RandomState, optional + Random state to use for sampling. + + Returns + ------- + ~xarray.Dataset + Dataset containing the posterior samples for the population-level dropout rate. + """ + return self._distribution_new_customers( + data=data, + random_seed=random_seed, + var_names=["dropout"], + )["dropout"] + + # TODO: Move into BaseModel? + def distribution_new_customer_purchase_rate( + self, + data: pd.DataFrame | None = None, + *, + random_seed: RandomState | None = None, + ) -> xarray.Dataset: + """Sample from the Beta distribution representing purchase probabilities for new customers. + + This is the probability a new customer will make a purchase given they are still active. + + Parameters + ---------- + data : ~pandas.DataFrame, optional + DataFrame containing the following columns: + + * `customer_id`: Unique customer identifier + + If not provided, predictions will be ran with data used to fit model. + random_seed : ~numpy.random.RandomState, optional + Random state to use for sampling. + + Returns + ------- + ~xarray.Dataset + Dataset containing the posterior samples for the population-level purchase rate. + """ + return self._distribution_new_customers( + data=data, + random_seed=random_seed, + var_names=["purchase_rate"], + )["purchase_rate"] + + # TODO: This is quite slow due to sampling far more than necessary. Same for other CLV models. + def distribution_new_customer_recency_frequency( + self, + data: pd.DataFrame | None = None, + *, + T: int | np.ndarray | pd.Series | None = None, + random_seed: RandomState | None = None, + ) -> xarray.Dataset: + """BG/BB process representing purchases across the customer population. + + This is the distribution of purchase frequencies given 'T' observation periods for each customer. + + Parameters + ---------- + data : ~pandas.DataFrame, optional + DataFrame containing the following columns: + + * `customer_id`: Unique customer identifier + * `T`: Total purchase opportunities. + + If not provided, the method will use the fit dataset. + T : array_like, optional + Total purchase opportunities. If not provided, T values from fit dataset will be used. + Not required if `data` Dataframe contains a `T` column. + random_seed : ~numpy.random.RandomState, optional + Random state to use for sampling. + + Returns + ------- + ~xarray.Dataset + Dataset containing the posterior samples for the customer population. + """ + return self._distribution_new_customers( + data=data, + T=T, + random_seed=random_seed, + var_names=["recency_frequency"], + )["recency_frequency"] diff --git a/tests/clv/models/test_beta_geo_beta_binom.py b/tests/clv/models/test_beta_geo_beta_binom.py new file mode 100644 index 000000000..9121eb8a5 --- /dev/null +++ b/tests/clv/models/test_beta_geo_beta_binom.py @@ -0,0 +1,526 @@ +# Copyright 2024 The PyMC Labs Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import arviz as az +import numpy as np +import pandas as pd +import pymc as pm +import pytensor as pt +import pytest +import xarray as xr +from lifetimes.fitters.beta_geo_beta_binom_fitter import BetaGeoBetaBinomFitter + +from pymc_marketing.clv.distributions import BetaGeoBetaBinom +from pymc_marketing.clv.models import BetaGeoBetaBinomModel +from pymc_marketing.prior import Prior + + +class TestBetaGeoBetaBinomModel: + @classmethod + def setup_class(cls): + # Set random seed + cls.rng = np.random.default_rng(34) + + # parameters + cls.alpha_true = 1.2035 + cls.beta_true = 0.7497 + cls.delta_true = 2.7834 + cls.gamma_true = 0.6567 + + # Use Quickstart dataset (the CDNOW_sample research data) for testing + test_data = pd.read_csv("data/bgbb_donations.csv") + + cls.data = test_data + # cls.customer_id = test_data["customer_id"] + # cls.frequency = test_data["frequency"] + # cls.recency = test_data["recency"] + # cls.T = test_data["T"] + + # take sample of all unique recency/frequency/T combinations to test predictive methods + test_customer_ids = [ + 3463, + 4554, + 4831, + 4960, + 5038, + 5159, + 5286, + 5899, + 6154, + 6309, + 6482, + 6716, + 7038, + 7219, + 7444, + 7801, + 8041, + 8235, + 8837, + 9172, + 9900, + 11103, + ] + + cls.sample_data = test_data.query("customer_id.isin(@test_customer_ids)") + cls.sample_data_N = len(test_customer_ids) + + # Instantiate model with CDNOW data for testing + cls.model = BetaGeoBetaBinomModel(cls.data) + cls.model.build_model() + + # Also instantiate lifetimes model for comparison + cls.lifetimes_model = BetaGeoBetaBinomFitter() + cls.lifetimes_model.params_ = { + "alpha": cls.alpha_true, + "beta": cls.beta_true, + "delta": cls.delta_true, + "gamma": cls.gamma_true, + } + + # Mock an idata object for tests requiring a fitted model + cls.N = len(cls.data) + cls.chains = 2 + cls.draws = 50 + cls.mock_fit = az.from_dict( + { + "alpha": cls.rng.normal( + cls.alpha_true, 1e-3, size=(cls.chains, cls.draws) + ), + "beta": cls.rng.normal( + cls.beta_true, 1e-3, size=(cls.chains, cls.draws) + ), + "delta": cls.rng.normal( + cls.delta_true, 1e-3, size=(cls.chains, cls.draws) + ), + "gamma": cls.rng.normal( + cls.gamma_true, 1e-3, size=(cls.chains, cls.draws) + ), + } + ) + + cls.model.idata = cls.mock_fit + + @pytest.fixture(scope="class") + def model_config(self): + return { + "alpha_prior": Prior("HalfNormal"), + "beta_prior": Prior("HalfStudentT", nu=4), + "delta_prior": Prior("HalfCauchy", beta=2), + "gamma_prior": Prior("Gamma", alpha=1, beta=1), + } + + def test_model(self, model_config): + # this test requires a different setup from other models due to default_model_config containing NoneTypes + default_model = BetaGeoBetaBinomModel( + data=self.data, + model_config=None, + ) + custom_model = BetaGeoBetaBinomModel( + data=self.data, + model_config=model_config, + ) + + for model in (default_model, custom_model): + model.build_model() + assert isinstance( + model.model["alpha"].owner.op, + pt.tensor.elemwise.Elemwise + if "alpha_prior" not in model.model_config + else model.model_config["alpha_prior"].pymc_distribution, + ) + assert isinstance( + model.model["beta"].owner.op, + pt.tensor.elemwise.Elemwise + if "beta_prior" not in model.model_config + else model.model_config["beta_prior"].pymc_distribution, + ) + assert isinstance( + model.model["delta"].owner.op, + pt.tensor.elemwise.Elemwise + if "delta_prior" not in model.model_config + else model.model_config["delta_prior"].pymc_distribution, + ) + assert isinstance( + model.model["gamma"].owner.op, + pt.tensor.elemwise.Elemwise + if "gamma_prior" not in model.model_config + else model.model_config["gamma_prior"].pymc_distribution, + ) + + assert default_model.model.eval_rv_shapes() == { + "kappa_dropout": (), + "kappa_dropout_interval__": (), + "kappa_purchase": (), + "kappa_purchase_interval__": (), + "phi_dropout": (), + "phi_dropout_interval__": (), + "phi_purchase": (), + "phi_purchase_interval__": (), + } + + assert custom_model.model.eval_rv_shapes() == { + "alpha": (), + "alpha_log__": (), + "beta": (), + "beta_log__": (), + "delta": (), + "delta_log__": (), + "gamma": (), + "gamma_log__": (), + } + + def test_missing_cols(self): + data_invalid = self.data.drop(columns="customer_id") + + with pytest.raises(ValueError, match="Required column customer_id missing"): + BetaGeoBetaBinomModel(data=data_invalid) + + data_invalid = self.data.drop(columns="frequency") + + with pytest.raises(ValueError, match="Required column frequency missing"): + BetaGeoBetaBinomModel(data=data_invalid) + + data_invalid = self.data.drop(columns="recency") + + with pytest.raises(ValueError, match="Required column recency missing"): + BetaGeoBetaBinomModel(data=data_invalid) + + data_invalid = self.data.drop(columns="T") + + with pytest.raises(ValueError, match="Required column T missing"): + BetaGeoBetaBinomModel(data=data_invalid) + + def test_customer_id_duplicate(self): + with pytest.raises( + ValueError, match="Column customer_id has duplicate entries" + ): + data = pd.DataFrame( + { + "customer_id": np.asarray([1, 1]), + "frequency": np.asarray([1, 1]), + "recency": np.asarray([1, 1]), + "T": np.asarray([1, 1]), + } + ) + + BetaGeoBetaBinomModel( + data=data, + ) + + def test_T_homogeneity(self): + with pytest.raises(ValueError, match="Column T has non-homogeneous entries"): + data = pd.DataFrame( + { + "customer_id": np.asarray([1, 2]), + "frequency": np.asarray([1, 2]), + "recency": np.asarray([1, 2]), + "T": np.asarray([1, 2]), + } + ) + + BetaGeoBetaBinomModel( + data=data, + ) + + @pytest.mark.parametrize("custom_config", (True, False)) + def test_model_repr(self, custom_config): + if custom_config: + model_config = { + "alpha_prior": Prior("HalfFlat"), + "beta_prior": Prior("HalfFlat"), + "delta_prior": Prior("HalfFlat"), + "gamma_prior": Prior("HalfNormal", sigma=10), + } + repr = ( + "BG/BB" + "\nalpha~HalfFlat()" + "\nbeta~HalfFlat()" + "\ngamma~HalfNormal(0,10)" + "\ndelta~HalfFlat()" + "\nrecency_frequency~BetaGeoBetaBinom(alpha,beta,gamma,delta,)" + ) + else: + model_config = None + repr = ( + "BG/BB" + "\nphi_purchase~Uniform(0,1)" + "\nkappa_purchase~Pareto(1,1)" + "\nphi_dropout~Uniform(0,1)" + "\nkappa_dropout~Pareto(1,1)" + "\nalpha~Deterministic(f(kappa_purchase,phi_purchase))" + "\nbeta~Deterministic(f(kappa_purchase,phi_purchase))" + "\ngamma~Deterministic(f(kappa_dropout,phi_dropout))" + "\ndelta~Deterministic(f(kappa_dropout,phi_dropout))" + "\nrecency_frequency~BetaGeoBetaBinom(alpha,beta,gamma,delta,)" + ) + model = BetaGeoBetaBinomModel( + data=self.data, + model_config=model_config, + ) + model.build_model() + + assert model.__repr__().replace(" ", "") == repr + + @pytest.mark.slow + @pytest.mark.parametrize( + "fit_method, rtol", + [ + ("mcmc", 0.1), + ("map", 0.2), + ], + ) + def test_model_convergence(self, fit_method, rtol, model_config): + model = BetaGeoBetaBinomModel( + data=self.data, + model_config=model_config, + ) + model.build_model() + + sample_kwargs = ( + dict(random_seed=self.rng, chains=2) if fit_method == "mcmc" else {} + ) + model.fit(fit_method=fit_method, progressbar=False, **sample_kwargs) + + fit = model.idata.posterior + np.testing.assert_allclose( + [ + fit["alpha"].mean(), + fit["beta"].mean(), + fit["delta"].mean(), + fit["gamma"].mean(), + ], + [self.alpha_true, self.beta_true, self.delta_true, self.gamma_true], + rtol=rtol, + ) + + def test_fit_result_without_fit(self, model_config): + model = BetaGeoBetaBinomModel(data=self.data, model_config=model_config) + with pytest.raises(RuntimeError, match="The model hasn't been fit yet"): + model.fit_result + + idata = model.fit( + tune=5, + chains=2, + draws=10, + compute_convergence_checks=False, + ) + assert isinstance(idata, az.InferenceData) + assert len(idata.posterior.chain) == 2 + assert len(idata.posterior.draw) == 10 + assert model.idata is idata + + @pytest.mark.parametrize("test_t", [1, 3, 6]) + def test_expected_purchases(self, test_t): + true_purchases = ( + self.lifetimes_model.conditional_expected_number_of_purchases_up_to_time( + m_periods_in_future=test_t, + frequency=self.sample_data["frequency"], + recency=self.sample_data["recency"], + n_periods=self.sample_data["T"], + ) + ) + + # test parametrization with default data has different dims + est_num_purchases = self.model.expected_purchases(future_t=test_t) + assert est_num_purchases.shape == (self.chains, self.draws, self.N) + + data = self.sample_data.assign(future_t=test_t) + est_num_purchases = self.model.expected_purchases(data) + + assert est_num_purchases.shape == (self.chains, self.draws, self.sample_data_N) + assert est_num_purchases.dims == ("chain", "draw", "customer_id") + + np.testing.assert_allclose( + true_purchases, + est_num_purchases.mean(("chain", "draw")), + rtol=0.01, + ) + + def test_expected_purchases_new_customer(self): + # values obtained from cells B7:17 from 'Tracking Plot" sheet in https://www.brucehardie.com/notes/010/ + true_purchases_new = np.array( + [ + 0.4985, + 0.9233, + 1.2969, + 1.6323, + 1.9381, + 2.2202, + 2.4826, + 2.7285, + 2.9603, + 3.1798, + 3.3887, + ] + ) + time_periods = np.arange(1, 12) + + # test dimensions for a single prediction + data = pd.DataFrame({"customer_id": [0], "t": [5]}) + est_purchase_new = self.model.expected_purchases_new_customer(data) + + assert est_purchase_new.shape == (self.chains, self.draws, 1) + assert est_purchase_new.dims == ("chain", "draw", "customer_id") + + # compare against array of true values + est_purchases_new = ( + xr.concat( + objs=[ + self.model.expected_purchases_new_customer(None, t=t).mean() + for t in time_periods + ], + dim="t", + ) + .transpose(..., "t") + .values + ) + + np.testing.assert_allclose( + true_purchases_new, + est_purchases_new, + rtol=0.001, + ) + + @pytest.mark.parametrize("test_t", [1, 3, 6]) + def test_expected_probability_alive(self, test_t): + true_prob_alive = self.lifetimes_model.conditional_probability_alive( + m_periods_in_future=test_t, + frequency=self.sample_data["frequency"], + recency=self.sample_data["recency"], + n_periods=self.sample_data["T"], + ) + + # test parametrization with default data has different dims + est_prob_alive = self.model.expected_probability_alive(future_t=test_t) + assert est_prob_alive.shape == (self.chains, self.draws, self.N) + + sample_data = self.sample_data.assign(future_t=test_t) + est_prob_alive = self.model.expected_probability_alive(sample_data) + + assert est_prob_alive.shape == (self.chains, self.draws, self.sample_data_N) + assert est_prob_alive.dims == ("chain", "draw", "customer_id") + np.testing.assert_allclose( + true_prob_alive, + est_prob_alive.mean(("chain", "draw")), + rtol=0.01, + ) + + alt_data = self.sample_data.assign(future_t=7.5) + est_prob_alive_t = self.model.expected_probability_alive(alt_data) + assert est_prob_alive.mean() > est_prob_alive_t.mean() + + def test_distribution_new_customer(self) -> None: + mock_model = BetaGeoBetaBinomModel( + data=self.data, + ) + mock_model.build_model() + mock_model.idata = az.from_dict( + { + "alpha": [self.alpha_true], + "beta": [self.beta_true], + "delta": [self.delta_true], + "gamma": [self.gamma_true], + } + ) + + rng = np.random.default_rng(42) + new_customer_dropout = mock_model.distribution_new_customer_dropout( + random_seed=rng + ) + new_customer_purchase_rate = mock_model.distribution_new_customer_purchase_rate( + random_seed=rng + ) + customer_rec_freq = mock_model.distribution_new_customer_recency_frequency( + self.data, T=self.data["T"], random_seed=rng + ) + customer_rec = customer_rec_freq.sel(obs_var="recency") + customer_freq = customer_rec_freq.sel(obs_var="frequency") + + assert isinstance(new_customer_dropout, xr.DataArray) + assert isinstance(new_customer_purchase_rate, xr.DataArray) + assert isinstance(customer_rec, xr.DataArray) + assert isinstance(customer_freq, xr.DataArray) + + N = 1000 + p = pm.Beta.dist(self.alpha_true, self.beta_true, size=N) + theta = pm.Beta.dist(self.gamma_true, self.delta_true, size=N) + ref_rec, ref_freq = pm.draw( + BetaGeoBetaBinom.dist( + alpha=self.alpha_true, + beta=self.beta_true, + delta=self.delta_true, + gamma=self.gamma_true, + T=self.data["T"], + ), + random_seed=rng, + ).T + + rtol = 0.15 + np.testing.assert_allclose( + new_customer_dropout.mean(), + pm.draw(theta.mean(), random_seed=rng), + rtol=rtol, + ) + np.testing.assert_allclose( + new_customer_dropout.var(), pm.draw(theta.var(), random_seed=rng), rtol=rtol + ) + np.testing.assert_allclose( + new_customer_purchase_rate.mean(), + pm.draw(p.mean(), random_seed=rng), + rtol=rtol, + ) + np.testing.assert_allclose( + new_customer_purchase_rate.var(), + pm.draw(p.var(), random_seed=rng), + rtol=rtol, + ) + np.testing.assert_allclose( + customer_rec.mean(), + ref_rec.mean(), + rtol=rtol, + ) + np.testing.assert_allclose( + customer_rec.var(), + ref_rec.var(), + rtol=rtol, + ) + np.testing.assert_allclose( + customer_freq.mean(), + ref_freq.mean(), + rtol=rtol, + ) + np.testing.assert_allclose( + customer_freq.var(), + ref_freq.var(), + rtol=rtol, + ) + + def test_save_load(self): + self.model.build_model() + self.model.fit("map", maxeval=1) + self.model.save("test_model") + # Testing the valid case. + + model2 = BetaGeoBetaBinomModel.load("test_model") + + # Check if the loaded model is indeed an instance of the class + assert isinstance(self.model, BetaGeoBetaBinomModel) + # Check if the loaded data matches with the model data + pd.testing.assert_frame_equal(self.model.data, model2.data, check_names=False) + assert self.model.model_config == model2.model_config + assert self.model.sampler_config == model2.sampler_config + assert self.model.idata == model2.idata + os.remove("test_model") diff --git a/tests/clv/test_distributions.py b/tests/clv/test_distributions.py index 663564669..212712eb5 100644 --- a/tests/clv/test_distributions.py +++ b/tests/clv/test_distributions.py @@ -378,19 +378,6 @@ def test_notimplemented_logp(self): with pytest.raises(NotImplementedError): pm.logp(dist, invalid_value) - invalid_dist = BetaGeoBetaBinom.dist( - alpha=np.ones( - 5, - ), - beta=1, - gamma=2, - delta=2, - T=10, - ) - value = np.array([1, 3]) - with pytest.raises(NotImplementedError): - pm.logp(invalid_dist, value) - @pytest.mark.parametrize( "alpha_size, beta_size, gamma_size, delta_size, beta_geo_beta_binom_size, expected_size", [