Quantco · MatthiasSchmidtblaicherQC · Jan 15, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024
diff --git a/docs/tutorials/formula_interface/formula_interface.ipynb b/docs/tutorials/formula_interface/formula_interface.ipynb
@@ -23,7 +23,7 @@
     "Formulas can provide a concise and convenient way to specify many of the usual pre-processing steps, such as converting to categorical types, creating interactions, applying transformations, or even spline interpolation. As an example, consider the following formula:\n",
     "\n",
     "```\n",
-    "{ClaimAmountCut / Exposure} ~ C(DrivAge, missing_method='convert') * C(VehPower, missing_method=\"zero\") + bs(BonusMalus, 3) + 1\n",
+    "{ClaimAmountCut / Exposure} ~ C(DrivAge, missing_method='convert') * C(VehPower, missing_method=\"zero\") + bs(BonusMalus, 3)\n",
     "```\n",
     "\n",
     "Despite its brevity, it describes all of the following:\n",
@@ -32,7 +32,6 @@
     " - If there are missing values in `DrivAge`, they should be treated as a separate category.\n",
     " - On the other hand, missing values in `VehPower` should be treated as all-zero indicators.\n",
     " - The predictors should also include a third degree B-spline interpolation of `BonusMalus`.\n",
-    " - The model should include an intercept.\n",
     "\n",
     "The following chapters demonstrate each of these features in some detail, as well as some additional advantages of using the formula interface."
    ]
@@ -59,6 +58,7 @@
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "import pandas as pd\n",
+    "import pytest\n",
     "import scipy.optimize as optimize\n",
     "import scipy.stats\n",
     "from dask_ml.preprocessing import Categorizer\n",
@@ -1261,144 +1261,27 @@
    "source": [
     "### Intercept Term\n",
     "\n",
-    "Just like in the case of the non-formula interface, an intercept term is added by default. This can be disabled by either setting the `fit_intercept` parameter to `False`, or adding `+0` or `-1` to the end of the formula. In the case of conflict, a warning is emitted, and the latter takes precedence."
+    "Just like in the case of the non-formula interface, the presence of an intercept is determined by the `fit_intercept` argument. In case that the formula specifies a different behavior (e.g., adding `+0` or `-1` while `fit_intercept=True`), an error will be raised."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/stanmart/work/glum/src/glum/_glm.py:2354: UserWarning: The formula explicitly sets the intercept to False, overriding fit_intercept=True.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>intercept</th>\n",
-       "      <th>DrivAge__0</th>\n",
-       "      <th>DrivAge__1</th>\n",
-       "      <th>DrivAge__2</th>\n",
-       "      <th>DrivAge__3</th>\n",
-       "      <th>DrivAge__4</th>\n",
-       "      <th>DrivAge__5</th>\n",
-       "      <th>DrivAge__6</th>\n",
-       "      <th>VehPower__4</th>\n",
-       "      <th>VehPower__5</th>\n",
-       "      <th>...</th>\n",
-       "      <th>DrivAge__4__x__VehPower__8</th>\n",
-       "      <th>DrivAge__5__x__VehPower__8</th>\n",
-       "      <th>DrivAge__6__x__VehPower__8</th>\n",
-       "      <th>DrivAge__0__x__VehPower__9</th>\n",
-       "      <th>DrivAge__1__x__VehPower__9</th>\n",
-       "      <th>DrivAge__2__x__VehPower__9</th>\n",
-       "      <th>DrivAge__3__x__VehPower__9</th>\n",
-       "      <th>DrivAge__4__x__VehPower__9</th>\n",
-       "      <th>DrivAge__5__x__VehPower__9</th>\n",
-       "      <th>DrivAge__6__x__VehPower__9</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>coefficient</th>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1.713298</td>\n",
-       "      <td>0.783505</td>\n",
-       "      <td>0.205914</td>\n",
-       "      <td>0.016085</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.000094</td>\n",
-       "      <td>0.223685</td>\n",
-       "      <td>4.66123</td>\n",
-       "      <td>4.736272</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-0.144927</td>\n",
-       "      <td>0.001657</td>\n",
-       "      <td>0.515373</td>\n",
-       "      <td>0.714834</td>\n",
-       "      <td>-0.325666</td>\n",
-       "      <td>-0.370935</td>\n",
-       "      <td>0.20417</td>\n",
-       "      <td>0.013222</td>\n",
-       "      <td>-0.273913</td>\n",
-       "      <td>0.115693</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>1 rows × 56 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "             intercept  DrivAge__0  DrivAge__1  DrivAge__2  DrivAge__3  \\\n",
-       "coefficient        0.0    1.713298    0.783505    0.205914    0.016085   \n",
-       "\n",
-       "             DrivAge__4  DrivAge__5  DrivAge__6  VehPower__4  VehPower__5  \\\n",
-       "coefficient         0.0    0.000094    0.223685      4.66123     4.736272   \n",
-       "\n",
-       "             ...  DrivAge__4__x__VehPower__8  DrivAge__5__x__VehPower__8  \\\n",
-       "coefficient  ...                   -0.144927                    0.001657   \n",
-       "\n",
-       "             DrivAge__6__x__VehPower__8  DrivAge__0__x__VehPower__9  \\\n",
-       "coefficient                    0.515373                    0.714834   \n",
-       "\n",
-       "             DrivAge__1__x__VehPower__9  DrivAge__2__x__VehPower__9  \\\n",
-       "coefficient                   -0.325666                   -0.370935   \n",
-       "\n",
-       "             DrivAge__3__x__VehPower__9  DrivAge__4__x__VehPower__9  \\\n",
-       "coefficient                     0.20417                    0.013222   \n",
-       "\n",
-       "             DrivAge__5__x__VehPower__9  DrivAge__6__x__VehPower__9  \n",
-       "coefficient                   -0.273913                    0.115693  \n",
-       "\n",
-       "[1 rows x 56 columns]"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "formula_noint = \"PurePremium ~ DrivAge * VehPower - 1\"\n",
     "\n",
-    "t_glm6 = GeneralizedLinearRegressor(\n",
-    "    family=TweedieDist,\n",
-    "    alpha_search=True,\n",
-    "    l1_ratio=1,\n",
-    "    fit_intercept=True,\n",
-    "    formula=formula_noint,\n",
-    "    interaction_separator=\"__x__\",\n",
-    "    categorical_format=\"{name}__{category}\",\n",
-    ")\n",
-    "t_glm6.fit(df_train, sample_weight=df[\"Exposure\"].values[train])\n",
-    "\n",
-    "pd.DataFrame(\n",
-    "    {\"coefficient\": np.concatenate(([t_glm6.intercept_], t_glm6.coef_))},\n",
-    "    index=[\"intercept\"] + t_glm6.feature_names_,\n",
-    ").T"
+    "with pytest.raises(ValueError, match=\"The formula sets the intercept to False\"):\n",
+    "    t_glm6 = GeneralizedLinearRegressor(\n",
+    "        family=TweedieDist,\n",
+    "        alpha_search=True,\n",
+    "        l1_ratio=1,\n",
+    "        fit_intercept=True,\n",
+    "        formula=formula_noint,\n",
+    "        interaction_separator=\"__x__\",\n",
+    "        categorical_format=\"{name}__{category}\",\n",
+    "    )"
    ]
   },
   {

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
@@ -244,8 +244,7 @@ def _parse_formula(
     formula : FormulaSpec
         The formula to parse.
     include_intercept: bool, default True
-        Whether to include an intercept column if the formula does not
-        include (``+ 1``) or exclude (``+ 0`` or ``- 1``) it explicitly.
+        Whether to include an intercept column.
 
     Returns
     -------
@@ -2673,11 +2672,11 @@ def _set_up_and_check_fit_args(
 
                 intercept = "1" in X.model_spec.terms
                 if intercept != self.fit_intercept:
-                    warnings.warn(
-                        f"The formula explicitly sets the intercept to {intercept}, "
-                        f"overriding fit_intercept={self.fit_intercept}."
+                    raise ValueError(
+                        f"The formula sets the intercept to {intercept}, "
+                        f"contradicting fit_intercept={self.fit_intercept}. "
+                        "You should use fit_intercept to specify the intercept."
                     )
-                    self.fit_intercept = intercept
 
                 self.X_model_spec_ = X.model_spec
 
@@ -3104,6 +3103,7 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
     expected_information : bool, optional (default = False)
         If true, then the expected information matrix is computed by default.
         Only relevant when computing robust standard errors.
+
     formula : FormulaSpec
         A formula accepted by formulaic. It can either be a one-sided formula, in
         which case ``y`` must be specified in ``fit``, or a two-sided formula, in
@@ -3130,6 +3130,7 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
         - if 'zero', missing values will represent all-zero indicator columns.
         - if 'convert', missing values will be converted to the ``cat_missing_name``
           category.
+
     cat_missing_name: str, default='(MISSING)'
         Name of the category to which missing values will be converted if
         ``cat_missing_method='convert'``.  Only used if ``X`` is a pandas data frame.