diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b63f3376..508b78e9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,6 +14,10 @@ Changelog - Added the complementary log-log (`cloglog`) link function. +**Bug fix** + +- Fixed :meth:`~glum.GeneralizedLinearRegressorBase.covariance_matrix` mutating feature names when called with a data frame. See `here `_. + **Other changes:** - When computing the covariance matrix, check for ill-conditionedness for all types of input. Furthermore, do it in a more efficient way. diff --git a/src/glum/_glm.py b/src/glum/_glm.py index db8a2086..55171258 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -1456,26 +1456,31 @@ def covariance_matrix( Cambridge university press """ - ( - X, - y, - sample_weight, - offset, - sum_weights, - P1, - P2, - ) = self._set_up_and_check_fit_args( + + if isinstance(X, pd.DataFrame) and hasattr(self, "feature_dtypes_"): + X = _align_df_categories(X, self.feature_dtypes_) + + X, y = check_X_y_tabmat_compliant( X, y, - sample_weight, - offset, - solver=self.solver, - force_all_finite=self.force_all_finite, + accept_sparse=["csr", "csc", "coo"], + dtype="numeric", + copy=self._should_copy_X(), + ensure_2d=True, + allow_nd=False, + drop_first=self.drop_first, ) - # Here we don't want sample_weight to be normalized to sum up to 1 - # We want sample_weight to sum up to the number of samples - sample_weight = sample_weight * sum_weights + if isinstance(X, np.ndarray): + X = tm.DenseMatrix(X) + if sparse.issparse(X) and not isinstance(X, tm.SparseMatrix): + X = tm.SparseMatrix(X) + + sample_weight = _check_weights( + sample_weight, y.shape[0], X.dtype, force_all_finite=self.force_all_finite + ) + sum_weights = np.sum(sample_weight) + offset = _check_offset(offset, y.shape[0], X.dtype) mu = self.predict(X, offset=offset) if mu is None else np.asanyarray(mu)