diff --git a/.flake8 b/.flake8 deleted file mode 100644 index ebbd3d5a..00000000 --- a/.flake8 +++ /dev/null @@ -1,29 +0,0 @@ -# Taken directly from https://github.com/ambv/black/blob/master/.flake8 -[flake8] -ignore = - B905, - # allow calling dict() instead of the literal {} for readability - C408, - E203, - # too many leading # for block comments - E266, - # line too long, conflict with black - E501, - E731, - # linebreak before binary operator, conflict with black - W503, - C901, - D104, - D100, - # Why does flake8 think it should have any say in our docstring formatting?? - D205, - # see above... - D400 -max-line-length = 88 -max-complexity = 18 -select = B,C,E,F,W,T4,B9,D -enable-extensions = flake8-docstrings -per-file-ignores = - tests/**:D101,D102,D103 - src/glum/_glm.py:D -docstring-convention = numpy diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3887a904..67fd3464 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,51 +1,24 @@ repos: - - repo: https://github.com/Quantco/pre-commit-mirrors-black - rev: 24.4.2 - hooks: - - id: black-conda - args: - - --safe - - --target-version=py39 - - repo: https://github.com/Quantco/pre-commit-mirrors-flake8 - rev: 7.0.0 - hooks: - - id: flake8-conda - additional_dependencies: [ - -c, - conda-forge, - flake8-bugbear=22.12.6, - flake8-builtins=2.1.0, - flake8-comprehensions=3.10.1, - flake8-docstrings=1.6.0, - flake8-print=5.0.0, - pep8-naming=0.13.3, - python<3.12, - ] - exclude: (^src/glum_benchmarks/orig_sklearn_fork/|^docs) - - repo: https://github.com/Quantco/pre-commit-mirrors-isort - rev: 5.13.2 - hooks: - - id: isort-conda - additional_dependencies: [toml] - - repo: https://github.com/Quantco/pre-commit-mirrors-mypy - rev: "1.10.0" - hooks: - - id: mypy-conda - args: - - --check-untyped-defs - - --ignore-missing-imports - - --namespace-packages - exclude: ^tests/ - additional_dependencies: [-c, conda-forge, types-setuptools=67.5, attrs] - - repo: https://github.com/Quantco/pre-commit-mirrors-pyupgrade - rev: 3.15.2 - hooks: - - id: pyupgrade-conda - exclude: ^src/glum_benchmarks/orig_sklearn_fork/ - args: [--py39-plus] - - repo: https://github.com/Quantco/pre-commit-mirrors-cython-lint - rev: 0.16.2 - hooks: - - id: cython-lint-conda - args: [--no-pycodestyle] - - id: double-quote-cython-strings-conda + - repo: https://github.com/Quantco/pre-commit-mirrors-ruff + rev: 0.4.3 + hooks: + - id: ruff-conda + exclude: ^src/glum_benchmarks/orig_sklearn_fork/ + - id: ruff-format-conda + exclude: ^src/glum_benchmarks/orig_sklearn_fork/ + - repo: https://github.com/Quantco/pre-commit-mirrors-mypy + rev: "1.10.0" + hooks: + - id: mypy-conda + args: + - --check-untyped-defs + - --ignore-missing-imports + - --namespace-packages + exclude: (^tests/|^src/glum_benchmarks/orig_sklearn_fork/) + additional_dependencies: [-c, conda-forge, types-setuptools=67.5, attrs] + - repo: https://github.com/Quantco/pre-commit-mirrors-cython-lint + rev: 0.16.2 + hooks: + - id: cython-lint-conda + args: [--no-pycodestyle] + - id: double-quote-cython-strings-conda diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 4a96aa8f..2b101a90 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,7 +7,6 @@ Changelog ========= - Unreleased ---------- @@ -17,6 +16,7 @@ Unreleased **Other changes:** +- Move the linting and formatting to ruff. - Removed libblas MKL from the development environment. 3.0.1 - 2024-05-23 diff --git a/docs/tutorials/formula_interface/load_transform_formula.py b/docs/tutorials/formula_interface/load_transform_formula.py index d0098f8b..ce272f83 100644 --- a/docs/tutorials/formula_interface/load_transform_formula.py +++ b/docs/tutorials/formula_interface/load_transform_formula.py @@ -8,17 +8,21 @@ def load_transform(): Summary of transformations: 1. We cut the number of claims to a maximum of 4, as is done in the case study paper - (Case-study authors suspect a data error. See section 1 of their paper for details). + (Case-study authors suspect a data error. See section 1 of their paper for + details). 2. We cut the exposure to a maximum of 1, as is done in the case study paper - (Case-study authors suspect a data error. See section 1 of their paper for details). - 3. We define ``'ClaimAmountCut'`` as the the claim amount cut at 100'000 per single claim - (before aggregation per policy). Reason: For large claims, extreme value theory - might apply. 100'000 is the 0.9984 quantile, any claims larger account for 25% of - the overall claim amount. This is a well known phenomenon for third-party liability. - 4. We aggregate the total claim amounts per policy ID and join them to ``freMTPL2freq``. + (Case-study authors suspect a data error. See section 1 of their paper for + details). + 3. We define ``'ClaimAmountCut'`` as the the claim amount cut at 100'000 per + single claim (before aggregation per policy). Reason: For large claims, + extreme value theory might apply. 100'000 is the 0.9984 quantile, any claims + larger account for 25% of the overall claim amount. This is a well known + phenomenon for third-party liability. + 4. We aggregate the total claim amounts per policy ID and join them to + ``freMTPL2freq``. 5. We fix ``'ClaimNb'`` as the claim number with claim amount greater zero. - 6. ``'VehPower'``, ``'VehAge'``, and ``'DrivAge'`` are clipped and/or digitized into bins so - they can be used as categoricals later on. + 6. ``'VehPower'``, ``'VehAge'``, and ``'DrivAge'`` are clipped and/or digitized + into bins so they can be used as categoricals later on. """ # load the datasets # first row (=column names) uses "", all other rows use '' diff --git a/docs/tutorials/glm_french_motor_tutorial/load_transform.py b/docs/tutorials/glm_french_motor_tutorial/load_transform.py index d0098f8b..ce272f83 100644 --- a/docs/tutorials/glm_french_motor_tutorial/load_transform.py +++ b/docs/tutorials/glm_french_motor_tutorial/load_transform.py @@ -8,17 +8,21 @@ def load_transform(): Summary of transformations: 1. We cut the number of claims to a maximum of 4, as is done in the case study paper - (Case-study authors suspect a data error. See section 1 of their paper for details). + (Case-study authors suspect a data error. See section 1 of their paper for + details). 2. We cut the exposure to a maximum of 1, as is done in the case study paper - (Case-study authors suspect a data error. See section 1 of their paper for details). - 3. We define ``'ClaimAmountCut'`` as the the claim amount cut at 100'000 per single claim - (before aggregation per policy). Reason: For large claims, extreme value theory - might apply. 100'000 is the 0.9984 quantile, any claims larger account for 25% of - the overall claim amount. This is a well known phenomenon for third-party liability. - 4. We aggregate the total claim amounts per policy ID and join them to ``freMTPL2freq``. + (Case-study authors suspect a data error. See section 1 of their paper for + details). + 3. We define ``'ClaimAmountCut'`` as the the claim amount cut at 100'000 per + single claim (before aggregation per policy). Reason: For large claims, + extreme value theory might apply. 100'000 is the 0.9984 quantile, any claims + larger account for 25% of the overall claim amount. This is a well known + phenomenon for third-party liability. + 4. We aggregate the total claim amounts per policy ID and join them to + ``freMTPL2freq``. 5. We fix ``'ClaimNb'`` as the claim number with claim amount greater zero. - 6. ``'VehPower'``, ``'VehAge'``, and ``'DrivAge'`` are clipped and/or digitized into bins so - they can be used as categoricals later on. + 6. ``'VehPower'``, ``'VehAge'``, and ``'DrivAge'`` are clipped and/or digitized + into bins so they can be used as categoricals later on. """ # load the datasets # first row (=column names) uses "", all other rows use '' diff --git a/pyproject.toml b/pyproject.toml index fc6dc0a3..8ac88d8f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,25 +8,30 @@ requires = [ 'scikit-learn', ] -[tool.black] -exclude = ''' -/( - \.eggs - | \.git - | \.venv - | build - | dist -)/ -''' - -[tool.isort] -multi_line_output = 3 -include_trailing_comma = true -ensure_newline_before_comments = true -line_length = 88 -known_first_party = "glum" -skip_glob = '\.eggs/*,\.git/*,\.venv/*,build/*,dist/*' -default_section = 'THIRDPARTY' +[tool.ruff] +line-length = 88 +target-version = "py39" + +[tool.ruff.lint] +ignore = ["E731", "N802", "N803", "N806"] +select = [ + # pyflakes + "F", + # pycodestyle + "E", "W", + # isort + "I", + # pep8-naming + "N", + # pyupgrade + "UP", +] + +[tool.ruff.lint.isort] +known-first-party = ["glum", "glum_benchmarks"] + +[tool.mypy] +python_version = '3.9' [tool.cibuildwheel] skip = [ @@ -51,6 +56,7 @@ CFLAGS="-I$CONDA/envs/build/include" CXXFLAGS="-I$CONDA/envs/build/include" CXX="/usr/bin/clang++" CC="/usr/bin/clang" +MACOSX_DEPLOYMENT_TARGET="10.13" [tool.pytest.ini_options] diff --git a/src/glum/_distribution.py b/src/glum/_distribution.py index f7facbd5..b7c42bd8 100644 --- a/src/glum/_distribution.py +++ b/src/glum/_distribution.py @@ -282,7 +282,9 @@ def _mu_deviance_derivative( link: Link, offset=None, ): - """Compute ``mu`` and the derivative of the deviance with respect to coefficients.""" + """Compute ``mu`` and the derivative of the deviance with respect to + coefficients. + """ lin_pred = _safe_lin_pred(X, coef, offset) mu = link.inverse(lin_pred) d1 = link.inverse_derivative(lin_pred) @@ -604,7 +606,6 @@ def power(self) -> float: @power.setter def power(self, power): - if not isinstance(power, (int, float, np.number)): raise TypeError(f"The power parameter must be numeric; got {power}.") if (power > 0) and (power < 1): @@ -622,7 +623,6 @@ def unit_variance_derivative(self, mu): # noqa D return numexpr.evaluate("p * mu ** (p - 1)") def deviance(self, y, mu, sample_weight=None) -> float: # noqa D - y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight) sample_weight = np.ones_like(y) if sample_weight is None else sample_weight @@ -641,7 +641,6 @@ def deviance(self, y, mu, sample_weight=None) -> float: # noqa D return tweedie_deviance(y, sample_weight, mu, p=float(self.power)) def unit_deviance(self, y, mu): # noqa D - if self.power == 0: # normal distribution return (y - mu) ** 2 if self.power == 1: # Poisson distribution @@ -803,7 +802,6 @@ def unit_variance_derivative(self, mu) -> np.ndarray: # noqa D return 0 if np.isscalar(mu) else np.zeros_like(mu) def deviance(self, y, mu, sample_weight=None) -> float: # noqa D - y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight) sample_weight = np.ones_like(y) if sample_weight is None else sample_weight @@ -1031,7 +1029,6 @@ def unit_variance_derivative(self, mu) -> np.ndarray: # noqa D return 2 * mu def deviance(self, y, mu, sample_weight=None) -> float: # noqa D - y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight) sample_weight = np.ones_like(y) if sample_weight is None else sample_weight @@ -1388,7 +1385,6 @@ def theta(self) -> float: @theta.setter def theta(self, theta): - if not isinstance(theta, (int, float)): raise TypeError(f"Theta must be numeric; got {theta}.") if not theta > 0: @@ -1517,7 +1513,6 @@ def guess_intercept( avg_y = np.average(y, weights=sample_weight) if isinstance(link, IdentityLink): - # This is only correct for the normal. For other distributions, the # answer is unknown, but we assume that we want `sum(y) = sum(mu)` @@ -1529,7 +1524,6 @@ def guess_intercept( return avg_y - avg_eta elif isinstance(link, LogLink): - # This is only correct for Tweedie log_avg_y = np.log(avg_y) @@ -1564,7 +1558,6 @@ def guess_intercept( return first - second elif isinstance(link, LogitLink): - log_odds = np.log(avg_y) - np.log(1 - avg_y) if eta is None: @@ -1575,7 +1568,6 @@ def guess_intercept( return log_odds - avg_eta else: - return link.link(y.dot(sample_weight)) diff --git a/src/glum/_glm.py b/src/glum/_glm.py index edd0a1b6..8d60747b 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -5,7 +5,8 @@ https://github.com/scikit-learn/scikit-learn/pull/9405 Original attribution from: -https://github.com/scikit-learn/scikit-learn/pull/9405/files#diff-38e412190dc50455611b75cfcf2d002713dcf6d537a78b9a22cc6b1c164390d1 # noqa: B950 +https://github.com/scikit-learn/scikit-learn/pull/9405/files +#diff-38e412190dc50455611b75cfcf2d002713dcf6d537a78b9a22cc6b1c164390d1 ''' Author: Christian Lorentzen some parts and tricks stolen from other sklearn files. @@ -14,7 +15,6 @@ # License: BSD 3 clause - import copy import re import sys @@ -133,7 +133,9 @@ def check_array_tabmat_compliant(mat: ArrayLike, drop_first: int = False, **kwar if res is not mat and original_type in (tm.DenseMatrix, tm.SparseMatrix): res = original_type( - res, column_names=mat.column_names, term_names=mat.term_names # type: ignore + res, + column_names=mat.column_names, # type: ignore + term_names=mat.term_names, # type: ignore ) return res @@ -462,7 +464,7 @@ def _standardize_warm_start( def get_family( - family: Union[str, ExponentialDispersionModel] + family: Union[str, ExponentialDispersionModel], ) -> ExponentialDispersionModel: if isinstance(family, ExponentialDispersionModel): return family @@ -552,7 +554,6 @@ def setup_p1( alpha: float, l1_ratio: float, ) -> np.ndarray: - if not isinstance(X, (tm.MatrixBase, tm.StandardizedMatrix)): raise TypeError @@ -593,7 +594,6 @@ def setup_p2( alpha: float, l1_ratio: float, ) -> Union[np.ndarray, sparse.spmatrix]: - if not isinstance(X, (tm.MatrixBase, tm.StandardizedMatrix)): raise TypeError @@ -2007,10 +2007,11 @@ def covariance_matrix( correction of :math:`\\frac{N}{N-p}`. The clustered covariance matrix uses a similar approach to the robust (HC-1) - covariance matrix. However, instead of using :math:`\\mathbf{G}^{T}(\\hat{\\theta} - \\mathbf{G}(\\hat{\\theta})` directly, we first sum over all the groups first. - The finite-sample correction is affected as well, becoming :math:`\\frac{M}{M-1} - \\frac{N}{N-p}` where :math:`M` is the number of groups. + covariance matrix. However, instead of using :math:`\\mathbf{G}^{T}( + \\hat{\\theta}\\mathbf{G}(\\hat{\\theta})` directly, we first sum over + all the groups first. The finite-sample correction is affected as well, + becoming :math:`\\frac{M}{M-1}\\frac{N}{N-p}` where :math:`M` is the number + of groups. References ---------- @@ -2223,7 +2224,8 @@ def score( the deviance. Note that those two are equal for ``family='normal'``. :math:`D^2` is defined as - :math:`D^2 = 1 - \\frac{D(y_{\\mathrm{true}}, y_{\\mathrm{pred}})}{D_{\\mathrm{null}}}`, + :math:`D^2 = 1 - \\frac{D(y_{\\mathrm{true}}, y_{\\mathrm{pred}})} + {D_{\\mathrm{null}}}`, :math:`D_{\\mathrm{null}}` is the null deviance, i.e. the deviance of a model with intercept alone. The best possible score is one and it can be negative. @@ -2573,8 +2575,8 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase): with inverse link function ``h`` and ``s=sample_weight``. Note that, for ``alpha=0`` the unregularized GLM is recovered. This is not the default behavior (see ``alpha`` parameter description for details). - Additionally, for ``sample_weight=None``, one has ``s_i=1`` and ``sum(s)=n_samples``. - For ``P1=P2='identity'``, the penalty is the elastic net:: + Additionally, for ``sample_weight=None``, one has ``s_i=1`` and + ``sum(s)=n_samples``. For ``P1=P2='identity'``, the penalty is the elastic net:: alpha * l1_ratio * ||w||_1 + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2. @@ -2612,7 +2614,8 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. - P1 : {'identity', array-like, None}, shape (n_features,), optional (default='identity') + P1 : {'identity', array-like, None}, shape (n_features,), optional + (default='identity') This array controls the strength of the regularization for each coefficient independently. A high value will lead to higher regularization while a value of zero will remove the regularization on this parameter. @@ -3019,7 +3022,7 @@ def _validate_hyperparameters(self) -> None: ): raise ValueError( "Penalty term must be a non-negative number;" - " got (alpha={})".format(self.alpha) + f" got (alpha={self.alpha})" ) if ( @@ -3031,7 +3034,7 @@ def _validate_hyperparameters(self) -> None: ): raise ValueError( "l1_ratio must be a number in interval [0, 1];" - " got (l1_ratio={})".format(self.l1_ratio) + f" got (l1_ratio={self.l1_ratio})" ) super()._validate_hyperparameters() @@ -3260,11 +3263,9 @@ def fit( _alpha = self.alpha if _alpha > 0 and self.l1_ratio > 0 and self._solver != "irls-cd": raise ValueError( - "The chosen solver (solver={}) can't deal " + f"The chosen solver (solver={self._solver}) can't deal " "with L1 penalties, which are included with " - "(alpha={}) and (l1_ratio={}).".format( - self._solver, _alpha, self.l1_ratio - ) + f"(alpha={_alpha}) and (l1_ratio={self.l1_ratio})." ) coef = self._solve( X=X, diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py index 04328d37..e4c2d8e6 100644 --- a/src/glum/_glm_cv.py +++ b/src/glum/_glm_cv.py @@ -40,7 +40,8 @@ class GeneralizedLinearRegressorCV(GeneralizedLinearRegressorBase): If you pass ``l1_ratio`` as an array, the ``fit`` method will choose the best value of ``l1_ratio`` and store it as ``self.l1_ratio``. - P1 : {'identity', array-like, None}, shape (n_features,), optional (default='identity') + P1 : {'identity', array-like, None}, shape (n_features,), optional + (default='identity') This array controls the strength of the regularization for each coefficient independently. A high value will lead to higher regularization while a value of zero will remove the regularization on this parameter. @@ -414,9 +415,8 @@ def _validate_hyperparameters(self) -> None: or np.any(l1_ratio > 1) ): raise ValueError( - "l1_ratio must be a number in interval [0, 1]; got l1_ratio={}".format( - self.l1_ratio - ) + "l1_ratio must be a number in interval [0, 1]; got " + f"l1_ratio={self.l1_ratio}" ) super()._validate_hyperparameters() diff --git a/src/glum/_link.py b/src/glum/_link.py index 179afac9..db2e44cc 100644 --- a/src/glum/_link.py +++ b/src/glum/_link.py @@ -234,7 +234,6 @@ def derivative(self, mu): # noqa D return 1.0 / (mu * (1 - mu)) def inverse(self, lin_pred): # noqa D - inv_logit = special.expit(lin_pred) eps50 = 50 * np.finfo(inv_logit.dtype).eps @@ -266,7 +265,6 @@ def derivative(self, mu): # noqa D return 1.0 / ((mu - 1) * (np.log1p(-mu))) def inverse(self, lin_pred): # noqa D - lin_pred = lin_pred inv_cloglog = -np.expm1(-np.exp(lin_pred)) eps50 = 50 * np.finfo(inv_cloglog.dtype).eps diff --git a/src/glum/_solvers.py b/src/glum/_solvers.py index ef8f6132..ccb7f4a5 100644 --- a/src/glum/_solvers.py +++ b/src/glum/_solvers.py @@ -105,7 +105,8 @@ def update_hessian(state, data, active_set): ``dH``. If ``threshold/data.hessian_approx == 0.0``, then we will always use every row. However, for ``data.hessian_approx != 0``, we include rows for which - ``include = (np.abs(hessian_rows_diff[i]) >= T * np.max(np.abs(hessian_rows_diff)))``. + ``include = (np.abs(hessian_rows_diff[i]) >= T * + np.max(np.abs(hessian_rows_diff)))``. Essentially, this criterion ignores data matrix rows that have not seen the second derivatives of their predictions change very much in the last @@ -344,7 +345,7 @@ def _irls_solver(inner_solver, coef, data) -> tuple[np.ndarray, int, int, list[l warnings.warn( "IRLS failed to converge. Increase" " the maximum number of iterations max_iter" - " (currently {})".format(data.max_iter), + f" (currently {data.max_iter})", ConvergenceWarning, ) return state.coef, state.n_iter, state.n_cycles, state.diagnostics diff --git a/src/glum/_util.py b/src/glum/_util.py index 83d30fe0..ec7b8d96 100644 --- a/src/glum/_util.py +++ b/src/glum/_util.py @@ -127,7 +127,6 @@ def _expand_categorical_penalties( penalty = np.asanyarray(penalty) if penalty.shape[0] == X.shape[1]: - if penalty.ndim == 2: raise ValueError( "When the penalty is two-dimensional, it must have the " @@ -147,7 +146,6 @@ def _expand_categorical_penalties( return np.array(expanded_penalty) else: - return penalty diff --git a/src/glum_benchmarks/bench_liblinear.py b/src/glum_benchmarks/bench_liblinear.py index 252efb4d..2890dbcd 100644 --- a/src/glum_benchmarks/bench_liblinear.py +++ b/src/glum_benchmarks/bench_liblinear.py @@ -47,8 +47,8 @@ def liblinear_bench( X = dat["X"] if not isinstance(X, (np.ndarray, sps.spmatrix, pd.DataFrame)): warnings.warn( - "liblinear requires data as scipy.sparse matrix, pandas dataframe, or numpy " - "array. Skipping." + "liblinear requires data as scipy.sparse matrix, pandas dataframe, or " + "numpy array. Skipping." ) return result diff --git a/src/glum_benchmarks/data/create_insurance.py b/src/glum_benchmarks/data/create_insurance.py index 9dcb587f..5d237555 100644 --- a/src/glum_benchmarks/data/create_insurance.py +++ b/src/glum_benchmarks/data/create_insurance.py @@ -14,7 +14,7 @@ from ..util import exposure_and_offset_to_weights -# taken from https://github.com/lorentzenchr/Tutorial_freMTPL2/blob/master/glm_freMTPL2_example.ipynb # noqa: B950 +# taken from https://github.com/lorentzenchr/Tutorial_freMTPL2/blob/master/glm_freMTPL2_example.ipynb # noqa: E501 # Modified to generate data sets of different sizes @@ -67,17 +67,15 @@ def create_insurance_raw_data(verbose=False) -> None: if verbose: print( - "Number or rows with ClaimAmountCut > 0 and ClaimNb == 0: {}".format( - df[(df.ClaimAmountCut > 0) & (df.ClaimNb == 0)].shape[0] - ) + "Number or rows with ClaimAmountCut > 0 and ClaimNb == 0: " + f"{df[(df.ClaimAmountCut > 0) & (df.ClaimNb == 0)].shape[0]}" ) # 9116 zero claims if verbose: print( - "Number or rows with ClaimAmountCut = 0 and ClaimNb >= 1: {}".format( - df[(df.ClaimAmountCut == 0) & (df.ClaimNb >= 1)].shape[0] - ) + "Number or rows with ClaimAmountCut = 0 and ClaimNb >= 1: " + f"{df[(df.ClaimAmountCut == 0) & (df.ClaimNb >= 1)].shape[0]}" ) # Note: Zero claims must be ignored in severity models, because the support is @@ -201,7 +199,7 @@ def make_column_transformer(*transformers, remainder: str = "drop"): # noqa: D1 def func_returns_df( - fn: Callable[[pd.DataFrame], np.ndarray] + fn: Callable[[pd.DataFrame], np.ndarray], ) -> Callable[[pd.DataFrame], pd.DataFrame]: """ Take a function that takes a dataframe and returns a Numpy array, and return a \ @@ -209,8 +207,8 @@ def func_returns_df( fn: Function that takes a dataframe and returns a numpy array Returns: Function that takes a dataframe and returns a dataframe with the values - determined by the original function, and the index and columns of the original - dataframe. + determined by the original function, and the index and columns of the + original dataframe. """ return lambda x: x.assign(**{x.columns[0]: fn(x)}) diff --git a/src/glum_benchmarks/orig_sklearn_fork/_glm.py b/src/glum_benchmarks/orig_sklearn_fork/_glm.py index 00f30f3a..84bb4afe 100644 --- a/src/glum_benchmarks/orig_sklearn_fork/_glm.py +++ b/src/glum_benchmarks/orig_sklearn_fork/_glm.py @@ -36,9 +36,6 @@ # So far, coefficients=w and sample weights=s. # - The intercept term is the first index, i.e. coef[0] - -from __future__ import division - import numbers import warnings from abc import ABCMeta, abstractmethod @@ -755,7 +752,7 @@ def power(self): @power.setter def power(self, power): if not isinstance(power, numbers.Real): - raise TypeError("power must be a real number, input was {}".format(power)) + raise TypeError(f"power must be a real number, input was {power}") self._upper_bound = np.inf self._include_upper_bound = False @@ -1056,7 +1053,7 @@ def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link, max_iter, if not converged: warnings.warn( "irls failed to converge. Increase the number " - "of iterations (currently {})".format(max_iter), + f"of iterations (currently {max_iter})", ConvergenceWarning, ) @@ -1445,7 +1442,7 @@ def _cd_solver( warnings.warn( "Coordinate descent failed to converge. Increase" " the maximum number of iterations max_iter" - " (currently {})".format(max_iter), + f" (currently {max_iter})", ConvergenceWarning, ) @@ -1796,7 +1793,7 @@ def fit(self, X, y, sample_weight=None): "The family must be an instance of class" " ExponentialDispersionModel or an element of" " ['normal', 'poisson', 'gamma', 'inverse.gaussian', " - "'binomial']; got (family={})".format(self.family) + f"'binomial']; got (family={self.family})" ) # Guarantee that self._link_instance is set to an instance of @@ -1819,7 +1816,7 @@ def fit(self, X, y, sample_weight=None): "No default link known for the " "specified distribution family. Please " "set link manually, i.e. not to 'auto'; " - "got (link='auto', family={}".format(self.family) + f"got (link='auto', family={self.family}" ) elif self.link == "identity": self._link_instance = IdentityLink() @@ -1831,13 +1828,13 @@ def fit(self, X, y, sample_weight=None): raise ValueError( "The link must be an instance of class Link or " "an element of ['auto', 'identity', 'log', 'logit']; " - "got (link={})".format(self.link) + f"got (link={self.link})" ) if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: raise ValueError( "Penalty term must be a non-negative number;" - " got (alpha={})".format(self.alpha) + f" got (alpha={self.alpha})" ) if ( not isinstance(self.l1_ratio, numbers.Number) @@ -1846,18 +1843,17 @@ def fit(self, X, y, sample_weight=None): ): raise ValueError( "l1_ratio must be a number in interval [0, 1];" - " got (l1_ratio={})".format(self.l1_ratio) + f" got (l1_ratio={self.l1_ratio})" ) if not isinstance(self.fit_intercept, bool): raise ValueError( - "The argument fit_intercept must be bool;" - " got {}".format(self.fit_intercept) + "The argument fit_intercept must be bool;" f" got {self.fit_intercept}" ) if self.solver not in ["auto", "irls", "lbfgs", "newton-cg", "cd"]: raise ValueError( "GeneralizedLinearRegressor supports only solvers" " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';" - " got {}".format(self.solver) + f" got {self.solver}" ) solver = self.solver if self.solver == "auto": @@ -1867,47 +1863,41 @@ def fit(self, X, y, sample_weight=None): solver = "cd" if self.alpha > 0 and self.l1_ratio > 0 and solver not in ["cd"]: raise ValueError( - "The chosen solver (solver={}) can't deal " + f"The chosen solver (solver={solver}) can't deal " "with L1 penalties, which are included with " - "(alpha={}) and (l1_ratio={}).".format( - solver, self.alpha, self.l1_ratio - ) + f"(alpha={self.alpha}) and (l1_ratio={self.l1_ratio})." ) if not isinstance(self.max_iter, int) or self.max_iter <= 0: raise ValueError( "Maximum number of iteration must be a positive " "integer;" - " got (max_iter={!r})".format(self.max_iter) + f" got (max_iter={self.max_iter!r})" ) if not isinstance(self.tol, numbers.Number) or self.tol <= 0: raise ValueError( "Tolerance for stopping criteria must be " - "positive; got (tol={!r})".format(self.tol) + f"positive; got (tol={self.tol!r})" ) if not isinstance(self.warm_start, bool): raise ValueError( - "The argument warm_start must be bool;" - " got {}".format(self.warm_start) + "The argument warm_start must be bool;" f" got {self.warm_start}" ) if self.selection not in ["cyclic", "random"]: raise ValueError( "The argument selection must be 'cyclic' or " - "'random'; got (selection={})".format(self.selection) + f"'random'; got (selection={self.selection})" ) random_state = check_random_state(self.random_state) if not isinstance(self.diag_fisher, bool): raise ValueError( - "The argument diag_fisher must be bool;" - " got {}".format(self.diag_fisher) + "The argument diag_fisher must be bool;" f" got {self.diag_fisher}" ) if not isinstance(self.copy_X, bool): - raise ValueError( - "The argument copy_X must be bool;" " got {}".format(self.copy_X) - ) + raise ValueError("The argument copy_X must be bool;" f" got {self.copy_X}") if not isinstance(self.check_input, bool): raise ValueError( "The argument check_input must be bool; got " - "(check_input={})".format(self.check_input) + f"(check_input={self.check_input})" ) family = self._family_instance @@ -1948,14 +1938,14 @@ def fit(self, X, y, sample_weight=None): except TypeError: raise TypeError( "The given P1 cannot be converted to a numeric" - "array; got (P1.dtype={}).".format(P1.dtype) + f"array; got (P1.dtype={P1.dtype})." ) if (P1.ndim != 1) or (P1.shape[0] != n_features): raise ValueError( "P1 must be either 'identity' or a 1d array " "with the length of X.shape[1]; " - "got (P1.shape[0]={}), " - "needed (X.shape[1]={}).".format(P1.shape[0], n_features) + f"got (P1.shape[0]={P1.shape[0]}), " + f"needed (X.shape[1]={n_features})." ) # If X is sparse, make P2 sparse, too. if isinstance(self.P2, str) and self.P2 == "identity": @@ -1978,9 +1968,7 @@ def fit(self, X, y, sample_weight=None): "P2 should be a 1d array of shape " "(n_features,) with " "n_features=X.shape[1]; " - "got (P2.shape=({},)), needed ({},)".format( - P2.shape[0], X.shape[1] - ) + f"got (P2.shape=({P2.shape[0]},)), needed ({X.shape[1]},)" ) if sparse.issparse(X): P2 = ( @@ -1998,9 +1986,7 @@ def fit(self, X, y, sample_weight=None): "P2 must be either None or an array of shape " "(n_features, n_features) with " "n_features=X.shape[1]; " - "got (P2.shape=({0}, {1})), needed ({2}, {2})".format( - P2.shape[0], P2.shape[1], X.shape[1] - ) + f"got (P2.shape=({P2.shape[0]}, {P2.shape[1]})), needed ({X.shape[1]}, {X.shape[1]})" ) start_params = self.start_params @@ -2009,7 +1995,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError( "The argument start_params must be 'guess', " "'zero' or an array of correct length; " - "got(start_params={})".format(start_params) + f"got(start_params={start_params})" ) else: start_params = check_array( @@ -2026,11 +2012,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError( "Start values for parameters must have the" "right length and dimension; required (length" - "={}, ndim=1); got (length={}, ndim={}).".format( - X.shape[1] + self.fit_intercept, - start_params.shape[0], - start_params.ndim, - ) + f"={X.shape[1] + self.fit_intercept}, ndim=1); got (length={start_params.shape[0]}, ndim={start_params.ndim})." ) l1 = self.alpha * self.l1_ratio @@ -2058,7 +2040,7 @@ def fit(self, X, y, sample_weight=None): if not np.all(family.in_y_range(y)): raise ValueError( "Some value(s) of y are out of the valid " - "range for family {}".format(family.__class__.__name__) + f"range for family {family.__class__.__name__}" ) # check if P1 has only non-negative values, negative values might # indicate group lasso in the future. @@ -2405,8 +2387,8 @@ def estimate_phi(self, X, y, sample_weight=None): raise ValueError( "Estimation of dispersion parameter phi requires" " more samples than features, got" - " samples=X.shape[0]={} and" - " n_features=X.shape[1]+fit_intercept={}.".format(n_samples, n_features) + f" samples=X.shape[0]={n_samples} and" + f" n_features=X.shape[1]+fit_intercept={n_features}." ) mu = self._link_instance.inverse(eta) if self.fit_dispersion == "chisqr": diff --git a/tests/benchmark/test_cli.py b/tests/benchmark/test_cli.py index a609a3bb..b9672b0e 100644 --- a/tests/benchmark/test_cli.py +++ b/tests/benchmark/test_cli.py @@ -79,7 +79,7 @@ def test_correct_problems_run(): if not result.exit_code == 0: problem_name_str = " ".join(args) raise ValueError( - f"""Failed on problem {problem_name_str} with output: \n {result.output}""" + f"Failed on problem {problem_name_str} with output: \n {result.output}" ) problems_run = os.listdir(d) diff --git a/tests/glm/performance/test_performance.py b/tests/glm/performance/test_performance.py index 2dbc55d6..090de4e1 100644 --- a/tests/glm/performance/test_performance.py +++ b/tests/glm/performance/test_performance.py @@ -200,7 +200,8 @@ def runtime_checker(): force_all_finite=False, ) min_runtime, result = runtime( - lambda: model.fit(X=dat["X"], y=dat["y"]), 5 # noqa B023 + lambda: model.fit(X=dat["X"], y=dat["y"]), # noqa B023 + 5, ) # Let's just guess that we're about half flop-limited and half diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py index 7721f253..efa26075 100644 --- a/tests/glm/test_glm.py +++ b/tests/glm/test_glm.py @@ -1716,9 +1716,7 @@ def test_column_with_stddev_zero(): model = GeneralizedLinearRegressor( family="poisson", fit_intercept=False, scale_predictors=False - ).fit( - X, y - ) # noqa: F841 + ).fit(X, y) # noqa: F841 model = GeneralizedLinearRegressor(family="poisson").fit(X, y) # noqa: F841 @@ -3024,7 +3022,9 @@ def get_mixed_data(): "fit_intercept", [True, False], ids=["intercept", "no_intercept"] ) def test_formula(get_mixed_data, formula, drop_first, fit_intercept): - """Model with formula and model with externally constructed model matrix should match.""" + """Model with formula and model with externally constructed model matrix should + match. + """ data = get_mixed_data model_formula = GeneralizedLinearRegressor(