diff --git a/.flake8 b/.flake8
deleted file mode 100644
index ebbd3d5a..00000000
--- a/.flake8
+++ /dev/null
@@ -1,29 +0,0 @@
-# Taken directly from https://github.com/ambv/black/blob/master/.flake8
-[flake8]
-ignore =
-    B905,
-    # allow calling dict() instead of the literal {} for readability
-    C408,
-    E203,
-    # too many leading # for block comments
-    E266,
-    # line too long, conflict with black
-    E501,
-    E731,
-    # linebreak before binary operator, conflict with black
-    W503,
-    C901,
-    D104,
-    D100,
-    # Why does flake8 think it should have any say in our docstring formatting??
-    D205,
-    # see above...
-    D400
-max-line-length = 88
-max-complexity = 18
-select = B,C,E,F,W,T4,B9,D
-enable-extensions = flake8-docstrings
-per-file-ignores =
-    tests/**:D101,D102,D103
-    src/glum/_glm.py:D
-docstring-convention = numpy
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3887a904..67fd3464 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,51 +1,24 @@
 repos:
- - repo: https://github.com/Quantco/pre-commit-mirrors-black
-   rev: 24.4.2
-   hooks:
-     - id: black-conda
-       args:
-         - --safe
-         - --target-version=py39
- - repo: https://github.com/Quantco/pre-commit-mirrors-flake8
-   rev: 7.0.0
-   hooks:
-    - id: flake8-conda
-      additional_dependencies: [
-          -c,
-          conda-forge,
-          flake8-bugbear=22.12.6,
-          flake8-builtins=2.1.0,
-          flake8-comprehensions=3.10.1,
-          flake8-docstrings=1.6.0,
-          flake8-print=5.0.0,
-          pep8-naming=0.13.3,
-          python<3.12,
-        ]
-      exclude: (^src/glum_benchmarks/orig_sklearn_fork/|^docs)
- - repo: https://github.com/Quantco/pre-commit-mirrors-isort
-   rev: 5.13.2
-   hooks:
-    - id: isort-conda
-      additional_dependencies: [toml]
- - repo: https://github.com/Quantco/pre-commit-mirrors-mypy
-   rev: "1.10.0"
-   hooks:
-    - id: mypy-conda
-      args:
-       - --check-untyped-defs
-       - --ignore-missing-imports
-       - --namespace-packages
-      exclude: ^tests/
-      additional_dependencies: [-c, conda-forge, types-setuptools=67.5, attrs]
- - repo: https://github.com/Quantco/pre-commit-mirrors-pyupgrade
-   rev: 3.15.2
-   hooks:
-    - id: pyupgrade-conda
-      exclude: ^src/glum_benchmarks/orig_sklearn_fork/
-      args: [--py39-plus]
- - repo: https://github.com/Quantco/pre-commit-mirrors-cython-lint
-   rev: 0.16.2
-   hooks:
-    - id: cython-lint-conda
-      args: [--no-pycodestyle]
-    - id: double-quote-cython-strings-conda
+  - repo: https://github.com/Quantco/pre-commit-mirrors-ruff
+    rev: 0.4.3
+    hooks:
+      - id: ruff-conda
+        exclude: ^src/glum_benchmarks/orig_sklearn_fork/
+      - id: ruff-format-conda
+        exclude: ^src/glum_benchmarks/orig_sklearn_fork/
+  - repo: https://github.com/Quantco/pre-commit-mirrors-mypy
+    rev: "1.10.0"
+    hooks:
+      - id: mypy-conda
+        args:
+          - --check-untyped-defs
+          - --ignore-missing-imports
+          - --namespace-packages
+        exclude: (^tests/|^src/glum_benchmarks/orig_sklearn_fork/)
+        additional_dependencies: [-c, conda-forge, types-setuptools=67.5, attrs]
+  - repo: https://github.com/Quantco/pre-commit-mirrors-cython-lint
+    rev: 0.16.2
+    hooks:
+      - id: cython-lint-conda
+        args: [--no-pycodestyle]
+      - id: double-quote-cython-strings-conda
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 4a96aa8f..2b101a90 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,7 +7,6 @@
 Changelog
 =========
 
-
 Unreleased
 ----------
 
@@ -17,6 +16,7 @@ Unreleased
 
 **Other changes:**
 
+- Move the linting and formatting to ruff.
 - Removed libblas MKL from the development environment.
 
 3.0.1 - 2024-05-23
diff --git a/docs/tutorials/formula_interface/load_transform_formula.py b/docs/tutorials/formula_interface/load_transform_formula.py
index d0098f8b..ce272f83 100644
--- a/docs/tutorials/formula_interface/load_transform_formula.py
+++ b/docs/tutorials/formula_interface/load_transform_formula.py
@@ -8,17 +8,21 @@ def load_transform():
     Summary of transformations:
 
     1. We cut the number of claims to a maximum of 4, as is done in the case study paper
-       (Case-study authors suspect a data error. See section 1 of their paper for details).
+       (Case-study authors suspect a data error. See section 1 of their paper for
+       details).
     2. We cut the exposure to a maximum of 1, as is done in the case study paper
-       (Case-study authors suspect a data error. See section 1 of their paper for details).
-    3. We define ``'ClaimAmountCut'`` as the the claim amount cut at 100'000 per single claim
-       (before aggregation per policy). Reason: For large claims, extreme value theory
-       might apply. 100'000 is the 0.9984 quantile, any claims larger account for 25% of
-       the overall claim amount. This is a well known phenomenon for third-party liability.
-    4. We aggregate the total claim amounts per policy ID and join them to ``freMTPL2freq``.
+       (Case-study authors suspect a data error. See section 1 of their paper for
+       details).
+    3. We define ``'ClaimAmountCut'`` as the the claim amount cut at 100'000 per
+       single claim (before aggregation per policy). Reason: For large claims,
+       extreme value theory might apply. 100'000 is the 0.9984 quantile, any claims
+       larger account for 25% of the overall claim amount. This is a well known
+       phenomenon for third-party liability.
+    4. We aggregate the total claim amounts per policy ID and join them to
+       ``freMTPL2freq``.
     5. We fix ``'ClaimNb'`` as the claim number with claim amount greater zero.
-    6. ``'VehPower'``, ``'VehAge'``, and ``'DrivAge'`` are clipped and/or digitized into bins so
-       they can be used as categoricals later on.
+    6. ``'VehPower'``, ``'VehAge'``, and ``'DrivAge'`` are clipped and/or digitized
+       into bins so they can be used as categoricals later on.
     """
     # load the datasets
     # first row (=column names) uses "", all other rows use ''
diff --git a/docs/tutorials/glm_french_motor_tutorial/load_transform.py b/docs/tutorials/glm_french_motor_tutorial/load_transform.py
index d0098f8b..ce272f83 100644
--- a/docs/tutorials/glm_french_motor_tutorial/load_transform.py
+++ b/docs/tutorials/glm_french_motor_tutorial/load_transform.py
@@ -8,17 +8,21 @@ def load_transform():
     Summary of transformations:
 
     1. We cut the number of claims to a maximum of 4, as is done in the case study paper
-       (Case-study authors suspect a data error. See section 1 of their paper for details).
+       (Case-study authors suspect a data error. See section 1 of their paper for
+       details).
     2. We cut the exposure to a maximum of 1, as is done in the case study paper
-       (Case-study authors suspect a data error. See section 1 of their paper for details).
-    3. We define ``'ClaimAmountCut'`` as the the claim amount cut at 100'000 per single claim
-       (before aggregation per policy). Reason: For large claims, extreme value theory
-       might apply. 100'000 is the 0.9984 quantile, any claims larger account for 25% of
-       the overall claim amount. This is a well known phenomenon for third-party liability.
-    4. We aggregate the total claim amounts per policy ID and join them to ``freMTPL2freq``.
+       (Case-study authors suspect a data error. See section 1 of their paper for
+       details).
+    3. We define ``'ClaimAmountCut'`` as the the claim amount cut at 100'000 per
+       single claim (before aggregation per policy). Reason: For large claims,
+       extreme value theory might apply. 100'000 is the 0.9984 quantile, any claims
+       larger account for 25% of the overall claim amount. This is a well known
+       phenomenon for third-party liability.
+    4. We aggregate the total claim amounts per policy ID and join them to
+       ``freMTPL2freq``.
     5. We fix ``'ClaimNb'`` as the claim number with claim amount greater zero.
-    6. ``'VehPower'``, ``'VehAge'``, and ``'DrivAge'`` are clipped and/or digitized into bins so
-       they can be used as categoricals later on.
+    6. ``'VehPower'``, ``'VehAge'``, and ``'DrivAge'`` are clipped and/or digitized
+       into bins so they can be used as categoricals later on.
     """
     # load the datasets
     # first row (=column names) uses "", all other rows use ''
diff --git a/pyproject.toml b/pyproject.toml
index fc6dc0a3..8ac88d8f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,25 +8,30 @@ requires = [
   'scikit-learn',
 ]
 
-[tool.black]
-exclude = '''
-/(
-    \.eggs
-  | \.git
-  | \.venv
-  | build
-  | dist
-)/
-'''
-
-[tool.isort]
-multi_line_output = 3
-include_trailing_comma = true
-ensure_newline_before_comments = true
-line_length = 88
-known_first_party = "glum"
-skip_glob = '\.eggs/*,\.git/*,\.venv/*,build/*,dist/*'
-default_section = 'THIRDPARTY'
+[tool.ruff]
+line-length = 88
+target-version = "py39"
+
+[tool.ruff.lint]
+ignore = ["E731", "N802", "N803", "N806"]
+select = [
+  # pyflakes
+  "F",
+  # pycodestyle
+  "E", "W",
+  # isort
+  "I",
+  # pep8-naming
+  "N",
+  # pyupgrade
+  "UP",
+]
+
+[tool.ruff.lint.isort]
+known-first-party = ["glum", "glum_benchmarks"]
+
+[tool.mypy]
+python_version = '3.9'
 
 [tool.cibuildwheel]
 skip = [
@@ -51,6 +56,7 @@ CFLAGS="-I$CONDA/envs/build/include"
 CXXFLAGS="-I$CONDA/envs/build/include"
 CXX="/usr/bin/clang++"
 CC="/usr/bin/clang"
+MACOSX_DEPLOYMENT_TARGET="10.13"
 
 
 [tool.pytest.ini_options]
diff --git a/src/glum/_distribution.py b/src/glum/_distribution.py
index f7facbd5..b7c42bd8 100644
--- a/src/glum/_distribution.py
+++ b/src/glum/_distribution.py
@@ -282,7 +282,9 @@ def _mu_deviance_derivative(
         link: Link,
         offset=None,
     ):
-        """Compute ``mu`` and the derivative of the deviance with respect to coefficients."""
+        """Compute ``mu`` and the derivative of the deviance with respect to
+        coefficients.
+        """
         lin_pred = _safe_lin_pred(X, coef, offset)
         mu = link.inverse(lin_pred)
         d1 = link.inverse_derivative(lin_pred)
@@ -604,7 +606,6 @@ def power(self) -> float:
 
     @power.setter
     def power(self, power):
-
         if not isinstance(power, (int, float, np.number)):
             raise TypeError(f"The power parameter must be numeric; got {power}.")
         if (power > 0) and (power < 1):
@@ -622,7 +623,6 @@ def unit_variance_derivative(self, mu):  # noqa D
         return numexpr.evaluate("p * mu ** (p - 1)")
 
     def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
-
         y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
         sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
@@ -641,7 +641,6 @@ def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
             return tweedie_deviance(y, sample_weight, mu, p=float(self.power))
 
     def unit_deviance(self, y, mu):  # noqa D
-
         if self.power == 0:  # normal distribution
             return (y - mu) ** 2
         if self.power == 1:  # Poisson distribution
@@ -803,7 +802,6 @@ def unit_variance_derivative(self, mu) -> np.ndarray:  # noqa D
         return 0 if np.isscalar(mu) else np.zeros_like(mu)
 
     def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
-
         y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
         sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
@@ -1031,7 +1029,6 @@ def unit_variance_derivative(self, mu) -> np.ndarray:  # noqa D
         return 2 * mu
 
     def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
-
         y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
         sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
@@ -1388,7 +1385,6 @@ def theta(self) -> float:
 
     @theta.setter
     def theta(self, theta):
-
         if not isinstance(theta, (int, float)):
             raise TypeError(f"Theta must be numeric; got {theta}.")
         if not theta > 0:
@@ -1517,7 +1513,6 @@ def guess_intercept(
     avg_y = np.average(y, weights=sample_weight)
 
     if isinstance(link, IdentityLink):
-
         # This is only correct for the normal. For other distributions, the
         # answer is unknown, but we assume that we want `sum(y) = sum(mu)`
 
@@ -1529,7 +1524,6 @@ def guess_intercept(
         return avg_y - avg_eta
 
     elif isinstance(link, LogLink):
-
         # This is only correct for Tweedie
 
         log_avg_y = np.log(avg_y)
@@ -1564,7 +1558,6 @@ def guess_intercept(
         return first - second
 
     elif isinstance(link, LogitLink):
-
         log_odds = np.log(avg_y) - np.log(1 - avg_y)
 
         if eta is None:
@@ -1575,7 +1568,6 @@ def guess_intercept(
         return log_odds - avg_eta
 
     else:
-
         return link.link(y.dot(sample_weight))
 
 
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index edd0a1b6..8d60747b 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -5,7 +5,8 @@
 https://github.com/scikit-learn/scikit-learn/pull/9405
 
 Original attribution from:
-https://github.com/scikit-learn/scikit-learn/pull/9405/files#diff-38e412190dc50455611b75cfcf2d002713dcf6d537a78b9a22cc6b1c164390d1 # noqa: B950
+https://github.com/scikit-learn/scikit-learn/pull/9405/files
+#diff-38e412190dc50455611b75cfcf2d002713dcf6d537a78b9a22cc6b1c164390d1
 '''
 Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
 some parts and tricks stolen from other sklearn files.
@@ -14,7 +15,6 @@
 
 # License: BSD 3 clause
 
-
 import copy
 import re
 import sys
@@ -133,7 +133,9 @@ def check_array_tabmat_compliant(mat: ArrayLike, drop_first: int = False, **kwar
 
     if res is not mat and original_type in (tm.DenseMatrix, tm.SparseMatrix):
         res = original_type(
-            res, column_names=mat.column_names, term_names=mat.term_names  # type: ignore
+            res,
+            column_names=mat.column_names,  # type: ignore
+            term_names=mat.term_names,  # type: ignore
         )
 
     return res
@@ -462,7 +464,7 @@ def _standardize_warm_start(
 
 
 def get_family(
-    family: Union[str, ExponentialDispersionModel]
+    family: Union[str, ExponentialDispersionModel],
 ) -> ExponentialDispersionModel:
     if isinstance(family, ExponentialDispersionModel):
         return family
@@ -552,7 +554,6 @@ def setup_p1(
     alpha: float,
     l1_ratio: float,
 ) -> np.ndarray:
-
     if not isinstance(X, (tm.MatrixBase, tm.StandardizedMatrix)):
         raise TypeError
 
@@ -593,7 +594,6 @@ def setup_p2(
     alpha: float,
     l1_ratio: float,
 ) -> Union[np.ndarray, sparse.spmatrix]:
-
     if not isinstance(X, (tm.MatrixBase, tm.StandardizedMatrix)):
         raise TypeError
 
@@ -2007,10 +2007,11 @@ def covariance_matrix(
         correction of :math:`\\frac{N}{N-p}`.
 
         The clustered covariance matrix uses a similar approach to the robust (HC-1)
-        covariance matrix. However, instead of using :math:`\\mathbf{G}^{T}(\\hat{\\theta}
-        \\mathbf{G}(\\hat{\\theta})` directly, we first sum over all the groups first.
-        The finite-sample correction is affected as well, becoming :math:`\\frac{M}{M-1}
-        \\frac{N}{N-p}` where :math:`M` is the number of groups.
+        covariance matrix. However, instead of using :math:`\\mathbf{G}^{T}(
+        \\hat{\\theta}\\mathbf{G}(\\hat{\\theta})` directly, we first sum over
+        all the groups first. The finite-sample correction is affected as well,
+        becoming :math:`\\frac{M}{M-1}\\frac{N}{N-p}` where :math:`M` is the number
+        of groups.
 
         References
         ----------
@@ -2223,7 +2224,8 @@ def score(
         the deviance. Note that those two are equal for ``family='normal'``.
 
         :math:`D^2` is defined as
-        :math:`D^2 = 1 - \\frac{D(y_{\\mathrm{true}}, y_{\\mathrm{pred}})}{D_{\\mathrm{null}}}`,
+        :math:`D^2 = 1 - \\frac{D(y_{\\mathrm{true}}, y_{\\mathrm{pred}})}
+        {D_{\\mathrm{null}}}`,
         :math:`D_{\\mathrm{null}}` is the null deviance, i.e. the deviance of a
         model with intercept alone. The best possible score is one and it can be
         negative.
@@ -2573,8 +2575,8 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
     with inverse link function ``h`` and ``s=sample_weight``.
     Note that, for ``alpha=0`` the unregularized GLM is recovered.
     This is not the default behavior (see ``alpha`` parameter description for details).
-    Additionally, for ``sample_weight=None``, one has ``s_i=1`` and ``sum(s)=n_samples``.
-    For ``P1=P2='identity'``, the penalty is the elastic net::
+    Additionally, for ``sample_weight=None``, one has ``s_i=1`` and
+    ``sum(s)=n_samples``. For ``P1=P2='identity'``, the penalty is the elastic net::
 
             alpha * l1_ratio * ||w||_1 + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2.
 
@@ -2612,7 +2614,8 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
         is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
         combination of L1 and L2.
 
-    P1 : {'identity', array-like, None}, shape (n_features,), optional (default='identity')
+    P1 : {'identity', array-like, None}, shape (n_features,), optional
+         (default='identity')
         This array controls the strength of the regularization for each coefficient
         independently. A high value will lead to higher regularization while a value of
         zero will remove the regularization on this parameter.
@@ -3019,7 +3022,7 @@ def _validate_hyperparameters(self) -> None:
             ):
                 raise ValueError(
                     "Penalty term must be a non-negative number;"
-                    " got (alpha={})".format(self.alpha)
+                    f" got (alpha={self.alpha})"
                 )
 
         if (
@@ -3031,7 +3034,7 @@ def _validate_hyperparameters(self) -> None:
         ):
             raise ValueError(
                 "l1_ratio must be a number in interval [0, 1];"
-                " got (l1_ratio={})".format(self.l1_ratio)
+                f" got (l1_ratio={self.l1_ratio})"
             )
         super()._validate_hyperparameters()
 
@@ -3260,11 +3263,9 @@ def fit(
                 _alpha = self.alpha
             if _alpha > 0 and self.l1_ratio > 0 and self._solver != "irls-cd":
                 raise ValueError(
-                    "The chosen solver (solver={}) can't deal "
+                    f"The chosen solver (solver={self._solver}) can't deal "
                     "with L1 penalties, which are included with "
-                    "(alpha={}) and (l1_ratio={}).".format(
-                        self._solver, _alpha, self.l1_ratio
-                    )
+                    f"(alpha={_alpha}) and (l1_ratio={self.l1_ratio})."
                 )
             coef = self._solve(
                 X=X,
diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py
index 04328d37..e4c2d8e6 100644
--- a/src/glum/_glm_cv.py
+++ b/src/glum/_glm_cv.py
@@ -40,7 +40,8 @@ class GeneralizedLinearRegressorCV(GeneralizedLinearRegressorBase):
         If you pass ``l1_ratio`` as an array, the ``fit`` method will choose the
         best value of ``l1_ratio`` and store it as ``self.l1_ratio``.
 
-    P1 : {'identity', array-like, None}, shape (n_features,), optional (default='identity')
+    P1 : {'identity', array-like, None}, shape (n_features,), optional
+         (default='identity')
         This array controls the strength of the regularization for each coefficient
         independently. A high value will lead to higher regularization while a value of
         zero will remove the regularization on this parameter.
@@ -414,9 +415,8 @@ def _validate_hyperparameters(self) -> None:
             or np.any(l1_ratio > 1)
         ):
             raise ValueError(
-                "l1_ratio must be a number in interval [0, 1]; got l1_ratio={}".format(
-                    self.l1_ratio
-                )
+                "l1_ratio must be a number in interval [0, 1]; got "
+                f"l1_ratio={self.l1_ratio}"
             )
         super()._validate_hyperparameters()
 
diff --git a/src/glum/_link.py b/src/glum/_link.py
index 179afac9..db2e44cc 100644
--- a/src/glum/_link.py
+++ b/src/glum/_link.py
@@ -234,7 +234,6 @@ def derivative(self, mu):  # noqa D
         return 1.0 / (mu * (1 - mu))
 
     def inverse(self, lin_pred):  # noqa D
-
         inv_logit = special.expit(lin_pred)
         eps50 = 50 * np.finfo(inv_logit.dtype).eps
 
@@ -266,7 +265,6 @@ def derivative(self, mu):  # noqa D
         return 1.0 / ((mu - 1) * (np.log1p(-mu)))
 
     def inverse(self, lin_pred):  # noqa D
-
         lin_pred = lin_pred
         inv_cloglog = -np.expm1(-np.exp(lin_pred))
         eps50 = 50 * np.finfo(inv_cloglog.dtype).eps
diff --git a/src/glum/_solvers.py b/src/glum/_solvers.py
index ef8f6132..ccb7f4a5 100644
--- a/src/glum/_solvers.py
+++ b/src/glum/_solvers.py
@@ -105,7 +105,8 @@ def update_hessian(state, data, active_set):
     ``dH``. If ``threshold/data.hessian_approx == 0.0``, then we will always use
     every row. However, for ``data.hessian_approx != 0``, we include rows for
     which
-    ``include = (np.abs(hessian_rows_diff[i]) >= T * np.max(np.abs(hessian_rows_diff)))``.
+    ``include = (np.abs(hessian_rows_diff[i]) >= T *
+    np.max(np.abs(hessian_rows_diff)))``.
 
     Essentially, this criterion ignores data matrix rows that have not seen the
     second derivatives of their predictions change very much in the last
@@ -344,7 +345,7 @@ def _irls_solver(inner_solver, coef, data) -> tuple[np.ndarray, int, int, list[l
         warnings.warn(
             "IRLS failed to converge. Increase"
             " the maximum number of iterations max_iter"
-            " (currently {})".format(data.max_iter),
+            f" (currently {data.max_iter})",
             ConvergenceWarning,
         )
     return state.coef, state.n_iter, state.n_cycles, state.diagnostics
diff --git a/src/glum/_util.py b/src/glum/_util.py
index 83d30fe0..ec7b8d96 100644
--- a/src/glum/_util.py
+++ b/src/glum/_util.py
@@ -127,7 +127,6 @@ def _expand_categorical_penalties(
         penalty = np.asanyarray(penalty)
 
     if penalty.shape[0] == X.shape[1]:
-
         if penalty.ndim == 2:
             raise ValueError(
                 "When the penalty is two-dimensional, it must have the "
@@ -147,7 +146,6 @@ def _expand_categorical_penalties(
         return np.array(expanded_penalty)
 
     else:
-
         return penalty
 
 
diff --git a/src/glum_benchmarks/bench_liblinear.py b/src/glum_benchmarks/bench_liblinear.py
index 252efb4d..2890dbcd 100644
--- a/src/glum_benchmarks/bench_liblinear.py
+++ b/src/glum_benchmarks/bench_liblinear.py
@@ -47,8 +47,8 @@ def liblinear_bench(
     X = dat["X"]
     if not isinstance(X, (np.ndarray, sps.spmatrix, pd.DataFrame)):
         warnings.warn(
-            "liblinear requires data as scipy.sparse matrix, pandas dataframe, or numpy "
-            "array. Skipping."
+            "liblinear requires data as scipy.sparse matrix, pandas dataframe, or "
+            "numpy array. Skipping."
         )
         return result
 
diff --git a/src/glum_benchmarks/data/create_insurance.py b/src/glum_benchmarks/data/create_insurance.py
index 9dcb587f..5d237555 100644
--- a/src/glum_benchmarks/data/create_insurance.py
+++ b/src/glum_benchmarks/data/create_insurance.py
@@ -14,7 +14,7 @@
 
 from ..util import exposure_and_offset_to_weights
 
-# taken from https://github.com/lorentzenchr/Tutorial_freMTPL2/blob/master/glm_freMTPL2_example.ipynb  # noqa: B950
+# taken from https://github.com/lorentzenchr/Tutorial_freMTPL2/blob/master/glm_freMTPL2_example.ipynb  # noqa: E501
 # Modified to generate data sets of different sizes
 
 
@@ -67,17 +67,15 @@ def create_insurance_raw_data(verbose=False) -> None:
 
     if verbose:
         print(
-            "Number or rows with ClaimAmountCut > 0 and ClaimNb == 0: {}".format(
-                df[(df.ClaimAmountCut > 0) & (df.ClaimNb == 0)].shape[0]
-            )
+            "Number or rows with ClaimAmountCut > 0 and ClaimNb == 0: "
+            f"{df[(df.ClaimAmountCut > 0) & (df.ClaimNb == 0)].shape[0]}"
         )
 
     # 9116 zero claims
     if verbose:
         print(
-            "Number or rows with ClaimAmountCut = 0 and ClaimNb >= 1: {}".format(
-                df[(df.ClaimAmountCut == 0) & (df.ClaimNb >= 1)].shape[0]
-            )
+            "Number or rows with ClaimAmountCut = 0 and ClaimNb >= 1: "
+            f"{df[(df.ClaimAmountCut == 0) & (df.ClaimNb >= 1)].shape[0]}"
         )
 
     # Note: Zero claims must be ignored in severity models, because the support is
@@ -201,7 +199,7 @@ def make_column_transformer(*transformers, remainder: str = "drop"):  # noqa: D1
 
 
 def func_returns_df(
-    fn: Callable[[pd.DataFrame], np.ndarray]
+    fn: Callable[[pd.DataFrame], np.ndarray],
 ) -> Callable[[pd.DataFrame], pd.DataFrame]:
     """
     Take a function that takes a dataframe and returns a Numpy array, and return a \
@@ -209,8 +207,8 @@ def func_returns_df(
 
     fn: Function that takes a dataframe and returns a numpy array
     Returns: Function that takes a dataframe and returns a dataframe with the values
-             determined by the original function, and the index and columns of the original
-             dataframe.
+             determined by the original function, and the index and columns of the
+             original dataframe.
     """
     return lambda x: x.assign(**{x.columns[0]: fn(x)})
 
diff --git a/src/glum_benchmarks/orig_sklearn_fork/_glm.py b/src/glum_benchmarks/orig_sklearn_fork/_glm.py
index 00f30f3a..84bb4afe 100644
--- a/src/glum_benchmarks/orig_sklearn_fork/_glm.py
+++ b/src/glum_benchmarks/orig_sklearn_fork/_glm.py
@@ -36,9 +36,6 @@
 #   So far, coefficients=w and sample weights=s.
 # - The intercept term is the first index, i.e. coef[0]
 
-
-from __future__ import division
-
 import numbers
 import warnings
 from abc import ABCMeta, abstractmethod
@@ -755,7 +752,7 @@ def power(self):
     @power.setter
     def power(self, power):
         if not isinstance(power, numbers.Real):
-            raise TypeError("power must be a real number, input was {}".format(power))
+            raise TypeError(f"power must be a real number, input was {power}")
 
         self._upper_bound = np.inf
         self._include_upper_bound = False
@@ -1056,7 +1053,7 @@ def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link, max_iter,
     if not converged:
         warnings.warn(
             "irls failed to converge. Increase the number "
-            "of iterations (currently {})".format(max_iter),
+            f"of iterations (currently {max_iter})",
             ConvergenceWarning,
         )
 
@@ -1445,7 +1442,7 @@ def _cd_solver(
         warnings.warn(
             "Coordinate descent failed to converge. Increase"
             " the maximum number of iterations max_iter"
-            " (currently {})".format(max_iter),
+            f" (currently {max_iter})",
             ConvergenceWarning,
         )
 
@@ -1796,7 +1793,7 @@ def fit(self, X, y, sample_weight=None):
                     "The family must be an instance of class"
                     " ExponentialDispersionModel or an element of"
                     " ['normal', 'poisson', 'gamma', 'inverse.gaussian', "
-                    "'binomial']; got (family={})".format(self.family)
+                    f"'binomial']; got (family={self.family})"
                 )
 
         # Guarantee that self._link_instance is set to an instance of
@@ -1819,7 +1816,7 @@ def fit(self, X, y, sample_weight=None):
                         "No default link known for the "
                         "specified distribution family. Please "
                         "set link manually, i.e. not to 'auto'; "
-                        "got (link='auto', family={}".format(self.family)
+                        f"got (link='auto', family={self.family}"
                     )
             elif self.link == "identity":
                 self._link_instance = IdentityLink()
@@ -1831,13 +1828,13 @@ def fit(self, X, y, sample_weight=None):
                 raise ValueError(
                     "The link must be an instance of class Link or "
                     "an element of ['auto', 'identity', 'log', 'logit']; "
-                    "got (link={})".format(self.link)
+                    f"got (link={self.link})"
                 )
 
         if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:
             raise ValueError(
                 "Penalty term must be a non-negative number;"
-                " got (alpha={})".format(self.alpha)
+                f" got (alpha={self.alpha})"
             )
         if (
             not isinstance(self.l1_ratio, numbers.Number)
@@ -1846,18 +1843,17 @@ def fit(self, X, y, sample_weight=None):
         ):
             raise ValueError(
                 "l1_ratio must be a number in interval [0, 1];"
-                " got (l1_ratio={})".format(self.l1_ratio)
+                f" got (l1_ratio={self.l1_ratio})"
             )
         if not isinstance(self.fit_intercept, bool):
             raise ValueError(
-                "The argument fit_intercept must be bool;"
-                " got {}".format(self.fit_intercept)
+                "The argument fit_intercept must be bool;" f" got {self.fit_intercept}"
             )
         if self.solver not in ["auto", "irls", "lbfgs", "newton-cg", "cd"]:
             raise ValueError(
                 "GeneralizedLinearRegressor supports only solvers"
                 " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';"
-                " got {}".format(self.solver)
+                f" got {self.solver}"
             )
         solver = self.solver
         if self.solver == "auto":
@@ -1867,47 +1863,41 @@ def fit(self, X, y, sample_weight=None):
                 solver = "cd"
         if self.alpha > 0 and self.l1_ratio > 0 and solver not in ["cd"]:
             raise ValueError(
-                "The chosen solver (solver={}) can't deal "
+                f"The chosen solver (solver={solver}) can't deal "
                 "with L1 penalties, which are included with "
-                "(alpha={}) and (l1_ratio={}).".format(
-                    solver, self.alpha, self.l1_ratio
-                )
+                f"(alpha={self.alpha}) and (l1_ratio={self.l1_ratio})."
             )
         if not isinstance(self.max_iter, int) or self.max_iter <= 0:
             raise ValueError(
                 "Maximum number of iteration must be a positive "
                 "integer;"
-                " got (max_iter={!r})".format(self.max_iter)
+                f" got (max_iter={self.max_iter!r})"
             )
         if not isinstance(self.tol, numbers.Number) or self.tol <= 0:
             raise ValueError(
                 "Tolerance for stopping criteria must be "
-                "positive; got (tol={!r})".format(self.tol)
+                f"positive; got (tol={self.tol!r})"
             )
         if not isinstance(self.warm_start, bool):
             raise ValueError(
-                "The argument warm_start must be bool;"
-                " got {}".format(self.warm_start)
+                "The argument warm_start must be bool;" f" got {self.warm_start}"
             )
         if self.selection not in ["cyclic", "random"]:
             raise ValueError(
                 "The argument selection must be 'cyclic' or "
-                "'random'; got (selection={})".format(self.selection)
+                f"'random'; got (selection={self.selection})"
             )
         random_state = check_random_state(self.random_state)
         if not isinstance(self.diag_fisher, bool):
             raise ValueError(
-                "The argument diag_fisher must be bool;"
-                " got {}".format(self.diag_fisher)
+                "The argument diag_fisher must be bool;" f" got {self.diag_fisher}"
             )
         if not isinstance(self.copy_X, bool):
-            raise ValueError(
-                "The argument copy_X must be bool;" " got {}".format(self.copy_X)
-            )
+            raise ValueError("The argument copy_X must be bool;" f" got {self.copy_X}")
         if not isinstance(self.check_input, bool):
             raise ValueError(
                 "The argument check_input must be bool; got "
-                "(check_input={})".format(self.check_input)
+                f"(check_input={self.check_input})"
             )
 
         family = self._family_instance
@@ -1948,14 +1938,14 @@ def fit(self, X, y, sample_weight=None):
             except TypeError:
                 raise TypeError(
                     "The given P1 cannot be converted to a numeric"
-                    "array; got (P1.dtype={}).".format(P1.dtype)
+                    f"array; got (P1.dtype={P1.dtype})."
                 )
             if (P1.ndim != 1) or (P1.shape[0] != n_features):
                 raise ValueError(
                     "P1 must be either 'identity' or a 1d array "
                     "with the length of X.shape[1]; "
-                    "got (P1.shape[0]={}), "
-                    "needed (X.shape[1]={}).".format(P1.shape[0], n_features)
+                    f"got (P1.shape[0]={P1.shape[0]}), "
+                    f"needed (X.shape[1]={n_features})."
                 )
         # If X is sparse, make P2 sparse, too.
         if isinstance(self.P2, str) and self.P2 == "identity":
@@ -1978,9 +1968,7 @@ def fit(self, X, y, sample_weight=None):
                         "P2 should be a 1d array of shape "
                         "(n_features,) with "
                         "n_features=X.shape[1]; "
-                        "got (P2.shape=({},)), needed ({},)".format(
-                            P2.shape[0], X.shape[1]
-                        )
+                        f"got (P2.shape=({P2.shape[0]},)), needed ({X.shape[1]},)"
                     )
                 if sparse.issparse(X):
                     P2 = (
@@ -1998,9 +1986,7 @@ def fit(self, X, y, sample_weight=None):
                     "P2 must be either None or an array of shape "
                     "(n_features, n_features) with "
                     "n_features=X.shape[1]; "
-                    "got (P2.shape=({0}, {1})), needed ({2}, {2})".format(
-                        P2.shape[0], P2.shape[1], X.shape[1]
-                    )
+                    f"got (P2.shape=({P2.shape[0]}, {P2.shape[1]})), needed ({X.shape[1]}, {X.shape[1]})"
                 )
 
         start_params = self.start_params
@@ -2009,7 +1995,7 @@ def fit(self, X, y, sample_weight=None):
                 raise ValueError(
                     "The argument start_params must be 'guess', "
                     "'zero' or an array of correct length; "
-                    "got(start_params={})".format(start_params)
+                    f"got(start_params={start_params})"
                 )
         else:
             start_params = check_array(
@@ -2026,11 +2012,7 @@ def fit(self, X, y, sample_weight=None):
                 raise ValueError(
                     "Start values for parameters must have the"
                     "right length and dimension; required (length"
-                    "={}, ndim=1); got (length={}, ndim={}).".format(
-                        X.shape[1] + self.fit_intercept,
-                        start_params.shape[0],
-                        start_params.ndim,
-                    )
+                    f"={X.shape[1] + self.fit_intercept}, ndim=1); got (length={start_params.shape[0]}, ndim={start_params.ndim})."
                 )
 
         l1 = self.alpha * self.l1_ratio
@@ -2058,7 +2040,7 @@ def fit(self, X, y, sample_weight=None):
             if not np.all(family.in_y_range(y)):
                 raise ValueError(
                     "Some value(s) of y are out of the valid "
-                    "range for family {}".format(family.__class__.__name__)
+                    f"range for family {family.__class__.__name__}"
                 )
             # check if P1 has only non-negative values, negative values might
             # indicate group lasso in the future.
@@ -2405,8 +2387,8 @@ def estimate_phi(self, X, y, sample_weight=None):
             raise ValueError(
                 "Estimation of dispersion parameter phi requires"
                 " more samples than features, got"
-                " samples=X.shape[0]={} and"
-                " n_features=X.shape[1]+fit_intercept={}.".format(n_samples, n_features)
+                f" samples=X.shape[0]={n_samples} and"
+                f" n_features=X.shape[1]+fit_intercept={n_features}."
             )
         mu = self._link_instance.inverse(eta)
         if self.fit_dispersion == "chisqr":
diff --git a/tests/benchmark/test_cli.py b/tests/benchmark/test_cli.py
index a609a3bb..b9672b0e 100644
--- a/tests/benchmark/test_cli.py
+++ b/tests/benchmark/test_cli.py
@@ -79,7 +79,7 @@ def test_correct_problems_run():
         if not result.exit_code == 0:
             problem_name_str = " ".join(args)
             raise ValueError(
-                f"""Failed on problem {problem_name_str} with output: \n {result.output}"""
+                f"Failed on problem {problem_name_str} with output: \n {result.output}"
             )
         problems_run = os.listdir(d)
 
diff --git a/tests/glm/performance/test_performance.py b/tests/glm/performance/test_performance.py
index 2dbc55d6..090de4e1 100644
--- a/tests/glm/performance/test_performance.py
+++ b/tests/glm/performance/test_performance.py
@@ -200,7 +200,8 @@ def runtime_checker():
             force_all_finite=False,
         )
         min_runtime, result = runtime(
-            lambda: model.fit(X=dat["X"], y=dat["y"]), 5  # noqa B023
+            lambda: model.fit(X=dat["X"], y=dat["y"]),  # noqa B023
+            5,
         )
 
         # Let's just guess that we're about half flop-limited and half
diff --git a/tests/glm/test_glm.py b/tests/glm/test_glm.py
index 7721f253..efa26075 100644
--- a/tests/glm/test_glm.py
+++ b/tests/glm/test_glm.py
@@ -1716,9 +1716,7 @@ def test_column_with_stddev_zero():
 
     model = GeneralizedLinearRegressor(
         family="poisson", fit_intercept=False, scale_predictors=False
-    ).fit(
-        X, y
-    )  # noqa: F841
+    ).fit(X, y)  # noqa: F841
     model = GeneralizedLinearRegressor(family="poisson").fit(X, y)  # noqa: F841
 
 
@@ -3024,7 +3022,9 @@ def get_mixed_data():
     "fit_intercept", [True, False], ids=["intercept", "no_intercept"]
 )
 def test_formula(get_mixed_data, formula, drop_first, fit_intercept):
-    """Model with formula and model with externally constructed model matrix should match."""
+    """Model with formula and model with externally constructed model matrix should
+    match.
+    """
     data = get_mixed_data
 
     model_formula = GeneralizedLinearRegressor(