Use ruff as our linter and formatter (#800)

* new linting system * changelog * add glum_benchmarks as known first party * Update test_performance.py Co-authored-by: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> * Update create_insurance.py Co-authored-by: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> * Update create_insurance.py Co-authored-by: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> * also check for line length --------- Co-authored-by: Luca Bittarello <15511539+lbittarello@users.noreply.github.com>
Quantco · May 29, 2024 · 15dda3b · 15dda3b
1 parent 4570c52
commit 15dda3b
Show file tree

Hide file tree

Showing 18 changed files with 152 additions and 223 deletions.
diff --git a/.flake8 b/.flake8
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,51 +1,24 @@
 repos:
- - repo: https://github.com/Quantco/pre-commit-mirrors-black
-   rev: 24.4.2
-   hooks:
-     - id: black-conda
-       args:
-         - --safe
-         - --target-version=py39
- - repo: https://github.com/Quantco/pre-commit-mirrors-flake8
-   rev: 7.0.0
-   hooks:
-    - id: flake8-conda
-      additional_dependencies: [
-          -c,
-          conda-forge,
-          flake8-bugbear=22.12.6,
-          flake8-builtins=2.1.0,
-          flake8-comprehensions=3.10.1,
-          flake8-docstrings=1.6.0,
-          flake8-print=5.0.0,
-          pep8-naming=0.13.3,
-          python<3.12,
-        ]
-      exclude: (^src/glum_benchmarks/orig_sklearn_fork/|^docs)
- - repo: https://github.com/Quantco/pre-commit-mirrors-isort
-   rev: 5.13.2
-   hooks:
-    - id: isort-conda
-      additional_dependencies: [toml]
- - repo: https://github.com/Quantco/pre-commit-mirrors-mypy
-   rev: "1.10.0"
-   hooks:
-    - id: mypy-conda
-      args:
-       - --check-untyped-defs
-       - --ignore-missing-imports
-       - --namespace-packages
-      exclude: ^tests/
-      additional_dependencies: [-c, conda-forge, types-setuptools=67.5, attrs]
- - repo: https://github.com/Quantco/pre-commit-mirrors-pyupgrade
-   rev: 3.15.2
-   hooks:
-    - id: pyupgrade-conda
-      exclude: ^src/glum_benchmarks/orig_sklearn_fork/
-      args: [--py39-plus]
- - repo: https://github.com/Quantco/pre-commit-mirrors-cython-lint
-   rev: 0.16.2
-   hooks:
-    - id: cython-lint-conda
-      args: [--no-pycodestyle]
-    - id: double-quote-cython-strings-conda
+  - repo: https://github.com/Quantco/pre-commit-mirrors-ruff
+    rev: 0.4.3
+    hooks:
+      - id: ruff-conda
+        exclude: ^src/glum_benchmarks/orig_sklearn_fork/
+      - id: ruff-format-conda
+        exclude: ^src/glum_benchmarks/orig_sklearn_fork/
+  - repo: https://github.com/Quantco/pre-commit-mirrors-mypy
+    rev: "1.10.0"
+    hooks:
+      - id: mypy-conda
+        args:
+          - --check-untyped-defs
+          - --ignore-missing-imports
+          - --namespace-packages
+        exclude: (^tests/|^src/glum_benchmarks/orig_sklearn_fork/)
+        additional_dependencies: [-c, conda-forge, types-setuptools=67.5, attrs]
+  - repo: https://github.com/Quantco/pre-commit-mirrors-cython-lint
+    rev: 0.16.2
+    hooks:
+      - id: cython-lint-conda
+        args: [--no-pycodestyle]
+      - id: double-quote-cython-strings-conda
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,7 +7,6 @@
 Changelog
 =========
 
-
 Unreleased
 ----------
 
@@ -17,6 +16,7 @@ Unreleased
 
 **Other changes:**
 
+- Move the linting and formatting to ruff.
 - Removed libblas MKL from the development environment.
 
 3.0.1 - 2024-05-23

diff --git a/docs/tutorials/formula_interface/load_transform_formula.py b/docs/tutorials/formula_interface/load_transform_formula.py
@@ -8,17 +8,21 @@ def load_transform():
     Summary of transformations:
 
     1. We cut the number of claims to a maximum of 4, as is done in the case study paper
-       (Case-study authors suspect a data error. See section 1 of their paper for details).
+       (Case-study authors suspect a data error. See section 1 of their paper for
+       details).
     2. We cut the exposure to a maximum of 1, as is done in the case study paper
-       (Case-study authors suspect a data error. See section 1 of their paper for details).
-    3. We define ``'ClaimAmountCut'`` as the the claim amount cut at 100'000 per single claim
-       (before aggregation per policy). Reason: For large claims, extreme value theory
-       might apply. 100'000 is the 0.9984 quantile, any claims larger account for 25% of
-       the overall claim amount. This is a well known phenomenon for third-party liability.
-    4. We aggregate the total claim amounts per policy ID and join them to ``freMTPL2freq``.
+       (Case-study authors suspect a data error. See section 1 of their paper for
+       details).
+    3. We define ``'ClaimAmountCut'`` as the the claim amount cut at 100'000 per
+       single claim (before aggregation per policy). Reason: For large claims,
+       extreme value theory might apply. 100'000 is the 0.9984 quantile, any claims
+       larger account for 25% of the overall claim amount. This is a well known
+       phenomenon for third-party liability.
+    4. We aggregate the total claim amounts per policy ID and join them to
+       ``freMTPL2freq``.
     5. We fix ``'ClaimNb'`` as the claim number with claim amount greater zero.
-    6. ``'VehPower'``, ``'VehAge'``, and ``'DrivAge'`` are clipped and/or digitized into bins so
-       they can be used as categoricals later on.
+    6. ``'VehPower'``, ``'VehAge'``, and ``'DrivAge'`` are clipped and/or digitized
+       into bins so they can be used as categoricals later on.
     """
     # load the datasets
     # first row (=column names) uses "", all other rows use ''

diff --git a/docs/tutorials/glm_french_motor_tutorial/load_transform.py b/docs/tutorials/glm_french_motor_tutorial/load_transform.py
@@ -8,17 +8,21 @@ def load_transform():
     Summary of transformations:
 
     1. We cut the number of claims to a maximum of 4, as is done in the case study paper
-       (Case-study authors suspect a data error. See section 1 of their paper for details).
+       (Case-study authors suspect a data error. See section 1 of their paper for
+       details).
     2. We cut the exposure to a maximum of 1, as is done in the case study paper
-       (Case-study authors suspect a data error. See section 1 of their paper for details).
-    3. We define ``'ClaimAmountCut'`` as the the claim amount cut at 100'000 per single claim
-       (before aggregation per policy). Reason: For large claims, extreme value theory
-       might apply. 100'000 is the 0.9984 quantile, any claims larger account for 25% of
-       the overall claim amount. This is a well known phenomenon for third-party liability.
-    4. We aggregate the total claim amounts per policy ID and join them to ``freMTPL2freq``.
+       (Case-study authors suspect a data error. See section 1 of their paper for
+       details).
+    3. We define ``'ClaimAmountCut'`` as the the claim amount cut at 100'000 per
+       single claim (before aggregation per policy). Reason: For large claims,
+       extreme value theory might apply. 100'000 is the 0.9984 quantile, any claims
+       larger account for 25% of the overall claim amount. This is a well known
+       phenomenon for third-party liability.
+    4. We aggregate the total claim amounts per policy ID and join them to
+       ``freMTPL2freq``.
     5. We fix ``'ClaimNb'`` as the claim number with claim amount greater zero.
-    6. ``'VehPower'``, ``'VehAge'``, and ``'DrivAge'`` are clipped and/or digitized into bins so
-       they can be used as categoricals later on.
+    6. ``'VehPower'``, ``'VehAge'``, and ``'DrivAge'`` are clipped and/or digitized
+       into bins so they can be used as categoricals later on.
     """
     # load the datasets
     # first row (=column names) uses "", all other rows use ''

diff --git a/pyproject.toml b/pyproject.toml
@@ -8,25 +8,30 @@ requires = [
   'scikit-learn',
 ]
 
-[tool.black]
-exclude = '''
-/(
-    \.eggs
-  | \.git
-  | \.venv
-  | build
-  | dist
-)/
-'''
-
-[tool.isort]
-multi_line_output = 3
-include_trailing_comma = true
-ensure_newline_before_comments = true
-line_length = 88
-known_first_party = "glum"
-skip_glob = '\.eggs/*,\.git/*,\.venv/*,build/*,dist/*'
-default_section = 'THIRDPARTY'
+[tool.ruff]
+line-length = 88
+target-version = "py39"
+
+[tool.ruff.lint]
+ignore = ["E731", "N802", "N803", "N806"]
+select = [
+  # pyflakes
+  "F",
+  # pycodestyle
+  "E", "W",
+  # isort
+  "I",
+  # pep8-naming
+  "N",
+  # pyupgrade
+  "UP",
+]
+
+[tool.ruff.lint.isort]
+known-first-party = ["glum", "glum_benchmarks"]
+
+[tool.mypy]
+python_version = '3.9'
 
 [tool.cibuildwheel]
 skip = [
@@ -51,6 +56,7 @@ CFLAGS="-I$CONDA/envs/build/include"
 CXXFLAGS="-I$CONDA/envs/build/include"
 CXX="/usr/bin/clang++"
 CC="/usr/bin/clang"
+MACOSX_DEPLOYMENT_TARGET="10.13"
 
 
 [tool.pytest.ini_options]

diff --git a/src/glum/_distribution.py b/src/glum/_distribution.py
@@ -282,7 +282,9 @@ def _mu_deviance_derivative(
         link: Link,
         offset=None,
     ):
-        """Compute ``mu`` and the derivative of the deviance with respect to coefficients."""
+        """Compute ``mu`` and the derivative of the deviance with respect to
+        coefficients.
+        """
         lin_pred = _safe_lin_pred(X, coef, offset)
         mu = link.inverse(lin_pred)
         d1 = link.inverse_derivative(lin_pred)
@@ -604,7 +606,6 @@ def power(self) -> float:
 
     @power.setter
     def power(self, power):
-
         if not isinstance(power, (int, float, np.number)):
             raise TypeError(f"The power parameter must be numeric; got {power}.")
         if (power > 0) and (power < 1):
@@ -622,7 +623,6 @@ def unit_variance_derivative(self, mu):  # noqa D
         return numexpr.evaluate("p * mu ** (p - 1)")
 
     def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
-
         y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
         sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
@@ -641,7 +641,6 @@ def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
             return tweedie_deviance(y, sample_weight, mu, p=float(self.power))
 
     def unit_deviance(self, y, mu):  # noqa D
-
         if self.power == 0:  # normal distribution
             return (y - mu) ** 2
         if self.power == 1:  # Poisson distribution
@@ -803,7 +802,6 @@ def unit_variance_derivative(self, mu) -> np.ndarray:  # noqa D
         return 0 if np.isscalar(mu) else np.zeros_like(mu)
 
     def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
-
         y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
         sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
@@ -1031,7 +1029,6 @@ def unit_variance_derivative(self, mu) -> np.ndarray:  # noqa D
         return 2 * mu
 
     def deviance(self, y, mu, sample_weight=None) -> float:  # noqa D
-
         y, mu, sample_weight = _as_float_arrays(y, mu, sample_weight)
         sample_weight = np.ones_like(y) if sample_weight is None else sample_weight
 
@@ -1388,7 +1385,6 @@ def theta(self) -> float:
 
     @theta.setter
     def theta(self, theta):
-
         if not isinstance(theta, (int, float)):
             raise TypeError(f"Theta must be numeric; got {theta}.")
         if not theta > 0:
@@ -1517,7 +1513,6 @@ def guess_intercept(
     avg_y = np.average(y, weights=sample_weight)
 
     if isinstance(link, IdentityLink):
-
         # This is only correct for the normal. For other distributions, the
         # answer is unknown, but we assume that we want `sum(y) = sum(mu)`
 
@@ -1529,7 +1524,6 @@ def guess_intercept(
         return avg_y - avg_eta
 
     elif isinstance(link, LogLink):
-
         # This is only correct for Tweedie
 
         log_avg_y = np.log(avg_y)
@@ -1564,7 +1558,6 @@ def guess_intercept(
         return first - second
 
     elif isinstance(link, LogitLink):
-
         log_odds = np.log(avg_y) - np.log(1 - avg_y)
 
         if eta is None:
@@ -1575,7 +1568,6 @@ def guess_intercept(
         return log_odds - avg_eta
 
     else:
-
         return link.link(y.dot(sample_weight))