microsoft · jameslamb · Feb 3, 2021 · Jan 29, 2021 · Jan 29, 2021 · Jan 31, 2021
@@ -434,6 +434,25 @@ def _predict(
 
 class _DaskLGBMModel:
 
+    # self._client is set in the constructor of lightgbm.sklearn.LGBMModel
+    _client: Optional[Client] = None
+
+    @property
+    def client(self) -> Client:
+        """Dask client
+
+        This property can be passed in the constructor or directly assigned
+        like ``model.client = client``.
+        """
+        if self._client is None:
 def test_sklearn_integration(estimator, check): 
-        if self._client is None:
+        if self._n_features is None:
+            raise LGBMNotFittedError('No client found. Need to call fit beforehand.')
+        if self._client is None:
 def test_sklearn_integration(estimator, check): 
-        if self._client is None:
+        if self._n_features is None:
+            raise LGBMNotFittedError('No client found. Need to call fit beforehand.')
+        if self._client is None:
+            return default_client()
+        else:
+            return self._client
+
+    @client.setter
+    def client(self, client: Client) -> None:
+        self._client = client
+
     def _fit(
         self,
         model_factory: Type[LGBMModel],
@@ -446,13 +465,11 @@ def _fit(
     ) -> "_DaskLGBMModel":
         if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)):
             raise LightGBMError('dask, pandas and scikit-learn are required for lightgbm.dask')
-        if client is None:
-            client = default_client()
 
         params = self.get_params(True)
 
         model = _train(
-            client=client,
+            client=self.client,
             data=X,
             label=y,
             params=params,
@@ -462,9 +479,9 @@ def _fit(
             **kwargs
         )
 
+        # at this point, self._client is still set
         self.set_params(**model.get_params())
         self._copy_extra_params(model, self)
-
         return self
 
     def _to_local(self, model_factory: Type[LGBMModel]) -> LGBMModel:
@@ -478,18 +495,58 @@ def _copy_extra_params(source: Union["_DaskLGBMModel", LGBMModel], dest: Union["
         attributes = source.__dict__
         extra_param_names = set(attributes.keys()).difference(params.keys())
         for name in extra_param_names:
-            setattr(dest, name, attributes[name])
+            if name != "_client":
+                setattr(dest, name, attributes[name])
 
 
 class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
     """Distributed version of lightgbm.LGBMClassifier."""
 
+    def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1,
+                 learning_rate=0.1, n_estimators=100,
+                 subsample_for_bin=200000, objective=None, class_weight=None,
+                 min_split_gain=0., min_child_weight=1e-3, min_child_samples=20,
+                 subsample=1., subsample_freq=0, colsample_bytree=1.,
+                 reg_alpha=0., reg_lambda=0., random_state=None,
+                 n_jobs=-1, silent=True, importance_type='split', **kwargs):
+        super().__init__(
+            boosting_type=boosting_type,
+            num_leaves=num_leaves,
+            max_depth=max_depth,
+            learning_rate=learning_rate,
+            n_estimators=n_estimators,
+            subsample_for_bin=subsample_for_bin,
+            objective=objective,
+            class_weight=class_weight,
+            min_split_gain=min_split_gain,
+            min_child_weight=min_child_weight,
+            min_child_samples=min_child_samples,
+            subsample=subsample,
+            subsample_freq=subsample_freq,
+            colsample_bytree=colsample_bytree,
+            reg_alpha=reg_alpha,
+            reg_lambda=reg_lambda,
+            random_state=random_state,
+            n_jobs=n_jobs,
+            silent=silent,
+            importance_type=importance_type,
+            **kwargs
+        )
+
+    _base_doc = LGBMClassifier.__init__.__doc__
+    _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
+    __init__.__doc__ = (
+        _before_kwargs
+        + 'client : dask.distributed.Client or None, optional (default=None)\n'
+        + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. This client will not be saved if the model object is pickled.\n'
+        + ' ' * 8 + _kwargs + _after_kwargs
+    )
+
     def fit(
         self,
         X: _DaskMatrixLike,
         y: _DaskCollection,
         sample_weight: Optional[_DaskCollection] = None,
-        client: Optional[Client] = None,
         **kwargs: Any
     ) -> "DaskLGBMClassifier":
         """Docstring is inherited from the lightgbm.LGBMClassifier.fit."""
@@ -498,16 +555,11 @@ def fit(
             X=X,
             y=y,
             sample_weight=sample_weight,
-            client=client,
+            client=self.client,
             **kwargs
         )
 
-    _base_doc = LGBMClassifier.fit.__doc__
-    _before_init_score, _init_score, _after_init_score = _base_doc.partition('init_score :')
-    fit.__doc__ = (_before_init_score
-                   + 'client : dask.distributed.Client or None, optional (default=None)\n'
-                   + ' ' * 12 + 'Dask client.\n'
-                   + ' ' * 8 + _init_score + _after_init_score)
+    fit.__doc__ = LGBMClassifier.fit.__doc__
 
     def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
         """Docstring is inherited from the lightgbm.LGBMClassifier.predict."""
@@ -545,6 +597,46 @@ def to_local(self) -> LGBMClassifier:
 class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
     """Distributed version of lightgbm.LGBMRegressor."""
 
+    def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1,
+                 learning_rate=0.1, n_estimators=100,
+                 subsample_for_bin=200000, objective=None, class_weight=None,
+                 min_split_gain=0., min_child_weight=1e-3, min_child_samples=20,
+                 subsample=1., subsample_freq=0, colsample_bytree=1.,
+                 reg_alpha=0., reg_lambda=0., random_state=None,
+                 n_jobs=-1, silent=True, importance_type='split', **kwargs):
+        super().__init__(
+            boosting_type=boosting_type,
+            num_leaves=num_leaves,
+            max_depth=max_depth,
+            learning_rate=learning_rate,
+            n_estimators=n_estimators,
+            subsample_for_bin=subsample_for_bin,
+            objective=objective,
+            class_weight=class_weight,
+            min_split_gain=min_split_gain,
+            min_child_weight=min_child_weight,
+            min_child_samples=min_child_samples,
+            subsample=subsample,
+            subsample_freq=subsample_freq,
+            colsample_bytree=colsample_bytree,
+            reg_alpha=reg_alpha,
+            reg_lambda=reg_lambda,
+            random_state=random_state,
+            n_jobs=n_jobs,
+            silent=silent,
+            importance_type=importance_type,
+            **kwargs
+        )
+
+    _base_doc = LGBMRegressor.__init__.__doc__
+    _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
+    __init__.__doc__ = (
+        _before_kwargs
+        + 'client : dask.distributed.Client or None, optional (default=None)\n'
+        + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. This client will not be saved if the model object is pickled.\n'
+        + ' ' * 8 + _kwargs + _after_kwargs
+    )
+
     def fit(
         self,
         X: _DaskMatrixLike,
@@ -563,12 +655,7 @@ def fit(
             **kwargs
         )
 
-    _base_doc = LGBMRegressor.fit.__doc__
-    _before_init_score, _init_score, _after_init_score = _base_doc.partition('init_score :')
-    fit.__doc__ = (_before_init_score
-                   + 'client : dask.distributed.Client or None, optional (default=None)\n'
-                   + ' ' * 12 + 'Dask client.\n'
-                   + ' ' * 8 + _init_score + _after_init_score)
+    fit.__doc__ = LGBMRegressor.fit.__doc__
 
     def predict(self, X: _DaskMatrixLike, **kwargs) -> dask_Array:
         """Docstring is inherited from the lightgbm.LGBMRegressor.predict."""
@@ -594,14 +681,53 @@ def to_local(self) -> LGBMRegressor:
 class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
     """Distributed version of lightgbm.LGBMRanker."""
 
+    def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1,
+                 learning_rate=0.1, n_estimators=100,
+                 subsample_for_bin=200000, objective=None, class_weight=None,
+                 min_split_gain=0., min_child_weight=1e-3, min_child_samples=20,
+                 subsample=1., subsample_freq=0, colsample_bytree=1.,
+                 reg_alpha=0., reg_lambda=0., random_state=None,
+                 n_jobs=-1, silent=True, importance_type='split', **kwargs):
+        super().__init__(
+            boosting_type=boosting_type,
+            num_leaves=num_leaves,
+            max_depth=max_depth,
+            learning_rate=learning_rate,
+            n_estimators=n_estimators,
+            subsample_for_bin=subsample_for_bin,
+            objective=objective,
+            class_weight=class_weight,
+            min_split_gain=min_split_gain,
+            min_child_weight=min_child_weight,
+            min_child_samples=min_child_samples,
+            subsample=subsample,
+            subsample_freq=subsample_freq,
+            colsample_bytree=colsample_bytree,
+            reg_alpha=reg_alpha,
+            reg_lambda=reg_lambda,
+            random_state=random_state,
+            n_jobs=n_jobs,
+            silent=silent,
+            importance_type=importance_type,
+            **kwargs
+        )
+
+    _base_doc = LGBMRanker.__init__.__doc__
+    _before_kwargs, _kwargs, _after_kwargs = _base_doc.partition('**kwargs')
+    __init__.__doc__ = (
+        _before_kwargs
+        + 'client : dask.distributed.Client or None, optional (default=None)\n'
+        + ' ' * 12 + 'Dask client. If ``None``, ``distributed.default_client()`` will be used at runtime. This client will not be saved if the model object is pickled.\n'
+        + ' ' * 8 + _kwargs + _after_kwargs
+    )
+
     def fit(
         self,
         X: _DaskMatrixLike,
         y: _DaskCollection,
         sample_weight: Optional[_DaskCollection] = None,
         init_score: Optional[_DaskCollection] = None,
         group: Optional[_DaskCollection] = None,
-        client: Optional[Client] = None,
         **kwargs: Any
     ) -> "DaskLGBMRanker":
         """Docstring is inherited from the lightgbm.LGBMRanker.fit."""
@@ -614,16 +740,11 @@ def fit(
             y=y,
             sample_weight=sample_weight,
             group=group,
-            client=client,
+            client=self.client,
             **kwargs
         )
 
-    _base_doc = LGBMRanker.fit.__doc__
-    _before_eval_set, _eval_set, _after_eval_set = _base_doc.partition('eval_set :')
-    fit.__doc__ = (_before_eval_set
-                   + 'client : dask.distributed.Client or None, optional (default=None)\n'
-                   + ' ' * 12 + 'Dask client.\n'
-                   + ' ' * 8 + _eval_set + _after_eval_set)
+    fit.__doc__ = LGBMRanker.fit.__doc__
 
     def predict(self, X: _DaskMatrixLike, **kwargs: Any) -> dask_Array:
         """Docstring is inherited from the lightgbm.LGBMRanker.predict."""

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
@@ -291,6 +291,9 @@ def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1,
         if not SKLEARN_INSTALLED:
             raise LightGBMError('scikit-learn is required for lightgbm.sklearn')
 
+        # Dask estimators inherit from this and may pass an argument "client"
+        self._client = kwargs.pop("client", None)
+
         self.boosting_type = boosting_type
         self.objective = objective
         self.num_leaves = num_leaves
@@ -325,6 +328,13 @@ def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1,
         self._n_classes = None
         self.set_params(**kwargs)
 
+    def __getstate__(self):
+        """Remove un-picklable attributes before serialization"""
+        client = self.__dict__.pop("_client", None)
+        out = copy.deepcopy(self.__dict__)
+        self._client = client
+        return out
+
     def _more_tags(self):
         return {
             'allow_nan': True,