From 4675a88bba98e422f97fd0f8e6a939468a793d35 Mon Sep 17 00:00:00 2001 From: traderbxy <1533405582@qq.com> Date: Mon, 16 Oct 2023 19:49:16 +0800 Subject: [PATCH 01/13] Add new .rst file --- .../xorbits_data/lightgbm_example.rst | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 doc/source/libraries/xorbits_data/lightgbm_example.rst diff --git a/doc/source/libraries/xorbits_data/lightgbm_example.rst b/doc/source/libraries/xorbits_data/lightgbm_example.rst new file mode 100644 index 000000000..c4edab514 --- /dev/null +++ b/doc/source/libraries/xorbits_data/lightgbm_example.rst @@ -0,0 +1,103 @@ +.. _10min_lightgbm: + +=================================== +10 minutes to :code:`xorbits.lightgbm` +=================================== + +.. currentmodule:: xorbits.lightgbm + +This is a short introduction to :code:`xorbits.lightgbm` which is originated from LightGBM's quickstart. + +Let's take :code:`LGBMRegressor` as an example and explain how to build a regression model, find the relationship between independent variables (features) and the dependent variable (target), and make predictions based on these relationships. + +Customarily, we import and init as follows: + +.. ipython:: python + + import xorbits + import xorbits.numpy as np + from xorbits.lightgbm import LGBMRegressor + from xorbits.sklearn.model_selection import train_test_split + from xorbits.sklearn.metrics import mean_squared_error + xorbits.init() + +Model Creation +--------------- +First, we build a :code:`LGBMRegressor` model and define its parameters. + +This model has many adjustable hyperparameters that allow you to configure parameters such as tree depth, the number of leaf nodes, learning rate, and more to optimize the model's performance. + +.. ipython:: python + + lgbm_regressor = LGBMRegressor(learning_rate=0.05,n_estimators=100) + +:code:`.get_params` method returns a dictionary containing all the parameter names of the model along with their corresponding values. You can inspect these values to understand the current configuration of the model. + +Inspect the parameters of the LightGBM regressor. + +.. ipython:: python + + paras=lgbm_regressor.get_params() + paras + +Set/modify parameters. + +:code:`.set_params` method allows you to dynamically modify the parameter settings of a machine learning model by specifying parameter names and their corresponding values, without the need to recreate the model object. + +.. ipython:: python + + lgbm_regressor.set_params(learning_rate=0.1, n_estimators=100) + lgbm_regressor.get_params() + +Data Preparation +--------------- +We can use real data as input. For the sake of simplicity, we will use randomly generated x and y data as an example. + +.. ipython:: python + + x = np.random.rand(100) + y_regression = 2 * x + 1 + 0.1 * np.random.randn(100) + x=x.reshape(-1, 1) + +In order to train the model, we split the dataset into a training set and a test set. + +.. ipython:: python + + X_train, X_test, y_train, y_test = train_test_split(x, y_regression, test_size=0.2) + +Model Training +--------------- +The :code:`.fit` method takes the training data (independent variable x and dependent variable y) and fits the model to the data. + +The model adjusts its parameters to minimize the error between the predicted values and the actual observations. + +.. ipython:: python + + lgbm_regressor.fit(X_train, y_train) + +Model Prediction +--------------- + +Once you have trained a model, you can use the :code:`.predict` method to apply that model to new data and generate predictions for the new data. + +.. ipython:: python + + y_pred = lgbm_regressor.predict(X_test) + y_pred + +Model Evaluation +--------------- + +:code:`.score` is typically used to assess the performance of a machine learning model. + +In regression problems, the :code:`.score` method usually returns the coefficient of determination (R-squared) score, which represents the model's ability to explain the variability in the dependent variable. + +Calculate the model's estimated accuracy on the test set. + +.. ipython:: python + + mse = mean_squared_error(y_test, y_pred) + mse + + accuracy = lgbm_regressor.score(X_test, y_test) + accuracy From 9aeac51df85e565e14fb6d61651b2dac37333289 Mon Sep 17 00:00:00 2001 From: traderbxy <1533405582@qq.com> Date: Wed, 15 Nov 2023 12:58:07 +0800 Subject: [PATCH 02/13] Resolve merge conflicts --- .../xorbits_data/lightgbm_example.rst | 103 ------------------ 1 file changed, 103 deletions(-) delete mode 100644 doc/source/libraries/xorbits_data/lightgbm_example.rst diff --git a/doc/source/libraries/xorbits_data/lightgbm_example.rst b/doc/source/libraries/xorbits_data/lightgbm_example.rst deleted file mode 100644 index c4edab514..000000000 --- a/doc/source/libraries/xorbits_data/lightgbm_example.rst +++ /dev/null @@ -1,103 +0,0 @@ -.. _10min_lightgbm: - -=================================== -10 minutes to :code:`xorbits.lightgbm` -=================================== - -.. currentmodule:: xorbits.lightgbm - -This is a short introduction to :code:`xorbits.lightgbm` which is originated from LightGBM's quickstart. - -Let's take :code:`LGBMRegressor` as an example and explain how to build a regression model, find the relationship between independent variables (features) and the dependent variable (target), and make predictions based on these relationships. - -Customarily, we import and init as follows: - -.. ipython:: python - - import xorbits - import xorbits.numpy as np - from xorbits.lightgbm import LGBMRegressor - from xorbits.sklearn.model_selection import train_test_split - from xorbits.sklearn.metrics import mean_squared_error - xorbits.init() - -Model Creation ---------------- -First, we build a :code:`LGBMRegressor` model and define its parameters. - -This model has many adjustable hyperparameters that allow you to configure parameters such as tree depth, the number of leaf nodes, learning rate, and more to optimize the model's performance. - -.. ipython:: python - - lgbm_regressor = LGBMRegressor(learning_rate=0.05,n_estimators=100) - -:code:`.get_params` method returns a dictionary containing all the parameter names of the model along with their corresponding values. You can inspect these values to understand the current configuration of the model. - -Inspect the parameters of the LightGBM regressor. - -.. ipython:: python - - paras=lgbm_regressor.get_params() - paras - -Set/modify parameters. - -:code:`.set_params` method allows you to dynamically modify the parameter settings of a machine learning model by specifying parameter names and their corresponding values, without the need to recreate the model object. - -.. ipython:: python - - lgbm_regressor.set_params(learning_rate=0.1, n_estimators=100) - lgbm_regressor.get_params() - -Data Preparation ---------------- -We can use real data as input. For the sake of simplicity, we will use randomly generated x and y data as an example. - -.. ipython:: python - - x = np.random.rand(100) - y_regression = 2 * x + 1 + 0.1 * np.random.randn(100) - x=x.reshape(-1, 1) - -In order to train the model, we split the dataset into a training set and a test set. - -.. ipython:: python - - X_train, X_test, y_train, y_test = train_test_split(x, y_regression, test_size=0.2) - -Model Training ---------------- -The :code:`.fit` method takes the training data (independent variable x and dependent variable y) and fits the model to the data. - -The model adjusts its parameters to minimize the error between the predicted values and the actual observations. - -.. ipython:: python - - lgbm_regressor.fit(X_train, y_train) - -Model Prediction ---------------- - -Once you have trained a model, you can use the :code:`.predict` method to apply that model to new data and generate predictions for the new data. - -.. ipython:: python - - y_pred = lgbm_regressor.predict(X_test) - y_pred - -Model Evaluation ---------------- - -:code:`.score` is typically used to assess the performance of a machine learning model. - -In regression problems, the :code:`.score` method usually returns the coefficient of determination (R-squared) score, which represents the model's ability to explain the variability in the dependent variable. - -Calculate the model's estimated accuracy on the test set. - -.. ipython:: python - - mse = mean_squared_error(y_test, y_pred) - mse - - accuracy = lgbm_regressor.score(X_test, y_test) - accuracy From ed2b4562772d4120101ad7f92e9ee85b78e6c438 Mon Sep 17 00:00:00 2001 From: traderbxy <1533405582@qq.com> Date: Wed, 15 Nov 2023 13:12:49 +0800 Subject: [PATCH 03/13] fix read_csv Path --- .../_mars/dataframe/datasource/read_csv.py | 7 ++- .../tests/test_datasource_execution.py | 49 +++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/python/xorbits/_mars/dataframe/datasource/read_csv.py b/python/xorbits/_mars/dataframe/datasource/read_csv.py index 43a29c47e..29c07354b 100644 --- a/python/xorbits/_mars/dataframe/datasource/read_csv.py +++ b/python/xorbits/_mars/dataframe/datasource/read_csv.py @@ -18,6 +18,7 @@ import numpy as np import pandas as pd +from pathlib import Path try: from pyarrow import NativeFile @@ -187,6 +188,8 @@ def _tile(cls, op: "DataFrameReadCSV"): dtypes = df.dtypes path_prefix = "" + if isinstance(op.path, Path): + op.path = op.path.as_posix() if isinstance(op.path, (tuple, list)): paths = op.path elif get_fs(op.path, op.storage_options).isdir(op.path): @@ -407,7 +410,7 @@ def __call__( def read_csv( - path: str, + path: Union[str, Path], names: Union[List, Tuple] = None, sep: str = ",", index_col: Union[int, str, List[int], List[str]] = None, @@ -743,6 +746,8 @@ def read_csv( return op() # infer dtypes and columns + if isinstance(path, Path): + path = path.as_posix() if isinstance(path, (list, tuple)): file_path = path[0] elif get_fs(path, storage_options).isdir(path): diff --git a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py index 1e7148934..5a57fc63f 100644 --- a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py +++ b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py @@ -24,6 +24,7 @@ import numpy as np import pandas as pd import pytest +from pathlib import Path from ....tensor.core import TENSOR_TYPE @@ -489,6 +490,54 @@ def test_from_records_execution(setup): pd.testing.assert_frame_equal(df2_result, pdf_expected) +def test_read_csv_execution_with_pathlib_Path(setup): + with tempfile.TemporaryDirectory() as tempdir: + file_path = os.path.join(tempdir, "test.csv") + + df = pd.DataFrame( + np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int64), + columns=["a", "b", "c"], + ) + df.to_csv(file_path) + + # test the pathlib.Path type for a Single file + file_path = Path(file_path) + pdf = pd.read_csv(file_path, index_col=0) + r = md.read_csv(file_path, index_col=0) + mdf = r.execute().fetch() + pd.testing.assert_frame_equal(pdf, mdf) + # size_res = self.executor.execute_dataframe(r, mock=True) + # assert sum(s[0] for s in size_res) == os.stat(file_path).st_size + + mdf2 = md.read_csv(file_path, index_col=0, chunk_bytes=10).execute().fetch() + pd.testing.assert_frame_equal(pdf, mdf2) + + mdf = md.read_csv(file_path, index_col=0, nrows=1).execute().fetch() + pd.testing.assert_frame_equal(df[:1], mdf) + + # test read directory + with tempfile.TemporaryDirectory() as tempdir: + testdir = os.path.join(tempdir, "test_dir") + os.makedirs(testdir, exist_ok=True) + + df = pd.DataFrame(np.random.rand(300, 3), columns=["a", "b", "c"]) + + file_paths = [os.path.join(testdir, f"test{i}.csv") for i in range(3)] + df[:100].to_csv(file_paths[0]) + df[100:200].to_csv(file_paths[1]) + df[200:].to_csv(file_paths[2]) + + # test the pathlib.Path type for a directory + testdir=Path(testdir) + # As we can not guarantee the order in which these files are processed, + # the result may not keep the original order. + mdf = md.read_csv(testdir, index_col=0).execute().fetch() + pd.testing.assert_frame_equal(df, mdf.sort_index()) + + mdf2 = md.read_csv(testdir, index_col=0, chunk_bytes=50).execute().fetch() + pd.testing.assert_frame_equal(df, mdf2.sort_index()) + + def test_read_csv_execution(setup): with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, "test.csv") From 4a0ebcdedec65b5e7c5029aaad4a5e630ed8d147 Mon Sep 17 00:00:00 2001 From: traderbxy <1533405582@qq.com> Date: Wed, 15 Nov 2023 17:13:14 +0800 Subject: [PATCH 04/13] use str --- python/xorbits/_mars/dataframe/datasource/read_csv.py | 4 ++-- .../dataframe/datasource/tests/test_datasource_execution.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/xorbits/_mars/dataframe/datasource/read_csv.py b/python/xorbits/_mars/dataframe/datasource/read_csv.py index 29c07354b..f2b695480 100644 --- a/python/xorbits/_mars/dataframe/datasource/read_csv.py +++ b/python/xorbits/_mars/dataframe/datasource/read_csv.py @@ -189,7 +189,7 @@ def _tile(cls, op: "DataFrameReadCSV"): path_prefix = "" if isinstance(op.path, Path): - op.path = op.path.as_posix() + op.path = str(op.path) if isinstance(op.path, (tuple, list)): paths = op.path elif get_fs(op.path, op.storage_options).isdir(op.path): @@ -747,7 +747,7 @@ def read_csv( # infer dtypes and columns if isinstance(path, Path): - path = path.as_posix() + path = str(path) if isinstance(path, (list, tuple)): file_path = path[0] elif get_fs(path, storage_options).isdir(path): diff --git a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py index 5a57fc63f..7a50d871c 100644 --- a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py +++ b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py @@ -528,7 +528,7 @@ def test_read_csv_execution_with_pathlib_Path(setup): df[200:].to_csv(file_paths[2]) # test the pathlib.Path type for a directory - testdir=Path(testdir) + testdir = Path(testdir) # As we can not guarantee the order in which these files are processed, # the result may not keep the original order. mdf = md.read_csv(testdir, index_col=0).execute().fetch() From e38dcefe2eba696152244fa47832df156303b551 Mon Sep 17 00:00:00 2001 From: traderbxy <1533405582@qq.com> Date: Thu, 16 Nov 2023 11:00:18 +0800 Subject: [PATCH 05/13] isort --- python/xorbits/_mars/dataframe/datasource/read_csv.py | 2 +- .../dataframe/datasource/tests/test_datasource_execution.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/xorbits/_mars/dataframe/datasource/read_csv.py b/python/xorbits/_mars/dataframe/datasource/read_csv.py index f2b695480..5bd869788 100644 --- a/python/xorbits/_mars/dataframe/datasource/read_csv.py +++ b/python/xorbits/_mars/dataframe/datasource/read_csv.py @@ -13,12 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. from io import BytesIO +from pathlib import Path from typing import Any, Dict, List, Tuple, Union from urllib.parse import urlparse import numpy as np import pandas as pd -from pathlib import Path try: from pyarrow import NativeFile diff --git a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py index 7a50d871c..9292dd39f 100644 --- a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py +++ b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py @@ -19,12 +19,12 @@ import time from collections import OrderedDict from datetime import datetime +from pathlib import Path from string import printable import numpy as np import pandas as pd import pytest -from pathlib import Path from ....tensor.core import TENSOR_TYPE From a77bebefdbf55d1be77b69a78b5d37fb862b310d Mon Sep 17 00:00:00 2001 From: ChengjieLi Date: Fri, 17 Nov 2023 11:58:38 +0800 Subject: [PATCH 06/13] Pin minikube version --- .github/workflows/python.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index feb75e3d0..4c8a67971 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -122,6 +122,7 @@ jobs: with: driver: none kubernetes-version: v1.23.12 + minikube-version: 1.31.2 - name: Install ucx dependencies if: ${{ (matrix.module != 'gpu') && (matrix.os == 'ubuntu-latest') && (matrix.python-version != '3.11') }} From 84cd01fc5c6b933b5f0ed87199dbbce2b7eaaa54 Mon Sep 17 00:00:00 2001 From: ChengjieLi Date: Fri, 17 Nov 2023 12:29:27 +0800 Subject: [PATCH 07/13] try to fix --- python/xorbits/deploy/kubernetes/tests/test_kubernetes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/xorbits/deploy/kubernetes/tests/test_kubernetes.py b/python/xorbits/deploy/kubernetes/tests/test_kubernetes.py index 77106ec10..16d232497 100644 --- a/python/xorbits/deploy/kubernetes/tests/test_kubernetes.py +++ b/python/xorbits/deploy/kubernetes/tests/test_kubernetes.py @@ -270,5 +270,5 @@ async def test_request_workers_insufficient(): use_local_image=True, ) as cluster_client: cluster_api = WebClusterAPI(address=cluster_client.endpoint) - with pytest.raises(SystemError, match=r".*Insufficient cpu.*"): + with pytest.raises(Exception, match=r".*Insufficient cpu.*"): await cluster_api.request_workers(worker_num=1, timeout=30) From 35dba24d53971a8970c4999410c9a063d27f184b Mon Sep 17 00:00:00 2001 From: ChengjieLi Date: Fri, 17 Nov 2023 12:29:32 +0800 Subject: [PATCH 08/13] try to fix --- python/xorbits/deploy/kubernetes/tests/test_kubernetes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/xorbits/deploy/kubernetes/tests/test_kubernetes.py b/python/xorbits/deploy/kubernetes/tests/test_kubernetes.py index 16d232497..d6758fe61 100644 --- a/python/xorbits/deploy/kubernetes/tests/test_kubernetes.py +++ b/python/xorbits/deploy/kubernetes/tests/test_kubernetes.py @@ -270,5 +270,6 @@ async def test_request_workers_insufficient(): use_local_image=True, ) as cluster_client: cluster_api = WebClusterAPI(address=cluster_client.endpoint) - with pytest.raises(Exception, match=r".*Insufficient cpu.*"): + with pytest.raises(Exception) as e: + print(e) await cluster_api.request_workers(worker_num=1, timeout=30) From 1fd9470461fc6f39a11a2cde206cc16cb1e20e1f Mon Sep 17 00:00:00 2001 From: ChengjieLi Date: Fri, 17 Nov 2023 13:49:03 +0800 Subject: [PATCH 09/13] request 10 workers --- python/xorbits/deploy/kubernetes/tests/test_kubernetes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/xorbits/deploy/kubernetes/tests/test_kubernetes.py b/python/xorbits/deploy/kubernetes/tests/test_kubernetes.py index d6758fe61..f005c0138 100644 --- a/python/xorbits/deploy/kubernetes/tests/test_kubernetes.py +++ b/python/xorbits/deploy/kubernetes/tests/test_kubernetes.py @@ -272,4 +272,4 @@ async def test_request_workers_insufficient(): cluster_api = WebClusterAPI(address=cluster_client.endpoint) with pytest.raises(Exception) as e: print(e) - await cluster_api.request_workers(worker_num=1, timeout=30) + await cluster_api.request_workers(worker_num=10, timeout=30) From ef55e4cdeb15f9eac1556a63fbb9de01c5453040 Mon Sep 17 00:00:00 2001 From: ChengjieLi Date: Fri, 17 Nov 2023 14:23:16 +0800 Subject: [PATCH 10/13] fix --- python/xorbits/deploy/kubernetes/tests/test_kubernetes.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/xorbits/deploy/kubernetes/tests/test_kubernetes.py b/python/xorbits/deploy/kubernetes/tests/test_kubernetes.py index f005c0138..ae94434ff 100644 --- a/python/xorbits/deploy/kubernetes/tests/test_kubernetes.py +++ b/python/xorbits/deploy/kubernetes/tests/test_kubernetes.py @@ -270,6 +270,5 @@ async def test_request_workers_insufficient(): use_local_image=True, ) as cluster_client: cluster_api = WebClusterAPI(address=cluster_client.endpoint) - with pytest.raises(Exception) as e: - print(e) + with pytest.raises(Exception): await cluster_api.request_workers(worker_num=10, timeout=30) From 10db25b177b1685bc15f94ea453f3c74f9d9af4f Mon Sep 17 00:00:00 2001 From: traderbxy <1533405582@qq.com> Date: Sat, 18 Nov 2023 13:42:09 +0800 Subject: [PATCH 11/13] fix xgboost.rst --- doc/source/libraries/xorbits_train/xgboost.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/libraries/xorbits_train/xgboost.rst b/doc/source/libraries/xorbits_train/xgboost.rst index 96e37f203..8253ff144 100644 --- a/doc/source/libraries/xorbits_train/xgboost.rst +++ b/doc/source/libraries/xorbits_train/xgboost.rst @@ -47,6 +47,7 @@ In order to train the model, we split the dataset into a training set and a test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2) + Model Training --------------- The :code:`.fit` method takes the training data (independent variable x and dependent variable y) and fits the model to the data. From c0f4e250e4aa551f9902a9eef6a248f37ec6964b Mon Sep 17 00:00:00 2001 From: Lu Weizheng Date: Mon, 20 Nov 2023 10:33:25 +0800 Subject: [PATCH 12/13] fix doc --- .../libraries/xorbits_train/lightgbm.rst | 32 +++++++++---------- .../libraries/xorbits_train/xgboost.rst | 23 +++++++------ 2 files changed, 27 insertions(+), 28 deletions(-) diff --git a/doc/source/libraries/xorbits_train/lightgbm.rst b/doc/source/libraries/xorbits_train/lightgbm.rst index 49e9e59c3..fefa275a8 100644 --- a/doc/source/libraries/xorbits_train/lightgbm.rst +++ b/doc/source/libraries/xorbits_train/lightgbm.rst @@ -1,8 +1,8 @@ .. _10min_lightgbm: -=================================== +====================================== 10 minutes to :code:`xorbits.lightgbm` -=================================== +====================================== .. currentmodule:: xorbits.lightgbm @@ -21,20 +21,20 @@ Customarily, we import and init as follows: xorbits.init() Model Creation ---------------- +-------------- First, we build a :code:`LGBMRegressor` model and define its parameters. This model has many adjustable hyperparameters that allow you to configure parameters such as tree depth, the number of leaf nodes, learning rate, and more to optimize the model's performance. -.. ipython:: python +.. ipython:: python - lgbm_regressor = LGBMRegressor(learning_rate=0.05,n_estimators=100) + lgbm_regressor = LGBMRegressor(learning_rate=0.05, n_estimators=100) :code:`.get_params` method returns a dictionary containing all the parameter names of the model along with their corresponding values. You can inspect these values to understand the current configuration of the model. Inspect the parameters of the LightGBM regressor. -.. ipython:: python +.. ipython:: python paras=lgbm_regressor.get_params() paras @@ -43,16 +43,16 @@ Set/modify parameters. :code:`.set_params` method allows you to dynamically modify the parameter settings of a machine learning model by specifying parameter names and their corresponding values, without the need to recreate the model object. -.. ipython:: python +.. ipython:: python lgbm_regressor.set_params(learning_rate=0.1, n_estimators=100) lgbm_regressor.get_params() Data Preparation ---------------- +---------------- We can use real data as input. For the sake of simplicity, we will use randomly generated x and y data as an example. -.. ipython:: python +.. ipython:: python x = np.random.rand(100) y_regression = 2 * x + 1 + 0.1 * np.random.randn(100) @@ -60,32 +60,32 @@ We can use real data as input. For the sake of simplicity, we will use randomly In order to train the model, we split the dataset into a training set and a test set. -.. ipython:: python +.. ipython:: python X_train, X_test, y_train, y_test = train_test_split(x, y_regression, test_size=0.2) Model Training ---------------- +-------------- The :code:`.fit` method takes the training data (independent variable x and dependent variable y) and fits the model to the data. The model adjusts its parameters to minimize the error between the predicted values and the actual observations. -.. ipython:: python +.. ipython:: python lgbm_regressor.fit(X_train, y_train) Model Prediction ---------------- +---------------- Once you have trained a model, you can use the :code:`.predict` method to apply that model to new data and generate predictions for the new data. -.. ipython:: python +.. ipython:: python y_pred = lgbm_regressor.predict(X_test) y_pred Model Evaluation ---------------- +---------------- :code:`.score` is typically used to assess the performance of a machine learning model. @@ -93,7 +93,7 @@ In regression problems, the :code:`.score` method usually returns the coefficien Calculate the model's estimated accuracy on the test set. -.. ipython:: python +.. ipython:: python accuracy = lgbm_regressor.score(X_test, y_test) accuracy diff --git a/doc/source/libraries/xorbits_train/xgboost.rst b/doc/source/libraries/xorbits_train/xgboost.rst index 8253ff144..1e36f69b0 100644 --- a/doc/source/libraries/xorbits_train/xgboost.rst +++ b/doc/source/libraries/xorbits_train/xgboost.rst @@ -1,8 +1,8 @@ .. _10min_xgboost: -=================================== +===================================== 10 minutes to :code:`xorbits.xgboost` -=================================== +===================================== .. currentmodule:: xorbits.xgboost @@ -19,7 +19,7 @@ Customarily, we import and init as follows: from xorbits.sklearn.model_selection import train_test_split Model Creation ---------------- +-------------- First, we build a :code:`XGBClassifier` model and define its parameters. This model implements the Gradient Boosting Decision Tree algorithm, improving model performance by training multiple decision trees. Gradient Boosting Trees is an ensemble learning method that builds a powerful model by combining multiple weak learners, typically decision trees. @@ -27,13 +27,13 @@ This model implements the Gradient Boosting Decision Tree algorithm, improving m .. ipython:: python bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic') + Data Preparation ---------------- +---------------- We import the scikit-learn/iris dataset from Hugging Face as the input data for our model. .. ipython:: python - # read data dataset = xdatasets.from_huggingface("scikit-learn/iris", split="train") iris_df = dataset.to_dataframe() @@ -43,13 +43,12 @@ We import the scikit-learn/iris dataset from Hugging Face as the input data for In order to train the model, we split the dataset into a training set and a test set. -.. ipython:: python - - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2) +.. ipython:: python + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) Model Training ---------------- +-------------- The :code:`.fit` method takes the training data (independent variable x and dependent variable y) and fits the model to the data. The model adjusts its parameters to minimize the error between the predicted values and the actual observations. @@ -61,7 +60,7 @@ The model adjusts its parameters to minimize the error between the predicted val bst.fit(X_train, y_train) Model Prediction ---------------- +---------------- Once you have trained a model, you can use the :code:`.predict` method to apply that model to new data and generate predictions for the new data. .. ipython:: python @@ -71,14 +70,14 @@ Once you have trained a model, you can use the :code:`.predict` method to apply preds Model Evaluation ---------------- +---------------- :code:`.score` is typically used to assess the performance of a machine learning model. In regression problems, the :code:`.score` method usually returns the coefficient of determination (R-squared) score, which represents the model's ability to explain the variability in the dependent variable. Calculate the model's estimated accuracy on the test set. -.. ipython:: python +.. ipython:: python accuracy = bst.score(X_test, y_test) accuracy From 2be01d619b11cbf4d6225c4a05b96913bfbea71e Mon Sep 17 00:00:00 2001 From: Lu Weizheng Date: Mon, 20 Nov 2023 14:22:54 +0800 Subject: [PATCH 13/13] fix doc --- doc/source/libraries/xorbits_train/lightgbm.rst | 10 +++++----- doc/source/libraries/xorbits_train/xgboost.rst | 3 --- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/doc/source/libraries/xorbits_train/lightgbm.rst b/doc/source/libraries/xorbits_train/lightgbm.rst index fefa275a8..8b9555539 100644 --- a/doc/source/libraries/xorbits_train/lightgbm.rst +++ b/doc/source/libraries/xorbits_train/lightgbm.rst @@ -14,11 +14,11 @@ Customarily, we import and init as follows: .. ipython:: python - import xorbits - import xorbits.numpy as np - from xorbits.lightgbm import LGBMRegressor - from xorbits.sklearn.model_selection import train_test_split - xorbits.init() + import xorbits + import xorbits.numpy as np + from xorbits.lightgbm import LGBMRegressor + from xorbits.sklearn.model_selection import train_test_split + xorbits.init() Model Creation -------------- diff --git a/doc/source/libraries/xorbits_train/xgboost.rst b/doc/source/libraries/xorbits_train/xgboost.rst index 1e36f69b0..675776452 100644 --- a/doc/source/libraries/xorbits_train/xgboost.rst +++ b/doc/source/libraries/xorbits_train/xgboost.rst @@ -55,8 +55,6 @@ The model adjusts its parameters to minimize the error between the predicted val .. ipython:: python - - # fit model bst.fit(X_train, y_train) Model Prediction @@ -65,7 +63,6 @@ Once you have trained a model, you can use the :code:`.predict` method to apply .. ipython:: python - # make predictions preds = bst.predict(X_test) preds