diff --git a/README.rst b/README.rst index 6a59e48a5c..c8491e44a5 100644 --- a/README.rst +++ b/README.rst @@ -141,7 +141,7 @@ Mars learn provides a familiar interface like scikit-learn. Mars learn also integrates with many libraries: - `TensorFlow `_ -- PyTorch +- `PyTorch `_ - `XGBoost `_ - `LightGBM `_ - `Joblib `_ diff --git a/docs/source/getting_started/learn.rst b/docs/source/getting_started/learn.rst index d84982f69f..941bbe5419 100644 --- a/docs/source/getting_started/learn.rst +++ b/docs/source/getting_started/learn.rst @@ -33,9 +33,9 @@ For implemented learn API, refer to :ref:`learn API reference `. Mars learn can integrate with XGBoost, LightGBM, TensorFlow and PyTorch. -- For XGBoost, refer to :ref:`integrate_xgboost`. -- For LightGBM, refer to :ref:`integrate_lightgbm`. -- For TensorFlow, refer to :ref:`integrate_tensorflow`. -- For PyTorch, doc is coming soon. -- For Joblib, refer to :ref:`integrate_joblib`. -- For Statsmodels, refer to :ref:`integrate_statsmodels`. +- :ref:`XGBoost `. +- :ref:`LightGBM `. +- :ref:`TensorFlow `. +- :ref:`PyTorch `. +- :ref:`Joblib `. +- :ref:`Statesmodels `. diff --git a/docs/source/index.rst b/docs/source/index.rst index 059057d919..2157719c46 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -114,8 +114,14 @@ Mars learn provides a familiar interface like scikit-learn. | | | +---------------------------------------------+----------------------------------------------------+ -Mars learn has also integrated many libraries, including :ref:`tensorflow`, -:ref:`xgboost`, :ref:`lightgbm`, :ref:`joblib` and :ref:`statsmodels`. +Mars learn also integrates with many libraries: + +- :ref:`TensorFlow ` +- :ref:`PyTorch ` +- :ref:`XGBoost ` +- :ref:`LightGBM ` +- :ref:`Joblib ` +- :ref:`Statsmodels ` Mars remote ----------- diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/getting_started/learn.po b/docs/source/locale/zh_CN/LC_MESSAGES/getting_started/learn.po index 38895fb57b..6b1326f2d2 100644 --- a/docs/source/locale/zh_CN/LC_MESSAGES/getting_started/learn.po +++ b/docs/source/locale/zh_CN/LC_MESSAGES/getting_started/learn.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: mars 0.5.0a2\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2021-08-03 18:42+0800\n" +"POT-Creation-Date: 2021-08-20 11:06+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -53,18 +53,25 @@ msgid "Mars learn can integrate with XGBoost, LightGBM, TensorFlow and PyTorch." msgstr "Mars learn 能和 XGBoost、LightGBM、TensorFlow 以及 PyTorch 集成。" #: ../../source/getting_started/learn.rst:36 -msgid "For XGBoost, refer to :ref:`xgboost`." -msgstr "XGBoost 参考 :ref:`xgboost`。" +msgid ":ref:`XGBoost `." +msgstr "" #: ../../source/getting_started/learn.rst:37 -msgid "For LightGBM, refer to :ref:`lightgbm`." -msgstr "LightGBM 参考 :ref:`lightgbm`。" +msgid ":ref:`LightGBM `." +msgstr "" #: ../../source/getting_started/learn.rst:38 -msgid "For TensorFlow, refer to :ref:`tensorflow`." -msgstr "TensorFlow 参考 :ref:`tensorflow`。" +msgid ":ref:`TensorFlow `." +msgstr "" #: ../../source/getting_started/learn.rst:39 -msgid "For PyTorch, doc is coming soon." -msgstr "PyTorch 集成的文档尚在编写中。" +msgid ":ref:`PyTorch `." +msgstr "" +#: ../../source/getting_started/learn.rst:40 +msgid ":ref:`Joblib `." +msgstr "" + +#: ../../source/getting_started/learn.rst:41 +msgid ":ref:`Statesmodels `." +msgstr "" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/index.po index 1e8609757c..90c9f4d9d7 100644 --- a/docs/source/locale/zh_CN/LC_MESSAGES/index.po +++ b/docs/source/locale/zh_CN/LC_MESSAGES/index.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: mars \n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2021-08-03 18:42+0800\n" +"POT-Creation-Date: 2021-08-20 11:28+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -25,8 +25,9 @@ msgstr "Mars 文档" msgid "" "Mars is a tensor-based unified framework for large-scale data computation" " which scales numpy, pandas, scikit-learn and many other libraries." -msgstr "Mars 是基于张量的,用于进行大规模数据计算的统一计算框架," -"它可以用来并行和分布式 numpy、pandas、scikit-learn 以及众多其他 Python 库。" +msgstr "" +"Mars 是基于张量的,用于进行大规模数据计算的统一计算框架,它可以用来并行和" +"分布式 numpy、pandas、scikit-learn 以及众多其他 Python 库。" #: ../../source/index.rst:11 msgid "Architecture Overview" @@ -111,32 +112,50 @@ msgid "**Mars learn**" msgstr "**Mars learn**" #: ../../source/index.rst:117 -msgid "" -"Mars learn has also integrated many libraries, including " -":ref:`tensorflow`, :ref:`xgboost`, :ref:`lightgbm`, :ref:`joblib` and " -":ref:`statsmodels`." +msgid "Mars learn also integrates with many libraries:" +msgstr "Mars learn 也集成了许多库:" + +#: ../../source/index.rst:119 +msgid ":ref:`TensorFlow `" +msgstr "" + +#: ../../source/index.rst:120 +msgid ":ref:`PyTorch `" msgstr "" -"Mars learn 还集成了很多库,包括 " -":ref:`tensorflow`, :ref:`xgboost`, :ref:`lightgbm`, :ref:`joblib` and " -":ref:`statsmodels`." #: ../../source/index.rst:121 -msgid "Mars remote" +msgid ":ref:`XGBoost `" +msgstr "" + +#: ../../source/index.rst:122 +msgid ":ref:`LightGBM `" msgstr "" #: ../../source/index.rst:123 +msgid ":ref:`Joblib `" +msgstr "" + +#: ../../source/index.rst:124 +msgid ":ref:`Statsmodels `" +msgstr "" + +#: ../../source/index.rst:127 +msgid "Mars remote" +msgstr "" + +#: ../../source/index.rst:129 msgid ":doc:`documentation `" msgstr ":doc:`文档 `" -#: ../../source/index.rst:125 +#: ../../source/index.rst:131 msgid "Mars remote allows users to execute functions in parallel." msgstr "Mars remote 允许用户并行执行函数。" -#: ../../source/index.rst:161 +#: ../../source/index.rst:167 msgid "Easy to scale in and scale out" msgstr "适应各种数据规模" -#: ../../source/index.rst:163 +#: ../../source/index.rst:169 msgid "" "Mars can scale in to a single machine, and scale out to a cluster with " "hundreds of machines. Both the local and distributed version share the " @@ -147,23 +166,23 @@ msgstr "" "两种环境下可使用相同的代码。因此,Mars 可以方便地从单台机器迁移到集群,以" "处理更多数据或者获得更好的性能。" -#: ../../source/index.rst:168 +#: ../../source/index.rst:174 msgid "Mars can run in a few ways:" msgstr "Mars 能以若干种方式运行:" -#: ../../source/index.rst:170 +#: ../../source/index.rst:176 msgid ":ref:`Local scheduling `" msgstr ":ref:`本地执行 `" -#: ../../source/index.rst:171 +#: ../../source/index.rst:177 msgid ":ref:`Run on cluster `" msgstr ":ref:`在集群中运行 `" -#: ../../source/index.rst:172 +#: ../../source/index.rst:178 msgid ":ref:`Run on Kubernetes `" msgstr ":ref:`在 Kubernetes 中部署 `" -#: ../../source/index.rst:173 +#: ../../source/index.rst:179 msgid ":ref:`Run on Yarn `" msgstr ":ref:`在 Yarn 中部署 `" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/reference/dataframe/frame.po b/docs/source/locale/zh_CN/LC_MESSAGES/reference/dataframe/frame.po index 18d190e3e7..439695267d 100644 --- a/docs/source/locale/zh_CN/LC_MESSAGES/reference/dataframe/frame.po +++ b/docs/source/locale/zh_CN/LC_MESSAGES/reference/dataframe/frame.po @@ -1192,6 +1192,14 @@ msgstr "" msgid "Create a scatter plot with varying marker point size and color." msgstr "" +#: ../../source/reference/dataframe/frame.rst:244::1 +msgid ":obj:`DataFrame.boxplot `\\" +msgstr "" + +#: ../../source/reference/dataframe/frame.rst:244::1 +msgid ":obj:`DataFrame.hist `\\" +msgstr "" + #: ../../source/reference/dataframe/frame.rst:248 msgid "Serialization / IO / conversion" msgstr "序列化、IO 和转换" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/reference/dataframe/indexing.po b/docs/source/locale/zh_CN/LC_MESSAGES/reference/dataframe/indexing.po index 8cd08563cd..a8704fce09 100644 --- a/docs/source/locale/zh_CN/LC_MESSAGES/reference/dataframe/indexing.po +++ b/docs/source/locale/zh_CN/LC_MESSAGES/reference/dataframe/indexing.po @@ -35,6 +35,14 @@ msgstr "" msgid "Properties" msgstr "属性" +#: ../../source/reference/dataframe/indexing.rst:28::1 +msgid ":obj:`Index.dtype `\\" +msgstr "" + +#: ../../source/reference/dataframe/indexing.rst:28::1 +msgid ":obj:`Index.inferred_type `\\" +msgstr "" + #: ../../source/reference/dataframe/indexing.rst:28::1 msgid ":obj:`Index.is_monotonic `\\" msgstr "" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/reference/dataframe/series.po b/docs/source/locale/zh_CN/LC_MESSAGES/reference/dataframe/series.po index 410a48099c..3e61e353e5 100644 --- a/docs/source/locale/zh_CN/LC_MESSAGES/reference/dataframe/series.po +++ b/docs/source/locale/zh_CN/LC_MESSAGES/reference/dataframe/series.po @@ -8,14 +8,14 @@ msgid "" msgstr "" "Project-Id-Version: mars 0.5.0a2\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2021-08-04 16:55+0800\n" +"POT-Creation-Date: 2021-08-03 18:42+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.9.1\n" +"Generated-By: Babel 2.7.0\n" #: ../../source/reference/dataframe/series.rst:3 msgid "Series" @@ -1462,6 +1462,10 @@ msgstr "" "``Series.str`` 可被用于以字符串方式访问 Series,并应用若干方法。这些功能" "可通过 ``Series.str.`` 来调用。" +#: ../../source/reference/dataframe/series.rst:371::1 +msgid ":obj:`Series.str.cgeneratedtalize `\\" +msgstr "" + #: ../../source/reference/dataframe/series.rst:371::1 msgid ":obj:`Series.str.casefold `\\ \\(\\)" msgstr "" @@ -1893,6 +1897,10 @@ msgstr "" msgid ":obj:`Series.str.isspace `\\ \\(\\)" msgstr "" +#: ../../source/reference/dataframe/series.rst:371::1 +msgid "Check whether all characters in each string are whitespace." +msgstr "" + #: ../../source/reference/dataframe/series.rst:371::1 msgid ":obj:`Series.str.islower `\\ \\(\\)" msgstr "" @@ -2039,6 +2047,10 @@ msgstr "" msgid "Generate a pie plot." msgstr "" +#: ../../source/reference/dataframe/series.rst:413::1 +msgid ":obj:`Series.hist `\\" +msgstr "" + #: ../../source/reference/dataframe/series.rst:415 msgid "Serialization / IO / conversion" msgstr "序列化、IO 和转换" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/reference/learn/generated/mars.learn.contrib.pytorch.MarsDataset.po b/docs/source/locale/zh_CN/LC_MESSAGES/reference/learn/generated/mars.learn.contrib.pytorch.MarsDataset.po index c892de6431..36491a0fb5 100644 --- a/docs/source/locale/zh_CN/LC_MESSAGES/reference/learn/generated/mars.learn.contrib.pytorch.MarsDataset.po +++ b/docs/source/locale/zh_CN/LC_MESSAGES/reference/learn/generated/mars.learn.contrib.pytorch.MarsDataset.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: mars 0.5.0a2\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2020-06-16 16:57+0800\n" +"POT-Creation-Date: 2021-08-20 11:06+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -21,3 +21,34 @@ msgstr "" msgid "mars.learn.contrib.pytorch.MarsDataset" msgstr "" +#: mars.learn.contrib.pytorch.MarsDataset:1 of +msgid "" +"MarsDataset that inherit from torch.utils.data.Dataset. It converts from " +"Mars basic datatype such as Tensor, DataFrame, Series. Additionally, it's" +" constructor can receive np.ndarray, pd.DataFrame, pd.Series type." +msgstr "" + +#: mars.learn.contrib.pytorch.MarsDataset.__init__:1 of +msgid "Initialize self. See help(type(self)) for accurate signature." +msgstr "" + +#: ../../source/reference/learn/generated/mars.learn.contrib.pytorch.MarsDataset.rst:13 +msgid "Methods" +msgstr "" + +#: ../../source/reference/learn/generated/mars.learn.contrib.pytorch.MarsDataset.rst:17::1 +msgid "" +":obj:`__init__ `\\ " +"\\(\\*tileables\\[\\, fetch\\_kwargs\\]\\)" +msgstr "" + +#: ../../source/reference/learn/generated/mars.learn.contrib.pytorch.MarsDataset.rst:17::1 +msgid "Initialize self." +msgstr "" + +#: ../../source/reference/learn/generated/mars.learn.contrib.pytorch.MarsDataset.rst:17::1 +msgid "" +":obj:`get_data `\\ " +"\\(t\\, index\\)" +msgstr "" + diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/reference/tensor/set.po b/docs/source/locale/zh_CN/LC_MESSAGES/reference/tensor/set.po index bcf620bc57..c6c333ee0d 100644 --- a/docs/source/locale/zh_CN/LC_MESSAGES/reference/tensor/set.po +++ b/docs/source/locale/zh_CN/LC_MESSAGES/reference/tensor/set.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: mars 0.5.0a2\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2020-06-16 16:57+0800\n" +"POT-Creation-Date: 2021-08-20 11:06+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -25,19 +25,29 @@ msgstr "集合函数" msgid "Boolean operations" msgstr "布尔运算" -#: ../../source/reference/tensor/set.rst:12::1 +#: ../../source/reference/tensor/set.rst:13::1 +msgid ":obj:`mars.tensor.in1d `" +msgstr "" + +#: ../../source/reference/tensor/set.rst:13::1 +msgid "" +"Test whether each element of a 1-D tensor is also present in a second " +"tensor." +msgstr "" + +#: ../../source/reference/tensor/set.rst:13::1 msgid ":obj:`mars.tensor.isin `" msgstr "" -#: ../../source/reference/tensor/set.rst:12::1 +#: ../../source/reference/tensor/set.rst:13::1 msgid "Calculates `element in test_elements`, broadcasting over `element` only." msgstr "" -#: ../../source/reference/tensor/set.rst:12::1 +#: ../../source/reference/tensor/set.rst:13::1 msgid ":obj:`mars.tensor.union1d `" msgstr "" -#: ../../source/reference/tensor/set.rst:12::1 +#: ../../source/reference/tensor/set.rst:13::1 msgid "Find the union of two tensors." msgstr "" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/reference/tensor/statistics.po b/docs/source/locale/zh_CN/LC_MESSAGES/reference/tensor/statistics.po index 4c4d0d7bbb..f09b852451 100644 --- a/docs/source/locale/zh_CN/LC_MESSAGES/reference/tensor/statistics.po +++ b/docs/source/locale/zh_CN/LC_MESSAGES/reference/tensor/statistics.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: mars 0.5.0a2\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2021-08-03 18:42+0800\n" +"POT-Creation-Date: 2021-08-20 11:06+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -185,55 +185,71 @@ msgstr "" msgid "Statistical tests" msgstr "统计检验" -#: ../../source/reference/tensor/statistics.rst:69::1 +#: ../../source/reference/tensor/statistics.rst:71::1 msgid ":obj:`mars.tensor.stats.chisquare `" msgstr "" -#: ../../source/reference/tensor/statistics.rst:69::1 +#: ../../source/reference/tensor/statistics.rst:71::1 msgid "Calculate a one-way chi-square test." msgstr "" -#: ../../source/reference/tensor/statistics.rst:69::1 +#: ../../source/reference/tensor/statistics.rst:71::1 +msgid ":obj:`mars.tensor.stats.ks_1samp `" +msgstr "" + +#: ../../source/reference/tensor/statistics.rst:71::1 +msgid "Performs the one-sample Kolmogorov-Smirnov test for goodness of fit." +msgstr "" + +#: ../../source/reference/tensor/statistics.rst:71::1 +msgid ":obj:`mars.tensor.stats.ks_2samp `" +msgstr "" + +#: ../../source/reference/tensor/statistics.rst:71::1 +msgid "Compute the Kolmogorov-Smirnov statistic on 2 samples." +msgstr "" + +#: ../../source/reference/tensor/statistics.rst:71::1 msgid "" ":obj:`mars.tensor.stats.power_divergence " "`" msgstr "" -#: ../../source/reference/tensor/statistics.rst:69::1 +#: ../../source/reference/tensor/statistics.rst:71::1 msgid "Cressie-Read power divergence statistic and goodness of fit test." msgstr "" -#: ../../source/reference/tensor/statistics.rst:69::1 +#: ../../source/reference/tensor/statistics.rst:71::1 msgid ":obj:`mars.tensor.stats.ttest_1samp `" msgstr "" -#: ../../source/reference/tensor/statistics.rst:69::1 +#: ../../source/reference/tensor/statistics.rst:71::1 msgid "Calculate the T-test for the mean of ONE group of scores." msgstr "" -#: ../../source/reference/tensor/statistics.rst:69::1 +#: ../../source/reference/tensor/statistics.rst:71::1 msgid ":obj:`mars.tensor.stats.ttest_ind `" msgstr "" -#: ../../source/reference/tensor/statistics.rst:69::1 +#: ../../source/reference/tensor/statistics.rst:71::1 msgid "Calculate the T-test for the means of *two independent* samples of scores." msgstr "" -#: ../../source/reference/tensor/statistics.rst:69::1 +#: ../../source/reference/tensor/statistics.rst:71::1 msgid "" ":obj:`mars.tensor.stats.ttest_ind_from_stats " "`" msgstr "" -#: ../../source/reference/tensor/statistics.rst:69::1 +#: ../../source/reference/tensor/statistics.rst:71::1 msgid "T-test for means of two independent samples from descriptive statistics." msgstr "" -#: ../../source/reference/tensor/statistics.rst:69::1 +#: ../../source/reference/tensor/statistics.rst:71::1 msgid ":obj:`mars.tensor.stats.ttest_rel `" msgstr "" -#: ../../source/reference/tensor/statistics.rst:69::1 +#: ../../source/reference/tensor/statistics.rst:71::1 msgid "Calculate the t-test on TWO RELATED samples of scores, a and b." msgstr "" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/contrib/dask.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/contrib/dask.po index 37e620243a..5b61d23571 100644 --- a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/contrib/dask.po +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/contrib/dask.po @@ -8,26 +8,30 @@ msgid "" msgstr "" "Project-Id-Version: mars 0.8.0a1\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2021-08-16 21:57+0800\n" +"POT-Creation-Date: 2021-08-20 11:06+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.9.1\n" +"Generated-By: Babel 2.7.0\n" #: ../../source/user_guide/contrib/dask.rst:5 msgid "Dask on Mars" msgstr "Dask on Mars" #: ../../source/user_guide/contrib/dask.rst:7 +msgid "New in version 0.8.0a2" +msgstr "自 0.8.0a2 起支持" + +#: ../../source/user_guide/contrib/dask.rst:9 msgid "" "Dask-on-Mars provides a simple way to execute the entire Dask ecosystem " "on top of Mars." msgstr "Dask-on-Mars 使得用户能通过简单的 API 调用,在 Mars 中运行大部分 Dask 任务" -#: ../../source/user_guide/contrib/dask.rst:9 +#: ../../source/user_guide/contrib/dask.rst:11 msgid "" "`Dask `__ is a flexible library for parallel computing" " in Python, geared towards scaling analytics and scientific computing " @@ -38,7 +42,7 @@ msgstr "" "`Dask `__ 是一个用于并行计算的 Python 库,旨在为大规模" "数据的分析和科学计算提供并行的计算解决方案。" -#: ../../source/user_guide/contrib/dask.rst:15 +#: ../../source/user_guide/contrib/dask.rst:17 msgid "" "For execution on Mars, you should *not* use the `Dask.distributed " "`__ client, " @@ -48,11 +52,11 @@ msgstr "" "distributed.dask.org/en/latest/quickstart.html>`__ 相关特性,只需使用普通" "的 Dask 特性和功能" -#: ../../source/user_guide/contrib/dask.rst:20 +#: ../../source/user_guide/contrib/dask.rst:22 msgid "Scheduler" msgstr "使用 Dask 调度器" -#: ../../source/user_guide/contrib/dask.rst:22 +#: ../../source/user_guide/contrib/dask.rst:24 msgid "" "The main API for Dask-on-Mars is :meth:`mars.contrib.dask.mars_scheduler`" ". It uses Dask’s scheduler API, which allows you to specify any callable" @@ -63,11 +67,11 @@ msgstr "" "兼容了 Dask 的 scheduler 接口,这使得用户可以直接指定使用 mars_scheduler " "来调度执行 Dask 任务。" -#: ../../source/user_guide/contrib/dask.rst:39 +#: ../../source/user_guide/contrib/dask.rst:41 msgid "Convert Dask Collections" msgstr "将 Dask 任务转变为 Mars 任务" -#: ../../source/user_guide/contrib/dask.rst:41 +#: ../../source/user_guide/contrib/dask.rst:43 msgid "" ":meth:`mars.contrib.dask.convert_dask_collection` can be used when user " "needs to manipulate dask collections with :ref:`Mars remote API `" @@ -80,7 +84,7 @@ msgstr "" "将 Dask 任务转变为 Mars 任务。这一函数返回的 Mars 对象与 :meth:`mars." "remote.spawn` 返回的对象类型一致。" -#: ../../source/user_guide/contrib/dask.rst:63 +#: ../../source/user_guide/contrib/dask.rst:65 msgid "" "Dask-on-Mars is an ongoing project. Please open an issue if you find that" " one of these dask functionalities doesn’t run on Mars." diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/learn/pytorch.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/learn/pytorch.po new file mode 100644 index 0000000000..0e65a209fd --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/learn/pytorch.po @@ -0,0 +1,161 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 1999-2020, The Alibaba Group Holding Ltd. +# This file is distributed under the same license as the mars package. +# FIRST AUTHOR , 2021. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: mars 0.8.0a1\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2021-08-20 11:13+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.7.0\n" + +#: ../../source/user_guide/learn/pytorch.rst:5 +msgid "Integrate with PyTorch" +msgstr "和Pytorch集成" + +#: ../../source/user_guide/learn/pytorch.rst:9 +msgid "" +"This introduction will give a brief tour about how to integrate `PyTorch " +"`_ in Mars." +msgstr "这篇指引会介绍如何在 Mars 里集成 `PyTorch `_。" + +#: ../../source/user_guide/learn/pytorch.rst:13 +msgid "Installation" +msgstr "安装" + +#: ../../source/user_guide/learn/pytorch.rst:15 +msgid "" +"If you are trying to use Mars on a single machine, e.g. on your laptop, " +"make sure PyTorch is installed." +msgstr "如果尝试在单机比如你的笔记本上使用 Mars,确保 PyTorch 已经安装。" + +#: ../../source/user_guide/learn/pytorch.rst:18 +msgid "You can install PyTorch via pip:" +msgstr "通过 pip 安装 PyTorch:" + +#: ../../source/user_guide/learn/pytorch.rst:24 +msgid "" +"Visit `installation guide for PyTorch `_ for more information." +msgstr "" +"访问 `PyTorch 安装指南 `_ 获取" +"更多信息" + +#: ../../source/user_guide/learn/pytorch.rst:27 +msgid "" +"On the other hand, if you are about to use Mars on a cluster, maker sure " +"PyTorch is installed on each worker." +msgstr "" +"另一方面,如果你打算在集群中使用 Mars,确保 PyTorch 在每一个worker上已" +"安装。" + +#: ../../source/user_guide/learn/pytorch.rst:31 +msgid "Prepare data" +msgstr "准备数据" + +#: ../../source/user_guide/learn/pytorch.rst:33 +msgid "" +"The dataset here we used is `ionosphere dataset " +"`_," +" click link to download data." +msgstr "" +"这里我们使用 `ionosphere 数据集 `_,点击链接下载数据。" + +#: ../../source/user_guide/learn/pytorch.rst:38 +msgid "Prepare PyTorch script" +msgstr "编写 PyTorch 脚本" + +#: ../../source/user_guide/learn/pytorch.rst:40 +msgid "" +"Now we create a Python file called ``torch_demo.py`` which contains the " +"logic of PyTorch." +msgstr "" +"现在我们创建一个命名为 `torch_demo.py` 的 Python 文件,它包含了 PyTorch " +"的处理逻辑。" + +#: ../../source/user_guide/learn/pytorch.rst:119 +msgid "" +"Mars libraries including DataFrame and so forth could be used directly to" +" process massive data and accelerate preprocess." +msgstr "Mars DataFrame 等模块可以直接在脚本里使用,以处理大规模数据和加速预处理。" + +#: ../../source/user_guide/learn/pytorch.rst:123 +msgid "Run PyTorch script via Mars" +msgstr "通过 Mars 运行 PyTorch 脚本" + +#: ../../source/user_guide/learn/pytorch.rst:125 +msgid "The PyTorch script can be submitted via :meth:`run_pytorch_script` now." +msgstr "现在可以通过 :meth:`run_pytorch_script` 提交 PyTorch 脚本。" + +#: ../../source/user_guide/learn/pytorch.rst:141 +msgid "Distributed training or inference" +msgstr "分布式训练和推理" + +#: ../../source/user_guide/learn/pytorch.rst:143 +msgid "" +"Refer to :ref:`deploy` section for deployment, or :ref:`k8s` section for " +"running Mars on Kubernetes." +msgstr "部署参考 :ref:`deploy` 部分, 在 Kubernetes 上运行参考 :ref:`k8s` 部分。" + +#: ../../source/user_guide/learn/pytorch.rst:146 +msgid "" +"As you can tell from ``torch_demo.py``, Mars will set environment " +"variable automatically. Thus you don't need to worry about the " +"distributed setting, what you need do is to write a proper `distributed " +"PyTorch script. " +"`_." +msgstr "" +"你能从 ``torch_demo.py`` 中看出,Mars 会自动设置相关的环境变量。你不需要" +"考虑分布式相关的设置,你需要做的是写一个正确的 `分布式PyTorch脚本 `_。" + +#: ../../source/user_guide/learn/pytorch.rst:151 +msgid "" +"Once a cluster exists, you can either set the session as default, the " +"training and prediction shown above will be submitted to the cluster, or " +"you can specify ``session=***`` explicitly as well." +msgstr "" +"一旦一个集群存在,你可以要么设置默认 session,训练和预测就会自动提交到" +"集群,要么你可以通过 ``session=***`` 显示指定运行的 session。" + +#: ../../source/user_guide/learn/pytorch.rst:169 +msgid "MarsDataset" +msgstr "MarsDataset" + +#: ../../source/user_guide/learn/pytorch.rst:171 +msgid "" +"In order to use Mars to process data, we implemented a " +":class:`MarsDataset` that can convert Mars object " +"(:class:`mars.tensor.Tensor`, :class:`mars.dataframe.DataFrame`, " +":class:`mars.dataframe.Series`) to ``torch.util.data.Dataset``." +msgstr "" +"为了更好地使用 Mars 处理数据,我们实现了一个 MarsDataset 对象,继承 ``" +"torch.util.data.Dataset``,能够接收Mars 类型的数据(:class:`mars.tensor." +"Tensor`、:class:`mars.dataframe.DataFrame`、:class:`mars.dataframe.Series" +"`)。" + +#: ../../source/user_guide/learn/pytorch.rst:188 +msgid "" +"Now, :meth:`run_pytorch_script` allow pass data to script. So you can " +"preprocess data via mars, then pass data to script." +msgstr "" +"现在,:meth:`run_pytorch_script` 允许传送 Mars 类型的数据到脚本中。所以你" +"可以先通过 Mars 处理数据,再将其传送到脚本中。" + +#: ../../source/user_guide/learn/pytorch.rst:208 +msgid "``torch_script.py``" +msgstr "``torch_script.py``" + +#: ../../source/user_guide/learn/pytorch.rst:270 +msgid "result:" +msgstr "运行结果:" + diff --git a/docs/source/reference/learn/reference.rst b/docs/source/reference/learn/reference.rst index ded1d8759f..54adb6de31 100644 --- a/docs/source/reference/learn/reference.rst +++ b/docs/source/reference/learn/reference.rst @@ -225,7 +225,7 @@ LightGBM Integration PyTorch Integration ====================== -.. automodule:: mars.learn.contrib.tensorflow +.. automodule:: mars.learn.contrib.pytorch :no-members: :no-inherited-members: @@ -236,8 +236,10 @@ PyTorch Integration contrib.pytorch.run_pytorch_script contrib.pytorch.MarsDataset - contrib.pytorch.MarsDistributedSampler - contrib.pytorch.MarsRandomSampler + contrib.pytorch.SequentialSampler + contrib.pytorch.RandomSampler + contrib.pytorch.SubsetRandomSampler + contrib.pytorch.DistributedSampler .. _statsmodels_ref: diff --git a/docs/source/user_guide/learn/index.rst b/docs/source/user_guide/learn/index.rst index bcd44150f8..63310926de 100644 --- a/docs/source/user_guide/learn/index.rst +++ b/docs/source/user_guide/learn/index.rst @@ -14,6 +14,7 @@ can be obtained in the :ref:`API reference `. joblib lightgbm + pytorch statsmodels tensorflow xgboost diff --git a/docs/source/user_guide/learn/pytorch.rst b/docs/source/user_guide/learn/pytorch.rst new file mode 100644 index 0000000000..11820a2b97 --- /dev/null +++ b/docs/source/user_guide/learn/pytorch.rst @@ -0,0 +1,280 @@ +.. _integrate_pytorch: + +************************* +Integrate with PyTorch +************************* + +.. currentmodule:: mars.learn.contrib.pytorch + +This introduction will give a brief tour about how to integrate `PyTorch +`_ in Mars. + +Installation +------------ + +If you are trying to use Mars on a single machine, e.g. on your laptop, make +sure PyTorch is installed. + +You can install PyTorch via pip: + +.. code-block:: bash + + pip3 install torch torchvision torchaudio + +Visit `installation guide for PyTorch `_ +for more information. + +On the other hand, if you are about to use Mars on a cluster, maker sure +PyTorch is installed on each worker. + +Prepare data +------------ + +The dataset here we used is `ionosphere dataset +`_, click +link to download data. + +Prepare PyTorch script +------------------------- + +Now we create a Python file called ``torch_demo.py`` which contains the logic of +PyTorch. + +.. code-block:: python + + import os + + import mars.dataframe as md + import torch + import torch.nn as nn + import torch.distributed as dist + import torch.optim as optim + import torch.utils.data + from sklearn.preprocessing import LabelEncoder + + + def prepare_data(): + df = md.read_csv('ionosphere.data', header=None) + + # split into input and output columns + X = df.iloc[:, :-1].to_tensor().astype('float32') + y = df.iloc[:, -1].to_tensor() + + # convert Mars tensor to numpy ndarray + X, y = X.to_numpy(), y.to_numpy() + + # encode string to integer + y = LabelEncoder().fit_transform(y) + + return X, y + + + def get_model(): + return nn.Sequential( + nn.Linear(34, 10), + nn.ReLU(), + nn.Linear(10, 8), + nn.ReLU(), + nn.Linear(8, 1), + nn.Sigmoid(), + ) + + + def train(): + dist.init_process_group(backend="gloo") + torch.manual_seed(42) + + data, labels= prepare_data() + data = torch.from_numpy(data) + labels = torch.from_numpy(labels) + train_dataset = torch.utils.data.TensorDataset(data, labels.float()) + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=32, + shuffle=False, + sampler=train_sampler) + + + model = nn.parallel.DistributedDataParallel(get_model()) + optimizer = optim.Adam(model.parameters(), + lr=0.001) + criterion = nn.BCELoss() + + for epoch in range(150): # 150 epochs + running_loss = 0.0 + for _, (batch_data, batch_labels) in enumerate(train_loader): + outputs = model(batch_data) + loss = criterion(outputs.squeeze(), batch_labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + running_loss += loss.item() + print(f"epoch {epoch}, running_loss is {running_loss}") + + + if __name__ == "__main__": + train() + +Mars libraries including DataFrame and so forth could be used directly to +process massive data and accelerate preprocess. + +Run PyTorch script via Mars +------------------------------ + +The PyTorch script can be submitted via :meth:`run_pytorch_script` now. + +.. code-block:: ipython + + In [1]: from mars.learn.contrib.pytorch import run_pytorch_script + + In [2]: run_pytorch_script("torch_demo.py", n_workers=2) + task: wait_for=()]>> + ... + epoch 148, running_loss is 0.27749747782945633 + epoch 148, running_loss is 0.29025389067828655 + epoch 149, running_loss is 0.2736152168363333 + epoch 149, running_loss is 0.2884620577096939 + Out[4]: Object + +Distributed training or inference +--------------------------------- + +Refer to :ref:`deploy` section for deployment, or :ref:`k8s` section for +running Mars on Kubernetes. + +As you can tell from ``torch_demo.py``, Mars will set environment variable +automatically. Thus you don't need to worry about the distributed setting, what +you need do is to write a proper `distributed PyTorch script. +`_. + +Once a cluster exists, you can either set the session as default, the training +and prediction shown above will be submitted to the cluster, or you can specify +``session=***`` explicitly as well. + +.. code-block:: python + + # A cluster has been configured, and web UI is started on : + import mars + # set the session as the default one + sess = mars.new_session('http://:') + + # submitted to cluster by default + run_pytorch_script('torch_demo.py', n_workers=2) + + # Or, session could be specified as well + run_pytorch_script('torch_demo.py', n_workers=2, session=sess) + +MarsDataset +------------ + +In order to use Mars to process data, we implemented a :class:`MarsDataset` that can convert +Mars object (:class:`mars.tensor.Tensor`, :class:`mars.dataframe.DataFrame`, +:class:`mars.dataframe.Series`) to ``torch.util.data.Dataset``. + +.. code-block:: python + + from mars.learn.contrib.pytorch import MarsDataset, RandomSampler + + data = mt.random.rand(1000, 32, dtype='f4') + labels = mt.random.randint(0, 2, (1000, 10), dtype='f4') + + train_dataset = MarsDataset(data, labels) + train_sampler = RandomSampler(train_dataset) + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=32, + sampler=train_sampler) + +Now, :meth:`run_pytorch_script` allow pass data to script. So you can preprocess data +via mars, then pass data to script. + +.. code-block:: python + + import mars.dataframe as md + from sklearn.preprocessing import LabelEncoder + + + df = md.read_csv('ionosphere.data', header=None) + feature_data = df.iloc[:, :-1].astype('float32') + feature_data.execute() + labels = df.iloc[:, -1] + labels = LabelEncoder().fit_transform(labels.execute().fetch()) + label = label.astype('float32') + + run_pytorch_script( + "torch_script.py", n_workers=2, data={'feature_data': feature_data, 'labels': labels}, + port=9945, session=sess) + +``torch_script.py`` + +.. code-block:: python + + from mars.learn.contrib.pytorch import DistributedSampler + from mars.learn.contrib.pytorch import MarsDataset + import torch + import torch.nn as nn + import torch.distributed as dist + import torch.optim as optim + import torch.utils.data + + + def get_model(): + return nn.Sequential( + nn.Linear(34, 10), + nn.ReLU(), + nn.Linear(10, 8), + nn.ReLU(), + nn.Linear(8, 1), + nn.Sigmoid(), + ) + + + def train(feature_data, labels): + + dist.init_process_group(backend='gloo') + torch.manual_seed(42) + + data = feature_data + labels = labels + + train_dataset = MarsDataset(data, labels) + train_sampler = DistributedSampler(train_dataset) + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=32, + shuffle=False, + sampler=train_sampler) + + model = nn.parallel.DistributedDataParallel(get_model()) + optimizer = optim.Adam(model.parameters(), + lr=0.001) + criterion = nn.BCELoss() + + for epoch in range(150): + # 150 epochs + running_loss = 0.0 + for _, (batch_data, batch_labels) in enumerate(train_loader): + outputs = model(batch_data) + loss = criterion(outputs.squeeze(), batch_labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + running_loss += loss.item() + print(f"epoch {epoch}, running_loss is {running_loss}") + + + if __name__ == "__main__": + feature_data = globals()['feature_data'] + labels = globals()['labels'] + train(feature_data, labels) + +result: + +.. code-block:: ipython + + epoch 147, running_loss is 0.29225416854023933 + epoch 147, running_loss is 0.28132784366607666 + epoch 148, running_loss is 0.27749747782945633 + epoch 148, running_loss is 0.29025389067828655 + epoch 149, running_loss is 0.2736152168363333 + epoch 149, running_loss is 0.2884620577096939 + Out[7]: Object diff --git a/mars/learn/contrib/pytorch/__init__.py b/mars/learn/contrib/pytorch/__init__.py index 3fb5126a99..9c1d856348 100644 --- a/mars/learn/contrib/pytorch/__init__.py +++ b/mars/learn/contrib/pytorch/__init__.py @@ -13,6 +13,9 @@ # limitations under the License. from .run_script import run_pytorch_script +from .dataset import MarsDataset # noqa: F401 # pylint: disable=unused-import +from .sampler import SequentialSampler, RandomSampler, \ + SubsetRandomSampler, DistributedSampler # noqa: F401 # pylint: disable=unused-import def register_op(): diff --git a/mars/learn/contrib/pytorch/dataset.py b/mars/learn/contrib/pytorch/dataset.py new file mode 100644 index 0000000000..849a322806 --- /dev/null +++ b/mars/learn/contrib/pytorch/dataset.py @@ -0,0 +1,91 @@ +# Copyright 1999-2020 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +from typing import List + +import numpy as np +import pandas as pd +try: + import torch + from torch.utils.data import Dataset +except ImportError: # pragma: no cover + torch = None + Dataset = object + +from .... import execute +from ....core.context import get_context +from ....tensor.core import TENSOR_TYPE +from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE +from ....utils import require_not_none + + +ACCEPT_TYPE = (TENSOR_TYPE, DATAFRAME_TYPE, SERIES_TYPE, + np.ndarray, pd.DataFrame, pd.Series, List) + + +@require_not_none(torch) +class MarsDataset(Dataset): + r"""MarsDataset that inherit from torch.utils.data.Dataset. + It converts from Mars basic datatype such as Tensor, + DataFrame, Series. Additionally, it's constructor can receive + np.ndarray, pd.DataFrame, pd.Series type. + """ + def __init__(self, *tileables, fetch_kwargs=None): + + self._context = get_context() + self._tileables = tileables + self._fetch_kwargs = fetch_kwargs or dict() + self._executed = False + self._check_type() + + def _check_type(self): + for t in self._tileables: + if not isinstance(t, ACCEPT_TYPE): + raise TypeError(f"Unexpected dataset type: {type(t)}") + + def _execute(self): + execute_data = [t for t in self._tileables + if isinstance(t, ACCEPT_TYPE[:3])] + if len(execute_data): + execute(execute_data) + + def __len__(self): + return self._tileables[0].shape[0] + + def __getitem__(self, index): + if not self._executed: + self._execute() + self._executed = True + return tuple(self.get_data(t, index) for t in self._tileables) + + def get_data(self, t, index): + fetch_kwargs = dict() + if self._fetch_kwargs: + fetch_kwargs = copy.deepcopy(self._fetch_kwargs) + + if isinstance(t, TENSOR_TYPE): + return t[index].fetch(**fetch_kwargs) + elif isinstance(t, np.ndarray): + return t[index] + elif isinstance(t, DATAFRAME_TYPE): + return t.iloc[index].fetch(**fetch_kwargs).values + elif isinstance(t, SERIES_TYPE): + return t.iloc[index].fetch(**fetch_kwargs) + elif isinstance(t, pd.DataFrame): + return t.iloc[index].values + elif isinstance(t, pd.Series): + return t.iloc[index] + else: + return t[index] diff --git a/mars/learn/contrib/pytorch/run_script.py b/mars/learn/contrib/pytorch/run_script.py index baa8473f83..72486264b2 100644 --- a/mars/learn/contrib/pytorch/run_script.py +++ b/mars/learn/contrib/pytorch/run_script.py @@ -57,17 +57,19 @@ def tile(cls, op): ctx = get_context() workers = pick_workers(ctx.get_worker_addresses(), op.world_size) + data, input_chunks = cls._get_chunk_data(op) out_chunks = [] for i in range(op.world_size): chunk_op = op.copy().reset_key() + chunk_op._data = data chunk_op.expect_worker = workers[i] if op.init_method is None: chunk_op._master_port = op.master_port chunk_op._master_addr = workers[0].split(':', 1)[0] chunk_op._rank = i chunk_op._init_method = op.init_method - out_chunks.append(chunk_op.new_chunk(None, index=(i,))) + out_chunks.append(chunk_op.new_chunk(input_chunks, index=(i,))) new_op = op.copy() return new_op.new_tileables(op.inputs, chunks=out_chunks, diff --git a/mars/learn/contrib/pytorch/sampler.py b/mars/learn/contrib/pytorch/sampler.py new file mode 100644 index 0000000000..62c3e569c9 --- /dev/null +++ b/mars/learn/contrib/pytorch/sampler.py @@ -0,0 +1,255 @@ +# Copyright 1999-2020 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Iterator, Optional, Sized, Sequence +import math + +try: + import torch + from torch.utils.data import Sampler +except ImportError: # pragma: no cover + torch = None + Sampler = object + +from ....utils import require_not_none + + +@require_not_none(torch) +class SequentialSampler(Sampler): + r""""Samples elements sequentially, always in the same order. + + Args: + data_source (Dataset): dataset to sample from + """ + data_source: Sized + + def __init__(self, data_source): + self.data_source = data_source + + def __iter__(self) -> Iterator[int]: + return iter(range(len(self.data_source))) + + def __len__(self) -> int: + return len(self.data_source) + + +@require_not_none(torch) +class RandomSampler(Sampler): + r"""" + Samples elements randomly. If without replacement, then sample from a shuffled dataset. + If with replacement, then user can specify :attr:`num_samples` to draw. + + Args: + data_source (Dataset): dataset to sample from + replacement (bool): samples are drawn on-demand with replacement if ``True``, default=``False`` + num_samples (int): number of samples to draw, default=`len(dataset)`. This argument + is supposed to be specified only when `replacement` is ``True``. + generator (Generator): Generator used in sampling. + """ + data_source: Sized + replacement: bool + + def __init__(self, data_source, replacement=False, num_samples=None, generator=None): + + self.data_source = data_source + self.replacement = replacement + self._num_samples = num_samples + self.generator = generator + + if not isinstance(self.replacement, bool): + raise ValueError("replacement should be a boolean value, but got " + f"replacement={self.replacement}") + + if self._num_samples is not None and not replacement: + raise ValueError("With replacement=False, num_samples should not be specified, " + "since a random permute will be performed.") + + if not isinstance(self.num_samples, int) or self.num_samples <= 0: + raise ValueError("num_samples should be a positive integer " + f"value, but got num_samples={self.num_samples}") + + @property + def num_samples(self): + # dataset size might change at runtime + if self._num_samples is None: + return len(self.data_source) + return self._num_samples + + def __iter__(self): + n = len(self.data_source) + if self.generator is None: + generator = torch.Generator() + generator.manual_seed(int(torch.empty((), dtype=torch.int64).random_().item())) + else: + generator = self.generator + if self.replacement: + for _ in range(self.num_samples // 32): + yield from torch.randint(high=n, size=(32,), dtype=torch.int64, generator=generator).tolist() + yield from torch.randint(high=n, size=(self.num_samples % 32,), dtype=torch.int64, generator=generator).tolist() + else: + yield from torch.randperm(n, generator=generator).tolist() + + def __len__(self) -> int: + return self.num_samples + + +@require_not_none(torch) +class SubsetRandomSampler(Sampler): + """ + Samples elements randomly from a given list of indices, without replacement. + + Args: + indices (sequence): a sequence of indices + generator (Generator): Generator used in sampling. + """ + indices: Sequence[int] + + def __init__(self, indices: Sequence[int], generator=None) -> None: + self.indices = indices + self.generator = generator + + def __iter__(self) -> Iterator[int]: + return (self.indices[i] for i in torch.randperm(len(self.indices), generator=self.generator)) + + def __len__(self) -> int: + return len(self.indices) + + +@require_not_none(torch) +class DistributedSampler(Sampler): + r"""Sampler that restricts data loading to a subset of the dataset. + + It is especially useful in conjunction with + :class:`torch.nn.parallel.DistributedDataParallel`. In such a case, each + process can pass a :class:`~torch.utils.data.DistributedSampler` instance as a + :class:`~torch.utils.data.DataLoader` sampler, and load a subset of the + original dataset that is exclusive to it. + + .. note:: + Dataset is assumed to be of constant size. + + Args: + dataset: Dataset used for sampling. + num_replicas (int, optional): Number of processes participating in + distributed training. By default, :attr:`world_size` is retrieved from the + current distributed group. + rank (int, optional): Rank of the current process within :attr:`num_replicas`. + By default, :attr:`rank` is retrieved from the current distributed + group. + shuffle (bool, optional): If ``True`` (default), sampler will shuffle the + indices. + seed (int, optional): random seed used to shuffle the sampler if + :attr:`shuffle=True`. This number should be identical across all + processes in the distributed group. Default: ``0``. + drop_last (bool, optional): if ``True``, then the sampler will drop the + tail of the data to make it evenly divisible across the number of + replicas. If ``False``, the sampler will add extra indices to make + the data evenly divisible across the replicas. Default: ``False``. + + .. warning:: + In distributed mode, calling the :meth:`set_epoch` method at + the beginning of each epoch **before** creating the :class:`DataLoader` iterator + is necessary to make shuffling work properly across multiple epochs. Otherwise, + the same ordering will be always used. + + Example:: + + >>> sampler = DistributedSampler(dataset) if is_distributed else None + >>> loader = DataLoader(dataset, shuffle=(sampler is None), + ... sampler=sampler) + >>> for epoch in range(start_epoch, n_epochs): + ... if is_distributed: + ... sampler.set_epoch(epoch) + ... train(loader) + """ + def __init__(self, dataset, num_replicas: Optional[int] = None, + rank: Optional[int] = None, shuffle: bool = True, + seed: int = 0, drop_last: bool = False) -> None: + import torch.distributed as dist + + if num_replicas is None: # pragma: no cover + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + num_replicas = dist.get_world_size() + if rank is None: # pragma: no cover + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + rank = dist.get_rank() + if rank >= num_replicas or rank < 0: + raise ValueError( + "Invalid rank {}, rank should be in the interval" + " [0, {}]".format(rank, num_replicas - 1)) + + self.dataset = dataset + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + self.drop_last = drop_last + # If the dataset length is evenly divisible by # of replicas, then there + # is no need to drop any data, since the dataset will be split equally. + if self.drop_last and len(self.dataset) % self.num_replicas != 0: + # Split to nearest available length that is evenly divisible. + # This is to ensure each rank receives the same amount of data when + # using this Sampler. + self.num_samples = math.ceil( + (len(self.dataset) - self.num_replicas) / self.num_replicas + ) + else: + self.num_samples = math.ceil(len(self.dataset) / self.num_replicas) + self.total_size = self.num_samples * self.num_replicas + self.shuffle = shuffle + self.seed = seed + + def generate_indices(self): + if self.shuffle: + # deterministically shuffle based on epoch and seed + g = torch.Generator() + g.manual_seed(self.seed + self.epoch) + indices = torch.randperm(len(self.dataset), generator=g).tolist() + else: + indices = list(range(len(self.dataset))) + + if not self.drop_last: + # add extra samples to make it evenly divisible + padding_size = self.total_size - len(indices) + if padding_size <= len(indices): + indices += indices[:padding_size] + else: + indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size] + else: + # remove tail of data to make it evenly divisible. + indices = indices[:self.total_size] + assert len(indices) == self.total_size + + # subsample + indices = indices[self.rank:self.total_size:self.num_replicas] + assert len(indices) == self.num_samples + + return indices + + def __iter__(self): + return iter(self.generate_indices()) + + def __len__(self): + return self.num_samples + + def set_epoch(self, epoch: int): + r"""Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas + use a different random ordering for each epoch. Otherwise, the next iteration of this + sampler will yield the same ordering. + + Args: + epoch (int): Epoch number. + """ + self.epoch = epoch diff --git a/mars/learn/contrib/pytorch/tests/pytorch_dataset.py b/mars/learn/contrib/pytorch/tests/pytorch_dataset.py new file mode 100644 index 0000000000..74c4be638e --- /dev/null +++ b/mars/learn/contrib/pytorch/tests/pytorch_dataset.py @@ -0,0 +1,78 @@ +# Copyright 1999-2021 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + + +def get_model(): + import torch.nn as nn + return nn.Sequential( + nn.Linear(32, 64), + nn.ReLU(), + nn.Linear(64, 64), + nn.ReLU(), + nn.Linear(64, 10), + nn.Softmax(), + ) + + +def main(feature_data, labels): + import torch.nn as nn + import torch.distributed as dist + import torch.optim as optim + import torch.utils.data + from mars.learn.contrib.pytorch import MarsDataset, DistributedSampler + + dist.init_process_group(backend='gloo') + torch.manual_seed(42) + + data = feature_data + labels = labels + + train_dataset = MarsDataset(data, labels) + assert len(train_dataset) == 1000 + + train_sampler = DistributedSampler(train_dataset, shuffle=True) + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=32, + shuffle=(train_sampler is None), + sampler=train_sampler) + + model = nn.parallel.DistributedDataParallel(get_model()) + optimizer = optim.SGD(model.parameters(), + lr=0.01, momentum=0.5) + criterion = nn.BCELoss() + + for i in range(2): + # 2 epochs + train_sampler.set_epoch(i) + running_loss = 0.0 + for _, (batch_data, batch_labels) in enumerate(train_loader): + outputs = model(batch_data) + loss = criterion(outputs.squeeze(), batch_labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + running_loss += loss.item() + print(f"running_loss is {loss.item()}") + + print("Done!") + + +if __name__ == "__main__": + assert len(sys.argv) == 2 + assert sys.argv[1] == 'multiple' + feature_data = globals()['feature_data'] + labels = globals()['labels'] + main(feature_data, labels) diff --git a/mars/learn/contrib/pytorch/tests/test_dataset.py b/mars/learn/contrib/pytorch/tests/test_dataset.py new file mode 100644 index 0000000000..b2bee2ee99 --- /dev/null +++ b/mars/learn/contrib/pytorch/tests/test_dataset.py @@ -0,0 +1,298 @@ +# Copyright 1999-2020 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import os + +import mars.tensor as mt +import mars.dataframe as md +from mars.utils import lazy_import +from mars.learn.contrib.pytorch import MarsDataset, RandomSampler, SequentialSampler, \ + SubsetRandomSampler, DistributedSampler, run_pytorch_script + +torch_installed = lazy_import('torch', globals=globals()) is not None + + +@pytest.mark.skipif(not torch_installed, reason='pytorch not installed') +def test_mars_dataset(setup): + from torch.utils.data import Dataset + import numpy as np + import pandas as pd + + # Mars tensor + data = mt.random.rand(1000, 32, dtype='f4') + labels = mt.random.randint(0, 2, (1000, 10), dtype='f4') + + data_verify = data[1].execute().fetch() + labels_verify = labels[1].execute().fetch() + + train_dataset = MarsDataset(data, labels) + + assert isinstance(train_dataset, Dataset) + np.testing.assert_array_equal(train_dataset[1][0], data_verify) + np.testing.assert_array_equal(train_dataset[1][1], labels_verify) + assert len(train_dataset) == 1000 + + # np ndarray + data = np.random.rand(1000, 32) + labels = np.random.randint(0, 2, (1000, 10)) + + data_verify = data[1] + labels.dtype = "float32" + labels_verify = labels[1] + + train_dataset = MarsDataset(data, labels) + np.testing.assert_array_equal(train_dataset[1][0], data_verify) + np.testing.assert_array_equal(train_dataset[1][1], labels_verify) + assert len(train_dataset) == 1000 + + # Mars dataframe + data = md.DataFrame(data) + labels = md.DataFrame(labels) + + data_verify = data.iloc[1].execute().fetch().values + labels_verify = labels.iloc[1].execute().fetch().values + + train_dataset = MarsDataset(data, labels, fetch_kwargs={ + 'extra_config': {'check_series_name': False}}) + np.testing.assert_array_equal(train_dataset[1][0], data_verify) + np.testing.assert_array_equal(train_dataset[1][1], labels_verify) + assert len(train_dataset) == 1000 + + # Mars Series + label = labels[1] + + label_verify = label[1].execute().fetch() + + train_dataset = MarsDataset(data, label, fetch_kwargs={ + 'extra_config': {'check_series_name': False}}) + np.testing.assert_array_equal(train_dataset[1][0], data_verify) + assert train_dataset[1][1] == label_verify + assert len(train_dataset) == 1000 + + # pandas dataframe + data = pd.DataFrame(np.random.rand(1000, 32)) + labels = pd.DataFrame(np.random.randint(0, 2, (1000, 10)), dtype="float32") + + data_verify = data.iloc[1].values + labels_verify = labels.iloc[1].values + + train_dataset = MarsDataset(data, labels) + np.testing.assert_array_equal(train_dataset[1][0], data_verify) + np.testing.assert_array_equal(train_dataset[1][1], labels_verify) + assert len(train_dataset) == 1000 + + # pands series + label = labels[1] + label_verify = label[1] + + train_dataset = MarsDataset(data, label) + np.testing.assert_array_equal(train_dataset[1][0], data_verify) + assert train_dataset[1][1] == label_verify + assert len(train_dataset) == 1000 + + # test TypeError + label = tuple(range(1000)) + + with pytest.raises(TypeError) as e: + train_dataset = MarsDataset(data, label) + exec_msg = e.value.args[0] + assert exec_msg == "Unexpected dataset type: " + + +@pytest.mark.skipif(not torch_installed, reason='pytorch not installed') +def test_sequential_sampler(setup_cluster): + import torch + + data = mt.random.rand(1000, 32, dtype='f4') + labels = mt.random.randint(0, 2, (1000, 10), dtype='f4') + + train_dataset = MarsDataset(data, labels) + assert len(train_dataset) == 1000 + + train_sampler = SequentialSampler(train_dataset) + assert len(train_sampler) == 1000 + + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=32, + sampler=train_sampler) + + model = torch.nn.Sequential( + torch.nn.Linear(32, 64), + torch.nn.ReLU(), + torch.nn.Linear(64, 64), + torch.nn.ReLU(), + torch.nn.Linear(64, 10), + torch.nn.Softmax(dim=1), + ) + + optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5) + criterion = torch.nn.BCELoss() + for _ in range(2): + # 2 epochs + for _, (batch_data, batch_labels) in enumerate(train_loader): + outputs = model(batch_data) + loss = criterion(outputs.squeeze(), batch_labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + +@pytest.mark.skipif(not torch_installed, reason='pytorch not installed') +def test_random_sampler(setup_cluster): + import torch + + data = mt.random.rand(1000, 32, dtype='f4') + labels = mt.random.randint(0, 2, (1000, 10), dtype='f4') + + train_dataset = MarsDataset(data, labels) + + # test __init__() + with pytest.raises(ValueError) as e: + train_sampler = RandomSampler(train_dataset, replacement=1) + exec_msg = e.value.args[0] + assert exec_msg == "replacement should be a boolean value, but got replacement=1" + + with pytest.raises(ValueError) as e: + train_sampler = RandomSampler(train_dataset, num_samples=900) + exec_msg = e.value.args[0] + assert exec_msg == "With replacement=False, num_samples should not " + \ + "be specified, since a random permute will be performed." + + with pytest.raises(ValueError) as e: + train_sampler = RandomSampler(train_dataset, replacement=True, num_samples=-1) + exec_msg = e.value.args[0] + assert exec_msg == "num_samples should be a positive integer value, but got num_samples=-1" + + train_sampler = RandomSampler(train_dataset) + + # test __len__ num_samples() + assert len(train_sampler) == 1000 + assert train_sampler.num_samples == 1000 + + # test __iter__ + g_cpu = torch.Generator() + g_cpu.manual_seed(2147483647) + + train_sampler = RandomSampler(train_dataset, generator=g_cpu) + assert len(train_sampler) == 1000 + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=32, + sampler=train_sampler) + for _, (batch_data, batch_labels) in enumerate(train_loader): + assert len(batch_data[0]) == 32 + assert len(batch_labels[0]) == 10 + + train_sampler = RandomSampler(train_dataset, replacement=True, num_samples=900) + assert len(train_sampler) == 900 + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=32, + sampler=train_sampler) + for _, (batch_data, batch_labels) in enumerate(train_loader): + assert len(batch_data[0]) == 32 + assert len(batch_labels[0]) == 10 + + # torch train + model = torch.nn.Sequential( + torch.nn.Linear(32, 64), + torch.nn.ReLU(), + torch.nn.Linear(64, 64), + torch.nn.ReLU(), + torch.nn.Linear(64, 10), + torch.nn.Softmax(dim=1), + ) + + optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5) + criterion = torch.nn.BCELoss() + for _ in range(2): + # 2 epochs + for _, (batch_data, batch_labels) in enumerate(train_loader): + outputs = model(batch_data) + loss = criterion(outputs.squeeze(), batch_labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + +@pytest.mark.skipif(not torch_installed, reason='pytorch not installed') +def test_subset_random_sampler(setup_cluster): + import numpy as np + import torch + + data = mt.random.rand(1000, 32, dtype='f4') + labels = mt.random.randint(0, 2, (1000, 10), dtype='f4') + data.execute() + labels.execute() + + train_dataset = MarsDataset(data, labels) + train_sampler = SubsetRandomSampler( + np.random.choice(range(len(train_dataset)), len(train_dataset))) + + assert len(train_sampler) == 1000 + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=32, + sampler=train_sampler) + for _, (batch_data, batch_labels) in enumerate(train_loader): + assert len(batch_data[0]) == 32 + assert len(batch_labels[0]) == 10 + + +@pytest.mark.skipif(not torch_installed, reason='pytorch not installed') +def test_distributed_sampler(setup_cluster): + import torch + + data = mt.random.rand(1001, 32, dtype='f4') + labels = mt.random.randint(0, 2, (1001, 10), dtype='f4') + + train_dataset = MarsDataset(data, labels) + + with pytest.raises(ValueError) as e: + train_sampler = DistributedSampler(train_dataset, num_replicas=2, rank=-1) + exec_msg = e.value.args[0] + assert exec_msg == "Invalid rank -1, rank should be in the interval [0, 1]" + + train_sampler = DistributedSampler(train_dataset, num_replicas=2, rank=0, + drop_last=True, shuffle=True) + assert len(train_sampler) == 500 + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=32, + sampler=train_sampler) + for _, (batch_data, batch_labels) in enumerate(train_loader): + assert len(batch_data[0]) == 32 + assert len(batch_labels[0]) == 10 + + train_sampler = DistributedSampler(train_dataset, num_replicas=2, rank=0, + drop_last=False, shuffle=False) + train_sampler.set_epoch(10) + assert len(train_sampler) == 501 + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=32, + sampler=train_sampler) + for _, (batch_data, batch_labels) in enumerate(train_loader): + assert len(batch_data[0]) == 32 + assert len(batch_labels[0]) == 10 + + +@pytest.mark.skipif(not torch_installed, reason='pytorch not installed') +def test_mars_dataset_script(setup_cluster): + sess = setup_cluster + path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + 'pytorch_dataset.py') + + data = mt.random.rand(1000, 32, dtype='f4') + labels = mt.random.randint(0, 2, (1000, 10), dtype='f4') + + assert run_pytorch_script( + path, n_workers=2, data={'feature_data': data, 'labels': labels}, + command_argv=['multiple'], port=9945, session=sess).fetch()['status'] == 'ok' diff --git a/mars/learn/contrib/tensorflow/run_script.py b/mars/learn/contrib/tensorflow/run_script.py index 235709da1a..c5ed285888 100644 --- a/mars/learn/contrib/tensorflow/run_script.py +++ b/mars/learn/contrib/tensorflow/run_script.py @@ -79,7 +79,6 @@ def tf_task_index(self): def tile(cls, op): ctx = get_context() - port = op.port or 2221 cluster_conf = {'worker': []} if op.n_ps > 0: cluster_conf['ps'] = [] @@ -88,6 +87,7 @@ def tile(cls, op): out_chunks = [] worker_addresses = ctx.get_worker_addresses() picked_workers = pick_workers(worker_addresses, op.n_roles) + data, input_chunks = cls._get_chunk_data(op) ports = yield from recursive_tile( collect_ports(worker_addresses)) @@ -98,6 +98,7 @@ def tile(cls, op): for worker, port in zip(picked_workers, ports): worker_addr = worker.rsplit(':', 1)[0] chunk_op = op.copy().reset_key() + chunk_op._data = data addr = f'{worker_addr}:{port}' # tell graph actor that the chunk should be executed on the exact worker chunk_op.expect_worker = worker @@ -108,7 +109,7 @@ def tile(cls, op): cluster_conf[tp].append(addr) chunk_op._tf_config = {'cluster': cluster_conf, 'task': {'type': tp, 'index': idx}} - out_chunks.append(chunk_op.new_chunk(None, index=(i,))) + out_chunks.append(chunk_op.new_chunk(input_chunks, index=(i,))) i += 1 new_op = op.copy() diff --git a/mars/remote/run_script.py b/mars/remote/run_script.py index 76b485cf3e..e537dfb9bc 100644 --- a/mars/remote/run_script.py +++ b/mars/remote/run_script.py @@ -74,11 +74,7 @@ def __call__(self, inputs): return self.new_tileable(inputs) @classmethod - def tile(cls, op: "RunScript"): - if len(op.inputs) > 0: - # trigger inputs to execute - yield - + def _get_chunk_data(cls, op: "RunScript"): new_data = None input_chunks = [] inputs_iter = iter(op.inputs) @@ -91,13 +87,22 @@ def tile(cls, op: "RunScript"): input_chunks.extend(v.chunks) else: new_data[k] = v + return new_data, input_chunks + + @classmethod + def tile(cls, op: "RunScript"): + if len(op.inputs) > 0: + # trigger inputs to execute + yield + + new_data, input_chunks = cls._get_chunk_data(op) out_chunks = [] for i in range(op.world_size): chunk_op = op.copy().reset_key() chunk_op._data = new_data chunk_op._rank = i - out_chunks.append(chunk_op.new_chunk(None, index=(i,))) + out_chunks.append(chunk_op.new_chunk(input_chunks, index=(i,))) new_op = op.copy() return new_op.new_tileables(op.inputs, chunks=out_chunks, @@ -114,7 +119,8 @@ def _build_envs(cls, ctx, op): @classmethod def _build_locals(cls, ctx: Union[Context, dict], op: "RunScript"): sess = ctx.get_current_session().as_default() - local = {'session': sess} + local = {'session': sess, + '__name__': '__main__'} if op.data is not None: local.update(op.data) return local