diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/reference/learn/reference.po b/docs/source/locale/zh_CN/LC_MESSAGES/reference/learn/reference.po index e5c8e9f471..e0b0a0ae99 100644 --- a/docs/source/locale/zh_CN/LC_MESSAGES/reference/learn/reference.po +++ b/docs/source/locale/zh_CN/LC_MESSAGES/reference/learn/reference.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: mars 0.5.0a2\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2021-08-23 16:36+0800\n" +"POT-Creation-Date: 2021-09-02 18:08+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -101,483 +101,567 @@ msgstr "" msgid "Matrix Decomposition" msgstr "矩阵分解" -#: ../../source/reference/learn/reference.rst:79::1 +#: ../../source/reference/learn/reference.rst:78::1 msgid "" ":obj:`decomposition.PCA `\\ " "\\(\\[n\\_components\\, copy\\, ...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:79::1 +#: ../../source/reference/learn/reference.rst:78::1 msgid "Principal component analysis (PCA)" msgstr "" -#: ../../source/reference/learn/reference.rst:79::1 +#: ../../source/reference/learn/reference.rst:78::1 msgid "" ":obj:`decomposition.TruncatedSVD " "`\\ \\(\\[n\\_components\\, " "...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:79::1 +#: ../../source/reference/learn/reference.rst:78::1 msgid "Dimensionality reduction using truncated SVD (aka LSA)." msgstr "" -#: ../../source/reference/learn/reference.rst:83 +#: ../../source/reference/learn/reference.rst:82 +msgid "Ensemble Methods" +msgstr "集成方法" + +#: ../../source/reference/learn/reference.rst:95::1 +msgid "" +":obj:`ensemble.BlockwiseVotingClassifier " +"`\\ \\(estimator\\)" +msgstr "" + +#: ../../source/reference/learn/reference.rst:95::1 +msgid "Blockwise training and ensemble voting classifier." +msgstr "" + +#: ../../source/reference/learn/reference.rst:95::1 +msgid "" +":obj:`ensemble.BlockwiseVotingRegressor " +"`\\ \\(estimator\\)" +msgstr "" + +#: ../../source/reference/learn/reference.rst:95::1 +msgid "Blockwise training and ensemble voting regressor." +msgstr "" + +#: ../../source/reference/learn/reference.rst:99 +msgid "Linear Models" +msgstr "线性模型" + +#: ../../source/reference/learn/reference.rst:102 +msgid "Classical linear regressors" +msgstr "" + +#: ../../source/reference/learn/reference.rst:110::1 +msgid "" +":obj:`linear_model.LinearRegression " +"`\\ \\(\\*\\[\\, ...\\]\\)" +msgstr "" + +#: ../../source/reference/learn/reference.rst:110::1 +msgid "Ordinary least squares Linear Regression." +msgstr "" + +#: ../../source/reference/learn/reference.rst:114 msgid "Metrics" msgstr "评估" -#: ../../source/reference/learn/reference.rst:92 +#: ../../source/reference/learn/reference.rst:123 msgid "Classification metrics" msgstr "分类评估" -#: ../../source/reference/learn/reference.rst:100::1 +#: ../../source/reference/learn/reference.rst:131::1 msgid "" ":obj:`metrics.accuracy_score `\\ " "\\(y\\_true\\, y\\_pred\\[\\, ...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:100::1 +#: ../../source/reference/learn/reference.rst:131::1 msgid "Accuracy classification score." msgstr "" -#: ../../source/reference/learn/reference.rst:100::1 +#: ../../source/reference/learn/reference.rst:131::1 msgid "" ":obj:`metrics.auc `\\ \\(x\\, y\\[\\, session\\, " "run\\_kwargs\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:100::1 +#: ../../source/reference/learn/reference.rst:131::1 msgid "Compute Area Under the Curve (AUC) using the trapezoidal rule" msgstr "" -#: ../../source/reference/learn/reference.rst:100::1 +#: ../../source/reference/learn/reference.rst:131::1 msgid "" ":obj:`metrics.roc_curve `\\ \\(y\\_true\\, " "y\\_score\\[\\, ...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:100::1 +#: ../../source/reference/learn/reference.rst:131::1 msgid "Compute Receiver operating characteristic (ROC)" msgstr "" -#: ../../source/reference/learn/reference.rst:102 +#: ../../source/reference/learn/reference.rst:133 +msgid "Regression metrics" +msgstr "分类评估" + +#: ../../source/reference/learn/reference.rst:140::1 +msgid "" +":obj:`metrics.r2_score `\\ \\(y\\_true\\, " +"y\\_pred\\, \\*\\[\\, ...\\]\\)" +msgstr "" + +#: ../../source/reference/learn/reference.rst:140::1 +msgid ":math:`R^2` (coefficient of determination) regression score function." +msgstr "" + +#: ../../source/reference/learn/reference.rst:142 msgid "Pairwise metrics" msgstr "Pairwise 评估" -#: ../../source/reference/learn/reference.rst:120::1 +#: ../../source/reference/learn/reference.rst:160::1 msgid "" ":obj:`metrics.pairwise.cosine_similarity " "`\\ \\(X\\[\\, Y\\, " "...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:120::1 +#: ../../source/reference/learn/reference.rst:160::1 msgid "Compute cosine similarity between samples in X and Y." msgstr "" -#: ../../source/reference/learn/reference.rst:120::1 +#: ../../source/reference/learn/reference.rst:160::1 msgid "" ":obj:`metrics.pairwise.cosine_distances " "`\\ \\(X\\[\\, Y\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:120::1 +#: ../../source/reference/learn/reference.rst:160::1 msgid "Compute cosine distance between samples in X and Y." msgstr "" -#: ../../source/reference/learn/reference.rst:120::1 +#: ../../source/reference/learn/reference.rst:160::1 msgid "" ":obj:`metrics.pairwise.euclidean_distances " "`\\ \\(X\\[\\, Y\\, " "...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:120::1 +#: ../../source/reference/learn/reference.rst:160::1 msgid "" "Considering the rows of X (and Y=X) as vectors, compute the distance " "matrix between each pair of vectors." msgstr "" -#: ../../source/reference/learn/reference.rst:120::1 +#: ../../source/reference/learn/reference.rst:160::1 msgid "" ":obj:`metrics.pairwise.haversine_distances " "`\\ \\(X\\[\\, Y\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:120::1 +#: ../../source/reference/learn/reference.rst:160::1 msgid "Compute the Haversine distance between samples in X and Y" msgstr "" -#: ../../source/reference/learn/reference.rst:120::1 +#: ../../source/reference/learn/reference.rst:160::1 msgid "" ":obj:`metrics.pairwise.manhattan_distances " "`\\ \\(X\\[\\, Y\\, " "...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:120::1 +#: ../../source/reference/learn/reference.rst:160::1 msgid "Compute the L1 distances between the vectors in X and Y." msgstr "" -#: ../../source/reference/learn/reference.rst:120::1 +#: ../../source/reference/learn/reference.rst:160::1 msgid "" ":obj:`metrics.pairwise.rbf_kernel " "`\\ \\(X\\[\\, Y\\, gamma\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:120::1 +#: ../../source/reference/learn/reference.rst:160::1 msgid "Compute the rbf (gaussian) kernel between X and Y." msgstr "" -#: ../../source/reference/learn/reference.rst:120::1 +#: ../../source/reference/learn/reference.rst:160::1 msgid "" ":obj:`metrics.pairwise_distances " "`\\ \\(X\\[\\, Y\\, metric\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:124 +#: ../../source/reference/learn/reference.rst:164 msgid "Model Selection" msgstr "模型选择" -#: ../../source/reference/learn/reference.rst:127 +#: ../../source/reference/learn/reference.rst:167 msgid "Splitter Classes" msgstr "划分类" -#: ../../source/reference/learn/reference.rst:135::1 +#: ../../source/reference/learn/reference.rst:175::1 msgid "" ":obj:`model_selection.KFold `\\ " "\\(\\[n\\_splits\\, shuffle\\, ...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:135::1 +#: ../../source/reference/learn/reference.rst:175::1 msgid "K-Folds cross-validator" msgstr "" -#: ../../source/reference/learn/reference.rst:137 +#: ../../source/reference/learn/reference.rst:177 msgid "Splitter Functions" msgstr "划分函数" -#: ../../source/reference/learn/reference.rst:145::1 +#: ../../source/reference/learn/reference.rst:185::1 msgid "" ":obj:`model_selection.train_test_split " "`\\ \\(\\*arrays\\, ...\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:145::1 +#: ../../source/reference/learn/reference.rst:185::1 msgid "Split arrays or matrices into random train and test subsets" msgstr "" -#: ../../source/reference/learn/reference.rst:149 +#: ../../source/reference/learn/reference.rst:189 msgid "Nearest Neighbors" msgstr "最邻近" -#: ../../source/reference/learn/reference.rst:161::1 +#: ../../source/reference/learn/reference.rst:201::1 msgid "" ":obj:`neighbors.NearestNeighbors " "`\\ \\(\\[n\\_neighbors\\, " "...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:165 +#: ../../source/reference/learn/reference.rst:205 msgid "Preprocessing and Normalization" msgstr "预处理和标准化" -#: ../../source/reference/learn/reference.rst:179::1 +#: ../../source/reference/learn/reference.rst:221::1 +msgid "" +":obj:`preprocessing.LabelBinarizer " +"`\\ \\(\\*\\[\\, neg\\_label\\, " +"...\\]\\)" +msgstr "" + +#: ../../source/reference/learn/reference.rst:221::1 +msgid "Binarize labels in a one-vs-all fashion." +msgstr "" + +#: ../../source/reference/learn/reference.rst:221::1 msgid "" ":obj:`preprocessing.MinMaxScaler " "`\\ \\(\\[feature\\_range\\, " "...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:179::1 +#: ../../source/reference/learn/reference.rst:221::1 msgid "Transform features by scaling each feature to a given range." msgstr "" -#: ../../source/reference/learn/reference.rst:179::1 +#: ../../source/reference/learn/reference.rst:221::1 msgid "" ":obj:`preprocessing.minmax_scale " "`\\ \\(X\\[\\, ...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:179::1 +#: ../../source/reference/learn/reference.rst:221::1 +msgid "" +":obj:`preprocessing.label_binarize " +"`\\ \\(y\\, \\*\\, classes\\)" +msgstr "" + +#: ../../source/reference/learn/reference.rst:221::1 msgid "" ":obj:`preprocessing.normalize `\\ " "\\(X\\[\\, norm\\, axis\\, ...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:179::1 +#: ../../source/reference/learn/reference.rst:221::1 msgid "Scale input vectors individually to unit norm (vector length)." msgstr "" -#: ../../source/reference/learn/reference.rst:183 +#: ../../source/reference/learn/reference.rst:225 msgid "Semi-Supervised Learning" msgstr "半监督学习" -#: ../../source/reference/learn/reference.rst:195::1 +#: ../../source/reference/learn/reference.rst:237::1 msgid "" ":obj:`semi_supervised.LabelPropagation " "`\\ \\(\\[kernel\\, " "...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:195::1 +#: ../../source/reference/learn/reference.rst:237::1 msgid "Label Propagation classifier" msgstr "" -#: ../../source/reference/learn/reference.rst:199 +#: ../../source/reference/learn/reference.rst:241 msgid "Utilities" msgstr "工具" -#: ../../source/reference/learn/reference.rst:219::1 +#: ../../source/reference/learn/reference.rst:261::1 msgid "" ":obj:`utils.assert_all_finite `\\ " "\\(X\\[\\, allow\\_nan\\, ...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:219::1 +#: ../../source/reference/learn/reference.rst:261::1 msgid "" ":obj:`utils.check_X_y `\\ \\(X\\, y\\[\\, " "accept\\_sparse\\, ...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:219::1 +#: ../../source/reference/learn/reference.rst:261::1 msgid "Input validation for standard estimators." msgstr "" -#: ../../source/reference/learn/reference.rst:219::1 +#: ../../source/reference/learn/reference.rst:261::1 msgid "" ":obj:`utils.check_array `\\ \\(array\\[\\, " "accept\\_sparse\\, ...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:219::1 +#: ../../source/reference/learn/reference.rst:261::1 msgid "Input validation on a tensor, list, sparse matrix or similar." msgstr "" -#: ../../source/reference/learn/reference.rst:219::1 +#: ../../source/reference/learn/reference.rst:261::1 msgid "" ":obj:`utils.check_consistent_length " "`\\ \\(\\*arrays\\[\\, " "...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:219::1 +#: ../../source/reference/learn/reference.rst:261::1 msgid "Check that all arrays have consistent first dimensions." msgstr "" -#: ../../source/reference/learn/reference.rst:219::1 +#: ../../source/reference/learn/reference.rst:261::1 msgid "" ":obj:`utils.multiclass.type_of_target " "`\\ \\(y\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:219::1 +#: ../../source/reference/learn/reference.rst:261::1 msgid "Determine the type of data indicated by the target." msgstr "" -#: ../../source/reference/learn/reference.rst:219::1 +#: ../../source/reference/learn/reference.rst:261::1 msgid "" ":obj:`utils.multiclass.is_multilabel " "`\\ \\(y\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:219::1 +#: ../../source/reference/learn/reference.rst:261::1 msgid "Check if ``y`` is in a multilabel format." msgstr "" -#: ../../source/reference/learn/reference.rst:219::1 +#: ../../source/reference/learn/reference.rst:261::1 msgid "" ":obj:`utils.shuffle `\\ \\(\\*arrays\\, " "\\*\\*options\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:219::1 +#: ../../source/reference/learn/reference.rst:261::1 msgid "" ":obj:`utils.validation.check_is_fitted " "`\\ \\(estimator\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:219::1 +#: ../../source/reference/learn/reference.rst:261::1 msgid "Perform is_fitted validation for estimator." msgstr "" -#: ../../source/reference/learn/reference.rst:219::1 +#: ../../source/reference/learn/reference.rst:261::1 msgid "" ":obj:`utils.validation.column_or_1d " "`\\ \\(y\\[\\, warn\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:219::1 +#: ../../source/reference/learn/reference.rst:261::1 msgid "Ravel column or 1d numpy array, else raises an error" msgstr "" -#: ../../source/reference/learn/reference.rst:223 +#: ../../source/reference/learn/reference.rst:265 msgid "LightGBM Integration" msgstr "LightGBM 集成" -#: ../../source/reference/learn/reference.rst:237::1 +#: ../../source/reference/learn/reference.rst:279::1 msgid "" ":obj:`contrib.lightgbm.LGBMClassifier " "`\\ \\(\\*args\\, " "\\*\\*kwargs\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:237::1 +#: ../../source/reference/learn/reference.rst:279::1 msgid "" ":obj:`contrib.lightgbm.LGBMRegressor " "`\\ \\(\\*args\\, " "\\*\\*kwargs\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:237::1 +#: ../../source/reference/learn/reference.rst:279::1 msgid "" ":obj:`contrib.lightgbm.LGBMRanker " "`\\ \\(\\*args\\, \\*\\*kwargs\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:241 +#: ../../source/reference/learn/reference.rst:283 msgid "PyTorch Integration" msgstr "PyTorch 集成" -#: ../../source/reference/learn/reference.rst:258::1 +#: ../../source/reference/learn/reference.rst:300::1 msgid "" ":obj:`contrib.pytorch.run_pytorch_script " "`\\ \\(script\\, ...\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:258::1 +#: ../../source/reference/learn/reference.rst:300::1 msgid "Run PyTorch script in Mars cluster." msgstr "" -#: ../../source/reference/learn/reference.rst:258::1 +#: ../../source/reference/learn/reference.rst:300::1 msgid "" ":obj:`contrib.pytorch.MarsDataset " "`\\ \\(\\*tileables\\[\\, " "...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:258::1 +#: ../../source/reference/learn/reference.rst:300::1 msgid "MarsDataset that inherit from torch.utils.data.Dataset." msgstr "" -#: ../../source/reference/learn/reference.rst:258::1 +#: ../../source/reference/learn/reference.rst:300::1 msgid "" ":obj:`contrib.pytorch.SequentialSampler " "`\\ \\(data\\_source\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:258::1 +#: ../../source/reference/learn/reference.rst:300::1 msgid "\"Samples elements sequentially, always in the same order." msgstr "" -#: ../../source/reference/learn/reference.rst:258::1 +#: ../../source/reference/learn/reference.rst:300::1 msgid "" ":obj:`contrib.pytorch.RandomSampler " "`\\ \\(data\\_source\\[\\, " "...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:258::1 +#: ../../source/reference/learn/reference.rst:300::1 msgid "\"" msgstr "" -#: ../../source/reference/learn/reference.rst:258::1 +#: ../../source/reference/learn/reference.rst:300::1 msgid "" ":obj:`contrib.pytorch.SubsetRandomSampler " "`\\ \\(indices\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:258::1 +#: ../../source/reference/learn/reference.rst:300::1 msgid "" "Samples elements randomly from a given list of indices, without " "replacement." msgstr "" -#: ../../source/reference/learn/reference.rst:258::1 +#: ../../source/reference/learn/reference.rst:300::1 msgid "" ":obj:`contrib.pytorch.DistributedSampler " "`\\ \\(dataset\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:258::1 +#: ../../source/reference/learn/reference.rst:300::1 msgid "Sampler that restricts data loading to a subset of the dataset." msgstr "" -#: ../../source/reference/learn/reference.rst:262 +#: ../../source/reference/learn/reference.rst:304 msgid "StatsModels Integration" msgstr "StatsModels 集成" -#: ../../source/reference/learn/reference.rst:275::1 +#: ../../source/reference/learn/reference.rst:317::1 msgid "" ":obj:`contrib.statsmodels.MarsDistributedModel " "`\\ \\(\\[...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:275::1 +#: ../../source/reference/learn/reference.rst:317::1 msgid "" ":obj:`contrib.statsmodels.MarsResults " "`\\ \\(model\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:279 +#: ../../source/reference/learn/reference.rst:321 msgid "TensorFlow Integration" msgstr "TensorFlow 集成" -#: ../../source/reference/learn/reference.rst:291::1 +#: ../../source/reference/learn/reference.rst:334::1 msgid "" ":obj:`contrib.tensorflow.run_tensorflow_script " "`\\ \\(...\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:291::1 +#: ../../source/reference/learn/reference.rst:334::1 msgid "Run TensorFlow script in Mars cluster." msgstr "" -#: ../../source/reference/learn/reference.rst:295 +#: ../../source/reference/learn/reference.rst:334::1 +msgid "" +":obj:`contrib.tensorflow.gen_tensorflow_dataset " +"`\\ \\(tensors\\)" +msgstr "" + +#: ../../source/reference/learn/reference.rst:334::1 +msgid "convert mars data type to tf.data.Dataset." +msgstr "" + +#: ../../source/reference/learn/reference.rst:338 msgid "XGBoost Integration" msgstr "XGBoost 集成" -#: ../../source/reference/learn/reference.rst:310::1 +#: ../../source/reference/learn/reference.rst:353::1 msgid "" ":obj:`contrib.xgboost.MarsDMatrix " "`\\ \\(data\\[\\, label\\, " "...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:310::1 +#: ../../source/reference/learn/reference.rst:353::1 msgid "" ":obj:`contrib.xgboost.train `\\ " "\\(params\\, dtrain\\[\\, evals\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:310::1 +#: ../../source/reference/learn/reference.rst:353::1 msgid "Train XGBoost model in Mars manner." msgstr "" -#: ../../source/reference/learn/reference.rst:310::1 +#: ../../source/reference/learn/reference.rst:353::1 msgid "" ":obj:`contrib.xgboost.predict `\\ " "\\(model\\, data\\[\\, ...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:310::1 +#: ../../source/reference/learn/reference.rst:353::1 msgid "" ":obj:`contrib.xgboost.XGBClassifier " "`\\ \\(\\[max\\_depth\\, " "...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:310::1 +#: ../../source/reference/learn/reference.rst:353::1 msgid "Implementation of the scikit-learn API for XGBoost classification." msgstr "" -#: ../../source/reference/learn/reference.rst:310::1 +#: ../../source/reference/learn/reference.rst:353::1 msgid "" ":obj:`contrib.xgboost.XGBRegressor " "`\\ \\(\\[max\\_depth\\, " "...\\]\\)" msgstr "" -#: ../../source/reference/learn/reference.rst:310::1 +#: ../../source/reference/learn/reference.rst:353::1 msgid "Implementation of the scikit-learn API for XGBoost regressor." msgstr "" + diff --git a/docs/source/reference/learn/reference.rst b/docs/source/reference/learn/reference.rst index cda324cf71..d7091278d9 100644 --- a/docs/source/reference/learn/reference.rst +++ b/docs/source/reference/learn/reference.rst @@ -93,6 +93,21 @@ Ensemble Methods ensemble.BlockwiseVotingClassifier ensemble.BlockwiseVotingRegressor +.. _linear_model_ref: + +Linear Models +============= + +Classical linear regressors +--------------------------- + +.. currentmodule:: mars.learn + +.. autosummary:: + :toctree: generated/ + + linear_model.LinearRegression + .. _metrics_ref: Metrics @@ -198,8 +213,10 @@ Preprocessing and Normalization .. autosummary:: :toctree: generated/ + preprocessing.LabelBinarizer preprocessing.MinMaxScaler preprocessing.minmax_scale + preprocessing.label_binarize preprocessing.normalize .. _semi_supervised_ref: diff --git a/mars/learn/contrib/tensorflow/dataset.py b/mars/learn/contrib/tensorflow/dataset.py index 26e95052eb..fed327c380 100644 --- a/mars/learn/contrib/tensorflow/dataset.py +++ b/mars/learn/contrib/tensorflow/dataset.py @@ -35,8 +35,8 @@ @require_not_none(tf) class MarsDataset: def __init__(self, tensors, - output_shapes=None, - output_types=None, + output_shapes: Tuple[int, ...]=None, + output_types: Tuple[np.dtype, ...]=None, fetch_kwargs=None): self._context = get_context() @@ -123,8 +123,8 @@ def make_generator(): # pragma: no cover def gen_tensorflow_dataset(tensors, - output_shapes=None, - output_types=None, + output_shapes: Tuple[int, ...]=None, + output_types: Tuple[np.dtype, ...]=None, fetch_kwargs=None): """ convert mars data type to tf.data.Dataset. Note this is based tensorflow 2.0 diff --git a/mars/learn/preprocessing/__init__.py b/mars/learn/preprocessing/__init__.py index 37887ec535..2a9dd4502a 100644 --- a/mars/learn/preprocessing/__init__.py +++ b/mars/learn/preprocessing/__init__.py @@ -12,10 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -try: - from ._data import MinMaxScaler - from ._data import minmax_scale -except ImportError: # pragma: no cover - # sklearn not installed - pass +from ._data import MinMaxScaler +from ._data import minmax_scale +from ._label import LabelBinarizer, label_binarize from .normalize import normalize diff --git a/mars/learn/preprocessing/_label.py b/mars/learn/preprocessing/_label.py new file mode 100644 index 0000000000..57f51af8b2 --- /dev/null +++ b/mars/learn/preprocessing/_label.py @@ -0,0 +1,661 @@ +# Copyright 1999-2021 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union + +import numpy as np +import scipy.sparse as sp +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.sparsefuncs import min_max_axis + +from ... import execute, fetch +from ... import opcodes +from ... import tensor as mt +from ...core import ENTITY_TYPE, OutputType, recursive_tile +from ...core.context import get_context, Context +from ...lib.sparse import SparseNDArray +from ...serialization.serializables import AnyField, BoolField, \ + Int32Field, StringField +from ...tensor.core import TensorOrder +from ...typing import TileableType +from ...utils import has_unknown_shape +from ..operands import LearnOperand, LearnOperandMixin +from ..utils import column_or_1d +from ..utils.multiclass import unique_labels, type_of_target +from ..utils.validation import _num_samples, check_is_fitted, check_array + + +class LabelBinarizer(TransformerMixin, BaseEstimator): + """Binarize labels in a one-vs-all fashion. + + Several regression and binary classification algorithms are + available in scikit-learn. A simple way to extend these algorithms + to the multi-class classification case is to use the so-called + one-vs-all scheme. + + At learning time, this simply consists in learning one regressor + or binary classifier per class. In doing so, one needs to convert + multi-class labels to binary labels (belong or does not belong + to the class). LabelBinarizer makes this process easy with the + transform method. + + At prediction time, one assigns the class for which the corresponding + model gave the greatest confidence. LabelBinarizer makes this easy + with the inverse_transform method. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + + neg_label : int, default=0 + Value with which negative labels must be encoded. + + pos_label : int, default=1 + Value with which positive labels must be encoded. + + sparse_output : bool, default=False + True if the returned array from transform is desired to be in sparse + CSR format. + + Attributes + ---------- + + classes_ : ndarray of shape (n_classes,) + Holds the label for each class. + + y_type_ : str + Represents the type of the target data as evaluated by + utils.multiclass.type_of_target. Possible type are 'continuous', + 'continuous-multioutput', 'binary', 'multiclass', + 'multiclass-multioutput', 'multilabel-indicator', and 'unknown'. + + sparse_input_ : bool + True if the input data to transform is given as a sparse matrix, False + otherwise. + + Examples + -------- + >>> from mars.learn import preprocessing + >>> lb = preprocessing.LabelBinarizer() + >>> lb.fit([1, 2, 6, 4, 2]) + LabelBinarizer() + >>> lb.classes_ + array([1, 2, 4, 6]) + >>> lb.transform([1, 6]) + array([[1, 0, 0, 0], + [0, 0, 0, 1]]) + + Binary targets transform to a column vector + + >>> lb = preprocessing.LabelBinarizer() + >>> lb.fit_transform(['yes', 'no', 'no', 'yes']) + array([[1], + [0], + [0], + [1]]) + + Passing a 2D matrix for multilabel classification + + >>> import numpy as np + >>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]])) + LabelBinarizer() + >>> lb.classes_ + array([0, 1, 2]) + >>> lb.transform([0, 1, 2, 1]) + array([[1, 0, 0], + [0, 1, 0], + [0, 0, 1], + [0, 1, 0]]) + + See Also + -------- + label_binarize : Function to perform the transform operation of + LabelBinarizer with fixed classes. + OneHotEncoder : Encode categorical features using a one-hot aka one-of-K + scheme. + """ + + def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False): + if neg_label >= pos_label: + raise ValueError("neg_label={0} must be strictly less than " + "pos_label={1}.".format(neg_label, pos_label)) + + if sparse_output and (pos_label == 0 or neg_label != 0): + raise ValueError("Sparse binarization is only supported with non " + "zero pos_label and zero neg_label, got " + "pos_label={0} and neg_label={1}" + "".format(pos_label, neg_label)) + + self.neg_label = neg_label + self.pos_label = pos_label + self.sparse_output = sparse_output + + def fit(self, y, session=None, run_kwargs=None): + """Fit label binarizer. + + Parameters + ---------- + y : ndarray of shape (n_samples,) or (n_samples, n_classes) + Target values. The 2-d matrix should only contain 0 and 1, + represents multilabel classification. + + Returns + ------- + self : returns an instance of self. + """ + self.y_type_ = fetch(execute( + type_of_target(y), session=session, **(run_kwargs or dict()))) + if 'multioutput' in self.y_type_: + raise ValueError("Multioutput target data is not supported with " + "label binarization") + if _num_samples(y) == 0: # pragma: no cover + raise ValueError('y has 0 samples: %r' % y) + + self.sparse_input_ = mt.tensor(y).issparse() + self.classes_ = unique_labels(y).execute( + session=session, **(run_kwargs or dict())) + return self + + def fit_transform(self, y, session=None, run_kwargs=None): + """Fit label binarizer and transform multi-class labels to binary + labels. + + The output of transform is sometimes referred to as + the 1-of-K coding scheme. + + Parameters + ---------- + y : {ndarray, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_classes) + Target values. The 2-d matrix should only contain 0 and 1, + represents multilabel classification. Sparse matrix can be + CSR, CSC, COO, DOK, or LIL. + + Returns + ------- + Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) + Shape will be (n_samples, 1) for binary problems. Sparse matrix + will be of CSR format. + """ + return self.fit(y, session=session, run_kwargs=run_kwargs)\ + .transform(y, session=session, run_kwargs=run_kwargs) + + def transform(self, y, session=None, run_kwargs=None): + """Transform multi-class labels to binary labels. + + The output of transform is sometimes referred to by some authors as + the 1-of-K coding scheme. + + Parameters + ---------- + y : {array, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_classes) + Target values. The 2-d matrix should only contain 0 and 1, + represents multilabel classification. Sparse matrix can be + CSR, CSC, COO, DOK, or LIL. + + Returns + ------- + Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) + Shape will be (n_samples, 1) for binary problems. Sparse matrix + will be of CSR format. + """ + check_is_fitted(self) + + target = fetch(execute(type_of_target(y), session=session, + **(run_kwargs or dict()))) + y_is_multilabel = target.startswith('multilabel') + if y_is_multilabel and not self.y_type_.startswith('multilabel'): + raise ValueError("The object was not fitted with multilabel" + " input.") + + return label_binarize(y, classes=self.classes_, + pos_label=self.pos_label, + neg_label=self.neg_label, + sparse_output=self.sparse_output) + + def inverse_transform(self, Y, threshold=None): + """Transform binary labels back to multi-class labels. + + Parameters + ---------- + Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) + Target values. All sparse matrices are converted to CSR before + inverse transformation. + + threshold : float, default=None + Threshold used in the binary and multi-label cases. + + Use 0 when ``Y`` contains the output of decision_function + (classifier). + Use 0.5 when ``Y`` contains the output of predict_proba. + + If None, the threshold is assumed to be half way between + neg_label and pos_label. + + Returns + ------- + y : {ndarray, sparse matrix} of shape (n_samples,) + Target values. Sparse matrix will be of CSR format. + + Notes + ----- + In the case when the binary labels are fractional + (probabilistic), inverse_transform chooses the class with the + greatest value. Typically, this allows to use the output of a + linear model's decision_function method directly as the input + of inverse_transform. + """ + check_is_fitted(self) + + if threshold is None: + threshold = (self.pos_label + self.neg_label) / 2. + + Y = mt.asarray(Y) + if self.y_type_ == "multiclass": + y_inv = Y.map_chunk(_inverse_binarize_multiclass, + args=(self.classes_,), dtype=self.classes_.dtype, + shape=(Y.shape[0],)) + else: + shape = (Y.shape[0],) if self.y_type_ != 'multilabel-indicator' else Y.shape + y_inv = Y.map_chunk(_inverse_binarize_thresholding, + args=(self.y_type_, self.classes_, threshold), + dtype=self.classes_.dtype, + shape=shape) + + if self.sparse_input_: + y_inv = y_inv.tosparse() + elif y_inv.issparse(): + y_inv = y_inv.todense() + + return y_inv + + def _more_tags(self): # pragma: no cover # noqa: R0201 # pylint: disable=no-self-use + return {'X_types': ['1dlabels']} + + +class LabelBinarize(LearnOperand, LearnOperandMixin): + _op_type_ = opcodes.LABEL_BINARIZE + + y = AnyField('y') + classes = AnyField('classes') + neg_label = Int32Field('neg_label') + pos_label = Int32Field('pos_label') + sparse_output = BoolField('sparse_output') + # for chunk + y_type = StringField('y_type') + pos_switch = BoolField('pos_switch') + + def __call__(self, y: TileableType, classes: TileableType): + inputs = [] + if isinstance(y, ENTITY_TYPE): + inputs.append(y) + if isinstance(classes, ENTITY_TYPE): + inputs.append(classes) + self.sparse = self.sparse_output + self.output_types = [OutputType.tensor] + return self.new_tileable(inputs, shape=(np.nan,), + dtype=np.dtype(int), + order=TensorOrder.C_ORDER) + + def _set_inputs(self, inputs): + super()._set_inputs(inputs) + if isinstance(self.y, ENTITY_TYPE): + self.y = self._inputs[0] + if isinstance(self.classes, ENTITY_TYPE): + self.classes = self._inputs[-1] + + @classmethod + def tile(cls, op: "LabelBinarize"): + y = op.y + classes = op.classes + neg_label = op.neg_label + pos_label = op.pos_label + sparse_output = op.sparse_output + out = op.outputs[0] + ctx = get_context() + + if (isinstance(y, ENTITY_TYPE) and has_unknown_shape(y)) or ( + isinstance(classes, ENTITY_TYPE) and has_unknown_shape(classes)): # pragma: no cover + yield + if isinstance(classes, ENTITY_TYPE) and len(classes.chunks) > 1: # pragma: no cover + classes = yield from recursive_tile( + classes.rechunk(classes.shape)) + + if not isinstance(y, list): + # XXX Workaround that will be removed when list of list format is + # dropped + y = check_array(y, accept_sparse=True, ensure_2d=False, dtype=None) + else: + if _num_samples(y) == 0: + raise ValueError('y has 0 samples: %r' % y) + + y = yield from recursive_tile(mt.tensor(y)) + + if neg_label >= pos_label: + raise ValueError("neg_label={0} must be strictly less than " + "pos_label={1}.".format(neg_label, pos_label)) + + if (sparse_output and (pos_label == 0 or neg_label != 0)): + raise ValueError("Sparse binarization is only supported with non " + "zero pos_label and zero neg_label, got " + "pos_label={0} and neg_label={1}" + "".format(pos_label, neg_label)) + + # To account for pos_label == 0 in the dense case + pos_switch = pos_label == 0 + if pos_switch: + pos_label = -neg_label + + y_type = yield from recursive_tile(type_of_target(y)) + yield y_type.chunks + y_type = ctx.get_chunks_result([y_type.chunks[0].key])[0] + y_type = y_type.item() if hasattr(y_type, 'item') else y_type + if 'multioutput' in y_type: + raise ValueError("Multioutput target data is not supported with label " + "binarization") + if y_type == 'unknown': + raise ValueError("The type of target data is not known") + + n_samples = mt.tensor(y).shape[0] + n_classes = len(classes) + + if y_type == "binary": + if n_classes == 1: + if sparse_output: + return (yield from recursive_tile( + mt.zeros((n_samples, 1), dtype=int, sparse=True))) + else: + Y = mt.zeros((len(y), 1), dtype=int) + Y += neg_label + return (yield from recursive_tile(Y)) + elif len(classes) >= 3: + y_type = "multiclass" + + if y_type == "multilabel-indicator": + y_n_classes = y.shape[1] if hasattr(y, 'shape') else len(y[0]) + if mt.tensor(classes).size != y_n_classes: + raise ValueError("classes {0} mismatch with the labels {1}" + " found in the data" + .format(classes, unique_labels(y))) + + if y_type in ("binary", "multiclass"): + y = yield from recursive_tile(column_or_1d(y)) + if y_type == 'binary': + out_shape = (n_samples, 1) + else: + out_shape = (n_samples, n_classes) + elif y_type == 'multilabel-indicator': + out_shape = y.shape + else: + raise ValueError("%s target data is not supported with label " + "binarization" % y_type) + + out_chunks = [] + for y_chunk in y.chunks: + chunk_inputs = [y_chunk] + classes_chunk = classes + if isinstance(classes, ENTITY_TYPE): + chunk_inputs.append(classes.chunks[0]) + classes_chunk = classes.chunks[0] + chunk_op = LabelBinarize( + y=y_chunk, classes=classes_chunk, neg_label=neg_label, + pos_label=pos_label, sparse_output=sparse_output, + y_type=y_type, pos_switch=pos_switch, + _output_types=op.output_types) + if len(out_shape) == 2: + chunk_shape = (y_chunk.shape[0], out_shape[1]) + chunk_index = (y_chunk.index[0], 0) + else: # pragma: no cover + chunk_shape = (y_chunk.shape[0],) + chunk_index = (y_chunk.index[0],) + out_chunk = chunk_op.new_chunk(chunk_inputs, shape=chunk_shape, + dtype=out.dtype, order=out.order, + index=chunk_index) + out_chunks.append(out_chunk) + + params = out.params.copy() + params['chunks'] = out_chunks + params['shape'] = out_shape + if len(out_shape) == 2: + nsplits = (y.nsplits[0], (out_shape[1],)) + else: # pragma: no cover + nsplits = (y.nsplits[0],) + params['nsplits'] = nsplits + return op.copy().new_tileables(op.inputs, kws=[params]) + + @classmethod + def execute(cls, + ctx: Union[dict, Context], + op: "LabelBinarize"): + y = ctx[op.y.key] + if hasattr(y, 'raw'): + # SparseNDArray + y = y.raw + if isinstance(op.classes, ENTITY_TYPE): + classes = ctx[op.classes.key] + else: + classes = op.classes + y_type = op.y_type + sparse_output = op.sparse_output + pos_label = op.pos_label + neg_label = op.neg_label + pos_switch = op.pos_switch + + n_samples = y.shape[0] if sp.issparse(y) else len(y) + n_classes = len(classes) + sorted_class = np.sort(classes) + + if y_type in ('binary', 'multiclass'): + # pick out the known labels from y + y_in_classes = np.in1d(y, classes) + y_seen = y[y_in_classes] + indices = np.searchsorted(sorted_class, y_seen) + indptr = np.hstack((0, np.cumsum(y_in_classes))) + + data = np.empty_like(indices) + data.fill(pos_label) + Y = sp.csr_matrix((data, indices, indptr), + shape=(n_samples, n_classes)) + elif y_type == "multilabel-indicator": + Y = sp.csr_matrix(y) + if pos_label != 1: + data = np.empty_like(Y.data) + data.fill(pos_label) + Y.data = data + else: # pragma: no cover + raise ValueError("%s target data is not supported with label " + "binarization" % y_type) + + if not sparse_output: + Y = Y.toarray() + Y = Y.astype(int, copy=False) + + if neg_label != 0: + Y[Y == 0] = neg_label + + if pos_switch: + Y[Y == pos_label] = 0 + else: + Y.data = Y.data.astype(int, copy=False) + + # preserve label ordering + if np.any(classes != sorted_class): + indices = np.searchsorted(sorted_class, classes) + Y = Y[:, indices] + + if y_type == "binary": + if sparse_output: + Y = Y.getcol(-1) + else: + Y = Y[:, -1].reshape((-1, 1)) + + if sp.issparse(Y): + Y = SparseNDArray(Y) + ctx[op.outputs[0].key] = Y + + +def label_binarize(y, *, classes, neg_label=0, pos_label=1, + sparse_output=False, execute=True): + """Binarize labels in a one-vs-all fashion. + + Several regression and binary classification algorithms are + available in scikit-learn. A simple way to extend these algorithms + to the multi-class classification case is to use the so-called + one-vs-all scheme. + + This function makes it possible to compute this transformation for a + fixed set of class labels known ahead of time. + + Parameters + ---------- + y : array-like + Sequence of integer labels or multilabel data to encode. + + classes : array-like of shape (n_classes,) + Uniquely holds the label for each class. + + neg_label : int, default=0 + Value with which negative labels must be encoded. + + pos_label : int, default=1 + Value with which positive labels must be encoded. + + sparse_output : bool, default=False, + Set to true if output binary array is desired in CSR sparse format. + + Returns + ------- + Y : {tensor, sparse tensor} of shape (n_samples, n_classes) + Shape will be (n_samples, 1) for binary problems. + + Examples + -------- + >>> from mars.learn.preprocessing import label_binarize + >>> label_binarize([1, 6], classes=[1, 2, 4, 6]) + array([[1, 0, 0, 0], + [0, 0, 0, 1]]) + + The class ordering is preserved: + + >>> label_binarize([1, 6], classes=[1, 6, 4, 2]) + array([[1, 0, 0, 0], + [0, 1, 0, 0]]) + + Binary targets transform to a column vector + + >>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes']) + array([[1], + [0], + [0], + [1]]) + + See Also + -------- + LabelBinarizer : Class used to wrap the functionality of label_binarize and + allow for fitting to classes independently of the transform operation. + """ + op = LabelBinarize(y=y, classes=classes, neg_label=neg_label, + pos_label=pos_label, sparse_output=sparse_output) + result = op(y, classes) + return result.execute() if execute else result + + +def _inverse_binarize_multiclass(y, classes): # pragma: no cover + """Inverse label binarization transformation for multiclass. + + Multiclass uses the maximal score instead of a threshold. + """ + classes = np.asarray(classes) + + if sp.issparse(y): + # Find the argmax for each row in y where y is a CSR matrix + + y = y.tocsr() + n_samples, n_outputs = y.shape + outputs = np.arange(n_outputs) + row_max = min_max_axis(y, 1)[1] + row_nnz = np.diff(y.indptr) + + y_data_repeated_max = np.repeat(row_max, row_nnz) + # picks out all indices obtaining the maximum per row + y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data) + + # For corner case where last row has a max of 0 + if row_max[-1] == 0: + y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)]) + + # Gets the index of the first argmax in each row from y_i_all_argmax + index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1]) + # first argmax of each row + y_ind_ext = np.append(y.indices, [0]) + y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]] + # Handle rows of all 0 + y_i_argmax[np.where(row_nnz == 0)[0]] = 0 + + # Handles rows with max of 0 that contain negative numbers + samples = np.arange(n_samples)[(row_nnz > 0) & + (row_max.ravel() == 0)] + for i in samples: + ind = y.indices[y.indptr[i]:y.indptr[i + 1]] + y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0] + + return classes[y_i_argmax] + else: + return classes.take(y.argmax(axis=1), mode="clip") + + +def _inverse_binarize_thresholding(y, output_type, classes, threshold): # pragma: no cover + """Inverse label binarization transformation using thresholding.""" + + if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2: + raise ValueError("output_type='binary', but y.shape = {0}". + format(y.shape)) + + if output_type != "binary" and y.shape[1] != len(classes): + raise ValueError("The number of class is not equal to the number of " + "dimension of y.") + + classes = np.asarray(classes) + + # Perform thresholding + if sp.issparse(y): + if threshold > 0: + if y.format not in ('csr', 'csc'): + y = y.tocsr() + y.data = np.array(y.data > threshold, dtype=int) + y.eliminate_zeros() + else: + y = np.array(y.toarray() > threshold, dtype=int) + else: + y = np.array(y > threshold, dtype=int) + + # Inverse transform data + if output_type == "binary": + if sp.issparse(y): + y = y.toarray() + if y.ndim == 2 and y.shape[1] == 2: + return classes[y[:, 1]] + else: + if len(classes) == 1: + return np.repeat(classes[0], len(y)) + else: + return classes[y.ravel()] + + elif output_type == "multilabel-indicator": + return y + + else: + raise ValueError("{0} format is not supported".format(output_type)) diff --git a/mars/learn/preprocessing/tests/test_label.py b/mars/learn/preprocessing/tests/test_label.py new file mode 100644 index 0000000000..4a640c5704 --- /dev/null +++ b/mars/learn/preprocessing/tests/test_label.py @@ -0,0 +1,258 @@ +# Copyright 1999-2021 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +import scipy.sparse as sp +from sklearn.preprocessing._label import _inverse_binarize_thresholding +from sklearn.preprocessing._label import _inverse_binarize_multiclass +from sklearn.utils._testing import assert_array_equal, ignore_warnings +from sklearn.utils.multiclass import type_of_target + +from .... import tensor as mt +from .. import LabelBinarizer, label_binarize + + +def test_label_binarizer(setup): + # one-class case defaults to negative label + # For dense case: + inp = ["pos", "pos", "pos", "pos"] + lb = LabelBinarizer(sparse_output=False) + expected = np.array([[0, 0, 0, 0]]).T + got = lb.fit_transform(inp) + assert_array_equal(lb.classes_, ["pos"]) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + # For sparse case: + lb = LabelBinarizer(sparse_output=True) + got = lb.fit_transform(inp) + assert got.issparse() + assert_array_equal(lb.classes_, ["pos"]) + assert_array_equal(expected, got.fetch().toarray()) + assert_array_equal(lb.inverse_transform(got.todense()), inp) + + lb = LabelBinarizer(sparse_output=False) + # two-class case + inp = ["neg", "pos", "pos", "neg"] + expected = np.array([[0, 1, 1, 0]]).T + got = lb.fit_transform(inp) + assert_array_equal(lb.classes_, ["neg", "pos"]) + assert_array_equal(expected, got) + + to_invert = np.array([[1, 0], + [0, 1], + [0, 1], + [1, 0]]) + assert_array_equal(lb.inverse_transform(to_invert), inp) + + # multi-class case + inp = ["spam", "ham", "eggs", "ham", "0"] + expected = np.array([[0, 0, 0, 1], + [0, 0, 1, 0], + [0, 1, 0, 0], + [0, 0, 1, 0], + [1, 0, 0, 0]]) + got = lb.fit_transform(inp) + assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam']) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + +def test_label_binarizer_set_label_encoding(setup): + lb = LabelBinarizer(neg_label=-2, pos_label=0) + + # two-class case with pos_label=0 + inp = np.array([0, 1, 1, 0]) + expected = np.array([[-2, 0, 0, -2]]).T + got = lb.fit_transform(mt.tensor(inp)) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + lb = LabelBinarizer(neg_label=-2, pos_label=2) + + # multi-class case + inp = np.array([3, 2, 1, 2, 0]) + expected = np.array([[-2, -2, -2, +2], + [-2, -2, +2, -2], + [-2, +2, -2, -2], + [-2, -2, +2, -2], + [+2, -2, -2, -2]]) + got = lb.fit_transform(inp) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + +@ignore_warnings +def test_label_binarizer_errors(setup): + # Check that invalid arguments yield ValueError + one_class = np.array([0, 0, 0, 0]) + lb = LabelBinarizer().fit(one_class) + + multi_label = [(2, 3), (0,), (0, 2)] + with pytest.raises(ValueError): + lb.transform(multi_label) + + lb = LabelBinarizer() + with pytest.raises(ValueError): + lb.transform([]) + with pytest.raises(ValueError): + lb.inverse_transform([]) + + with pytest.raises(ValueError): + LabelBinarizer(neg_label=2, pos_label=1) + with pytest.raises(ValueError): + LabelBinarizer(neg_label=2, pos_label=2) + + with pytest.raises(ValueError): + LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True) + + # Sequence of seq type should raise ValueError + y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]] + with pytest.raises(ValueError): + LabelBinarizer().fit_transform(y_seq_of_seqs) + + # Fail on multioutput data + with pytest.raises(ValueError): + LabelBinarizer().fit(np.array([[1, 3], [2, 1]])) + with pytest.raises(ValueError): + label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3]) + + +def test_label_binarize_with_class_order(setup): + out = label_binarize([1, 6], classes=[1, 2, 4, 6]) + expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]]) + assert_array_equal(out, expected) + + # Modified class order + out = label_binarize([1, 6], classes=[1, 6, 4, 2]) + expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]]) + assert_array_equal(out, expected) + + out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1]) + expected = np.array([[0, 0, 1, 0], + [0, 0, 0, 1], + [0, 1, 0, 0], + [1, 0, 0, 0]]) + assert_array_equal(out, expected) + + +def toarray(a): + if hasattr(a, "toarray"): + a = a.toarray() + return a + + +def check_binarized_results(y, classes, pos_label, neg_label, expected): + for sparse_output in [True, False]: + if ((pos_label == 0 or neg_label != 0) and sparse_output): + with pytest.raises(ValueError): + label_binarize(y, classes=classes, neg_label=neg_label, + pos_label=pos_label, + sparse_output=sparse_output) + continue + + # check label_binarize + binarized = label_binarize(y, classes=classes, neg_label=neg_label, + pos_label=pos_label, + sparse_output=sparse_output) + binarized = binarized.fetch() + if hasattr(binarized, 'raw'): + binarized = binarized.raw + assert_array_equal(toarray(binarized), expected) + assert sp.issparse(binarized) == sparse_output + + # check inverse + y_type = type_of_target(y) + if y_type == "multiclass": + inversed = _inverse_binarize_multiclass(binarized, classes=classes) + + else: + inversed = _inverse_binarize_thresholding(binarized, + output_type=y_type, + classes=classes, + threshold=((neg_label + + pos_label) / + 2.)) + + assert_array_equal(toarray(inversed), toarray(y)) + + # Check label binarizer + lb = LabelBinarizer(neg_label=neg_label, pos_label=pos_label, + sparse_output=sparse_output) + binarized = lb.fit_transform(y) + assert_array_equal(toarray(binarized), expected) + assert binarized.issparse() == sparse_output + inverse_output = lb.inverse_transform(binarized) + assert_array_equal(toarray(inverse_output), toarray(y)) + assert inverse_output.issparse() == sp.issparse(y) + + +def test_label_binarize_binary(setup): + y = [0, 1, 0] + classes = [0, 1] + pos_label = 2 + neg_label = -1 + expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1)) + + check_binarized_results(y, classes, pos_label, neg_label, expected) + + # Binary case where sparse_output = True will not result in a ValueError + y = [0, 1, 0] + classes = [0, 1] + pos_label = 3 + neg_label = 0 + expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1)) + + check_binarized_results(y, classes, pos_label, neg_label, expected) + + +def test_label_binarize_multiclass(setup): + y = [0, 1, 2] + classes = [0, 1, 2] + pos_label = 2 + neg_label = 0 + expected = 2 * np.eye(3) + + check_binarized_results(y, classes, pos_label, neg_label, expected) + + with pytest.raises(ValueError): + label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label, + sparse_output=True) + + +def test_label_binarize_multilabel(setup): + y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]]) + classes = [0, 1, 2] + pos_label = 2 + neg_label = 0 + expected = pos_label * y_ind + y_sparse = [sp.csr_matrix(y_ind)] + + for y in [y_ind] + y_sparse: + check_binarized_results(y, classes, pos_label, neg_label, + expected) + + with pytest.raises(ValueError): + label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label, + sparse_output=True) + + +def test_invalid_input_label_binarize(setup): + with pytest.raises(ValueError): + label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1) + with pytest.raises(ValueError, match="continuous target data is not "): + label_binarize([1.2, 2.7], classes=[0, 1]) + with pytest.raises(ValueError, match="mismatch with the labels"): + label_binarize([[1, 3]], classes=[1, 2, 3]) diff --git a/mars/tensor/base/map_chunk.py b/mars/tensor/base/map_chunk.py index 19aaf1d945..d43081901b 100644 --- a/mars/tensor/base/map_chunk.py +++ b/mars/tensor/base/map_chunk.py @@ -66,7 +66,7 @@ def _set_inputs(self, inputs): self._args = replace_objects(self._args, mapping) self._kwargs = replace_objects(self._kwargs, mapping) - def __call__(self, t, dtype=None): + def __call__(self, t, dtype=None, shape=None): if dtype is None: try: kwargs = self.kwargs or dict() @@ -79,7 +79,10 @@ def __call__(self, t, dtype=None): raise TypeError('Cannot estimate output type of map_chunk call') dtype = mock_result.dtype - new_shape = t.shape if self.elementwise else (np.nan,) * t.ndim + if shape is not None: + new_shape = shape + else: + new_shape = t.shape if self.elementwise else (np.nan,) * t.ndim inputs = [t] + find_objects(self.args, ENTITY_TYPE) + \ find_objects(self.kwargs, ENTITY_TYPE) return self.new_tensor(inputs, dtype=dtype, shape=new_shape) @@ -100,9 +103,10 @@ def tile(cls, op: 'TensorMapChunk'): chunks = [] for c in inp.chunks: params = c.params - params['dtype'] = inp.dtype + params['dtype'] = out.dtype if not op.elementwise: - params['shape'] = (np.nan,) * c.ndim + params['shape'] = (np.nan,) * out.ndim + params['index'] = params['index'][:out.ndim] new_op = op.copy().reset_key() new_op.tileable_op_key = out.key @@ -113,7 +117,7 @@ def tile(cls, op: 'TensorMapChunk'): new_op = op.copy().reset_key() params = out.params - nsplits = inp.nsplits + nsplits = inp.nsplits[:out.ndim] if not op.elementwise: nsplits = tuple((np.nan,) * len(sp) for sp in nsplits) return new_op.new_tileables([inp], chunks=chunks, nsplits=nsplits, **params) @@ -176,8 +180,9 @@ def map_chunk(t, func, args=(), **kwargs): """ elementwise = kwargs.pop('elementwise', None) dtype = np.dtype(kwargs.pop('dtype')) if 'dtype' in kwargs else None + shape = kwargs.pop('shape', None) with_chunk_index = kwargs.pop('with_chunk_index', False) op = TensorMapChunk(func=func, args=args, kwargs=kwargs, elementwise=elementwise, with_chunk_index=with_chunk_index) - return op(t, dtype=dtype) + return op(t, dtype=dtype, shape=shape) diff --git a/mars/tensor/datasource/array.py b/mars/tensor/datasource/array.py index 4bb9d80547..2c46941a75 100644 --- a/mars/tensor/datasource/array.py +++ b/mars/tensor/datasource/array.py @@ -265,7 +265,7 @@ def array(x, dtype=None, copy=True, order='K', ndmin=None, chunk_size=None): x = x.copy(order=order) elif not copy and isinstance(raw_x, TENSOR_TYPE) and raw_x.dtype == x.dtype and \ raw_x.order == x.order and raw_x.shape == x.shape and \ - raw_x is not x: + raw_x is not x and hasattr(raw_x, 'data'): raw_x.data = x.data return x