diff --git a/src/core/expr/fexpr_qcut.cc b/src/core/expr/fexpr_qcut.cc new file mode 100644 index 0000000000..0f5894a1c2 --- /dev/null +++ b/src/core/expr/fexpr_qcut.cc @@ -0,0 +1,198 @@ +//------------------------------------------------------------------------------ +// Copyright 2020 H2O.ai +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. +//------------------------------------------------------------------------------ +#include "_dt.h" +#include "column/latent.h" +#include "column/sentinel_fw.h" +#include "column/qcut.h" +#include "datatablemodule.h" +#include "expr/eval_context.h" +#include "expr/fexpr_func.h" +#include "frame/py_frame.h" +#include "parallel/api.h" +namespace dt { +namespace expr { + + + +//------------------------------------------------------------------------------ +// FExpr_Qcut +//------------------------------------------------------------------------------ + +class FExpr_Qcut : public FExpr_Func { + private: + ptrExpr arg_; + py::oobj py_nquantiles_; + + public: + FExpr_Qcut(py::oobj arg, py::robj py_nquantiles) + : arg_(as_fexpr(arg)), + py_nquantiles_(py_nquantiles) + {} + + std::string repr() const override { + std::string out = "qcut("; + out += arg_->repr(); + if (!py_nquantiles_.is_none()) { + out += ", nquantiles="; + out += py_nquantiles_.repr().to_string(); + } + out += ")"; + return out; + } + + + Workframe evaluate_n(EvalContext& ctx) const override { + if (ctx.has_groupby()) { + throw NotImplError() << "qcut() cannot be used in a groupby context"; + } + + Workframe wf = arg_->evaluate_n(ctx); + const size_t ncols = wf.ncols(); + + int32vec nquantiles(ncols); + bool defined_nquantiles = !py_nquantiles_.is_none(); + bool nquantiles_list_or_tuple = py_nquantiles_.is_list_or_tuple(); + + if (nquantiles_list_or_tuple) { + py::oiter py_nquantiles = py_nquantiles_.to_oiter(); + if (py_nquantiles.size() != ncols) { + throw ValueError() << "When `nquantiles` is a list or a tuple, its " + << "length must be the same as the number of input columns, i.e. `" + << ncols << "`, instead got: `" << py_nquantiles.size() << "`"; + + } + + size_t i = 0; + for (auto py_nquantile : py_nquantiles) { + int32_t nquantile = py_nquantile.to_int32_strict(); + if (nquantile <= 0) { + throw ValueError() << "All elements in `nquantiles` must be positive, " + << "got `nquantiles[" << i << "`]: `" << nquantile << "`"; + } + + nquantiles[i++] = nquantile; + } + xassert(i == ncols); + } + else { + int32_t nquantiles_default = 10; + if (defined_nquantiles) { + nquantiles_default = py_nquantiles_.to_int32_strict(); + if (nquantiles_default <= 0) { + throw ValueError() << "Number of quantiles must be positive, " + "instead got: `" << nquantiles_default << "`"; + } + } + + for (size_t i = 0; i < ncols; ++i) { + nquantiles[i] = nquantiles_default; + } + } + + // Qcut workframe in-place + for (size_t i = 0; i < ncols; ++i) { + Column coli = wf.retrieve_column(i); + + if (coli.ltype() == dt::LType::STRING || coli.ltype() == dt::LType::OBJECT) + { + throw TypeError() << "`qcut()` cannot be applied to " + << "string or object columns, instead column `" << i + << "` has an stype: `" << coli.stype() << "`"; + } + + coli = Column(new Latent_ColumnImpl(new Qcut_ColumnImpl( + std::move(coli), nquantiles[i] + ))); + wf.replace_column(i, std::move(coli)); + } + + return wf; + } +}; + + + + +//------------------------------------------------------------------------------ +// Python-facing `qcut()` function +//------------------------------------------------------------------------------ + +static const char* doc_qcut = +R"(qcut(cols, nquantiles=10) +-- + +Bin all the columns from `cols` into intervals with approximately +equal populations. Thus, the intervals are chosen according to +the sample quantiles of the data. + +If there are duplicate values in the data, they will all be placed +into the same been. In extreme cases this may cause the bins to be +highly unbalanced. + +Parameters +---------- +cols: FExpr + Input data for quantile binning. + +nquantiles: int | List[int] + When a single number is specified, this number of quantiles + will be used to bin each column in `cols`. + + When a list or a tuple is provided, each column will be binned + by using its own number of quantiles. In the latter case, + the list/tuple length must be equal to the number of columns + in `cols`. + +return: FExpr + f-expression that converts input columns into the columns filled + with the respective quantile ids. +)"; + +static py::PKArgs args_qcut( + 1, 0, 1, false, false, + { + "cols", "nquantiles" + }, + "qcut", doc_qcut +); + +static py::oobj pyfn_qcut(const py::PKArgs& args) { + if (args[0].is_none_or_undefined()) { + throw TypeError() << "Function `qcut()` requires one positional argument, " + << "but none were given"; + } + + auto arg0 = args[0].to_oobj(); + auto arg1 = args[1].is_none_or_undefined()? py::None() : args[1].to_oobj(); + + return PyFExpr::make(new FExpr_Qcut(arg0, arg1)); +} + + + + +}} // dt::expr + + +void py::DatatableModule::init_methods_qcut() { + ADD_FN(&dt::expr::pyfn_qcut, dt::expr::args_qcut); +} diff --git a/src/core/expr/head_func.cc b/src/core/expr/head_func.cc index 9dc3882362..5944efd5f1 100644 --- a/src/core/expr/head_func.cc +++ b/src/core/expr/head_func.cc @@ -170,7 +170,6 @@ void Head_Func::init() { factory[static_cast(Op::SETMINUS)] = make_colsetop; factory[static_cast(Op::SHIFTFN)] = &Head_Func_Shift::make; factory[static_cast(Op::CUT)] = &Head_Func_Cut::make; - factory[static_cast(Op::QCUT)] = &Head_Func_Qcut::make; factory[static_cast(Op::COUNT0)] = make_reduce0; factory[static_cast(Op::COV)] = make_reduce2; factory[static_cast(Op::CORR)] = make_reduce2; diff --git a/src/core/expr/head_func.h b/src/core/expr/head_func.h index d9ea980559..ed7f4542b3 100644 --- a/src/core/expr/head_func.h +++ b/src/core/expr/head_func.h @@ -157,17 +157,6 @@ class Head_Func_Cut : public Head_Func { -class Head_Func_Qcut : public Head_Func { - private: - py::oobj py_nquantiles_; - - public: - Head_Func_Qcut(py::oobj py_nquantiles); - static ptrHead make(Op, const py::otuple& params); - - Workframe evaluate_n(const vecExpr&, EvalContext&) const override; -}; - }} // namespace dt::expr #endif diff --git a/src/core/expr/head_func_cut.cc b/src/core/expr/head_func_cut.cc index 0de5b38ca8..8c0df433ba 100644 --- a/src/core/expr/head_func_cut.cc +++ b/src/core/expr/head_func_cut.cc @@ -27,8 +27,6 @@ #include "expr/head_func.h" #include "frame/py_frame.h" #include "parallel/api.h" - - namespace dt { namespace expr { diff --git a/src/core/expr/head_func_qcut.cc b/src/core/expr/head_func_qcut.cc deleted file mode 100644 index 3f1fef7f03..0000000000 --- a/src/core/expr/head_func_qcut.cc +++ /dev/null @@ -1,193 +0,0 @@ -//------------------------------------------------------------------------------ -// Copyright 2020 H2O.ai -// -// Permission is hereby granted, free of charge, to any person obtaining a -// copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -// IN THE SOFTWARE. -//------------------------------------------------------------------------------ -#include "_dt.h" -#include "column/latent.h" -#include "column/sentinel_fw.h" -#include "column/qcut.h" -#include "datatablemodule.h" -#include "expr/eval_context.h" -#include "expr/head_func.h" -#include "frame/py_frame.h" -#include "parallel/api.h" - -namespace dt { -namespace expr { - - - -//------------------------------------------------------------------------------ -// Head_Func_Qcut -//------------------------------------------------------------------------------ - -Head_Func_Qcut::Head_Func_Qcut(py::oobj py_nquantiles) - : py_nquantiles_(py_nquantiles) {} - - -ptrHead Head_Func_Qcut::make(Op, const py::otuple& params) { - xassert(params.size() == 1); - return ptrHead(new Head_Func_Qcut(params[0])); -} - - - -Workframe Head_Func_Qcut::evaluate_n( - const vecExpr& args, EvalContext& ctx) const -{ - - if (ctx.has_groupby()) { - throw NotImplError() << "qcut() cannot be used in a groupby context"; - } - - int32_t nquantiles_default = 10; - Workframe wf = args[0]->evaluate_n(ctx); - const size_t ncols = wf.ncols(); - - int32vec nquantiles(ncols); - bool defined_nquantiles = !py_nquantiles_.is_none(); - bool nquantiles_list_or_tuple = py_nquantiles_.is_list_or_tuple(); - - if (nquantiles_list_or_tuple) { - py::oiter py_nquantiles = py_nquantiles_.to_oiter(); - if (py_nquantiles.size() != ncols) { - throw ValueError() << "When `nquantiles` is a list or a tuple, its " - << "length must be the same as the number of input columns, i.e. `" - << ncols << "`, instead got: `" << py_nquantiles.size() << "`"; - - } - - size_t i = 0; - for (auto py_nquantile : py_nquantiles) { - int32_t nquantile = py_nquantile.to_int32_strict(); - if (nquantile <= 0) { - throw ValueError() << "All elements in `nquantiles` must be positive, " - << "got `nquantiles[" << i << "`]: `" << nquantile << "`"; - } - - nquantiles[i++] = nquantile; - } - xassert(i == ncols); - - } else { - if (defined_nquantiles) { - nquantiles_default = py_nquantiles_.to_int32_strict(); - if (nquantiles_default <= 0) { - throw ValueError() << "Number of quantiles must be positive, " - "instead got: `" << nquantiles_default << "`"; - } - } - - for (size_t i = 0; i < ncols; ++i) { - nquantiles[i] = nquantiles_default; - } - } - - // Qcut workframe in-place - for (size_t i = 0; i < ncols; ++i) { - Column coli = wf.retrieve_column(i); - - if (coli.ltype() == dt::LType::STRING || coli.ltype() == dt::LType::OBJECT) - { - throw TypeError() << "`qcut()` cannot be applied to " - << "string or object columns, instead column `" << i - << "` has an stype: `" << coli.stype() << "`"; - } - - coli = Column(new Latent_ColumnImpl(new Qcut_ColumnImpl( - std::move(coli), nquantiles[i] - ))); - wf.replace_column(i, std::move(coli)); - } - - return wf; -} - -}} // dt::expr - - -namespace py { - -static oobj make_pyexpr(dt::expr::Op opcode, otuple targs, otuple tparams) { - size_t op = static_cast(opcode); - return robj(Expr_Type).call({ oint(op), targs, tparams }); -} - - -static const char* doc_qcut = -R"(qcut(cols, nquantiles=10) --- - -Bin all the columns from `cols` into equal-population -discrete intervals, i.e. quantiles. In reality, for some data -these quantiles may not have exactly the same population. - -Parameters ----------- -cols: FExproid - Input data for quantile binning. -nquantiles: int | list of ints | tuple of ints - When a single number is specified, this number of quantiles - will be used to bin each column in `cols`. - When a list or a tuple is provided, each column will be binned - by using its own number of quantiles. In the latter case, - the list/tuple length must be equal to the number of columns - in `cols`. - -return: Expr - f-expression that converts input columns into the - columns filled with the respective quantile ids. -)"; - -static PKArgs args_qcut( - 1, 0, 1, false, false, - { - "cols", "nquantiles" - }, - "qcut", doc_qcut -); - - -/** - * Python-facing function for `qcut()`. - */ -static oobj pyfn_qcut(const PKArgs& args) -{ - if (args[0].is_none_or_undefined()) { - throw TypeError() << "Function `qcut()` requires one positional argument, " - << "but none were given"; - } - - oobj arg0 = args[0].to_oobj(); - oobj arg1 = args[1].is_none_or_undefined()? py::None() : args[1].to_oobj(); - - return make_pyexpr(dt::expr::Op::QCUT, - otuple{ arg0 }, - otuple{ arg1 }); -} - - -void DatatableModule::init_methods_qcut() { - ADD_FN(&py::pyfn_qcut, py::args_qcut); -} - -} // namespace py - - diff --git a/src/core/expr/op.h b/src/core/expr/op.h index 426d75fe47..a2039b83f3 100644 --- a/src/core/expr/op.h +++ b/src/core/expr/op.h @@ -53,7 +53,6 @@ enum class Op : size_t { SETMINUS = 4, SHIFTFN = 5, // head_func_shift.cc CUT = 7, - QCUT = 8, // Unary UPLUS = UNOP_FIRST, // funary/basic.cc diff --git a/src/datatable/expr/expr.py b/src/datatable/expr/expr.py index 6fa68e18d6..7937f0a75d 100644 --- a/src/datatable/expr/expr.py +++ b/src/datatable/expr/expr.py @@ -39,7 +39,6 @@ class OpCodes(enum.Enum): SETMINUS = 4 SHIFTFN = 5 CUT = 7 - QCUT = 8 # Unary UPLUS = 101 diff --git a/tests/expr/test-qcut.py b/tests/expr/test-qcut.py index e59ca733f4..e2d8666960 100644 --- a/tests/expr/test-qcut.py +++ b/tests/expr/test-qcut.py @@ -20,17 +20,17 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS # IN THE SOFTWARE. +#------------------------------------------------------------------------------- import math import pytest import random -from datatable import dt, stype, f, qcut -from datatable.expr import Expr +from datatable import dt, stype, f, qcut, FExpr from datatable.internal import frame_integrity_check from tests import assert_equals #------------------------------------------------------------------------------- -# qcut() +# Errors #------------------------------------------------------------------------------- def test_qcut_error_noargs(): @@ -100,10 +100,25 @@ def test_qcut_error_groupby(): DT[:, qcut(f[0]), f[0]] + + +#------------------------------------------------------------------------------- +# Normal +#------------------------------------------------------------------------------- + +def test_qcut_str(): + assert str(qcut(f.A)) == "FExpr" + assert str(qcut(f.A) + 1) == "FExpr" + assert str(qcut(f.A + f.B)) == "FExpr" + assert str(qcut(f.B, nquantiles=3)) == "FExpr" + assert str(qcut(f[:2], nquantiles=[3, 4])) == \ + "FExpr" + + def test_qcut_empty_frame(): DT = dt.Frame() expr_qcut = qcut(DT) - assert isinstance(expr_qcut, Expr) + assert isinstance(expr_qcut, FExpr) assert_equals(DT[:, f[:]], DT) @@ -111,7 +126,7 @@ def test_qcut_zerorow_frame(): DT = dt.Frame([[], []]) DT_qcut = DT[:, qcut(f[:])] expr_qcut = qcut(DT) - assert isinstance(expr_qcut, Expr) + assert isinstance(expr_qcut, FExpr) assert_equals(DT_qcut, dt.Frame([[] / dt.int32, [] / dt.int32])) @@ -119,7 +134,7 @@ def test_qcut_trivial(): DT = dt.Frame({"trivial": range(10)}) DT_qcut = DT[:, qcut(f[:])] expr_qcut = qcut(DT) - assert isinstance(expr_qcut, Expr) + assert isinstance(expr_qcut, FExpr) assert_equals(DT, DT_qcut)