diff --git a/CHANGELOG.md b/CHANGELOG.md index de73bea3b3..0116873840 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,18 +12,42 @@ and this project adheres to [Semantic Versioning](http://semver.org/). ### [Unreleased](https://github.com/h2oai/datatable/compare/HEAD...v0.7.0) #### Added + - methods `Frame.to_tuples()` and `Frame.to_dict()` (#1400). -- methods `Frame.head(n)` and `Frame.tail(n)` (#1307). -- `Frame` objects are now pickle-able (#1442). + +- methods `Frame.head(n)` and `Frame.tail(n)` to return the first/last + `n` rows correspondingly (#1307). + +- `Frame` objects can now pickled (#1442). + +- function `dt.repeat(frame, n)` that creates a Frame by row-binding `n` + copies of the `frame`. + #### Fixed + - crash when an int-column row selector is applied to a Frame which already had another row filter applied (#1437). + - Frame.copy() now retains the key (#1443). + - rendering of "view" Frames in Jupyter notebook (#1448). + - installation from source distribution (#1451). +### Changed + +- Setting `frame.nrows` now always pads the Frame with NAs, even if the Frame + had only 1 row originally. Use `frame.repeat()` in order to expand the Frame + by copying its values. + +- Improved the performance of setting `frame.nrows`. Now if the frame has + multiple columns, a view will be created. + + + + ### [v0.7.0](https://github.com/h2oai/datatable/compare/0.7.0...v0.6.0) — 2018-11-16 #### Added - Frame can now be created from a list/dict of numpy arrays. diff --git a/c/column.cc b/c/column.cc index beeb28e928..be900aad82 100644 --- a/c/column.cc +++ b/c/column.cc @@ -210,6 +210,12 @@ Column* Column::rbind(std::vector& columns) } +RowIndex Column::remove_rowindex() { + RowIndex res(std::move(ri)); + xassert(!ri); + return res; +} + void Column::replace_rowindex(const RowIndex& newri) { ri = newri; nrows = ri.size(); diff --git a/c/column.h b/c/column.h index fdce4296ff..ff60fd3f1a 100644 --- a/c/column.h +++ b/c/column.h @@ -1,9 +1,23 @@ //------------------------------------------------------------------------------ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at http://mozilla.org/MPL/2.0/. +// Copyright 2018 H2O.ai // -// © H2O.ai 2018 +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. //------------------------------------------------------------------------------ #ifndef dt_COLUMN_h #define dt_COLUMN_h @@ -105,6 +119,8 @@ class Column virtual size_t elemsize() const = 0; virtual bool is_fixedwidth() const = 0; + const RowIndex& rowindex() const { return ri; } + RowIndex remove_rowindex(); void replace_rowindex(const RowIndex& newri); MemoryRange data_buf() const { return mbuf; } @@ -113,7 +129,6 @@ class Column PyObject* mbuf_repr() const; size_t alloc_size() const; - const RowIndex& rowindex() const { return ri; } virtual size_t data_nrows() const = 0; size_t memory_footprint() const; diff --git a/c/datatable.cc b/c/datatable.cc index b0171a1f6b..baf6a0dd8d 100644 --- a/c/datatable.cc +++ b/c/datatable.cc @@ -128,31 +128,62 @@ DataTable* DataTable::delete_columns(std::vector& cols_to_remove) } +// Split all columns into groups, by their `RowIndex`es +static std::pair, std::vector>> +_split_columns_by_rowindices(const DataTable* dt) +{ + std::vector rowindices; + std::vector> colindices; + for (size_t i = 0; i < dt->ncols; ++i) { + RowIndex r = dt->columns[i]->rowindex(); + size_t j = 0; + for (; j < rowindices.size(); ++j) { + if (rowindices[j] == r) break; + } + if (j == rowindices.size()) { + rowindices.push_back(std::move(r)); + colindices.resize(j + 1); + } + colindices[j].push_back(i); + } + return std::make_pair(std::move(rowindices), std::move(colindices)); +} + void DataTable::resize_rows(size_t new_nrows) { - if (rowindex) { - if (new_nrows < nrows) { - rowindex.shrink(new_nrows, ncols); - replace_rowindex(rowindex); - return; + if (new_nrows == nrows) return; + + // Split all columns into groups, by their `RowIndex`es + std::vector rowindices; + std::vector> colindices; + for (size_t i = 0; i < ncols; ++i) { + RowIndex r = columns[i]->remove_rowindex(); + size_t j = 0; + for (; j < rowindices.size(); ++j) { + if (rowindices[j] == r) break; } - if (new_nrows > nrows) { - reify(); - // fall-through + if (j == rowindices.size()) { + rowindices.push_back(std::move(r)); + colindices.resize(j + 1); } + colindices[j].push_back(i); } - if (new_nrows != nrows) { - for (size_t i = 0; i < ncols; ++i) { - columns[i]->resize_and_fill(new_nrows); + + for (size_t j = 0; j < rowindices.size(); ++j) { + RowIndex& r = rowindices[j]; + if (!r) r = RowIndex(size_t(0), nrows, size_t(1)); + r.resize(new_nrows); + for (size_t i : colindices[j]) { + columns[i]->replace_rowindex(r); } - nrows = new_nrows; } + nrows = new_nrows; } void DataTable::replace_rowindex(const RowIndex& newri) { - if (newri.isabsent() && rowindex.isabsent()) return; + if (!newri && !rowindex) return; rowindex = newri; nrows = rowindex.size(); for (size_t i = 0; i < ncols; ++i) { @@ -161,6 +192,25 @@ void DataTable::replace_rowindex(const RowIndex& newri) { } +/** + * Equivalent of ``dt[ri, :]``. + */ +DataTable* apply_rowindex(const DataTable* dt, const RowIndex& ri) { + auto rc = _split_columns_by_rowindices(dt); + auto& rowindices = rc.first; + auto& colindices = rc.second; + + colvec newcols(dt->ncols); + for (size_t j = 0; j < rowindices.size(); ++j) { + RowIndex newri = ri * rowindices[j]; + for (size_t i : colindices[j]) { + newcols[i] = dt->columns[i]->shallowcopy(newri); + } + } + return new DataTable(std::move(newcols), dt); +} + + void DataTable::replace_groupby(const Groupby& newgb) { int32_t last_offset = newgb.offsets_r()[newgb.ngroups()]; if (static_cast(last_offset) != nrows) { @@ -178,7 +228,7 @@ void DataTable::replace_groupby(const Groupby& newgb) { * Do nothing if the DataTable is not a view. */ void DataTable::reify() { - if (rowindex.isabsent()) return; + // if (rowindex.isabsent()) return; for (size_t i = 0; i < ncols; ++i) { columns[i]->reify(); } diff --git a/c/datatable.h b/c/datatable.h index f94621db81..45777dc012 100644 --- a/c/datatable.h +++ b/c/datatable.h @@ -159,6 +159,8 @@ DataTable* open_jay_from_file(const std::string& path); DataTable* open_jay_from_bytes(const char* ptr, size_t len); DataTable* open_jay_from_mbuf(const MemoryRange&); +DataTable* apply_rowindex(const DataTable*, const RowIndex& ri); + //============================================================================== diff --git a/c/datatablemodule.cc b/c/datatablemodule.cc index d572235ea3..76736bc313 100644 --- a/c/datatablemodule.cc +++ b/c/datatablemodule.cc @@ -199,11 +199,13 @@ void DatatableModule::init_methods() { add(METHODv(expr_unaryop)); add(METHOD0(is_debug_mode)); add(METHOD0(has_omp_support)); + init_methods_aggregate(); - init_methods_str(); + init_methods_join(); init_methods_options(); + init_methods_repeat(); init_methods_sets(); - init_methods_join(); + init_methods_str(); #ifdef DTTEST init_tests(); #endif diff --git a/c/datatablemodule.h b/c/datatablemodule.h index 8547314c8c..a739ba6c3f 100644 --- a/c/datatablemodule.h +++ b/c/datatablemodule.h @@ -29,10 +29,11 @@ class DatatableModule : public py::ExtModule { void init_methods(); void init_methods_aggregate();// extra/aggergate.cc - void init_methods_str(); // str/py_str.cc + void init_methods_join(); // frame/join.cc void init_methods_options(); // options.cc + void init_methods_repeat(); // frame/repeat.cc void init_methods_sets(); // set_funcs.cc - void init_methods_join(); // frame/join.cc + void init_methods_str(); // str/py_str.cc #ifdef DTTEST void init_tests(); diff --git a/c/frame/py_frame.h b/c/frame/py_frame.h index 6b67e93014..950244b758 100644 --- a/c/frame/py_frame.h +++ b/c/frame/py_frame.h @@ -105,6 +105,7 @@ class Frame : public PyObject { oobj to_tuples(const NoArgs&); oobj head(const PKArgs&); oobj tail(const PKArgs&); + void repeat(const PKArgs&); private: static bool internal_construction; diff --git a/c/frame/repeat.cc b/c/frame/repeat.cc new file mode 100644 index 0000000000..55604549ca --- /dev/null +++ b/c/frame/repeat.cc @@ -0,0 +1,135 @@ +//------------------------------------------------------------------------------ +// Copyright 2018 H2O.ai +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. +//------------------------------------------------------------------------------ +#include "datatablemodule.h" +#include "frame/py_frame.h" +#include "python/args.h" +#include "rowindex.h" + + +//------------------------------------------------------------------------------ +// Static helpers +//------------------------------------------------------------------------------ + +/** + * Create and return a RowIndex with elements + * + * list(range(nrows)) * nreps + * + * The returned RowIndex will be either ARR32 or ARR64 depending on how many + * elements are in it. + */ +template +static RowIndex _make_repeat_rowindex(size_t nrows, size_t nreps) { + size_t new_nrows = nrows * nreps; + dt::array indices(new_nrows); + T* ptr = indices.data(); + for (size_t i = 0; i < nrows; ++i) { + ptr[i] = static_cast(i); + } + size_t nrows_filled = nrows; + while (nrows_filled < new_nrows) { + size_t nrows_copy = std::min(new_nrows - nrows_filled, nrows_filled); + std::memcpy(ptr + nrows_filled, ptr, nrows_copy * sizeof(T)); + nrows_filled += nrows_copy; + xassert(nrows_filled % nrows == 0); + } + xassert(nrows_filled == new_nrows); + return RowIndex(std::move(indices), 0, nrows - 1); +} + + +static Column* repeat_column(const Column* col, size_t nreps) { + size_t elemsize = col->elemsize(); + size_t nrows = col->nrows; + size_t new_nrows = nrows * nreps; + xassert(!col->rowindex()); + + Column* newcol = Column::new_data_column(col->stype(), new_nrows); + const void* olddata = col->data(); + void* newdata = newcol->data_w(); + + std::memcpy(newdata, olddata, nrows * elemsize); + size_t nrows_filled = nrows; + while (nrows_filled < new_nrows) { + size_t nrows_copy = std::min(new_nrows - nrows_filled, nrows_filled); + std::memcpy(static_cast(newdata) + nrows_filled * elemsize, + newdata, + nrows_copy * elemsize); + nrows_filled += nrows_copy; + xassert(nrows_filled % nrows == 0); + } + xassert(nrows_filled == new_nrows); + + return newcol; +} + + + +//------------------------------------------------------------------------------ +// datatable.repeat() +//------------------------------------------------------------------------------ + +static py::PKArgs fn_repeat( + 2, 0, 0, false, false, {"frame", "n"}, + "repeat", +R"(repeat(frame, n) +-- + +Concatenate `n` copies of the `frame` by rows and return the result. + +This is equivalent to ``dt.rbind([self] * n)``. +)", + + +[](const py::PKArgs& args) -> py::oobj { + DataTable* dt = args[0].to_frame(); + size_t n = args[1].to_size_t(); + + // Empty Frame: repeating is a noop + if (dt->ncols == 0 || dt->nrows == 0) { + return py::oobj::from_new_reference(py::Frame::from_datatable(dt->copy())); + } + + // Single-colum fixed-width Frame: + Column* col0 = dt->columns[0]; + if (dt->ncols == 1 && + !info(col0->stype()).is_varwidth() && + !col0->rowindex()) + { + Column* newcol = repeat_column(col0, n); + DataTable* newdt = new DataTable({newcol}, dt); // copy names from dt + return py::oobj::from_new_reference(py::Frame::from_datatable(newdt)); + } + + RowIndex ri = (dt->nrows * n < std::numeric_limits::max()) + ? _make_repeat_rowindex(dt->nrows, n) + : _make_repeat_rowindex(dt->nrows, n); + + DataTable* newdt = apply_rowindex(dt, ri); + return py::oobj::from_new_reference(py::Frame::from_datatable(newdt)); +}); + + + +void DatatableModule::init_methods_repeat() { + ADDFN(fn_repeat); +} diff --git a/c/rowindex.cc b/c/rowindex.cc index 12a6482a07..4f2f3da696 100644 --- a/c/rowindex.cc +++ b/c/rowindex.cc @@ -87,6 +87,16 @@ RowIndex::RowIndex(arr64_t&& arr, bool sorted) { impl->acquire(); } +RowIndex::RowIndex(arr32_t&& arr, size_t min, size_t max) { + impl = new ArrayRowIndexImpl(std::move(arr), min, max); + impl->acquire(); +} + +RowIndex::RowIndex(arr64_t&& arr, size_t min, size_t max) { + impl = new ArrayRowIndexImpl(std::move(arr), min, max); + impl->acquire(); +} + RowIndex::RowIndex(filterfn32* f, size_t n, bool sorted) { impl = new ArrayRowIndexImpl(f, n, sorted); impl->acquire(); @@ -174,16 +184,31 @@ void RowIndex::clear() { } -void RowIndex::shrink(size_t nrows, size_t ncols) { - xassert(impl && static_cast(impl->refcount) >= ncols + 1); - if (static_cast(impl->refcount) == ncols + 1) { - impl->shrink(nrows); - } else { - auto newimpl = impl->shrunk(nrows); +// void RowIndex::shrink(size_t nrows, size_t ncols) { +// xassert(impl && static_cast(impl->refcount) >= ncols + 1); +// if (static_cast(impl->refcount) == ncols + 1) { +// impl->shrink(nrows); +// } else { +// auto newimpl = impl->shrunk(nrows); +// xassert(newimpl->refcount == 0); +// impl->release(); +// impl = newimpl; +// impl->acquire(); +// } +// } + + +void RowIndex::resize(size_t nrows) { + xassert(impl); + if (impl->refcount > 1 || (impl->type == RowIndexType::SLICE && + impl->length < nrows)) { + auto newimpl = impl->resized(nrows); xassert(newimpl->refcount == 0); impl->release(); impl = newimpl; impl->acquire(); + } else { + impl->resize(nrows); } } diff --git a/c/rowindex.h b/c/rowindex.h index 7968abd22d..cfef463f94 100644 --- a/c/rowindex.h +++ b/c/rowindex.h @@ -65,6 +65,8 @@ class RowIndex { */ RowIndex(arr32_t&& arr, bool sorted = false); RowIndex(arr64_t&& arr, bool sorted = false); + RowIndex(arr32_t&& arr, size_t min, size_t max); + RowIndex(arr64_t&& arr, size_t min, size_t max); /** * Construct a "slice" RowIndex from triple `(start, count, step)`. @@ -167,6 +169,20 @@ class RowIndex { */ void shrink(size_t nrows, size_t ncols); + /** + * Modifies the RowIndex so that its new size becomes `nrows`. This will + * either throw out the existing elements at the tail, or append "NA" + * indices. + * + * This method either modifies the existing `impl` in-place if its refcount + * is 1, or replaces it with a new "modified" impl. Additionally, a "slice" + * impl will be replaced with an "array" impl if `nrows` is greater than + * the current size of the RowIndex. + * + * This method should not be evoked on an "empty" RowIndex. + */ + void resize(size_t nrows); + size_t memory_footprint() const; /** diff --git a/c/rowindex_array.cc b/c/rowindex_array.cc index c3bd4127e6..f50e0b8abd 100644 --- a/c/rowindex_array.cc +++ b/c/rowindex_array.cc @@ -43,7 +43,7 @@ ArrayRowIndexImpl::ArrayRowIndexImpl(arr32_t&& array, bool sorted) { xassert(length <= std::numeric_limits::max()); owned = array.data_owned(); data = array.release(); - set_min_max(); + set_min_max(); } @@ -53,7 +53,32 @@ ArrayRowIndexImpl::ArrayRowIndexImpl(arr64_t&& array, bool sorted) { length = array.size(); owned = array.data_owned(); data = array.release(); - set_min_max(); + set_min_max(); +} + + +ArrayRowIndexImpl::ArrayRowIndexImpl(arr32_t&& array, size_t _min, size_t _max) +{ + type = RowIndexType::ARR32; + ascending = false; + length = array.size(); + xassert(length <= std::numeric_limits::max()); + owned = array.data_owned(); + data = array.release(); + min = _min; + max = _max; +} + + +ArrayRowIndexImpl::ArrayRowIndexImpl(arr64_t&& array, size_t _min, size_t _max) +{ + type = RowIndexType::ARR64; + ascending = false; + length = array.size(); + owned = array.data_owned(); + data = array.release(); + min = _min; + max = _max; } @@ -63,7 +88,7 @@ ArrayRowIndexImpl::ArrayRowIndexImpl( { size_t n = starts.size(); xassert(n == counts.size() && n == steps.size()); - ascending = false; + ascending = true; data = nullptr; owned = true; @@ -76,12 +101,16 @@ ArrayRowIndexImpl::ArrayRowIndexImpl( size_t start = static_cast(starts[i]); size_t step = static_cast(steps[i]); size_t len = static_cast(counts[i]); - SliceRowIndexImpl tmp(start, len, step); // check triple's validity - if (tmp.min < min) min = tmp.min; - if (tmp.max > max) max = tmp.max; + if (start == RowIndex::NA && step == 0 && len <= RowIndex::MAX) {} + else { + SliceRowIndexImpl tmp(start, len, step); // check triple's validity + if (!tmp.ascending || tmp.min < max) ascending = false; + if (tmp.min < min) min = tmp.min; + if (tmp.max > max) max = tmp.max; + } length += len; } - if (max == 0) { + if (min > max) { min = max = RowIndex::NA; } xassert(min >= 0 && min <= max); @@ -223,7 +252,7 @@ ArrayRowIndexImpl::ArrayRowIndexImpl(filterfn32* ff, size_t n, bool sorted) { // actual number of elements written. length = out_length; _resize_data(); - set_min_max(); + set_min_max(); } @@ -269,7 +298,7 @@ ArrayRowIndexImpl::ArrayRowIndexImpl(filterfn64* ff, size_t n, bool sorted) { } length = out_length; _resize_data(); - set_min_max(); + set_min_max(); } @@ -281,9 +310,16 @@ ArrayRowIndexImpl::~ArrayRowIndexImpl() { } +void ArrayRowIndexImpl::set_min_max() { + if (type == RowIndexType::ARR32) { + _set_min_max(); + } else { + _set_min_max(); + } +} template -void ArrayRowIndexImpl::set_min_max() { +void ArrayRowIndexImpl::_set_min_max() { const T* idata = static_cast(data); if (length == 1) ascending = true; if (length == 0) { @@ -345,8 +381,6 @@ void ArrayRowIndexImpl::init_from_boolean_column(const BoolColumn* col) { if (tdata[j] == 1) ind32[k++] = static_cast(j); }); - ascending = true; - set_min_max(); } else { type = RowIndexType::ARR64; _resize_data(); @@ -357,9 +391,9 @@ void ArrayRowIndexImpl::init_from_boolean_column(const BoolColumn* col) { if (tdata[j] == 1) ind64[k++] = static_cast(j); }); - ascending = true; - set_min_max(); } + ascending = true; + set_min_max(); } @@ -547,11 +581,7 @@ void ArrayRowIndexImpl::shrink(size_t n) { xassert(n < length); length = n; _resize_data(); - if (type == RowIndexType::ARR32) { - set_min_max(); - } else { - set_min_max(); - } + set_min_max(); } RowIndexImpl* ArrayRowIndexImpl::shrunk(size_t n) { @@ -568,6 +598,36 @@ RowIndexImpl* ArrayRowIndexImpl::shrunk(size_t n) { } +void ArrayRowIndexImpl::resize(size_t n) { + size_t oldlen = length; + length = n; + _resize_data(); + if (n <= oldlen) { + set_min_max(); + } else { + size_t elemsize = (type == RowIndexType::ARR32)? 4 : 8; + std::memset(static_cast(data) + oldlen * elemsize, + -1, elemsize * (n - oldlen)); + } +} + +RowIndexImpl* ArrayRowIndexImpl::resized(size_t n) { + size_t ncopy = std::min(n, length); + if (type == RowIndexType::ARR32) { + arr32_t new_ind32(n); + std::memcpy(new_ind32.data(), data, ncopy * 4); + std::memset(new_ind32.data() + ncopy, -1, (n - ncopy) * 4); + return new ArrayRowIndexImpl(std::move(new_ind32), ascending); + } else { + arr64_t new_ind64(n); + std::memcpy(new_ind64.data(), data, n * 8); + std::memset(new_ind64.data() + ncopy, -1, (n - ncopy) * 8); + return new ArrayRowIndexImpl(std::move(new_ind64), ascending); + } +} + + + size_t ArrayRowIndexImpl::nth(size_t i) const { if (type == RowIndexType::ARR32) return static_cast(static_cast(data)[i]); @@ -634,10 +694,17 @@ void ArrayRowIndexImpl::_resize_data() { } size_t elemsize = type == RowIndexType::ARR32? 4 : 8; size_t allocsize = length * elemsize; - void* ptr = std::realloc(data, allocsize); - if (!ptr) { - throw MemoryError() << "Cannot allocate " << allocsize << " bytes " - "for a RowIndex object"; + if (allocsize) { + void* ptr = std::realloc(data, allocsize); + if (!ptr) { + throw MemoryError() << "Cannot allocate " << allocsize << " bytes " + "for a RowIndex object"; + } + data = ptr; + } else { + // If allocsize==0, the behavior of std::realloc is implementation-defined + // See https://en.cppreference.com/w/cpp/memory/c/realloc + std::free(data); + data = nullptr; } - data = ptr; } diff --git a/c/rowindex_impl.h b/c/rowindex_impl.h index 4c54b0d838..1e9f6d997e 100644 --- a/c/rowindex_impl.h +++ b/c/rowindex_impl.h @@ -74,8 +74,12 @@ class RowIndexImpl { virtual size_t nth(size_t i) const = 0; virtual RowIndexImpl* uplift_from(const RowIndexImpl*) = 0; virtual RowIndexImpl* inverse(size_t nrows) const = 0; + virtual void shrink(size_t n) = 0; virtual RowIndexImpl* shrunk(size_t n) = 0; + virtual void resize(size_t n) = 0; + virtual RowIndexImpl* resized(size_t n) = 0; + virtual size_t memory_footprint() const = 0; virtual void verify_integrity() const; }; @@ -97,8 +101,12 @@ class SliceRowIndexImpl : public RowIndexImpl { size_t nth(size_t i) const override; RowIndexImpl* uplift_from(const RowIndexImpl*) override; RowIndexImpl* inverse(size_t nrows) const override; + void shrink(size_t n) override; RowIndexImpl* shrunk(size_t n) override; + void resize(size_t n) override; + RowIndexImpl* resized(size_t n) override; + size_t memory_footprint() const override; void verify_integrity() const override; @@ -128,6 +136,8 @@ class ArrayRowIndexImpl : public RowIndexImpl { public: ArrayRowIndexImpl(arr32_t&& indices, bool sorted); ArrayRowIndexImpl(arr64_t&& indices, bool sorted); + ArrayRowIndexImpl(arr32_t&& indices, size_t min, size_t max); + ArrayRowIndexImpl(arr64_t&& indices, size_t min, size_t max); ArrayRowIndexImpl(const arr64_t& starts, const arr64_t& counts, const arr64_t& steps); ArrayRowIndexImpl(filterfn32* f, size_t n, bool sorted); @@ -141,18 +151,23 @@ class ArrayRowIndexImpl : public RowIndexImpl { size_t nth(size_t i) const override; RowIndexImpl* uplift_from(const RowIndexImpl*) override; RowIndexImpl* inverse(size_t nrows) const override; + void shrink(size_t n) override; RowIndexImpl* shrunk(size_t n) override; + void resize(size_t n) override; + RowIndexImpl* resized(size_t n) override; + size_t memory_footprint() const override; void verify_integrity() const override; private: void _resize_data(); + void set_min_max(); // Helper function that computes and sets proper `min` / `max` fields for // this RowIndex. The `sorted` flag is a hint whether the indices are // sorted (if they are, computing min/max is much simpler). - template void set_min_max(); + template void _set_min_max(); // Helpers for `ArrayRowIndexImpl(Column*)` void init_from_boolean_column(const BoolColumn* col); diff --git a/c/rowindex_slice.cc b/c/rowindex_slice.cc index 33548a070b..e8eae7e312 100644 --- a/c/rowindex_slice.cc +++ b/c/rowindex_slice.cc @@ -212,6 +212,30 @@ RowIndexImpl* SliceRowIndexImpl::shrunk(size_t n) { } +void SliceRowIndexImpl::resize(size_t n) { + xassert(n <= length); + length = n; + min = start; + max = start + step*(n - 1); + if (!ascending) std::swap(min, max); +} + +RowIndexImpl* SliceRowIndexImpl::resized(size_t n) { + if (n <= length) { + return new SliceRowIndexImpl(start, n, step); + } else { + arr64_t starts(2), counts(2), steps(2); + starts[0] = static_cast(start); + counts[0] = static_cast(length); + steps[0] = static_cast(step); + starts[1] = -1; + counts[1] = static_cast(n - length); + steps[1] = 0; + return new ArrayRowIndexImpl(starts, counts, steps); + } +} + + size_t SliceRowIndexImpl::memory_footprint() const { return sizeof(*this); diff --git a/datatable/__init__.py b/datatable/__init__.py index ea7ff54cbb..33d57ee659 100644 --- a/datatable/__init__.py +++ b/datatable/__init__.py @@ -10,7 +10,10 @@ from .expr import mean, min, max, sd, isna, sum, count, first, abs, exp from .fread import fread, GenericReader, FreadWarning from .graph import f, g, join, by -from .lib import core as _core +from .lib._datatable import ( + unique, union, intersect, setdiff, symdiff, + repeat +) from .nff import save, open from .options import options from .str import split_into_nhot @@ -32,7 +35,7 @@ "DataTable", "options", "bool8", "int8", "int16", "int32", "int64", "float32", "float64", "str32", "str64", "obj64", - "cbind", "rbind", + "cbind", "rbind", "repeat", "unique", "union", "intersect", "setdiff", "symdiff", "split_into_nhot") @@ -47,10 +50,6 @@ str64 = stype.str64 obj64 = stype.obj64 DataTable = Frame -unique = _core.unique -union = _core.union -intersect = _core.intersect -setdiff = _core.setdiff -symdiff = _core.symdiff + Frame.__module__ = "datatable" diff --git a/datatable/graph/__init__.py b/datatable/graph/__init__.py index 9025a1e8a7..2ac129a1d6 100644 --- a/datatable/graph/__init__.py +++ b/datatable/graph/__init__.py @@ -87,7 +87,7 @@ def make_datatable(dt, rows, select, groupby=None, join=None, sort=None, if isinstance(replacement, (int, float, str, type(None))): replacement = datatable.Frame([replacement]) if allrows: - replacement.nrows = dt.nrows + replacement = datatable.repeat(replacement, dt.nrows) elif isinstance(replacement, datatable.Frame): pass elif isinstance(replacement, BaseExpr): diff --git a/tests/munging/test_dt_combo.py b/tests/munging/test_dt_combo.py index 3d5225dce6..24c5a46fdc 100644 --- a/tests/munging/test_dt_combo.py +++ b/tests/munging/test_dt_combo.py @@ -31,7 +31,8 @@ def test_columns_rows(): def test_issue1225(): f0 = dt.Frame(A=[1, 2, 3], B=[5, 6, 8]) f1 = f0[::-1, :][:, [dt.float64(f.A), f.B]] - assert f1.internal.isview + # TODO: restore this check after #1188 + # assert f1.internal.isview f1.materialize() assert f1.stypes == (stype.float64, stype.int8) assert f1.topython() == [[3.0, 2.0, 1.0], [8, 6, 5]] diff --git a/tests/test_dt.py b/tests/test_dt.py index e67c5ce88c..9cf3c4d1f1 100644 --- a/tests/test_dt.py +++ b/tests/test_dt.py @@ -526,7 +526,7 @@ def test_resize_rows_api(): f0.nrows = 3 f0.nrows = 5 f0.internal.check() - assert f0.topython() == [[20, 20, 20, None, None]] + assert f0.topython() == [[20, None, None, None, None]] def test_resize_rows0(): @@ -550,7 +550,7 @@ def test_resize_rows0(): f0.internal.check() assert f0.shape == (20, 1) assert f0.stypes == (dt.int32,) - assert f0.topython() == [[0] * 20] + assert f0.topython() == [[0] + [None] * 19] def test_resize_rows1(): @@ -564,12 +564,12 @@ def test_resize_rows1(): f0.internal.check() assert f0.shape == (7, 5) assert f0.stypes == stypes - assert f0.topython() == [src * 7 for src in srcs] + assert f0.topython() == [src + [None] * 6 for src in srcs] f0.nrows = 20 f0.internal.check() assert f0.shape == (20, 5) assert f0.stypes == stypes - assert f0.topython() == [src * 7 + [None] * 13 for src in srcs] + assert f0.topython() == [src + [None] * 19 for src in srcs] f0.nrows = 0 f0.internal.check() assert f0.shape == (0, 5) @@ -600,7 +600,7 @@ def test_resize_view_slice(): f1.nrows = 15 f1.internal.check() assert f1.shape == (15, 1) - assert not f1.internal.isview + assert f1.internal.isview assert f1.topython()[0] == list(range(8, 28, 2)) + [None] * 5 @@ -619,7 +619,7 @@ def test_resize_view_array(): f1.nrows = 5 f1.internal.check() assert f1.shape == (5, 1) - assert not f1.internal.isview + assert f1.internal.isview assert f1.topython() == [[1, 1, 2, 3, None]] @@ -641,6 +641,62 @@ def test_resize_bad(): +#------------------------------------------------------------------------------- +# Frame.repeat() +#------------------------------------------------------------------------------- + +def test_dt_repeat(): + f0 = dt.Frame(range(10)) + f1 = dt.repeat(f0, 3) + f1.internal.check() + assert f1.to_list() == [list(range(10)) * 3] + + +def test_dt_repeat2(): + f0 = dt.Frame(["A", "B", "CDE"]) + f1 = dt.repeat(f0, 7) + f1.internal.check() + assert f1.to_list() == [f0.to_list()[0] * 7] + + +def test_dt_repeat_multicol(): + f0 = dt.Frame(A=[None, 1.4, -2.6, 3.9998], + B=["row", "row", "row", "your boat"], + C=[25, -9, 18, 2], + D=[True, None, True, False]) + f1 = dt.repeat(f0, 4) + f1.internal.check() + assert f1.internal.isview + assert f1.names == f0.names + assert f1.stypes == f0.stypes + assert f1.to_list() == [col * 4 for col in f0.to_list()] + + +def test_dt_repeat_view(): + f0 = dt.Frame(A=[1, 3, 4, 5], B=[2, 6, 3, 1]) + f1 = f0[::2, :] + f2 = dt.repeat(f1, 5) + f2.internal.check() + assert f2.to_dict() == {"A": [1, 4] * 5, "B": [2, 3] * 5} + + +def test_dt_repeat_empty_frame(): + f0 = dt.Frame() + f1 = dt.repeat(f0, 5) + f1.internal.check() + assert f1.to_list() == [] + + +def test_repeat_empty_frame2(): + f0 = dt.Frame(A=[], B=[], C=[], stypes=[dt.int32, dt.str32, dt.float32]) + f1 = dt.repeat(f0, 1000) + f1.internal.check() + assert f1.names == f0.names + assert f1.stypes == f0.stypes + assert f1.to_list() == f0.to_list() + + + #------------------------------------------------------------------------------- # Renaming columns #-------------------------------------------------------------------------------