Skip to content

Commit

Permalink
Resize rows (#1459)
Browse files Browse the repository at this point in the history
- `DataTable.resize_rows()` now works correctly for a DataTable where its columns have different row-indices;
- This method is now more efficient: for multi-column Frames a view is created, without having to modify the data in each column;
- Python `Frame.nrows` setter now always pads the Frame with NAs, even when it had only 1 row (previously 1-row frames were tiled);
- Method `dt.repeat(frame, n)` added to create a frame by repeating an existing frame multiple times.

This is a WIP for #1188
  • Loading branch information
st-pasha authored Dec 1, 2018
1 parent 885e5e8 commit a623746
Show file tree
Hide file tree
Showing 18 changed files with 510 additions and 71 deletions.
28 changes: 26 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,42 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
### [Unreleased](https://github.com/h2oai/datatable/compare/HEAD...v0.7.0)

#### Added

- methods `Frame.to_tuples()` and `Frame.to_dict()` (#1400).
- methods `Frame.head(n)` and `Frame.tail(n)` (#1307).
- `Frame` objects are now pickle-able (#1442).

- methods `Frame.head(n)` and `Frame.tail(n)` to return the first/last
`n` rows correspondingly (#1307).

- `Frame` objects can now pickled (#1442).

- function `dt.repeat(frame, n)` that creates a Frame by row-binding `n`
copies of the `frame`.


#### Fixed

- crash when an int-column row selector is applied to a Frame which already
had another row filter applied (#1437).

- Frame.copy() now retains the key (#1443).

- rendering of "view" Frames in Jupyter notebook (#1448).

- installation from source distribution (#1451).


### Changed

- Setting `frame.nrows` now always pads the Frame with NAs, even if the Frame
had only 1 row originally. Use `frame.repeat()` in order to expand the Frame
by copying its values.

- Improved the performance of setting `frame.nrows`. Now if the frame has
multiple columns, a view will be created.




### [v0.7.0](https://github.com/h2oai/datatable/compare/0.7.0...v0.6.0) — 2018-11-16
#### Added
- Frame can now be created from a list/dict of numpy arrays.
Expand Down
6 changes: 6 additions & 0 deletions c/column.cc
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,12 @@ Column* Column::rbind(std::vector<const Column*>& columns)
}


RowIndex Column::remove_rowindex() {
RowIndex res(std::move(ri));
xassert(!ri);
return res;
}

void Column::replace_rowindex(const RowIndex& newri) {
ri = newri;
nrows = ri.size();
Expand Down
25 changes: 20 additions & 5 deletions c/column.h
Original file line number Diff line number Diff line change
@@ -1,9 +1,23 @@
//------------------------------------------------------------------------------
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Copyright 2018 H2O.ai
//
// © H2O.ai 2018
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
//------------------------------------------------------------------------------
#ifndef dt_COLUMN_h
#define dt_COLUMN_h
Expand Down Expand Up @@ -105,6 +119,8 @@ class Column
virtual size_t elemsize() const = 0;
virtual bool is_fixedwidth() const = 0;

const RowIndex& rowindex() const { return ri; }
RowIndex remove_rowindex();
void replace_rowindex(const RowIndex& newri);

MemoryRange data_buf() const { return mbuf; }
Expand All @@ -113,7 +129,6 @@ class Column
PyObject* mbuf_repr() const;
size_t alloc_size() const;

const RowIndex& rowindex() const { return ri; }
virtual size_t data_nrows() const = 0;
size_t memory_footprint() const;

Expand Down
78 changes: 64 additions & 14 deletions c/datatable.cc
Original file line number Diff line number Diff line change
Expand Up @@ -128,31 +128,62 @@ DataTable* DataTable::delete_columns(std::vector<size_t>& cols_to_remove)
}


// Split all columns into groups, by their `RowIndex`es
static std::pair<std::vector<RowIndex>, std::vector<std::vector<size_t>>>
_split_columns_by_rowindices(const DataTable* dt)
{
std::vector<RowIndex> rowindices;
std::vector<std::vector<size_t>> colindices;
for (size_t i = 0; i < dt->ncols; ++i) {
RowIndex r = dt->columns[i]->rowindex();
size_t j = 0;
for (; j < rowindices.size(); ++j) {
if (rowindices[j] == r) break;
}
if (j == rowindices.size()) {
rowindices.push_back(std::move(r));
colindices.resize(j + 1);
}
colindices[j].push_back(i);
}
return std::make_pair(std::move(rowindices), std::move(colindices));
}


void DataTable::resize_rows(size_t new_nrows) {
if (rowindex) {
if (new_nrows < nrows) {
rowindex.shrink(new_nrows, ncols);
replace_rowindex(rowindex);
return;
if (new_nrows == nrows) return;

// Split all columns into groups, by their `RowIndex`es
std::vector<RowIndex> rowindices;
std::vector<std::vector<size_t>> colindices;
for (size_t i = 0; i < ncols; ++i) {
RowIndex r = columns[i]->remove_rowindex();
size_t j = 0;
for (; j < rowindices.size(); ++j) {
if (rowindices[j] == r) break;
}
if (new_nrows > nrows) {
reify();
// fall-through
if (j == rowindices.size()) {
rowindices.push_back(std::move(r));
colindices.resize(j + 1);
}
colindices[j].push_back(i);
}
if (new_nrows != nrows) {
for (size_t i = 0; i < ncols; ++i) {
columns[i]->resize_and_fill(new_nrows);

for (size_t j = 0; j < rowindices.size(); ++j) {
RowIndex& r = rowindices[j];
if (!r) r = RowIndex(size_t(0), nrows, size_t(1));
r.resize(new_nrows);
for (size_t i : colindices[j]) {
columns[i]->replace_rowindex(r);
}
nrows = new_nrows;
}
nrows = new_nrows;
}



void DataTable::replace_rowindex(const RowIndex& newri) {
if (newri.isabsent() && rowindex.isabsent()) return;
if (!newri && !rowindex) return;
rowindex = newri;
nrows = rowindex.size();
for (size_t i = 0; i < ncols; ++i) {
Expand All @@ -161,6 +192,25 @@ void DataTable::replace_rowindex(const RowIndex& newri) {
}


/**
* Equivalent of ``dt[ri, :]``.
*/
DataTable* apply_rowindex(const DataTable* dt, const RowIndex& ri) {
auto rc = _split_columns_by_rowindices(dt);
auto& rowindices = rc.first;
auto& colindices = rc.second;

colvec newcols(dt->ncols);
for (size_t j = 0; j < rowindices.size(); ++j) {
RowIndex newri = ri * rowindices[j];
for (size_t i : colindices[j]) {
newcols[i] = dt->columns[i]->shallowcopy(newri);
}
}
return new DataTable(std::move(newcols), dt);
}


void DataTable::replace_groupby(const Groupby& newgb) {
int32_t last_offset = newgb.offsets_r()[newgb.ngroups()];
if (static_cast<size_t>(last_offset) != nrows) {
Expand All @@ -178,7 +228,7 @@ void DataTable::replace_groupby(const Groupby& newgb) {
* Do nothing if the DataTable is not a view.
*/
void DataTable::reify() {
if (rowindex.isabsent()) return;
// if (rowindex.isabsent()) return;
for (size_t i = 0; i < ncols; ++i) {
columns[i]->reify();
}
Expand Down
2 changes: 2 additions & 0 deletions c/datatable.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ DataTable* open_jay_from_file(const std::string& path);
DataTable* open_jay_from_bytes(const char* ptr, size_t len);
DataTable* open_jay_from_mbuf(const MemoryRange&);

DataTable* apply_rowindex(const DataTable*, const RowIndex& ri);


//==============================================================================

Expand Down
6 changes: 4 additions & 2 deletions c/datatablemodule.cc
Original file line number Diff line number Diff line change
Expand Up @@ -199,11 +199,13 @@ void DatatableModule::init_methods() {
add(METHODv(expr_unaryop));
add(METHOD0(is_debug_mode));
add(METHOD0(has_omp_support));

init_methods_aggregate();
init_methods_str();
init_methods_join();
init_methods_options();
init_methods_repeat();
init_methods_sets();
init_methods_join();
init_methods_str();
#ifdef DTTEST
init_tests();
#endif
Expand Down
5 changes: 3 additions & 2 deletions c/datatablemodule.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@ class DatatableModule : public py::ExtModule<DatatableModule> {

void init_methods();
void init_methods_aggregate();// extra/aggergate.cc
void init_methods_str(); // str/py_str.cc
void init_methods_join(); // frame/join.cc
void init_methods_options(); // options.cc
void init_methods_repeat(); // frame/repeat.cc
void init_methods_sets(); // set_funcs.cc
void init_methods_join(); // frame/join.cc
void init_methods_str(); // str/py_str.cc

#ifdef DTTEST
void init_tests();
Expand Down
1 change: 1 addition & 0 deletions c/frame/py_frame.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ class Frame : public PyObject {
oobj to_tuples(const NoArgs&);
oobj head(const PKArgs&);
oobj tail(const PKArgs&);
void repeat(const PKArgs&);

private:
static bool internal_construction;
Expand Down
135 changes: 135 additions & 0 deletions c/frame/repeat.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
//------------------------------------------------------------------------------
// Copyright 2018 H2O.ai
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
//------------------------------------------------------------------------------
#include "datatablemodule.h"
#include "frame/py_frame.h"
#include "python/args.h"
#include "rowindex.h"


//------------------------------------------------------------------------------
// Static helpers
//------------------------------------------------------------------------------

/**
* Create and return a RowIndex with elements
*
* list(range(nrows)) * nreps
*
* The returned RowIndex will be either ARR32 or ARR64 depending on how many
* elements are in it.
*/
template <typename T>
static RowIndex _make_repeat_rowindex(size_t nrows, size_t nreps) {
size_t new_nrows = nrows * nreps;
dt::array<T> indices(new_nrows);
T* ptr = indices.data();
for (size_t i = 0; i < nrows; ++i) {
ptr[i] = static_cast<T>(i);
}
size_t nrows_filled = nrows;
while (nrows_filled < new_nrows) {
size_t nrows_copy = std::min(new_nrows - nrows_filled, nrows_filled);
std::memcpy(ptr + nrows_filled, ptr, nrows_copy * sizeof(T));
nrows_filled += nrows_copy;
xassert(nrows_filled % nrows == 0);
}
xassert(nrows_filled == new_nrows);
return RowIndex(std::move(indices), 0, nrows - 1);
}


static Column* repeat_column(const Column* col, size_t nreps) {
size_t elemsize = col->elemsize();
size_t nrows = col->nrows;
size_t new_nrows = nrows * nreps;
xassert(!col->rowindex());

Column* newcol = Column::new_data_column(col->stype(), new_nrows);
const void* olddata = col->data();
void* newdata = newcol->data_w();

std::memcpy(newdata, olddata, nrows * elemsize);
size_t nrows_filled = nrows;
while (nrows_filled < new_nrows) {
size_t nrows_copy = std::min(new_nrows - nrows_filled, nrows_filled);
std::memcpy(static_cast<char*>(newdata) + nrows_filled * elemsize,
newdata,
nrows_copy * elemsize);
nrows_filled += nrows_copy;
xassert(nrows_filled % nrows == 0);
}
xassert(nrows_filled == new_nrows);

return newcol;
}



//------------------------------------------------------------------------------
// datatable.repeat()
//------------------------------------------------------------------------------

static py::PKArgs fn_repeat(
2, 0, 0, false, false, {"frame", "n"},
"repeat",
R"(repeat(frame, n)
--
Concatenate `n` copies of the `frame` by rows and return the result.
This is equivalent to ``dt.rbind([self] * n)``.
)",


[](const py::PKArgs& args) -> py::oobj {
DataTable* dt = args[0].to_frame();
size_t n = args[1].to_size_t();

// Empty Frame: repeating is a noop
if (dt->ncols == 0 || dt->nrows == 0) {
return py::oobj::from_new_reference(py::Frame::from_datatable(dt->copy()));
}

// Single-colum fixed-width Frame:
Column* col0 = dt->columns[0];
if (dt->ncols == 1 &&
!info(col0->stype()).is_varwidth() &&
!col0->rowindex())
{
Column* newcol = repeat_column(col0, n);
DataTable* newdt = new DataTable({newcol}, dt); // copy names from dt
return py::oobj::from_new_reference(py::Frame::from_datatable(newdt));
}

RowIndex ri = (dt->nrows * n < std::numeric_limits<int32_t>::max())
? _make_repeat_rowindex<int32_t>(dt->nrows, n)
: _make_repeat_rowindex<int64_t>(dt->nrows, n);

DataTable* newdt = apply_rowindex(dt, ri);
return py::oobj::from_new_reference(py::Frame::from_datatable(newdt));
});



void DatatableModule::init_methods_repeat() {
ADDFN(fn_repeat);
}
Loading

0 comments on commit a623746

Please sign in to comment.