Skip to content

Commit

Permalink
Update h2o.createFrame with optional response column. Add a Runit tes…
Browse files Browse the repository at this point in the history
…t of this feature.
  • Loading branch information
anqi committed Jan 16, 2015
1 parent 83b4617 commit 372bb4a
Show file tree
Hide file tree
Showing 7 changed files with 87 additions and 38 deletions.
19 changes: 14 additions & 5 deletions R/h2o-package/R/ParseImport.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,13 @@ h2o.assign <- function(data, key) {
.h2o.exec2(expr = data@key, h2o = data@h2o, dest_key = key)
}

h2o.createFrame <- function(object, key, rows, cols, seed, randomize, value, real_range, categorical_fraction, factors, integer_fraction, integer_range, binary_fraction=0, binary_ones_fraction=0.5, missing_fraction, response_factors) {
h2o.createFrame <- function(object, key, rows = 10000, cols = 10, seed, randomize = TRUE, value = 0, real_range = 100, categorical_fraction = 0.2, factors = 100, integer_fraction = 0.2, integer_range = 100, binary_fraction = 0.1, binary_ones_fraction = 0.02, missing_fraction = 0.01, response_factors = 2, has_response = FALSE) {
if(class(object) != "H2OClient") stop("object must be of class H2OClient")
if(!is.character(key)) stop("key must be a character string")
if(!is.numeric(rows)) stop("rows must be a numeric value")
if(!is.numeric(cols)) stop("cols must be a numeric value")
if(!is.numeric(seed)) stop("seed must be a numeric value")
if(!is.logical(randomize)) stop("randomize must be a boolean value")
if(!missing(seed) && !is.numeric(seed)) stop("seed must be a numeric value")
if(!is.logical(randomize)) stop("randomize must be a logical value")
if(!is.numeric(value)) stop("value must be a numeric value")
if(!is.numeric(real_range)) stop("real_range must be a numeric value")
if(!is.numeric(categorical_fraction)) stop("categorical_fraction must be a numeric value")
Expand All @@ -120,9 +122,16 @@ h2o.createFrame <- function(object, key, rows, cols, seed, randomize, value, rea
if(!is.numeric(response_factors)) stop("response_factors must be a numeric value")
if(!is.numeric(binary_fraction)) stop("binary_fraction must be a numeric value")
if(!is.numeric(binary_ones_fraction)) stop("binary_ones_fraction must be a numeric value")
if(!is.logical(has_response)) stop("has_response must be a logical value")

res <- .h2o.__remoteSend(object, .h2o.__PAGE_CreateFrame, key = key, rows = rows, cols = cols, seed = seed, randomize = as.numeric(randomize), value = value, real_range = real_range,
categorical_fraction = categorical_fraction, factors = factors, integer_fraction = integer_fraction, integer_range = integer_range, binary_fraction = binary_fraction, binary_ones_fraction=binary_ones_fraction, missing_fraction = missing_fraction, response_factors = response_factors)
if(missing(seed))
res <- .h2o.__remoteSend(object, .h2o.__PAGE_CreateFrame, key = key, rows = rows, cols = cols, randomize = as.numeric(randomize), value = value, real_range = real_range,
categorical_fraction = categorical_fraction, factors = factors, integer_fraction = integer_fraction, integer_range = integer_range, binary_fraction = binary_fraction,
binary_ones_fraction = binary_ones_fraction, missing_fraction = missing_fraction, response_factors = response_factors, has_response = as.numeric(has_response))
else
res <- .h2o.__remoteSend(object, .h2o.__PAGE_CreateFrame, key = key, rows = rows, cols = cols, seed = seed, randomize = as.numeric(randomize), value = value, real_range = real_range,
categorical_fraction = categorical_fraction, factors = factors, integer_fraction = integer_fraction, integer_range = integer_range, binary_fraction = binary_fraction,
binary_ones_fraction = binary_ones_fraction, missing_fraction = missing_fraction, response_factors = response_factors, has_response = as.numeric(has_response))
.h2o.exec2(expr = key, h2o = object, dest_key = key)
}

Expand Down
13 changes: 6 additions & 7 deletions R/h2o-package/man/h2o.createFrame.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,14 @@ Create an H2O Frame
Create an H2O data frame from scratch, with optional randomization. Supports categoricals, integers, reals and missing values.
}
\usage{
h2o.createFrame(object, key, rows, cols, seed, randomize, value, real_range,
categorical_fraction, factors, integer_fraction, integer_range,
binary_fraction, binary_ones_fraction,
missing_fraction, response_factors)
h2o.createFrame(object, key = "", rows = 10000, cols = 10, seed, randomize = TRUE, value = 0, real_range = 100,
categorical_fraction = 0.2, factors = 100, integer_fraction = 0.2, integer_range = 100, binary_fraction = 0.1,
binary_ones_fraction = 0.02, missing_fraction = 0.01, response_factors = 2, has_response = FALSE)
}
%- maybe also 'usage' for other objects documented here.
\arguments{
\item{object}{An \code{\linkS4class{H2OClient}} object containing the IP address and port of the server running H2O.}
\item{key}{
The unique hex key assigned to the created frame.}
\item{key}{The unique hex key assigned to the created frame.}
\item{rows}{Number of rows}
\item{cols}{Number of columns}
\item{seed}{Random number seed}
Expand All @@ -32,6 +30,7 @@ h2o.createFrame(object, key, rows, cols, seed, randomize, value, real_range,
\item{binary_ones_fraction}{Fraction of 1's in binary columns (for randomize=true)}
\item{missing_fraction}{Fraction of missing values}
\item{response_factors}{Number of factor levels of the first column (1=real, 2=binomial, N=multinomial)}
\item{has_response}{Whether an additional response column should be generated. The final data frame will have cols+1 columns}
}
\value{
Returns an H2O data frame.
Expand All @@ -44,7 +43,7 @@ myframe = h2o.createFrame(localH2O, 'myframekey', rows = 1000, cols = 10,
categorical_fraction = 0.2, factors = 100,
integer_fraction = 0.2, integer_range = 100,
binary_fraction = 0.1, binary_ones_fraction = 0.01,
missing_fraction = 0.1, response_factors = 2)
missing_fraction = 0.1, response_factors = 2, has_response = FALSE)
head(myframe)
summary(myframe)
h2o.shutdown(localH2O)
Expand Down
2 changes: 1 addition & 1 deletion R/tests/testdir_demos/runit_demo_random_data_glm.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ for(i in 1:length(rows)){ # changing number of rows
categorical_fraction = 0.0, factors = 10,
integer_fraction = 0.4, integer_range = 100,
missing_fraction = 0, response_factors = 1,
binary_fraction = 0, binary_ones_fraction = 0.5) )
binary_fraction = 0, binary_ones_fraction = 0.5, has_response = TRUE) )
create_frm_time[i,j] = as.numeric(sst[3])
mem = h2o.ls(conn,"myframe")
frm_size[i,j] = as.numeric(mem[2])
Expand Down
2 changes: 1 addition & 1 deletion R/tests/testdir_demos/runit_demo_random_data_pca.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ for(i in 1:length(rows)){ # changing number of rows
categorical_fraction = 0.0, factors = 10,
integer_fraction = 0.4, integer_range = 100,
missing_fraction = 0, response_factors = 1,
binary_fraction = 0, binary_ones_fraction = 0.5) )
binary_fraction = 0, binary_ones_fraction = 0.5, has_response = TRUE) )

create_frm_time[i,j] = as.numeric(sst[3])
mem = h2o.ls(conn,"myframe")
Expand Down
32 changes: 32 additions & 0 deletions R/tests/testdir_misc/runit_createFrame.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
##
# Testing creation of random data frame in H2O
##

setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

test.createFrame <- function(conn) {
Log.info("Create a data frame with rows = 10000, cols = 100")
hex <- h2o.createFrame(conn, "hex", rows = 10000, cols = 100, categorical_fraction = 0.1, factors = 5, integer_fraction = 0.5, integer_range = 1)
expect_equal(dim(hex), c(10000, 100))
expect_equal(length(colnames(hex)), 100)

Log.info("Check that 0.1 * 100 = 10 columns are categorical")
fac_col <- sapply(1:100, function(i) is.factor(hex[,i]))
num_fac <- sum(fac_col)
expect_equal(num_fac/100, 0.1)

Log.info("Create a data frame with rows = 100, cols = 10")
hex2 <- h2o.createFrame(conn, "hex2", rows = 100, cols = 10, randomize = FALSE, value = 5, categorical_fraction = 0, integer_fraction = 0, missing_fraction = 0, has_response = TRUE)
print(summary(hex2))
expect_equal(dim(hex2), c(100, 11))
expect_equal(length(colnames(hex2)), 11)

Log.info("Check that all data entries are equal to 5")
cons_col <- sapply(1:10, function(i) { min(hex2[,i]) == 5 && max(hex2[,i]) == 5 })
expect_true(all(cons_col))

testEnd()
}

doTest("Create a random data frame in H2O", test.createFrame)
3 changes: 3 additions & 0 deletions src/main/java/hex/CreateFrame.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ public class CreateFrame extends Request2 {

public boolean positive_response; // only for response_factors=1

@API(help = "Whether an additional response column should be generated", filter = Default.class, json=true)
public boolean has_response = false;

@Override public Response serve() {
try {
if (integer_fraction + binary_fraction + categorical_fraction > 1) throw new IllegalArgumentException("Integer, binary and categorical fractions must add up to <= 1.");
Expand Down
54 changes: 30 additions & 24 deletions src/main/java/water/fvec/FrameCreator.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ public FrameCreator(CreateFrame createFrame, Key job) {
_job=job;
_createFrame = createFrame;

int[] idx = Utils.seq(1, _createFrame.cols+1);
int[] idx = _createFrame.has_response ? Utils.seq(1, _createFrame.cols + 1) : Utils.seq(0, _createFrame.cols);
int[] shuffled_idx = new int[idx.length];
Utils.shuffleArray(idx, idx.length, shuffled_idx, _createFrame.seed, 0);

Expand All @@ -44,13 +44,17 @@ public FrameCreator(CreateFrame createFrame, Key job) {

// create domains for categorical variables
if (_createFrame.randomize) {
assert(_createFrame.response_factors >= 1);
_domain = new String[_createFrame.cols+1][];
_domain[0] = _createFrame.response_factors == 1 ? null : new String[_createFrame.response_factors];
if (_domain[0] != null) {
for (int i=0; i <_domain[0].length; ++i) {
_domain[0][i] = "resp." + i;
if(_createFrame.has_response) {
assert(_createFrame.response_factors >= 1);
_domain = new String[_createFrame.cols+1][];
_domain[0] = _createFrame.response_factors == 1 ? null : new String[_createFrame.response_factors];
if (_domain[0] != null) {
for (int i = 0; i < _domain[0].length; ++i) {
_domain[0][i] = "resp." + i;
}
}
} else {
_domain = new String[_createFrame.cols][];
}

for (int c : _cat_cols) {
Expand All @@ -76,14 +80,19 @@ public FrameCreator(CreateFrame createFrame, Key job) {
final private Key _job;

@Override public void compute2() {
Vec[] vecs = Vec.makeNewCons(_createFrame.rows, _createFrame.cols+1, _createFrame.value, _domain);
int totcols = _createFrame.has_response ? (_createFrame.cols+1) : _createFrame.cols;
Vec[] vecs = Vec.makeNewCons(_createFrame.rows, totcols, _createFrame.value, _domain);
String[] names = new String[vecs.length];
names[0] = "response";
for( int i=1; i<vecs.length; i++ ) names[i] = "C"+i;
if(_createFrame.has_response) {
names[0] = "response";
for (int i = 1; i < vecs.length; i++) names[i] = "C" + i;
} else {
for (int i = 0; i < vecs.length; i++) names[i] = "C" + (i+1);
}

_out = new Frame(Key.make(_createFrame.key), names, vecs);
assert _out.numRows() == _createFrame.rows;
assert _out.numCols() == _createFrame.cols+1;
assert _out.numCols() == totcols;
_out.delete_and_lock(_job);

// fill with random values
Expand Down Expand Up @@ -125,14 +134,16 @@ public void map (Chunk[]cs){
final Random rng = new Random();

// response
for (int r = 0; r < cs[0]._len; r++) {
setSeed(rng, 0, cs[0]._start + r);
if (_createFrame.response_factors >1)
cs[0].set0(r, (int)(rng.nextDouble() * _createFrame.response_factors)); //classification
else if (_createFrame.positive_response)
cs[0].set0(r, _createFrame.real_range * rng.nextDouble()); //regression with positive response
else
cs[0].set0(r, _createFrame.real_range * (1 - 2 * rng.nextDouble())); //regression
if(_createFrame.has_response) {
for (int r = 0; r < cs[0]._len; r++) {
setSeed(rng, 0, cs[0]._start + r);
if (_createFrame.response_factors > 1)
cs[0].set0(r, (int) (rng.nextDouble() * _createFrame.response_factors)); //classification
else if (_createFrame.positive_response)
cs[0].set0(r, _createFrame.real_range * rng.nextDouble()); //regression with positive response
else
cs[0].set0(r, _createFrame.real_range * (1 - 2 * rng.nextDouble())); //regression
}
}

for (int c : _cat_cols) {
Expand Down Expand Up @@ -162,8 +173,6 @@ else if (_createFrame.positive_response)
}
}



public static class MissingInserter extends MRTask2<MissingInserter> {
final long _seed;
final double _frac;
Expand Down Expand Up @@ -192,7 +201,4 @@ public void map (Chunk[]cs){
}
}
}



}

0 comments on commit 372bb4a

Please sign in to comment.