Skip to content

Commit

Permalink
zipfian random sampler without replacement (apache#12113)
Browse files Browse the repository at this point in the history
* code compiles

* update doc

* fix bug and add test

* fix lint
  • Loading branch information
eric-haibin-lin authored and szha committed Aug 11, 2018
1 parent ef697a5 commit 925f5a0
Show file tree
Hide file tree
Showing 3 changed files with 259 additions and 0 deletions.
72 changes: 72 additions & 0 deletions src/operator/random/unique_sample_op.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*!
* Copyright (c) 2016 by Contributors
* \file sample_op.cc
* \brief CPU Implementation of unique sample op
*/

#include "./unique_sample_op.h"
#include "../tensor/init_op.h"

namespace mxnet {
namespace op {

DMLC_REGISTER_PARAMETER(SampleUniqueZifpianParam);

#define MXNET_OPERATOR_REGISTER_UNIQUE_SAMPLE(name, ParamType) \
NNVM_REGISTER_OP(name) \
.set_num_inputs(0) \
.set_num_outputs(2) \
.set_attr_parser(ParamParser<ParamType>) \
.set_attr<FResourceRequest>("FResourceRequest", UniqueSampleResource) \
.add_arguments(ParamType::__FIELDS__())

MXNET_OPERATOR_REGISTER_UNIQUE_SAMPLE(_sample_unique_zipfian,
SampleUniqueZifpianParam)
.describe(R"code(Draw random samples from an an approximately log-uniform
or Zipfian distribution without replacement.
This operation takes a 2-D shape `(batch_size, num_sampled)`,
and randomly generates *num_sampled* samples from the range of integers [0, range_max)
for each instance in the batch.
The elements in each instance are drawn without replacement from the base distribution.
The base distribution for this operator is an approximately log-uniform or Zipfian distribution:
P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)
Additionaly, it also returns the number of trials used to obtain `num_sampled` samples for
each instance in the batch.
Example::
samples, trials = _sample_unique_zipfian(750000, shape=(4, 8192))
unique(samples[0]) = 8192
unique(samples[3]) = 8192
trials[0] = 16435
)code" ADD_FILELINE)
.set_attr<nnvm::FInferShape>("FInferShape", SampleUniqueShape<SampleUniqueZifpianParam>)
.set_attr<nnvm::FInferType>("FInferType", SampleUniqueType<SampleUniqueZifpianParam>)
.set_attr<FCompute>("FCompute<cpu>", SampleUniqueZifpian);

} // namespace op
} // namespace mxnet
170 changes: 170 additions & 0 deletions src/operator/random/unique_sample_op.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*!
* Copyright (c) 2018 by Contributors
* \file sample_op.h
* \brief Elementary unique sampling operators
*/
#ifndef MXNET_OPERATOR_RANDOM_UNIQUE_SAMPLE_OP_H_
#define MXNET_OPERATOR_RANDOM_UNIQUE_SAMPLE_OP_H_

#include <mxnet/operator_util.h>
#include <mshadow/base.h>
#include <string>
#include <vector>
#include <unordered_set>
#include <algorithm>
#include <cmath>
#include "../mxnet_op.h"
#include "../operator_common.h"
#include "./sampler.h"

namespace mxnet {
namespace op {

struct SampleUniqueZifpianParam : public dmlc::Parameter<SampleUniqueZifpianParam> {
int range_max;
TShape shape;
DMLC_DECLARE_PARAMETER(SampleUniqueZifpianParam) {
DMLC_DECLARE_FIELD(range_max)
.describe("The number of possible classes.");
DMLC_DECLARE_FIELD(shape)
.set_default(TShape())
.describe("2-D shape of the output, where shape[0] is the batch size, and shape[1] "
"is the number of candidates to sample for each batch.");
}
};

template<typename ParamType>
inline bool SampleUniqueShape(const nnvm::NodeAttrs& attrs,
std::vector<TShape> *in_attrs,
std::vector<TShape> *out_attrs) {
const ParamType& param = nnvm::get<ParamType>(attrs.parsed);
CHECK_EQ(in_attrs->size(), 0U);
CHECK_EQ(out_attrs->size(), 2U);
// output shape is known
if ((*out_attrs)[0].ndim() == 2 && param.shape.ndim() == 0) {
SHAPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::Shape1((*out_attrs)[0][0]));
return true;
}
CHECK_EQ(param.shape.ndim(), 2U);
SHAPE_ASSIGN_CHECK(*out_attrs, 0, param.shape);
SHAPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::Shape1(param.shape[0]));
return true;
}

template<typename ParamType>
inline bool SampleUniqueType(const nnvm::NodeAttrs& attrs,
std::vector<int> *in_attrs,
std::vector<int> *out_attrs) {
CHECK_EQ(in_attrs->size(), 0U);
CHECK_EQ(out_attrs->size(), 2U);
TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt64);
TYPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::kInt64);
return true;
}

inline std::vector<ResourceRequest> UniqueSampleResource(const NodeAttrs& attrs) {
return {ResourceRequest::kParallelRandom};
}

/*!
* \brief Launch a generic kernel with parallel unique random generator
* \tparam gen random generator
* \tparam batch_size the batch size
* \tparam num_sampled the number of unique samples per batch
* \tparam Args Varargs type to eventually pass to the OP::Map() function
*/
template<typename GType, typename DType, typename OP, typename ...Args>
inline static void LaunchUniqueRNG(mshadow::Stream<cpu> *s,
common::random::RandGenerator<cpu, GType> *gen,
const int batch_size, const size_t num_sampled,
std::vector<std::unordered_set<DType>> *results,
Args... args) {
// minimal check to avoid division by zero, below.
// if `N` is zero the map operation is a no-op in any case.
if (batch_size <= 0 || num_sampled <= 0) return;
const int nthread = std::min(batch_size, RandGenerator<cpu>::kNumRandomStates);
const int step = (batch_size + nthread - 1) / nthread;
Kernel<OP, cpu>::Launch(s, nthread, *gen, batch_size, num_sampled, results, step, args...);
}

struct UniqueSampleUniformKernel {
template<typename GType, typename DType>
MSHADOW_XINLINE static void Map(int tid, RandGenerator<cpu, GType> gen,
const int batch_size, const size_t num_sampled,
std::vector<std::unordered_set<DType>> *results,
const int step, const GType log_range_max,
DType *samples, DType *num_tries) {
const int begin = tid * step;
const int end = (tid + 1) * step;
typename RandGenerator<cpu, GType>::Impl generator(&gen, tid);
for (int i = begin; i < end && i < batch_size; i++) {
auto &result = results->at(i);
const int base = i * num_sampled;
DType tries = 0;
while (result.size() != num_sampled) {
const double x = generator.uniform();
const DType value = static_cast<DType>(lround(exp(x * log_range_max)) - 1);
// sampling without replacement
if (result.find(value) == result.end()) {
samples[base + result.size()] = value;
result.emplace(value);
}
tries += 1;
}
num_tries[i] = tries;
}
}
};

inline void SampleUniqueZifpian(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
using DType = int64_t;
using GType = double;
const SampleUniqueZifpianParam& param = nnvm::get<SampleUniqueZifpianParam>(attrs.parsed);
const int batch_size = param.shape[0];
const size_t num_sampled = static_cast<size_t>(param.shape[1]);
const double log_range_max = log(param.range_max);
CHECK_EQ(outputs.size(), 2U);
CHECK_LE(num_sampled, param.range_max)
<< "Number of samples cannot exceed the number of possible classes";
// rand generator resource and result sets
RandGenerator<cpu, GType> *pgen = ctx.requested[0].get_parallel_random<cpu, GType>();
std::vector<std::unordered_set<DType>> results(batch_size);
for (int i = 0; i < batch_size; i++) {
results[i].reserve(num_sampled);
}

DType *num_tries = outputs[1].dptr<DType>();
DType *samples = outputs[0].dptr<DType>();
Stream<cpu> *s = ctx.get_stream<cpu>();
LaunchUniqueRNG<GType, DType, UniqueSampleUniformKernel>(s, pgen, batch_size, num_sampled,
&results, log_range_max, samples,
num_tries);
}


} // namespace op
} // namespace mxnet
#endif // MXNET_OPERATOR_RANDOM_UNIQUE_SAMPLE_OP_H_
17 changes: 17 additions & 0 deletions tests/python/unittest/test_random.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,6 +625,23 @@ def check_data(a, b):
for j in range(i+1, num_seeds):
check_data(data[i],data[j])

@with_seed()
def test_unique_zipfian_generator():
ctx = mx.context.current_context()
if ctx.device_type == 'cpu':
num_sampled = 8192
range_max = 793472
batch_size = 4
op = mx.nd._internal._sample_unique_zipfian
classes, num_trials = op(range_max, shape=(batch_size, num_sampled))
for i in range(batch_size):
num_trial = num_trials[i].asscalar()
# test uniqueness
assert np.unique(classes[i].asnumpy()).size == num_sampled
# test num trials. reference count obtained from pytorch implementation
assert num_trial > 14500
assert num_trial < 17000

@with_seed()
def test_zipfian_generator():
# dummy true classes
Expand Down

0 comments on commit 925f5a0

Please sign in to comment.