Skip to content

Commit

Permalink
Add in optimizations for softmax for Fusion F1.
Browse files Browse the repository at this point in the history
Confirmed that the test passes with:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=xtensa OPTIMIZED_KERNEL_DIR=xtensa TARGET_ARCH=fusion_f1 XTENSA_CORE=F1_190305_swupgrade test_kernel_softmax_test -j8
```

However, the latency improvement is only ~1000 ticks, as tested with:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=xtensa OPTIMIZED_KERNEL_DIR=xtensa TARGET_ARCH=fusion_f1 XTENSA_CORE=F1_190305_swupgrade test_keyword_benchmark -j8
```

Since Softmax is currently a small fraction of the overall keyword_benchmark latency we will focus on the latency of only this particular OP.

With the optimized implementation:
```
SOFTMAX took 749 ticks (0 ms).
```

Reference implementation:
```
SOFTMAX took 2052 ticks (2 ms).
```

And with the LUT hifimini implementation (for completeness):
```
SOFTMAX took 1142 ticks (1 ms).
```

The gain of ~1500 ticks ticks is still worth merging because after all the optimizations (e.g.  tensorflow#47098), this will still mean a ~5% improvement for the keyword benchmark.

And the benefits might be more significant for other models too.
  • Loading branch information
advaitjain committed Feb 12, 2021
1 parent ed58135 commit 06e80ff
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 2 deletions.
74 changes: 72 additions & 2 deletions tensorflow/lite/micro/kernels/xtensa/softmax.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ limitations under the License.
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/kernels/op_macros.h"
#include "tensorflow/lite/micro/kernels/kernel_util.h"
#include "tensorflow/lite/micro/kernels/xtensa/xtensa.h"

namespace tflite {
namespace {
Expand All @@ -32,7 +33,14 @@ namespace {
struct OpData {
uint16_t* exp_lut;
};
#elif defined(FUSION_F1)
struct OpData {
SoftmaxParams params;
int scratch_tensor_index;
};
#endif

#if defined(HIFIMINI)
// Number of unique int8_t and int16_t values. Used in exponent lookup table
// computation.
constexpr int kInt8Range =
Expand Down Expand Up @@ -173,8 +181,63 @@ TfLiteStatus PrepareHifimini(TfLiteContext* context, TfLiteNode* node) {
}
#endif // defined(HIFIMINI)

#if defined(FUSION_F1)
TfLiteStatus PrepareHifi4(TfLiteContext* context, TfLiteNode* node) {
TF_LITE_ENSURE_OK(context, SoftmaxPrepare(context, node));

// Calculate scratch memory requirements and request scratch buffer
const TfLiteTensor* input = GetInput(context, node, 0);
const TfLiteTensor* output = GetOutput(context, node, 0);

const RuntimeShape& input_shape = GetTensorShape(input);
const RuntimeShape& output_shape = GetTensorShape(output);
const int trailing_dim = input_shape.DimensionsCount() - 1;
const int depth =
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);

if (input->type == kTfLiteInt8) {
int required_scratch =
get_softmax_scratch_size(PREC_ASYM8S, PREC_ASYM8S, depth);
TF_LITE_ENSURE(context, required_scratch > 0);

auto* data = static_cast<OpData*>(node->user_data);
TF_LITE_ENSURE_OK(
context, context->RequestScratchBufferInArena(
context, required_scratch, &(data->scratch_tensor_index)));
}

return kTfLiteOk;
}

TfLiteStatus EvalHifi4(const OpData* op_data, const TfLiteEvalTensor* input,
TfLiteEvalTensor* output, TfLiteContext* context) {
const RuntimeShape& input_shape = tflite::micro::GetTensorShape(input);
const int8_t* input_data = tflite::micro::GetTensorData<int8_t>(input);
const RuntimeShape& output_shape = tflite::micro::GetTensorShape(output);
int16_t* output_data = tflite::micro::GetTensorData<int16_t>(output);
const int trailing_dim = input_shape.DimensionsCount() - 1;
const int outer_size =
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
const int depth =
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);

void* p_scratch = static_cast<void*>(
context->GetScratchBuffer(context, op_data->scratch_tensor_index));

for (int i = 0; i < outer_size; ++i) {
int err = xa_nn_vec_softmax_asym8s_16(
&output_data[i * depth], &input_data[i * depth],
op_data->params.diff_min, op_data->params.input_left_shift,
op_data->params.input_multiplier, depth, p_scratch);
TF_LITE_ENSURE(context, err == 0);
}
return kTfLiteOk;
}

#endif // defined(FUSION_F1)

void* Init(TfLiteContext* context, const char* buffer, size_t length) {
#if defined(HIFIMINI)
#if defined(HIFIMINI) || defined(FUSION_F1)
TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
return context->AllocatePersistentBuffer(context, sizeof(OpData));
#else
Expand All @@ -185,6 +248,8 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
#if defined(HIFIMINI)
return PrepareHifimini(context, node);
#elif defined(FUSION_F1)
return PrepareHifi4(context, node);
#else
return SoftmaxPrepare(context, node);
#endif
Expand All @@ -208,7 +273,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
TfLiteTypeGetName(input->type), input->type);
return kTfLiteError;
}
#else // !defined(HIFIMINI)
#else // !defined(HIFIMINI)
switch (input->type) {
case kTfLiteFloat32: {
SoftmaxParams op_data = *static_cast<SoftmaxParams*>(node->user_data);
Expand All @@ -221,12 +286,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
}
case kTfLiteInt8: {
if (output->type == kTfLiteInt16) {
#if defined(FUSION_F1)
return EvalHifi4(static_cast<OpData*>(node->user_data), input, output,
context);
#else
SoftmaxParams op_data = *static_cast<SoftmaxParams*>(node->user_data);
tflite::reference_ops::Softmax(
op_data, tflite::micro::GetTensorShape(input),
tflite::micro::GetTensorData<int8_t>(input),
tflite::micro::GetTensorShape(output),
tflite::micro::GetTensorData<int16_t>(output));
#endif
} else {
SoftmaxParams op_data = *static_cast<SoftmaxParams*>(node->user_data);
tflite::reference_ops::Softmax(
Expand Down
1 change: 1 addition & 0 deletions tensorflow/lite/micro/kernels/xtensa/xtensa.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ limitations under the License.
#include <xtensa/tie/xt_hifi2.h>
#elif defined(FUSION_F1)
#include "include/nnlib/xa_nnlib_api.h"
#include "include/nnlib/xa_nnlib_standards.h"
#endif

#endif // TENSORFLOW_LITE_MICRO_KERNELS_XTENSA_XTENSA_H_

0 comments on commit 06e80ff

Please sign in to comment.