Skip to content

Commit

Permalink
Use xa_nnlib for svdf for Fusion F1.
Browse files Browse the repository at this point in the history
The code in this change is the subset of functionality needed for int8
svdf for Hifi4 copied from https://github.com/pnikam-cad/tensorflow/blob/a737c1e3945bc70022259479ad24133a343ec906/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc

Note that the current change has not pulled in either the floating point
implementation or the Hifi5 implementation.

Profiled the keryword_benchmark with the following command:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=xtensa OPTIMIZED_KERNEL_DIR=xtensa TARGET_ARCH=fusion_f1 XTENSA_CORE=F1_190305_swupgrade run_keyword_benchmark -j8
```

gives a latency of 38516 ticks with this change vs 152642 ticks without this change.

Per OP latency with this change:
```
KeywordRunNIerations(1) took 38516 ticks (38 ms)
QUANTIZE took 3758 ticks (3 ms).
SVDF took 4753 ticks (4 ms).
FULLY_CONNECTED took 1353 ticks (1 ms).
SVDF took 4211 ticks (4 ms).
FULLY_CONNECTED took 1353 ticks (1 ms).
SVDF took 3145 ticks (3 ms).
FULLY_CONNECTED took 1353 ticks (1 ms).
SVDF took 4211 ticks (4 ms).
FULLY_CONNECTED took 1353 ticks (1 ms).
SVDF took 2890 ticks (2 ms).
SVDF took 3583 ticks (3 ms).
SVDF took 3054 ticks (3 ms).
FULLY_CONNECTED took 1091 ticks (1 ms).
SOFTMAX took 2042 ticks (2 ms).
QUANTIZE took 366 ticks (0 ms).
```

Without this change:
```
KeywordRunNIerations(1) took 152642 ticks (152 ms)
QUANTIZE took 3758 ticks (3 ms).
SVDF took 38003 ticks (38 ms).
FULLY_CONNECTED took 1353 ticks (1 ms).
SVDF took 18803 ticks (18 ms).
FULLY_CONNECTED took 1353 ticks (1 ms).
SVDF took 18803 ticks (18 ms).
FULLY_CONNECTED took 1353 ticks (1 ms).
SVDF took 18803 ticks (18 ms).
FULLY_CONNECTED took 1353 ticks (1 ms).
SVDF took 13907 ticks (13 ms).
SVDF took 15827 ticks (15 ms).
SVDF took 15827 ticks (15 ms).
FULLY_CONNECTED took 1091 ticks (1 ms).
SOFTMAX took 2042 ticks (2 ms).
QUANTIZE took 366 ticks (0 ms).
```

Also confirmed that the kernel_svdf_test passes with:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=xtensa OPTIMIZED_KERNEL_DIR=xtensa TARGET_ARCH=fusion_f1 XTENSA_CORE=F1_190305_swupgrade test_kernel_svdf_test -j8
```
  • Loading branch information
advaitjain committed Feb 11, 2021
1 parent ed58135 commit de53e77
Showing 1 changed file with 92 additions and 15 deletions.
107 changes: 92 additions & 15 deletions tensorflow/lite/micro/kernels/xtensa/svdf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,14 @@ constexpr int kOutputTensor = 0;
* Note: passing OpData by value might seem like an oversight but it helps
* reduce the latency. See b/155656675 for more details.
*/
void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
const TfLiteEvalTensor* input_tensor,
const TfLiteEvalTensor* weights_feature_tensor,
const TfLiteEvalTensor* weights_time_tensor,
const TfLiteEvalTensor* bias_tensor,
const TfLiteSVDFParams* params,
TfLiteEvalTensor* activation_state_tensor,
TfLiteEvalTensor* output_tensor, OpData data) {
void EvalIntegerSvdfHifimini(TfLiteContext* context, TfLiteNode* node,
const TfLiteEvalTensor* input_tensor,
const TfLiteEvalTensor* weights_feature_tensor,
const TfLiteEvalTensor* weights_time_tensor,
const TfLiteEvalTensor* bias_tensor,
const TfLiteSVDFParams* params,
TfLiteEvalTensor* activation_state_tensor,
TfLiteEvalTensor* output_tensor, OpData data) {
const int n_rank = params->rank;
const int n_batch = input_tensor->dims->data[0];
const int n_input = input_tensor->dims->data[1];
Expand Down Expand Up @@ -243,7 +243,75 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
}
}
}
#endif

#elif defined(FUSION_F1)

TfLiteStatus EvalIntegerSvdfHifi4(
TfLiteContext* context, TfLiteNode* node,
const TfLiteEvalTensor* input_tensor,
const TfLiteEvalTensor* weights_feature_tensor,
const TfLiteEvalTensor* weights_time_tensor,
const TfLiteEvalTensor* bias_tensor, const TfLiteSVDFParams* params,
TfLiteEvalTensor* activation_state_tensor, TfLiteEvalTensor* output_tensor,
const OpData& data) {
const int n_rank = params->rank;
const int n_batch = input_tensor->dims->data[0];
const int n_input = input_tensor->dims->data[1];
const int n_filter = weights_feature_tensor->dims->data[0];
const int n_unit = n_filter / n_rank;
const int n_memory = weights_time_tensor->dims->data[1];

TFLITE_DCHECK(context != nullptr);
TFLITE_DCHECK(context->GetScratchBuffer != nullptr);

// Shift states.
int16_t* const state_ptr =
tflite::micro::GetTensorData<int16_t>(activation_state_tensor);

// Left shift the activation_state.
int num_bytes = sizeof(*state_ptr) * (n_batch * n_filter * n_memory - 1);
xa_nn_memmove_16(state_ptr, state_ptr + 1, num_bytes);

// Note: no need to clear the latest activation, matmul is not accumulative.

// Feature matmul.
const int8_t* input = tflite::micro::GetTensorData<int8_t>(input_tensor);
const int8_t* weight_feature =
tflite::micro::GetTensorData<int8_t>(weights_feature_tensor);
int16_t* result_in_batch = state_ptr + (n_memory - 1);

for (int b = 0; b < n_batch; b++) {
TF_LITE_ENSURE_EQ(context,
xa_nn_matXvec_out_stride_sym8sxasym8s_16(
&result_in_batch[b * n_filter * n_memory],
weight_feature, &input[b * n_input], NULL, n_filter,
n_input, n_input, n_memory, -data.input_zero_point,
(data.effective_scale_1_a), data.effective_scale_1_b),
0);
}

for (int b = 0; b < n_batch; ++b) {
const int16_t* vector1_ptr =
tflite::micro::GetTensorData<int16_t>(weights_time_tensor);
const int16_t* vector2_ptr =
tflite::micro::GetTensorData<int16_t>(activation_state_tensor) +
b * n_memory * n_filter;
const int32_t* bias_ptr =
tflite::micro::GetTensorData<int32_t>(bias_tensor);
int8_t* output_ptr =
tflite::micro::GetTensorData<int8_t>(output_tensor) + b * n_unit;

TF_LITE_ENSURE_EQ(
context,
xa_nn_dot_prod_16x16_asym8s(
output_ptr, vector1_ptr, vector2_ptr, bias_ptr, n_memory * n_rank,
(data.effective_scale_2_a), data.effective_scale_2_b,
data.output_zero_point, n_unit),
0);
}
return kTfLiteOk;
}
#endif // defined(FUSION_F1) || defined(HIFIMINI)

void* Init(TfLiteContext* context, const char* buffer, size_t length) {
TFLITE_DCHECK(context != nullptr);
Expand Down Expand Up @@ -274,11 +342,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
const int rank = params->rank;
const int input_size = input->dims->data[1];
const int batch_size = input->dims->data[0];

#if defined(HIFIMINI)
// Ensure the input size is a multiple of two. This is necessary since
// optimized kernels access the memory in chunks of two, and all accesses
// must be aligned to 16 bits.
// TODO(b/153202598): Remove when padding is allowed in TFLite tensors.
TF_LITE_ENSURE_EQ(context, input_size % 2, 0);
#endif // defined(HIFIMINI)

const int num_filters = weights_feature->dims->data[0];
TF_LITE_ENSURE_EQ(context, num_filters % rank, 0);
Expand Down Expand Up @@ -339,9 +410,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
static_cast<double>(activation_state->params.scale *
weights_time->params.scale / output->params.scale);

TF_LITE_ENSURE_EQ(context, static_cast<double>(bias->params.scale),
static_cast<double>(activation_state->params.scale *
weights_time->params.scale));
TF_LITE_ENSURE_NEAR(context, static_cast<double>(bias->params.scale),
static_cast<double>(activation_state->params.scale *
weights_time->params.scale),
1e-5);

TFLITE_DCHECK(node->user_data != nullptr);
OpData* data = static_cast<OpData*>(node->user_data);
Expand Down Expand Up @@ -396,13 +468,18 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
const OpData& data = *(static_cast<const OpData*>(node->user_data));

#if defined(HIFIMINI)
EvalIntegerSVDF(context, node, input, weights_feature, weights_time, bias,
params, activation_state, output, data);
EvalIntegerSvdfHifimini(context, node, input, weights_feature, weights_time,
bias, params, activation_state, output, data);
return kTfLiteOk;
#elif defined(FUSION_F1)
return EvalIntegerSvdfHifi4(context, node, input, weights_feature,
weights_time, bias, params, activation_state,
output, data);
#else
EvalIntegerSvdfReference(context, node, input, weights_feature, weights_time,
bias, params, activation_state, output, data);
#endif
return kTfLiteOk;
#endif
}

} // namespace
Expand Down

0 comments on commit de53e77

Please sign in to comment.