From 76d6411f505cc39bdb6f0a159bc311f603152cc8 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 21 Jul 2022 10:04:28 +0000
Subject: [PATCH] [ETHOSN] Get buffer sizes from the compiled network

The NPU support library compiler sometimes adds padding to input
tensors which means the buffer sizes calculated at runtime can
sometimes be smaller than necessary. Instead, buffer sizes are now
collected at compile time and passed to the runtime so that they match
the sizes expected by the compiled network. This was seen when running
a fully connected operation with an input that is not a multiple of
1024, so testing has been added to cover this case.

Additionally changed the fully connected test case to use pytest
parameterization as part of a general cleanup, and fixed the fully
connected testing to support output channels > 1.

Change-Id: Iad319d75326b9ac41950de982603660a084dc27b
---
 src/relay/backend/contrib/ethosn/codegen.cc   | 17 ++++
 .../backend/contrib/ethosn/codegen_ethosn.h   | 13 +++
 src/runtime/contrib/ethosn/ethosn_device.cc   | 38 ++++----
 src/runtime/contrib/ethosn/ethosn_device.h    |  8 +-
 src/runtime/contrib/ethosn/ethosn_runtime.cc  | 14 ++-
 src/runtime/contrib/ethosn/ethosn_runtime.h   |  4 +
 .../test_ethosn/test_fullyconnected.py        | 95 ++++++++++---------
 .../contrib/test_ethosn/test_networks.py      | 10 +-
 8 files changed, 126 insertions(+), 73 deletions(-)

diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index 67ae1d20e3d0e..587c338205611 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -629,6 +629,7 @@ runtime::ethosn::OrderedCompiledNetwork EthosnCompiler::CompileEthosnFunc(const
   // Determine the order that the inputs/outputs are in and how that corresponds to the
   // order that the TVM runtime will expect them in
   auto input_output_order = GetInputOutputOrder(network_with_ids, compiled_network);
+  auto io_sizes = GetIOSizes(compiled_network);
   // Use the order information to create an 'ordered' network with includes how to map
   // the inputs/outputs from the TVM runtime to the inputs/outputs of the compiled network
   runtime::ethosn::OrderedCompiledNetwork ordered_network;
@@ -636,6 +637,8 @@ runtime::ethosn::OrderedCompiledNetwork EthosnCompiler::CompileEthosnFunc(const
   ordered_network.compiled_cmm = std::move(compiled_network);
   ordered_network.inputs = input_output_order.first;
   ordered_network.outputs = input_output_order.second;
+  ordered_network.input_sizes = io_sizes.first;
+  ordered_network.output_sizes = io_sizes.second;
   return ordered_network;
 }
 
@@ -686,6 +689,20 @@ std::pair<std::vector<uint32_t>, std::vector<uint32_t>> EthosnCompiler::GetInput
   return std::make_pair(input_order, output_order);
 }
 
+std::pair<std::vector<uint32_t>, std::vector<uint32_t>> EthosnCompiler::GetIOSizes(
+    const std::unique_ptr<sl::CompiledNetwork>& compiled_network) {
+  std::vector<uint32_t> input_sizes;
+  std::vector<uint32_t> output_sizes;
+  for (const sl::InputBufferInfo info : compiled_network->GetInputBufferInfos()) {
+    input_sizes.push_back(info.m_Size);
+  }
+  for (const sl::OutputBufferInfo info : compiled_network->GetOutputBufferInfos()) {
+    output_sizes.push_back(info.m_Size);
+  }
+
+  return std::make_pair(input_sizes, output_sizes);
+}
+
 std::unique_ptr<sl::SupportQueries> EthosnCompiler::m_Queries;
 
 EthosnError EthosnCompiler::SupportedSetup() {
diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
index 9da4e5b18bd5d..f3c9c77826e98 100644
--- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h
+++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -350,6 +350,19 @@ class EthosnCompiler {
   static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> GetInputOutputOrder(
       NetworkWithIDs network, const std::unique_ptr<sl::CompiledNetwork>& compiled_network);
 
+  /*!
+   * \brief Determine the input and output sizes of a compiled network.
+   *
+   * These need to be queried from the compiled network as the compiler can choose
+   * to add additional padding on the input/output in certain cases.
+   *
+   * \param compiled_network The network compiled by the NPU compiler.
+   * \return Pair of vectors of buffer sizes for both the inputs and outputs of the
+   * network.
+   */
+  static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> GetIOSizes(
+      const std::unique_ptr<sl::CompiledNetwork>& compiled_network);
+
   /*!
    * \brief Query interface used to determine if the Ethos-N hardware supports an operation
    * with the supplied parameters.
diff --git a/src/runtime/contrib/ethosn/ethosn_device.cc b/src/runtime/contrib/ethosn/ethosn_device.cc
index 98717036385da..5f657da1787ad 100644
--- a/src/runtime/contrib/ethosn/ethosn_device.cc
+++ b/src/runtime/contrib/ethosn/ethosn_device.cc
@@ -95,28 +95,28 @@ void CopyOutput(dl::Buffer* source_buffers[], std::vector<DLTensor*>* outputs) {
 }
 
 void CreateBuffers(std::vector<std::shared_ptr<dl::Buffer> >* fm,
-                   const std::vector<DLTensor*>& tensors, bool input) {
-  int index = 0;
-  for (auto buffer : tensors) {
-    auto* data = static_cast<uint8_t*>(buffer->data);
-    // The NPU only needs the size of the tensor * uint8_t.
-    auto data_size = static_cast<uint32_t>(GetDataSize(*buffer));
+                   const std::vector<DLTensor*>& tensors, const std::vector<uint32_t>& tensor_sizes,
+                   bool input) {
+  for (size_t i = 0; i < tensors.size(); i++) {
+    auto* data = static_cast<uint8_t*>(tensors[i]->data);
     if (input) {
-      (*fm)[index++] = std::make_shared<dl::Buffer>(data, data_size, dl::DataFormat::NHWC);
+      (*fm)[i] = std::make_shared<dl::Buffer>(data, tensor_sizes[i], dl::DataFormat::NHWC);
     } else {
-      (*fm)[index++] = std::make_shared<dl::Buffer>(data_size, dl::DataFormat::NHWC);
+      (*fm)[i] = std::make_shared<dl::Buffer>(tensor_sizes[i], dl::DataFormat::NHWC);
     }
   }
 }
 
 #if _ETHOSN_API_VERSION_ <= 2102
 bool Inference(tvm::runtime::TVMArgs args, sl::CompiledNetwork* network,
-               const std::vector<uint32_t>& input_order,
-               const std::vector<uint32_t>& output_order) {
+               const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
+               const std::vector<uint32_t>& input_sizes,
+               const std::vector<uint32_t>& output_sizes) {
 #else
 bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu,
-               const std::vector<uint32_t>& input_order,
-               const std::vector<uint32_t>& output_order) {
+               const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
+               const std::vector<uint32_t>& input_sizes,
+               const std::vector<uint32_t>& output_sizes) {
 #endif
   // Unpack parameters
   uint8_t argc = 0;
@@ -133,11 +133,11 @@ bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu,
 
   // Set up input buffers
   std::vector<std::shared_ptr<dl::Buffer> > ifm(inputs.size());
-  CreateBuffers(&ifm, inputs, true);
+  CreateBuffers(&ifm, inputs, input_sizes, true);
 
   // Set up output buffers
   std::vector<std::shared_ptr<dl::Buffer> > ofm(outputs.size());
-  CreateBuffers(&ofm, outputs, false);
+  CreateBuffers(&ofm, outputs, output_sizes, false);
 
   // Raw pointers for the inference
   dl::Buffer* ifm_raw[inputs.size()];
@@ -222,12 +222,14 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.test.infra.inference_result")
 // Allow the ethos-n support code to be tested without a device
 #if _ETHOSN_API_VERSION_ <= 2102
 bool Inference(tvm::runtime::TVMArgs args, sl::CompiledNetwork* network,
-               const std::vector<uint32_t>& input_order,
-               const std::vector<uint32_t>& output_order) {
+               const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
+               const std::vector<uint32_t>& input_sizes,
+               const std::vector<uint32_t>& output_sizes) {
 #else
 bool Inference(tvm::runtime::TVMArgs args, dl::Network* /* npu */,
-               const std::vector<uint32_t>& input_order,
-               const std::vector<uint32_t>& output_order) {
+               const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
+               const std::vector<uint32_t>& input_sizes,
+               const std::vector<uint32_t>& output_sizes) {
 #endif
   std::vector<DLTensor*> outputs;
   for (int argc = input_order.size(); argc < args.size(); argc++) {
diff --git a/src/runtime/contrib/ethosn/ethosn_device.h b/src/runtime/contrib/ethosn/ethosn_device.h
index d69be62aa603c..2d1e536ef8e78 100644
--- a/src/runtime/contrib/ethosn/ethosn_device.h
+++ b/src/runtime/contrib/ethosn/ethosn_device.h
@@ -41,10 +41,12 @@ using tvm::runtime::TVMArgs;
 
 #if _ETHOSN_API_VERSION_ <= 2102
 bool Inference(TVMArgs args, sl::CompiledNetwork* npu, const std::vector<uint32_t>& input_order,
-               const std::vector<uint32_t>& output_order);
+               const std::vector<uint32_t>& output_order, const std::vector<uint32_t>& input_sizes,
+               const std::vector<uint32_t>& output_sizes);
 #else
-bool Inference(TVMArgs args, dl::Network* npu, const std::vector<uint32_t>& input_order,
-               const std::vector<uint32_t>& output_order);
+bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu,
+               const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
+               const std::vector<uint32_t>& input_sizes, const std::vector<uint32_t>& output_sizes);
 #endif
 
 }  // namespace ethosn
diff --git a/src/runtime/contrib/ethosn/ethosn_runtime.cc b/src/runtime/contrib/ethosn/ethosn_runtime.cc
index 962d4db47eb93..295ff537b3795 100644
--- a/src/runtime/contrib/ethosn/ethosn_runtime.cc
+++ b/src/runtime/contrib/ethosn/ethosn_runtime.cc
@@ -60,6 +60,8 @@ EthosnModule::EthosnModule(std::vector<OrderedCompiledNetwork>* cmms) {
 #endif
     network_map_[it.name].inputs = it.inputs;
     network_map_[it.name].outputs = it.outputs;
+    network_map_[it.name].input_sizes = it.input_sizes;
+    network_map_[it.name].output_sizes = it.output_sizes;
   }
 }
 
@@ -69,10 +71,12 @@ PackedFunc EthosnModule::GetFunction(const std::string& name,
     return PackedFunc([sptr_to_self, this, name](TVMArgs args, TVMRetValue* rv) {
 #if _ETHOSN_API_VERSION_ <= 2102
       *rv = Inference(args, network_map_[name].compiled_cmm.get(), network_map_[name].inputs,
-                      network_map_[name].outputs);
+                      network_map_[name].outputs, network_map_[name].input_sizes,
+                      network_map_[name].output_sizes);
 #else
       *rv = Inference(args, network_map_[name].runtime_cmm.get(), network_map_[name].inputs,
-                      network_map_[name].outputs);
+                      network_map_[name].outputs, network_map_[name].input_sizes,
+                      network_map_[name].output_sizes);
 #endif
     });
   } else {
@@ -90,8 +94,10 @@ void EthosnModule::SaveToBinary(dmlc::Stream* stream) {
     stream->Write(ss.str());
     stream->Write(it.second.inputs.size());
     stream->Write(&it.second.inputs[0], sizeof(uint32_t) * it.second.inputs.size());
+    stream->Write(&it.second.input_sizes[0], sizeof(uint32_t) * it.second.input_sizes.size());
     stream->Write(it.second.outputs.size());
     stream->Write(&it.second.outputs[0], sizeof(uint32_t) * it.second.outputs.size());
+    stream->Write(&it.second.output_sizes[0], sizeof(uint32_t) * it.second.output_sizes.size());
   }
 }
 
@@ -128,12 +134,16 @@ Module EthosnModule::LoadFromBinary(void* strm) {
     compiled.inputs.resize(size);
     // Read the order of inputs
     stream->Read(&compiled.inputs[0], sizeof(uint32_t) * size);
+    compiled.input_sizes.resize(size);
+    stream->Read(&compiled.input_sizes[0], sizeof(uint32_t) * size);
     // Read the number of outputs
     stream->Read<uint64_t>(&output_size);
     size = static_cast<size_t>(output_size);
     compiled.outputs.resize(size);
     // Read the order of outputs
     stream->Read(&compiled.outputs[0], sizeof(uint32_t) * size);
+    compiled.output_sizes.resize(size);
+    stream->Read(&compiled.output_sizes[0], sizeof(uint32_t) * size);
   }
   auto n = make_object<EthosnModule>(&cmms);
   return Module(n);
diff --git a/src/runtime/contrib/ethosn/ethosn_runtime.h b/src/runtime/contrib/ethosn/ethosn_runtime.h
index ed5d04143e8e5..b60250754b31e 100644
--- a/src/runtime/contrib/ethosn/ethosn_runtime.h
+++ b/src/runtime/contrib/ethosn/ethosn_runtime.h
@@ -52,6 +52,8 @@ struct OrderedCompiledNetwork {
   std::string name;
   std::vector<uint32_t> inputs;
   std::vector<uint32_t> outputs;
+  std::vector<uint32_t> input_sizes;
+  std::vector<uint32_t> output_sizes;
 };
 
 class EthosnModule : public ModuleNode {
@@ -88,8 +90,10 @@ class EthosnModule : public ModuleNode {
    *         std::string : serialized command stream
    *         size_t      : number of inputs
    *         std::vector : order of inputs
+   *         std::vector : buffer sizes for inputs
    *         size_t      : number of outputs
    *         std::vector : order of outputs
+   *         std::vector : buffer sizes for outputs
    *       ] * number of functions
    */
   static Module LoadFromBinary(void* strm);
diff --git a/tests/python/contrib/test_ethosn/test_fullyconnected.py b/tests/python/contrib/test_ethosn/test_fullyconnected.py
index 7a636561f9cef..5e15e5a7c3423 100644
--- a/tests/python/contrib/test_ethosn/test_fullyconnected.py
+++ b/tests/python/contrib/test_ethosn/test_fullyconnected.py
@@ -42,9 +42,9 @@ def _get_model(
         units=weight_shape[0],
         out_dtype="int32",
     )
-    b = tvm.nd.array(np.random.randint(0, high=255, size=(shape[0],), dtype="int32"))
+    b = tvm.nd.array(np.random.randint(0, high=255, size=(weight_shape[0],), dtype="int32"))
     biasc = relay.const(b, "int32")
-    bias = relay.nn.bias_add(fc, biasc, axis=0)
+    bias = relay.nn.bias_add(fc, biasc)
     req = relay.qnn.op.requantize(
         bias,
         relay.const(input_sc * kernel_sc, "float32"),  # input zero scale
@@ -58,55 +58,60 @@ def _get_model(
 
 
 @requires_ethosn
-@pytest.mark.parametrize("dtype", ["uint8"])
-def test_fullyconnected(dtype):
-    zp_min = np.iinfo(dtype).min
-    zp_max = np.iinfo(dtype).max
-    trials = [
-        ((1, 1024), zp_min + 71, 0.580, zp_max - 176, 1.498),
-        ((1, 4096), zp_min + 166, 1.724, zp_max - 138, 0.180),
-        ((1, 16384), zp_min + 101, 1.372, zp_max - 234, 1.346),
-    ]
+@pytest.mark.parametrize(
+    "shape,out_channels",
+    [
+        ((1, 1024), 64),
+        ((1, 16384), 1),
+        ((1, 1280), 1000),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype,input_zp,input_sc,kernel_zp,kernel_sc",
+    [
+        ("uint8", 71, 0.580, 176, 1.498),
+        ("uint8", 166, 1.724, 138, 0.180),
+        ("int8", 71, 0.580, 0, 1.498),
+        ("int8", 120, 1.724, 0, 0.180),
+    ],
+)
+def test_fullyconnected(shape, out_channels, dtype, input_zp, input_sc, kernel_zp, kernel_sc):
+    """
+    Test fully connected offloading.
+    """
     np.random.seed(0)
-    for shape, input_zp, input_sc, kernel_zp, kernel_sc in trials:
-        kernel_zp = (
-            0
-            if dtype == "int8"
-            else np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max) + 1
-        )
-        inputs = {
-            "a": tvm.nd.array(
-                np.random.randint(
-                    np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype
-                )
-            ),
-        }
-        outputs = []
-        output_zp, output_sc = tei.get_conv2d_qnn_params(
-            dtype,
+    inputs = {
+        "a": tvm.nd.array(
+            np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype)
+        ),
+    }
+
+    outputs = []
+    output_zp, output_sc = tei.get_conv2d_qnn_params(
+        dtype,
+        input_zp,
+        input_sc,
+        kernel_zp,
+        kernel_sc,
+        shape[0],
+        shape[1],
+        1,
+    )
+    for npu in [False, True]:
+        model, params = _get_model(
+            shape,
+            (out_channels, shape[1]),
             input_zp,
             input_sc,
             kernel_zp,
             kernel_sc,
-            shape[0],
-            shape[1],
-            1,
+            output_zp,
+            output_sc,
+            dtype,
         )
-        for npu in [False, True]:
-            model, params = _get_model(
-                shape,
-                shape,
-                input_zp,
-                input_sc,  # input zp, sc
-                kernel_zp,
-                kernel_sc,  # kernel
-                output_zp,
-                output_sc,  # output
-                dtype,
-            )
-            mod = tei.make_module(model, params)
-            outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
-        tei.verify(outputs, dtype, 1)
+        mod = tei.make_module(model, params)
+        outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
+    tei.verify(outputs, dtype, 1)
 
 
 @requires_ethosn
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index 143ec0b88dfe4..3ab609afd9fb5 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -125,7 +125,7 @@ def test_mobilenet_v1():
     # on hardware that isn't available in CI.
     _compile_hash = {"393a19dfb980345cdd3bbeddbc36424d"}
     if tei.get_ethosn_api_version() == 2111:
-        _compile_hash = {"5d1c6a6bd4df8963866cc90405bf92dd"}
+        _compile_hash = {"c523c3c2bb9add1fee508217eb73af1a"}
     if tei.get_ethosn_api_version() == 2102:
         _compile_hash = {"46ccafc840633633aca441645e41b444"}
         if tei.get_ethosn_variant() == "Ethos-N78_1TOPS_2PLE_RATIO":
@@ -152,7 +152,7 @@ def test_resnet_50_int8():
     # on hardware that isn't available in CI.
     if tei.get_ethosn_api_version() > 2011:
         if tei.get_ethosn_variant() == "Ethos-N78_1TOPS_2PLE_RATIO":
-            _compile_hash = {"c0a01c547ed1b2e3308094508fa1bfea", "434f0c65c41e24d5482142c88b3438fe"}
+            _compile_hash = {"60404ad60fc2bfbb68464d8a14cc0452", "5b9d72b9accfea7ed89eb09ca0aa5487"}
             _test_image_network(
                 model_url="https://raw.githubusercontent.com/dmlc/web-data/main/tensorflow/"
                 "models/Quantized/resnet_50_quantized.tflite",
@@ -174,7 +174,7 @@ def test_inception_v3():
     # on hardware that isn't available in CI.
     _compile_hash = {"2c7ff5487e1a21e62b3b42eec624fed4"}
     if tei.get_ethosn_api_version() == 2111:
-        _compile_hash = {"e6abe33a7bc4a4170da53eefa6577bba"}
+        _compile_hash = {"88db2c7928240be9833c1b5ef367de28"}
     if tei.get_ethosn_api_version() == 2102:
         _compile_hash = {"43dc2097127eb224c0191b1a15f8acca"}
         if tei.get_ethosn_variant() == "Ethos-N78_1TOPS_2PLE_RATIO":
@@ -200,7 +200,7 @@ def test_inception_v4():
     # on hardware that isn't available in CI.
     _compile_hash = {"4245dbd02e1432dc261a67fc8e632a00"}
     if tei.get_ethosn_api_version() == 2111:
-        _compile_hash = {"42e43c323ed8202f7b720ba9029bbcb7"}
+        _compile_hash = {"37648682f97cbbcecdc13945b7f2212f"}
     if tei.get_ethosn_api_version() == 2102:
         _compile_hash = {"fab6c2297502f95d33079c6ce1a737f9"}
         if tei.get_ethosn_variant() == "Ethos-N78_1TOPS_2PLE_RATIO":
@@ -227,7 +227,7 @@ def test_ssd_mobilenet_v1():
     _compile_hash = {"5ee8ed6af9a7f31fc14957b51a8e7423", "e6a91ccc47ba4c6b4614fcd676bd726f"}
     if tei.get_ethosn_api_version() == 2111:
         # TODO(Leo-arm): review split operator
-        _compile_hash = {"a37f900601b9493bd142e8aed16205e5", "afb68ca8f452d1f4a674b457b5e30f59"}
+        _compile_hash = {"6b699f94795785d31b39940a5cf84a81", "7b8b0a3ad7cfe1695dee187f21f03785"}
     if tei.get_ethosn_api_version() == 2102:
         _compile_hash = {"7795b6c67178da9d1f9b98063bad75b1", "10826406ae724e52f360a06c35ced09d"}
         if tei.get_ethosn_variant() == "Ethos-N78_1TOPS_2PLE_RATIO":