modeld: PC Thneed prereqs (commaai#25615)

* pc thneed prereqs * ugh, out of date * that can stay private * memcpy here is fine in SNPE variant * release files * thneed docs don't work anymore. they didn't look too useful Co-authored-by: Comma Device <device@comma.ai>
rjsmith1999 · Oct 8, 2022 · 4d1b1cf · 4d1b1cf
1 parent d9c0576
commit 4d1b1cf
Show file tree

Hide file tree

Showing 16 changed files with 366 additions and 263 deletions.
diff --git a/docs/c_docs.rst b/docs/c_docs.rst
@@ -78,8 +78,6 @@ modeld
    :project: selfdrive_modeld_transforms
 .. autodoxygenindex::
    :project: selfdrive_modeld_models
-.. autodoxygenindex::
-   :project: selfdrive_modeld_thneed
 .. autodoxygenindex::
    :project: selfdrive_modeld_runners
 

diff --git a/release/files_common b/release/files_common
@@ -356,7 +356,9 @@ selfdrive/modeld/transforms/transform.h
 selfdrive/modeld/transforms/transform.cl
 
 selfdrive/modeld/thneed/*.py
-selfdrive/modeld/thneed/thneed.*
+selfdrive/modeld/thneed/thneed.h
+selfdrive/modeld/thneed/thneed_common.cc
+selfdrive/modeld/thneed/thneed_qcom2.cc
 selfdrive/modeld/thneed/serialize.cc
 selfdrive/modeld/thneed/compile.cc
 selfdrive/modeld/thneed/optimizer.cc

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
@@ -23,7 +23,8 @@ common_src = [
 ]
 
 thneed_src = [
-  "thneed/thneed.cc",
+  "thneed/thneed_common.cc",
+  "thneed/thneed_qcom2.cc",
   "thneed/serialize.cc",
   "thneed/optimizer.cc",
   "runners/thneedmodel.cc",

diff --git a/selfdrive/modeld/models/driving.cc b/selfdrive/modeld/models/driving.cc
@@ -38,7 +38,7 @@ void model_init(ModelState* s, cl_device_id device_id, cl_context context) {
 #else
   s->m = std::make_unique<SNPEModel>("models/supercombo.dlc",
 #endif
-   &s->output[0], NET_OUTPUT_SIZE, USE_GPU_RUNTIME, true);
+   &s->output[0], NET_OUTPUT_SIZE, USE_GPU_RUNTIME, true, false, context);
 
 #ifdef TEMPORAL
   s->m->addRecurrent(&s->output[OUTPUT_SIZE], TEMPORAL_SIZE);

diff --git a/selfdrive/modeld/runners/onnx_runner.py b/selfdrive/modeld/runners/onnx_runner.py
@@ -9,36 +9,46 @@
 
 import onnxruntime as ort # pylint: disable=import-error
 
-def read(sz):
+def read(sz, tf8=False):
   dd = []
   gt = 0
-  while gt < sz * 4:
-    st = os.read(0, sz * 4 - gt)
+  szof = 1 if tf8 else 4
+  while gt < sz * szof:
+    st = os.read(0, sz * szof - gt)
     assert(len(st) > 0)
     dd.append(st)
     gt += len(st)
-  return np.frombuffer(b''.join(dd), dtype=np.float32)
+  r = np.frombuffer(b''.join(dd), dtype=np.uint8 if tf8 else np.float32).astype(np.float32)
+  if tf8:
+    r = r / 255.
+  return r
 
 def write(d):
   os.write(1, d.tobytes())
 
-def run_loop(m):
+def run_loop(m, tf8_input=False):
   ishapes = [[1]+ii.shape[1:] for ii in m.get_inputs()]
   keys = [x.name for x in m.get_inputs()]
+
+  # run once to initialize CUDA provider
+  if "CUDAExecutionProvider" in m.get_providers():
+    m.run(None, dict(zip(keys, [np.zeros(shp, dtype=np.float32) for shp in ishapes])))
+
   print("ready to run onnx model", keys, ishapes, file=sys.stderr)
   while 1:
     inputs = []
-    for shp in ishapes:
+    for k, shp in zip(keys, ishapes):
       ts = np.product(shp)
       #print("reshaping %s with offset %d" % (str(shp), offset), file=sys.stderr)
-      inputs.append(read(ts).reshape(shp))
+      inputs.append(read(ts, (k=='input_img' and tf8_input)).reshape(shp))
     ret = m.run(None, dict(zip(keys, inputs)))
     #print(ret, file=sys.stderr)
     for r in ret:
       write(r)
 
 
 if __name__ == "__main__":
+  print(sys.argv, file=sys.stderr)
   print("Onnx available providers: ", ort.get_available_providers(), file=sys.stderr)
   options = ort.SessionOptions()
   options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
@@ -54,7 +64,10 @@ def run_loop(m):
     options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
     provider = 'CPUExecutionProvider'
 
-  print("Onnx selected provider: ", [provider], file=sys.stderr)
-  ort_session = ort.InferenceSession(sys.argv[1], options, providers=[provider])
-  print("Onnx using ", ort_session.get_providers(), file=sys.stderr)
-  run_loop(ort_session)
+  try:
+    print("Onnx selected provider: ", [provider], file=sys.stderr)
+    ort_session = ort.InferenceSession(sys.argv[1], options, providers=[provider])
+    print("Onnx using ", ort_session.get_providers(), file=sys.stderr)
+    run_loop(ort_session, tf8_input=("--use_tf8" in sys.argv))
+  except KeyboardInterrupt:
+    pass
diff --git a/selfdrive/modeld/runners/onnxmodel.cc b/selfdrive/modeld/runners/onnxmodel.cc
@@ -14,12 +14,13 @@
 #include "selfdrive/common/swaglog.h"
 #include "selfdrive/common/util.h"
 
-ONNXModel::ONNXModel(const char *path, float *_output, size_t _output_size, int runtime, bool _use_extra) {
+ONNXModel::ONNXModel(const char *path, float *_output, size_t _output_size, int runtime, bool _use_extra, bool _use_tf8, cl_context context) {
   LOGD("loading model %s", path);
 
   output = _output;
   output_size = _output_size;
   use_extra = _use_extra;
+  use_tf8 = _use_tf8;
 
   int err = pipe(pipein);
   assert(err == 0);
@@ -28,11 +29,12 @@ ONNXModel::ONNXModel(const char *path, float *_output, size_t _output_size, int
 
   std::string exe_dir = util::dir_name(util::readlink("/proc/self/exe"));
   std::string onnx_runner = exe_dir + "/runners/onnx_runner.py";
+  std::string tf8_arg = use_tf8 ? "--use_tf8" : "";
 
   proc_pid = fork();
   if (proc_pid == 0) {
     LOGD("spawning onnx process %s", onnx_runner.c_str());
-    char *argv[] = {(char*)onnx_runner.c_str(), (char*)path, nullptr};
+    char *argv[] = {(char*)onnx_runner.c_str(), (char*)path, (char*)tf8_arg.c_str(), nullptr};
     dup2(pipein[0], 0);
     dup2(pipeout[1], 1);
     close(pipein[0]);

diff --git a/selfdrive/modeld/runners/onnxmodel.h b/selfdrive/modeld/runners/onnxmodel.h
@@ -6,7 +6,7 @@
 
 class ONNXModel : public RunModel {
 public:
-  ONNXModel(const char *path, float *output, size_t output_size, int runtime, bool use_extra = false);
+  ONNXModel(const char *path, float *output, size_t output_size, int runtime, bool use_extra = false, bool _use_tf8 = false, cl_context context = NULL);
 	~ONNXModel();
   void addRecurrent(float *state, int state_size);
   void addDesire(float *state, int state_size);
@@ -31,6 +31,7 @@ class ONNXModel : public RunModel {
   int calib_size;
   float *image_input_buf = NULL;
   int image_buf_size;
+  bool use_tf8;
   float *extra_input_buf = NULL;
   int extra_buf_size;
   bool use_extra;

diff --git a/selfdrive/modeld/runners/runmodel.h b/selfdrive/modeld/runners/runmodel.h
@@ -1,4 +1,5 @@
 #pragma once
+#include "selfdrive/common/clutil.h"
 class RunModel {
 public:
   virtual ~RunModel() {}

diff --git a/selfdrive/modeld/runners/snpemodel.cc b/selfdrive/modeld/runners/snpemodel.cc
@@ -6,19 +6,20 @@
 #include <cstdlib>
 #include <cstring>
 
-#include "selfdrive/common/util.h"
-#include "selfdrive/common/timing.h"
+#include "common/util.h"
+#include "common/timing.h"
 
 void PrintErrorStringAndExit() {
   std::cerr << zdl::DlSystem::getLastErrorString() << std::endl;
   std::exit(EXIT_FAILURE);
 }
 
-SNPEModel::SNPEModel(const char *path, float *loutput, size_t loutput_size, int runtime, bool luse_extra) {
+SNPEModel::SNPEModel(const char *path, float *loutput, size_t loutput_size, int runtime, bool luse_extra, bool luse_tf8, cl_context context) {
   output = loutput;
   output_size = loutput_size;
   use_extra = luse_extra;
-#if defined(QCOM) || defined(QCOM2)
+  use_tf8 = luse_tf8;
+#ifdef QCOM2
   if (runtime==USE_GPU_RUNTIME) {
     Runtime = zdl::DlSystem::Runtime_t::GPU;
   } else if (runtime==USE_DSP_RUNTIME) {
@@ -39,7 +40,7 @@ SNPEModel::SNPEModel(const char *path, float *loutput, size_t loutput_size, int
   // create model runner
   zdl::SNPE::SNPEBuilder snpeBuilder(container.get());
   while (!snpe) {
-#if defined(QCOM) || defined(QCOM2)
+#ifdef QCOM2
     snpe = snpeBuilder.setOutputLayers({})
                       .setRuntimeProcessor(Runtime)
                       .setUseUserSuppliedBuffers(true)
@@ -70,14 +71,16 @@ SNPEModel::SNPEModel(const char *path, float *loutput, size_t loutput_size, int
   printf("model: %s -> %s\n", input_tensor_name, output_tensor_name);
 
   zdl::DlSystem::UserBufferEncodingFloat userBufferEncodingFloat;
+  zdl::DlSystem::UserBufferEncodingTf8 userBufferEncodingTf8(0, 1./255); // network takes 0-1
   zdl::DlSystem::IUserBufferFactory& ubFactory = zdl::SNPE::SNPEFactory::getUserBufferFactory();
+  size_t size_of_input = use_tf8 ? sizeof(uint8_t) : sizeof(float);
 
   // create input buffer
   {
     const auto &inputDims_opt = snpe->getInputDimensions(input_tensor_name);
     const zdl::DlSystem::TensorShape& bufferShape = *inputDims_opt;
     std::vector<size_t> strides(bufferShape.rank());
-    strides[strides.size() - 1] = sizeof(float);
+    strides[strides.size() - 1] = size_of_input;
     size_t product = 1;
     for (size_t i = 0; i < bufferShape.rank(); i++) product *= bufferShape[i];
     size_t stride = strides[strides.size() - 1];
@@ -86,7 +89,10 @@ SNPEModel::SNPEModel(const char *path, float *loutput, size_t loutput_size, int
       strides[i-1] = stride;
     }
     printf("input product is %lu\n", product);
-    inputBuffer = ubFactory.createUserBuffer(NULL, product*sizeof(float), strides, &userBufferEncodingFloat);
+    inputBuffer = ubFactory.createUserBuffer(NULL,
+                                             product*size_of_input,
+                                             strides,
+                                             use_tf8 ? (zdl::DlSystem::UserBufferEncoding*)&userBufferEncodingTf8 : (zdl::DlSystem::UserBufferEncoding*)&userBufferEncodingFloat);
 
     inputMap.add(input_tensor_name, inputBuffer.get());
   }
@@ -123,6 +129,12 @@ SNPEModel::SNPEModel(const char *path, float *loutput, size_t loutput_size, int
     outputBuffer = ubFactory.createUserBuffer(output, output_size * sizeof(float), outputStrides, &userBufferEncodingFloat);
     outputMap.add(output_tensor_name, outputBuffer.get());
   }
+
+#ifdef USE_THNEED
+  if (Runtime == zdl::DlSystem::Runtime_t::GPU) {
+    thneed.reset(new Thneed());
+  }
+#endif
 }
 
 void SNPEModel::addRecurrent(float *state, int state_size) {
@@ -176,7 +188,7 @@ std::unique_ptr<zdl::DlSystem::IUserBuffer> SNPEModel::addExtra(float *state, in
 void SNPEModel::execute() {
 #ifdef USE_THNEED
   if (Runtime == zdl::DlSystem::Runtime_t::GPU) {
-    if (thneed == NULL) {
+    if (!thneed_recorded) {
       bool ret = inputBuffer->setBufferAddress(input);
       assert(ret == true);
       if (use_extra) {
@@ -188,7 +200,7 @@ void SNPEModel::execute() {
         PrintErrorStringAndExit();
       }
       memset(recurrent, 0, recurrent_size*sizeof(float));
-      thneed = new Thneed();
+      thneed->record = true;
       if (!snpe->execute(inputMap, outputMap)) {
         PrintErrorStringAndExit();
       }
@@ -220,6 +232,7 @@ void SNPEModel::execute() {
         assert(false);
       }
       free(outputs_golden);
+      thneed_recorded = true;
     } else {
       if (use_extra) {
         float *inputs[5] = {recurrent, trafficConvention, desire, extra, input};

diff --git a/selfdrive/modeld/runners/snpemodel.h b/selfdrive/modeld/runners/snpemodel.h
@@ -1,4 +1,5 @@
 #pragma once
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
 
 #include <DlContainer/IDlContainer.hpp>
 #include <DlSystem/DlError.hpp>
@@ -22,7 +23,7 @@
 
 class SNPEModel : public RunModel {
 public:
-  SNPEModel(const char *path, float *loutput, size_t loutput_size, int runtime, bool luse_extra = false);
+  SNPEModel(const char *path, float *loutput, size_t loutput_size, int runtime, bool luse_extra = false, bool use_tf8 = false, cl_context context = NULL);
   void addRecurrent(float *state, int state_size);
   void addTrafficConvention(float *state, int state_size);
   void addCalib(float *state, int state_size);
@@ -32,13 +33,14 @@ class SNPEModel : public RunModel {
   void execute();
 
 #ifdef USE_THNEED
-  Thneed *thneed = NULL;
+  std::unique_ptr<Thneed> thneed;
+  bool thneed_recorded = false;
 #endif
 
 private:
   std::string model_data;
 
-#if defined(QCOM) || defined(QCOM2)
+#ifdef QCOM2
   zdl::DlSystem::Runtime_t Runtime;
 #endif
 
@@ -50,6 +52,7 @@ class SNPEModel : public RunModel {
   std::unique_ptr<zdl::DlSystem::IUserBuffer> inputBuffer;
   float *input;
   size_t input_size;
+  bool use_tf8;
 
   // snpe output stuff
   zdl::DlSystem::UserBufferMap outputMap;

diff --git a/selfdrive/modeld/runners/thneedmodel.cc b/selfdrive/modeld/runners/thneedmodel.cc
@@ -2,9 +2,8 @@
 
 #include <cassert>
 
-ThneedModel::ThneedModel(const char *path, float *loutput, size_t loutput_size, int runtime, bool luse_extra) {
-  thneed = new Thneed(true);
-  thneed->record = 0;
+ThneedModel::ThneedModel(const char *path, float *loutput, size_t loutput_size, int runtime, bool luse_extra, bool luse_tf8, cl_context context) {
+  thneed = new Thneed(true, context);
   thneed->load(path);
   thneed->clexec();
   thneed->find_inputs_outputs();

diff --git a/selfdrive/modeld/runners/thneedmodel.h b/selfdrive/modeld/runners/thneedmodel.h
@@ -5,7 +5,7 @@
 
 class ThneedModel : public RunModel {
 public:
-  ThneedModel(const char *path, float *loutput, size_t loutput_size, int runtime, bool luse_extra = false);
+  ThneedModel(const char *path, float *loutput, size_t loutput_size, int runtime, bool luse_extra = false, bool use_tf8 = false, cl_context context = NULL);
   void addRecurrent(float *state, int state_size);
   void addTrafficConvention(float *state, int state_size);
   void addDesire(float *state, int state_size);