From a3ba5d0dbc01f51fcec8f488305c90c2d33e16bf Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Tue, 17 May 2022 17:39:18 -0700 Subject: [PATCH] thneed: add flag to enable optimizer (#24568) * improve the thneed compiler * only init thneed if we are using the GPU Co-authored-by: Comma Device --- selfdrive/modeld/SConscript | 2 +- selfdrive/modeld/runners/snpemodel.cc | 11 +++++-- selfdrive/modeld/runners/snpemodel.h | 3 +- selfdrive/modeld/runners/thneedmodel.cc | 1 - selfdrive/modeld/thneed/compile.cc | 41 ++++++++++++++++++++++--- selfdrive/modeld/thneed/thneed.cc | 14 ++------- selfdrive/modeld/thneed/thneed.h | 3 +- 7 files changed, 53 insertions(+), 22 deletions(-) diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript index 859488356e3f79..1f1c661c8be3e9 100644 --- a/selfdrive/modeld/SConscript +++ b/selfdrive/modeld/SConscript @@ -65,7 +65,7 @@ common_model = lenv.Object(common_src) if use_thneed and arch == "larch64": fn = File("models/supercombo").abspath compiler = lenv.Program('thneed/compile', ["thneed/compile.cc"]+common_model, LIBS=libs) - cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} {fn}.dlc {fn}_badweights.thneed --binary" + cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} --in {fn}.dlc --out {fn}_badweights.thneed --binary --optimize" lib_paths = ':'.join(Dir(p).abspath for p in lenv["LIBPATH"]) kernel_path = os.path.join(Dir('.').abspath, "thneed", "kernels") diff --git a/selfdrive/modeld/runners/snpemodel.cc b/selfdrive/modeld/runners/snpemodel.cc index 44fa6ce298c389..d8ed708fa867e9 100644 --- a/selfdrive/modeld/runners/snpemodel.cc +++ b/selfdrive/modeld/runners/snpemodel.cc @@ -123,6 +123,12 @@ SNPEModel::SNPEModel(const char *path, float *loutput, size_t loutput_size, int outputBuffer = ubFactory.createUserBuffer(output, output_size * sizeof(float), outputStrides, &userBufferEncodingFloat); outputMap.add(output_tensor_name, outputBuffer.get()); } + +#ifdef USE_THNEED + if (Runtime == zdl::DlSystem::Runtime_t::GPU) { + thneed.reset(new Thneed()); + } +#endif } void SNPEModel::addRecurrent(float *state, int state_size) { @@ -176,7 +182,7 @@ std::unique_ptr SNPEModel::addExtra(float *state, in void SNPEModel::execute() { #ifdef USE_THNEED if (Runtime == zdl::DlSystem::Runtime_t::GPU) { - if (thneed == NULL) { + if (!thneed_recorded) { bool ret = inputBuffer->setBufferAddress(input); assert(ret == true); if (use_extra) { @@ -188,7 +194,7 @@ void SNPEModel::execute() { PrintErrorStringAndExit(); } memset(recurrent, 0, recurrent_size*sizeof(float)); - thneed = new Thneed(); + thneed->record = true; if (!snpe->execute(inputMap, outputMap)) { PrintErrorStringAndExit(); } @@ -220,6 +226,7 @@ void SNPEModel::execute() { assert(false); } free(outputs_golden); + thneed_recorded = true; } else { if (use_extra) { float *inputs[5] = {recurrent, trafficConvention, desire, extra, input}; diff --git a/selfdrive/modeld/runners/snpemodel.h b/selfdrive/modeld/runners/snpemodel.h index 6e9c33f89c8f6d..ee5381d6a29c1c 100644 --- a/selfdrive/modeld/runners/snpemodel.h +++ b/selfdrive/modeld/runners/snpemodel.h @@ -32,7 +32,8 @@ class SNPEModel : public RunModel { void execute(); #ifdef USE_THNEED - Thneed *thneed = NULL; + std::unique_ptr thneed; + bool thneed_recorded = false; #endif private: diff --git a/selfdrive/modeld/runners/thneedmodel.cc b/selfdrive/modeld/runners/thneedmodel.cc index edc091bda9ca38..dbe80a9463b1d2 100644 --- a/selfdrive/modeld/runners/thneedmodel.cc +++ b/selfdrive/modeld/runners/thneedmodel.cc @@ -4,7 +4,6 @@ ThneedModel::ThneedModel(const char *path, float *loutput, size_t loutput_size, int runtime, bool luse_extra) { thneed = new Thneed(true); - thneed->record = 0; thneed->load(path); thneed->clexec(); thneed->find_inputs_outputs(); diff --git a/selfdrive/modeld/thneed/compile.cc b/selfdrive/modeld/thneed/compile.cc index c22156d2c9c2a0..8698ce482e224b 100644 --- a/selfdrive/modeld/thneed/compile.cc +++ b/selfdrive/modeld/thneed/compile.cc @@ -1,4 +1,5 @@ #include +#include #include "selfdrive/modeld/runners/snpemodel.h" #include "selfdrive/modeld/thneed/thneed.h" @@ -10,10 +11,36 @@ // TODO: This should probably use SNPE directly. int main(int argc, char* argv[]) { + bool run_optimizer = false, save_binaries = false; + const char *input_file = NULL, *output_file = NULL; + static struct option long_options[] = { + {"in", required_argument, 0, 'i' }, + {"out", required_argument, 0, 'o' }, + {"binary", no_argument, 0, 'b' }, + {"optimize", no_argument, 0, 'f' }, + {0, 0, 0, 0 } + }; + int long_index = 0, opt = 0; + while ((opt = getopt_long_only(argc, argv,"", long_options, &long_index)) != -1) { + switch (opt) { + case 'i': input_file = optarg; break; + case 'o': output_file = optarg; break; + case 'b': save_binaries = true; break; + case 'f': run_optimizer = true; break; + } + } + + // no input? + if (!input_file) { + printf("usage: -i -o --binary --optimize\n"); + return -1; + } + #define OUTPUT_SIZE 0x10000 float *output = (float*)calloc(OUTPUT_SIZE, sizeof(float)); - SNPEModel mdl(argv[1], output, 0, USE_GPU_RUNTIME, true); + SNPEModel mdl(input_file, output, 0, USE_GPU_RUNTIME, true); + mdl.thneed->run_optimizer = run_optimizer; float state[TEMPORAL_SIZE] = {0}; float desire[DESIRE_LEN] = {0}; @@ -32,14 +59,20 @@ int main(int argc, char* argv[]) { memset(output, 0, OUTPUT_SIZE * sizeof(float)); mdl.execute(); + // don't save? + if (!output_file) { + printf("no output file, exiting\n"); + return 0; + } + // save model - bool save_binaries = (argc > 3) && (strcmp(argv[3], "--binary") == 0); - mdl.thneed->save(argv[2], save_binaries); + printf("saving %s with binary %d\n", output_file, save_binaries); + mdl.thneed->save(output_file, save_binaries); // test model auto thneed = new Thneed(true); thneed->record = false; - thneed->load(argv[2]); + thneed->load(output_file); thneed->clexec(); thneed->find_inputs_outputs(); diff --git a/selfdrive/modeld/thneed/thneed.cc b/selfdrive/modeld/thneed/thneed.cc index 470ff219b0e431..90b1200a1aee57 100644 --- a/selfdrive/modeld/thneed/thneed.cc +++ b/selfdrive/modeld/thneed/thneed.cc @@ -11,8 +11,6 @@ #include "selfdrive/common/clutil.h" #include "selfdrive/common/timing.h" -//#define RUN_DISASSEMBLER -#define RUN_OPTIMIZER Thneed *g_thneed = NULL; int g_fd = -1; @@ -203,11 +201,6 @@ void CachedCommand::exec() { for (auto &it : kq) { it->debug_print(false); } - #ifdef RUN_DISASSEMBLER - // assuming 2 commands - disassemble(0); - disassemble(1); - #endif } assert(ret == 0); @@ -220,7 +213,6 @@ Thneed::Thneed(bool do_clinit) { assert(g_fd != -1); fd = g_fd; ram = make_unique(0x80000, fd); - record = true; timestamp = -1; g_thneed = this; char *thneed_debug_env = getenv("THNEED_DEBUG"); @@ -230,7 +222,7 @@ Thneed::Thneed(bool do_clinit) { void Thneed::stop() { find_inputs_outputs(); printf("Thneed::stop: recorded %lu commands\n", cmds.size()); - record = 0; + record = false; } void Thneed::find_inputs_outputs() { @@ -416,9 +408,7 @@ cl_int thneed_clFinish(cl_command_queue command_queue) { Thneed *thneed = g_thneed; if (thneed != NULL && thneed->record) { - #ifdef RUN_OPTIMIZER - thneed->optimize(); - #endif + if (thneed->run_optimizer) thneed->optimize(); return thneed->clexec(); } else { return clFinish(command_queue); diff --git a/selfdrive/modeld/thneed/thneed.h b/selfdrive/modeld/thneed/thneed.h index b09d32b0ef9e8a..0ccea59a3c3e8a 100644 --- a/selfdrive/modeld/thneed/thneed.h +++ b/selfdrive/modeld/thneed/thneed.h @@ -94,6 +94,7 @@ class Thneed { void execute(float **finputs, float *foutput, bool slow=false); void wait(); int optimize(); + bool run_optimizer = false; vector input_clmem; vector inputs; @@ -106,7 +107,7 @@ class Thneed { int context_id; // protected? - bool record; + bool record = false; int debug; int timestamp; unique_ptr ram;