AOT C Codegen Type Issue #8062

mehrdadh · 2021-05-17T22:18:06Z

I have a model that I tried to deploy using AOT runtime. The model final output has type int8 and based on that I allocated a placeholder for the output like this:

int8_t output_data0[] ={0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, };

However, when I look at the C code generated for AOT runtime library, here's what has been generated:

TVM_DLL int32_t tvm__run_func(void* args, void* arg_type_ids, int32_t num_args, void* out_ret_value, void* out_ret_tcode, void* resource_handle) {
  TVMValue stack[5];
  void* stack_tcode = stack;
  TVMValue stack1[9];
  void* stack_value = stack1;
  void* arg0 = (((TVMValue*)args)[0].v_handle);
  int32_t arg0_code = ((int32_t*)arg_type_ids)[(0)];
  void* arg1 = (((TVMValue*)args)[1].v_handle);
  int32_t arg1_code = ((int32_t*)arg_type_ids)[(1)];
  void* input_0 = arg0;
  void* output_0 = arg1;
...

TVMValue stack245[6];
  void* sid_59_value1 = stack245;
  (((DLTensor*)sid_59_value1)[0].data) = sid_59;
  TVMValue stack246[1];
  void* param_60_array = stack246;
  TVMValue stack247[1];
  void* ret_value110 = stack247;
  TVMValue stack248[1];
  void* ret_value111 = stack248;
  TVMValue stack249[1];
  void* param_60_value = stack249;
  (((TVMValue*)param_60_value)[0].v_int64) = 60;
  (void)_lookup_linked_param(param_60_value, 0, 0, ret_value111, ret_value110, 0);
  (((DLTensor*)param_60_array)[0].data) = (((TVMValue*)ret_value111)[0].v_handle);
  TVMValue stack250[1];
  void* param_61_array = stack250;
  TVMValue stack251[1];
  void* ret_value112 = stack251;
  TVMValue stack252[1];
  void* ret_value113 = stack252;
  TVMValue stack253[1];
  void* param_61_value = stack253;
  (((TVMValue*)param_61_value)[0].v_int64) = 61;
  (void)_lookup_linked_param(param_61_value, 0, 0, ret_value113, ret_value112, 0);
  (((DLTensor*)param_61_array)[0].data) = (((TVMValue*)ret_value113)[0].v_handle);
  (((TVMValue*)stack_value)[0].v_handle) = sid_59_value1;
  ((int32_t*)stack_tcode)[(0)] = 3;
  (((TVMValue*)stack_value)[1].v_handle) = param_60_array;
  ((int32_t*)stack_tcode)[(1)] = 3;
  (((TVMValue*)stack_value)[2].v_handle) = param_61_array;
  ((int32_t*)stack_tcode)[(2)] = 3;
  (((TVMValue*)stack_value)[3].v_handle) = output_0;
  ((int32_t*)stack_tcode)[(3)] = 3;
  TVMValue ret_val12;
  int ret_type_code12;
  if (fused_nn_contrib_dense_pack_add_fixed_point_multiply_add_clip_cast_cast_subtract_14669711146056581479_( (TVMValue*) stack_value , (int*) stack_tcode, 4, &ret_val12, &ret_type_code12, NULL) != 0){
    return -1;
  }
TVMValue stack254[6];
  void* sid_59_value2 = stack254;
  (((DLTensor*)sid_59_value2)[0].data) = sid_59;
  (((TVMValue*)stack_value)[0].v_handle) = output_0;
  ((int32_t*)stack_tcode)[(0)] = 3;
  (((TVMValue*)stack_value)[1].v_handle) = sid_59_value2;
  ((int32_t*)stack_tcode)[(1)] = 3;
  TVMValue ret_val13;
  int ret_type_code13;
  if (fused_nn_softmax( (TVMValue*) stack_value , (int*) stack_tcode, 2, &ret_val13, &ret_type_code13, NULL) != 0){
    return -1;
  }
  TVMValue stack255[6];
  void* sid_59_value3 = stack255;
  (((DLTensor*)sid_59_value3)[0].data) = sid_59;
  (((TVMValue*)stack_value)[0].v_handle) = sid_59_value3;
  ((int32_t*)stack_tcode)[(0)] = 3;
  (((TVMValue*)stack_value)[1].v_handle) = output_0;
  ((int32_t*)stack_tcode)[(1)] = 3;
  TVMValue ret_val14;
  int ret_type_code14;
  if (fused_divide_add_round_cast_clip_cast( (TVMValue*) stack_value , (int*) stack_tcode, 2, &ret_val14, &ret_type_code14, NULL) != 0){
    return -1;
  }

output_0 is the placeholder for final output (output_data0) that we passed to function tvm__run_func and it has int8 type, however, output_0 has been used other intermediate functions and assigned other types like float. For ex. fused_nn_contrib_dense_pack_add_fixed_point_multiply_add_clip_cast_cast_subtract_14669711146056581479_ function is defined here:

TVM_DLL int32_t fused_nn_contrib_dense_pack_add_fixed_point_multiply_add_clip_cast_cast_subtract_14669711146056581479_(void* args, void* arg_type_ids, int32_t num_args, void* out_ret_value, void* out_ret_tcode, void* resource_handle) {
  void* arg0 = (((TVMValue*)args)[0].v_handle);
  int32_t arg0_code = ((int32_t*)arg_type_ids)[(0)];
  void* arg1 = (((TVMValue*)args)[1].v_handle);
  int32_t arg1_code = ((int32_t*)arg_type_ids)[(1)];
  void* arg2 = (((TVMValue*)args)[2].v_handle);
  int32_t arg2_code = ((int32_t*)arg_type_ids)[(2)];
  void* arg3 = (((TVMValue*)args)[3].v_handle);
  int32_t arg3_code = ((int32_t*)arg_type_ids)[(3)];
  void* placeholder = (((DLTensor*)arg0)[0].data);
  void* arg0_shape = (((DLTensor*)arg0)[0].shape);
  void* arg0_strides = (((DLTensor*)arg0)[0].strides);
  int32_t dev_id = (((DLTensor*)arg0)[0].device.device_id);
  void* placeholder1 = (((DLTensor*)arg1)[0].data);
  void* arg1_shape = (((DLTensor*)arg1)[0].shape);
  void* arg1_strides = (((DLTensor*)arg1)[0].strides);
  void* placeholder2 = (((DLTensor*)arg2)[0].data);
  void* arg2_shape = (((DLTensor*)arg2)[0].shape);
  void* arg2_strides = (((DLTensor*)arg2)[0].strides);
  void* T_multiply = (((DLTensor*)arg3)[0].data);
  void* arg3_shape = (((DLTensor*)arg3)[0].shape);
  void* arg3_strides = (((DLTensor*)arg3)[0].strides);
  if (!(arg0_strides == NULL)) {
  }
  if (!(arg1_strides == NULL)) {
  }
  if (!(arg2_strides == NULL)) {
  }
  if (!(arg3_strides == NULL)) {
  }
  void* compute_global = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)48, 0, 32);
  if (compute_global == NULL) {
    return -1;
  }
  for (int32_t x_c_init = 0; x_c_init < 12; ++x_c_init) {
    ((int32_t*)compute_global)[(x_c_init)] = 0;
  }
  for (int32_t k_outer = 0; k_outer < 64; ++k_outer) {
    for (int32_t x_c = 0; x_c < 12; ++x_c) {
      ((int32_t*)compute_global)[(x_c)] = (((int32_t*)compute_global)[(x_c)] + (((int32_t)((int16_t*)placeholder)[(k_outer)]) * ((int32_t)((int16_t*)placeholder1)[(((k_outer * 12) + x_c))])));
    }
  }
  for (int32_t ax1_inner_inner = 0; ax1_inner_inner < 12; ++ax1_inner_inner) {
    int32_t _1 = ((int32_t)(((((0 != 0) ? (((int64_t)(((int32_t*)compute_global)[(ax1_inner_inner)] + ((int32_t*)placeholder2)[(ax1_inner_inner)])) << ((int64_t)0)) : ((int64_t)(((int32_t*)compute_global)[(ax1_inner_inner)] + ((int32_t*)placeholder2)[(ax1_inner_inner)]))) * (int64_t)1278221461) + ((int64_t)1 << ((int64_t)((7 + 31) - 1)))) >> ((int64_t)(7 + 31)))) + 14;
    int32_t _2 = (_1) < (127) ? (_1) : (127);
    ((float*)T_multiply)[(ax1_inner_inner)] = (((float)(((int32_t)((int8_t)((_2) > (-128) ? (_2) : (-128)))) - 14)) * 1.446925e-01f);
  }
  if (TVMBackendFreeWorkspace(1, dev_id, compute_global) != 0) {
    return -1;
  }
  return 0;
}

and here T_multiply is the output_0 which is interpreted as float type and this cause memory overriding of other variables.

One quick fix is to assign the final output as the largest size that we used in graph(float32/float64) to avoid this problem, but we need a better way to fix this problem.

The text was updated successfully, but these errors were encountered:

areusch · 2021-05-17T22:22:50Z

cc @Mousius @manupa-arm i think Graph Memory planning is sharing the output buffer (here named output_data0) with other intermediates. we probably need to add logic to GraphPlanMemory to stop it from doing this when AOT is enabled, since AOT's API treats the output parameter as a user-supplied memory region.

mehrdadh · 2021-05-17T22:24:30Z

thanks @gromero for helping to find this issue!

manupak · 2021-05-18T19:58:17Z

@mehrdadh , can you confirm how do you know the output type is int8 ?
Is this a quantized network ?

I see symptoms of a different problem here because there is a fused cast at the last operator. For argument sake, can you usethe last operator's arg type (post-cast -- in fused_divide_add_round_cast_clip_cast) and see whether the issue persists. If my thinking is correct, the compiled model might expect the output type to be int_16 and not int_8.

cc @giuseros : I think there might be int_16 cast at the very end causing this.

mehrdadh · 2021-05-19T16:18:01Z

@manupa-arm it is a quantized network and the final output is expected to be int8. Here is the full AOT generated library:
lib1.c

output_0 has been reused in two other functions which both interpret it as float and that causes the issue. I have used float model with the same setup and this issue doesn't happen since reusing output in that case doesn't have a conflict.

mehrdadh · 2021-05-19T16:19:27Z

output_0 is used in fused_nn_contrib_dense_pack_add_fixed_point_multiply_add_clip_cast_cast_subtract_14669711146056581479_ as arg3, and also used in fused_divide_add_round_cast_clip_cast as arg1.

manupak · 2021-05-20T08:59:15Z

Ack, Yes its not what I thought.

In this PR we are decoupling AOT from the Graph Memory Planner. Since AOT has the runner expressed in TIR we can get rid of the GMP in relay and use the Storage Rewrite Pass to do memory planning on the runner function. This also sorts out the issue mentioned in apache#8062 Change-Id: I6e33fadbf0462edf0366ee37e84ffde26123d3cb

giuseros · 2021-05-20T16:04:22Z

Hi all,
The main problem is that the Graph Memory Planner( in Relay ) sees the output of the graph as a further temporary variable. When using the graph executor this temporary area is then copied to the user real output, but in AOT that is the real output.

Why is this a problem? Because GMP can expand temporaries to that they can be shared. Thinking the output is a temporary it expands the output as well, but the output is not expandable in AOT (since it is provided by the user).

We were able to reproduce this with mobilenet quantized. The fix we come up with in #8096 is to remove the dependency from the GMP and use the TIR StorageRewrite pass on the runner function, which does not suffer of this problem.

The memory foot print is the same, so we basically got two birds with one stone.

mehrdadh · 2021-05-20T17:10:04Z

Thanks @giuseros!

mehrdadh · 2021-05-20T18:43:12Z

will be solved by this PR: #8096

In this PR we are decoupling AOT from the Graph Memory Planner. Since AOT has the runner expressed in TIR we can get rid of the GMP in relay and use the Storage Rewrite Pass to do memory planning on the runner function. This also sorts out the issue mentioned in apache#8062 Change-Id: I6e33fadbf0462edf0366ee37e84ffde26123d3cb

areusch · 2021-06-10T19:47:29Z

will close this when the PR or another solution merges

Mousius · 2021-08-06T11:09:24Z

@areusch / @mehrdadh - did the PR fix this issue in the end? Can we close this? 😸

mehrdadh · 2021-08-06T16:33:52Z

@mehrdadh yes, this issue is resolved.
Thanks for the reminder.

giuseros mentioned this issue May 20, 2021

Decoupling AOT from graph memory planner #8096

Merged

mehrdadh closed this as completed May 20, 2021

areusch reopened this Jun 10, 2021

mehrdadh closed this as completed Aug 6, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

AOT C Codegen Type Issue #8062

AOT C Codegen Type Issue #8062

mehrdadh commented May 17, 2021

areusch commented May 17, 2021

mehrdadh commented May 17, 2021

manupak commented May 18, 2021 •

edited

Loading

mehrdadh commented May 19, 2021

mehrdadh commented May 19, 2021

manupak commented May 20, 2021

giuseros commented May 20, 2021

mehrdadh commented May 20, 2021

mehrdadh commented May 20, 2021

areusch commented Jun 10, 2021

Mousius commented Aug 6, 2021

mehrdadh commented Aug 6, 2021

AOT C Codegen Type Issue #8062

AOT C Codegen Type Issue #8062

Comments

mehrdadh commented May 17, 2021

areusch commented May 17, 2021

mehrdadh commented May 17, 2021

manupak commented May 18, 2021 • edited Loading

mehrdadh commented May 19, 2021

mehrdadh commented May 19, 2021

manupak commented May 20, 2021

giuseros commented May 20, 2021

mehrdadh commented May 20, 2021

mehrdadh commented May 20, 2021

areusch commented Jun 10, 2021

Mousius commented Aug 6, 2021

mehrdadh commented Aug 6, 2021

manupak commented May 18, 2021 •

edited

Loading