gpu - reduce memory usage in gen backends #1661

jeremylt · 2024-09-11T18:41:26Z

Some memory usage reductions, and trimming out duplicated restriction applications.

jeremylt · 2024-09-11T18:44:27Z

Here's an example of an updated Ratel kernel. See CeedScalar *r_q_in_0 = r_e_in_0; (reused input buffer, skipped duplicate restriction) and CeedScalar *r_e_out_0 = r_e_in_1; (reused input buffer).

extern "C" __global__ void CeedKernelCudaGenOperator_Mass(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W) {
  const CeedScalar *d_in_0 = fields.inputs[0];
  const CeedScalar *d_in_1 = fields.inputs[1];
  CeedScalar *d_out_0 = fields.outputs[0];
  const CeedInt dim = 3;
  const CeedInt Q_1d = 3;
  extern __shared__ CeedScalar slice[];
  SharedData_Cuda data;
  data.t_id_x = threadIdx.x;
  data.t_id_y = threadIdx.y;
  data.t_id_z = threadIdx.z;
  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
  data.slice = slice + data.t_id_z*T_1D*T_1D;

  // Input field constants and basis data
  // -- Input field 0
  const CeedInt P_1d_in_0 = 3;
  const CeedInt num_comp_in_0 = 10;
  // EvalMode: none
  // -- Input field 1
  const CeedInt P_1d_in_1 = 3;
  const CeedInt num_comp_in_1 = 16;
  // EvalMode: interpolation
  __shared__ CeedScalar s_B_in_1[9];
  loadMatrix<P_1d_in_1, Q_1d>(data, B.inputs[1], s_B_in_1);

  // Output field constants and basis data
  // -- Output field 0
  const CeedInt P_1d_out_0 = 3;
  const CeedInt num_comp_out_0 = 16;
  // EvalMode: interpolation
  __shared__ CeedScalar s_B_out_0[9];
  loadMatrix<P_1d_out_0, Q_1d>(data, B.outputs[0], s_B_out_0);

  // Element loop
  __syncthreads();
  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
    // -- Input field restrictions and basis actions
    // ---- Input field 0
    CeedScalar r_e_in_0[num_comp_in_0*P_1d_in_0];
    // Strides: {1, 729, 27}
    readDofsStrided3d<num_comp_in_0, P_1d_in_0,1,729,27>(data, elem, d_in_0, r_e_in_0);
    // EvalMode: none
    CeedScalar *r_q_in_0 = r_e_in_0;
    // ---- Input field 1
    CeedScalar r_e_in_1[num_comp_in_1*P_1d_in_1];
    const CeedInt l_size_in_1 = 5488;
    // CompStride: 1
    readDofsOffset3d<num_comp_in_1, 1, P_1d_in_1>(data, l_size_in_1, elem, indices.inputs[1], d_in_1, r_e_in_1);
    // EvalMode: interpolation
    CeedScalar r_q_in_1[num_comp_in_1*Q_1d];
    InterpTensor3d<num_comp_in_1, P_1d_in_1, Q_1d>(data, r_e_in_1, s_B_in_1, r_q_in_1);

    // -- Output field setup
    // ---- Output field 0
    CeedScalar r_q_out_0[num_comp_out_0*Q_1d];

    // Note: Using full elements
    {
      // -- Input fields
      // ---- Input field 0
      CeedScalar *r_s_in_0 = r_q_in_0;
      // ---- Input field 1
      CeedScalar *r_s_in_1 = r_q_in_1;
      // -- Output fields
      // ---- Output field 0
      CeedScalar *r_s_out_0 = r_q_out_0;

      // -- QFunction inputs and outputs
      // ---- Inputs
      CeedScalar *inputs[2];
      // ------ Input field 0
      inputs[0] = r_s_in_0;
      // ------ Input field 1
      inputs[1] = r_s_in_1;
      // ---- Outputs
      CeedScalar *outputs[1];
      // ------ Output field 0
      outputs[0] = r_s_out_0;

      // -- Apply QFunction
      Mass(ctx, Q_1d, inputs, outputs);
    }

    // -- Output field basis action and restrictions
    // ---- Output field 0
    // EvalMode: interpolation
    CeedScalar *r_e_out_0 = r_e_in_1;
    InterpTransposeTensor3d<num_comp_out_0, P_1d_out_0, Q_1d>(data, r_q_out_0, s_B_out_0, r_e_out_0);
    const CeedInt l_size_out_0 = 5488;
    // CompStride: 1
    writeDofsOffset3d<num_comp_out_0, 1, P_1d_out_0>(data, l_size_out_0, elem, indices.outputs[0], r_e_out_0, d_out_0);
  }
}

jeremylt · 2024-09-11T22:35:59Z

Note to me - can improve using any input evec buffer or previously used output evec buffer that is at least as big as the required output buffer

jeremylt · 2024-09-12T17:49:38Z

Ok, now in thing PR for gen operators

duplicate restrictions are skipped
all non CEED_EVAL_NONE inputs share restriction scratch space
all outputs share the same restriction scratch space as the inputs

jeremylt · 2024-09-12T18:08:34Z

Here's the new generated code.

Of note

CeedScalar *r_e_in_0 = r_e_scratch; - use of scratch buffer
CeedScalar *r_e_in_1 = r_e_in_0; skipping restriction already done for field 0

extern "C" __global__ void CeedKernelCudaGenOperator_SetupSurfaceGeometryBounded(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W) {
  const CeedScalar *d_in_0 = fields.inputs[0];
  const CeedScalar *d_in_1 = fields.inputs[1];
  CeedScalar *d_out_0 = fields.outputs[0];
  const CeedInt dim = 2;
  const CeedInt Q_1d = 4;
  extern __shared__ CeedScalar slice[];
  SharedData_Cuda data;
  data.t_id_x = threadIdx.x;
  data.t_id_y = threadIdx.y;
  data.t_id_z = threadIdx.z;
  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
  data.slice = slice + data.t_id_z*T_1D*T_1D;

  // Input field constants and basis data
  // -- Input field 0
  const CeedInt P_1d_in_0 = 2;
  const CeedInt num_comp_in_0 = 3;
  // EvalMode: interpolation
  __shared__ CeedScalar s_B_in_0[8];
  loadMatrix<P_1d_in_0, Q_1d>(data, B.inputs[0], s_B_in_0);
  // -- Input field 1
  const CeedInt P_1d_in_1 = 2;
  const CeedInt num_comp_in_1 = 3;
  // EvalMode: gradient
  __shared__ CeedScalar s_B_in_1[8];
  loadMatrix<P_1d_in_1, Q_1d>(data, B.inputs[1], s_B_in_1);
  __shared__ CeedScalar s_G_in_1[8];
  loadMatrix<P_1d_in_1, Q_1d>(data, G.inputs[1], s_G_in_1);
  // -- Input field 2
  // EvalMode: quadrature weights

  // Output field constants and basis data
  // -- Output field 0
  const CeedInt P_1d_out_0 = 4;
  const CeedInt num_comp_out_0 = 4;
  // EvalMode: none

  // Element loop
  __syncthreads();
  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
    // Scratch restriction buffer space
    CeedScalar r_e_scratch[64];

    // -- Input field restrictions and basis actions
    // ---- Input field 0
    CeedScalar *r_e_in_0 = r_e_scratch;
    const CeedInt l_size_in_0 = 192;
    // CompStride: 1
    readDofsOffset2d<num_comp_in_0, 1, P_1d_in_0>(data, l_size_in_0, elem, indices.inputs[0], d_in_0, r_e_in_0);
    // EvalMode: interpolation
    CeedScalar r_q_in_0[num_comp_in_0*Q_1d];
    InterpTensor2d<num_comp_in_0, P_1d_in_0, Q_1d>(data, r_e_in_0, s_B_in_0, r_q_in_0);
    // ---- Input field 1
    CeedScalar *r_e_in_1 = r_e_in_0;
    // EvalMode: gradient
    CeedScalar r_q_in_1[num_comp_in_1*dim*Q_1d];
    GradTensor2d<num_comp_in_1, P_1d_in_1, Q_1d>(data, r_e_in_1, s_B_in_1, s_G_in_1, r_q_in_1);
    // ---- Input field 2
    // EvalMode: quadrature weights
    CeedScalar r_q_in_2[Q_1d];
    WeightTensor2d<Q_1d>(data, W, r_q_in_2);

    // -- Output field setup
    // ---- Output field 0
    CeedScalar r_q_out_0[num_comp_out_0*Q_1d];

    // Note: Using full elements
    {
      // -- Input fields
      // ---- Input field 0
      CeedScalar *r_s_in_0 = r_q_in_0;
      // ---- Input field 1
      CeedScalar *r_s_in_1 = r_q_in_1;
      // ---- Input field 2
      CeedScalar *r_s_in_2 = r_q_in_2;
      // -- Output fields
      // ---- Output field 0
      CeedScalar *r_s_out_0 = r_q_out_0;

      // -- QFunction inputs and outputs
      // ---- Inputs
      CeedScalar *inputs[3];
      // ------ Input field 0
      inputs[0] = r_s_in_0;
      // ------ Input field 1
      inputs[1] = r_s_in_1;
      // ------ Input field 2
      inputs[2] = r_s_in_2;
      // ---- Outputs
      CeedScalar *outputs[1];
      // ------ Output field 0
      outputs[0] = r_s_out_0;

      // -- Apply QFunction
      SetupSurfaceGeometryBounded(ctx, 1, inputs, outputs);
    }

    // -- Output field basis action and restrictions
    // ---- Output field 0
    // EvalMode: none
    CeedScalar *r_e_out_0 = r_q_out_0;
    // Strides: {1, 144, 16}
    writeDofsStrided2d<num_comp_out_0, P_1d_out_0,1,144,16>(data, elem, r_e_out_0, d_out_0);
  }
}

also

CeedScalar r_e_in_0[num_comp_in_0*P_1d_in_0]; - r_e_in_0 is r_q_in_0, so buffer needed
CeedScalar *r_e_in_1 = r_e_scratch; - using scratch for input
CeedScalar *r_e_out_0 = r_e_scratch; - and recycling same scratch for output

extern "C" __global__ void CeedKernelCudaGenOperator_Mass(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W) {
  const CeedScalar *d_in_0 = fields.inputs[0];
  const CeedScalar *d_in_1 = fields.inputs[1];
  CeedScalar *d_out_0 = fields.outputs[0];
  const CeedInt dim = 2;
  const CeedInt Q_1d = 4;
  extern __shared__ CeedScalar slice[];
  SharedData_Cuda data;
  data.t_id_x = threadIdx.x;
  data.t_id_y = threadIdx.y;
  data.t_id_z = threadIdx.z;
  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
  data.slice = slice + data.t_id_z*T_1D*T_1D;

  // Input field constants and basis data
  // -- Input field 0
  const CeedInt P_1d_in_0 = 4;
  const CeedInt num_comp_in_0 = 4;
  // EvalMode: none
  // -- Input field 1
  const CeedInt P_1d_in_1 = 2;
  const CeedInt num_comp_in_1 = 3;
  // EvalMode: interpolation
  __shared__ CeedScalar s_B_in_1[8];
  loadMatrix<P_1d_in_1, Q_1d>(data, B.inputs[1], s_B_in_1);

  // Output field constants and basis data
  // -- Output field 0
  const CeedInt P_1d_out_0 = 2;
  const CeedInt num_comp_out_0 = 3;
  // EvalMode: interpolation
  __shared__ CeedScalar s_B_out_0[8];
  loadMatrix<P_1d_out_0, Q_1d>(data, B.outputs[0], s_B_out_0);

  // Element loop
  __syncthreads();
  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
    // Scratch restriction buffer space
    CeedScalar r_e_scratch[64];

    // -- Input field restrictions and basis actions
    // ---- Input field 0
    CeedScalar r_e_in_0[num_comp_in_0*P_1d_in_0];
    // Strides: {1, 144, 16}
    readDofsStrided2d<num_comp_in_0, P_1d_in_0,1,144,16>(data, elem, d_in_0, r_e_in_0);
    // EvalMode: none
    CeedScalar *r_q_in_0 = r_e_in_0;
    // ---- Input field 1
    CeedScalar *r_e_in_1 = r_e_scratch;
    const CeedInt l_size_in_1 = 192;
    // CompStride: 1
    readDofsOffset2d<num_comp_in_1, 1, P_1d_in_1>(data, l_size_in_1, elem, indices.inputs[1], d_in_1, r_e_in_1);
    // EvalMode: interpolation
    CeedScalar r_q_in_1[num_comp_in_1*Q_1d];
    InterpTensor2d<num_comp_in_1, P_1d_in_1, Q_1d>(data, r_e_in_1, s_B_in_1, r_q_in_1);

    // -- Output field setup
    // ---- Output field 0
    CeedScalar r_q_out_0[num_comp_out_0*Q_1d];

    // Note: Using full elements
    {
      // -- Input fields
      // ---- Input field 0
      CeedScalar *r_s_in_0 = r_q_in_0;
      // ---- Input field 1
      CeedScalar *r_s_in_1 = r_q_in_1;
      // -- Output fields
      // ---- Output field 0
      CeedScalar *r_s_out_0 = r_q_out_0;

      // -- QFunction inputs and outputs
      // ---- Inputs
      CeedScalar *inputs[2];
      // ------ Input field 0
      inputs[0] = r_s_in_0;
      // ------ Input field 1
      inputs[1] = r_s_in_1;
      // ---- Outputs
      CeedScalar *outputs[1];
      // ------ Output field 0
      outputs[0] = r_s_out_0;

      // -- Apply QFunction
      Mass(ctx, 1, inputs, outputs);
    }

    // -- Output field basis action and restrictions
    // ---- Output field 0
    // EvalMode: interpolation
    CeedScalar *r_e_out_0 = r_e_scratch;
    InterpTransposeTensor2d<num_comp_out_0, P_1d_out_0, Q_1d>(data, r_q_out_0, s_B_out_0, r_e_out_0);
    const CeedInt l_size_out_0 = 192;
    // CompStride: 1
    writeDofsOffset2d<num_comp_out_0, 1, P_1d_out_0>(data, l_size_out_0, elem, indices.outputs[0], r_e_out_0, d_out_0);
  }
}

zatkins-dev

Looks good to me. I'm not sure what's up with the failing test though, especially since the changes seem pretty identical between the two files.

jeremylt · 2024-09-12T20:43:24Z

Yeah I think that's just ROCM CI continuing to be flaky

jeremylt added GPU 1-In Review labels Sep 11, 2024

jeremylt self-assigned this Sep 11, 2024

jeremylt force-pushed the jeremy/gen-memory branch 2 times, most recently from 9ac67c4 to 56ed042 Compare September 11, 2024 18:58

gpu - reduce memory usage in gen backends

e93651e

jeremylt force-pushed the jeremy/gen-memory branch from 722f3b0 to e93651e Compare September 12, 2024 18:00

zatkins-dev self-requested a review September 12, 2024 20:38

zatkins-dev approved these changes Sep 12, 2024

View reviewed changes

jeremylt merged commit 8386266 into main Sep 12, 2024
23 of 24 checks passed

jeremylt deleted the jeremy/gen-memory branch September 12, 2024 20:44

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

gpu - reduce memory usage in gen backends #1661

gpu - reduce memory usage in gen backends #1661

jeremylt commented Sep 11, 2024

jeremylt commented Sep 11, 2024 •

edited

Loading

jeremylt commented Sep 11, 2024 •

edited

Loading

jeremylt commented Sep 12, 2024

jeremylt commented Sep 12, 2024

zatkins-dev left a comment

jeremylt commented Sep 12, 2024

gpu - reduce memory usage in gen backends #1661

gpu - reduce memory usage in gen backends #1661

Conversation

jeremylt commented Sep 11, 2024

jeremylt commented Sep 11, 2024 • edited Loading

jeremylt commented Sep 11, 2024 • edited Loading

jeremylt commented Sep 12, 2024

jeremylt commented Sep 12, 2024

zatkins-dev left a comment

Choose a reason for hiding this comment

jeremylt commented Sep 12, 2024

jeremylt commented Sep 11, 2024 •

edited

Loading

jeremylt commented Sep 11, 2024 •

edited

Loading