Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gpu - reduce memory usage in gen backends #1661

Merged
merged 1 commit into from
Sep 12, 2024
Merged

gpu - reduce memory usage in gen backends #1661

merged 1 commit into from
Sep 12, 2024

Conversation

jeremylt
Copy link
Member

Some memory usage reductions, and trimming out duplicated restriction applications.

@jeremylt
Copy link
Member Author

jeremylt commented Sep 11, 2024

Here's an example of an updated Ratel kernel. See CeedScalar *r_q_in_0 = r_e_in_0; (reused input buffer, skipped duplicate restriction) and CeedScalar *r_e_out_0 = r_e_in_1; (reused input buffer).

extern "C" __global__ void CeedKernelCudaGenOperator_Mass(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W) {
  const CeedScalar *d_in_0 = fields.inputs[0];
  const CeedScalar *d_in_1 = fields.inputs[1];
  CeedScalar *d_out_0 = fields.outputs[0];
  const CeedInt dim = 3;
  const CeedInt Q_1d = 3;
  extern __shared__ CeedScalar slice[];
  SharedData_Cuda data;
  data.t_id_x = threadIdx.x;
  data.t_id_y = threadIdx.y;
  data.t_id_z = threadIdx.z;
  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
  data.slice = slice + data.t_id_z*T_1D*T_1D;

  // Input field constants and basis data
  // -- Input field 0
  const CeedInt P_1d_in_0 = 3;
  const CeedInt num_comp_in_0 = 10;
  // EvalMode: none
  // -- Input field 1
  const CeedInt P_1d_in_1 = 3;
  const CeedInt num_comp_in_1 = 16;
  // EvalMode: interpolation
  __shared__ CeedScalar s_B_in_1[9];
  loadMatrix<P_1d_in_1, Q_1d>(data, B.inputs[1], s_B_in_1);

  // Output field constants and basis data
  // -- Output field 0
  const CeedInt P_1d_out_0 = 3;
  const CeedInt num_comp_out_0 = 16;
  // EvalMode: interpolation
  __shared__ CeedScalar s_B_out_0[9];
  loadMatrix<P_1d_out_0, Q_1d>(data, B.outputs[0], s_B_out_0);

  // Element loop
  __syncthreads();
  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
    // -- Input field restrictions and basis actions
    // ---- Input field 0
    CeedScalar r_e_in_0[num_comp_in_0*P_1d_in_0];
    // Strides: {1, 729, 27}
    readDofsStrided3d<num_comp_in_0, P_1d_in_0,1,729,27>(data, elem, d_in_0, r_e_in_0);
    // EvalMode: none
    CeedScalar *r_q_in_0 = r_e_in_0;
    // ---- Input field 1
    CeedScalar r_e_in_1[num_comp_in_1*P_1d_in_1];
    const CeedInt l_size_in_1 = 5488;
    // CompStride: 1
    readDofsOffset3d<num_comp_in_1, 1, P_1d_in_1>(data, l_size_in_1, elem, indices.inputs[1], d_in_1, r_e_in_1);
    // EvalMode: interpolation
    CeedScalar r_q_in_1[num_comp_in_1*Q_1d];
    InterpTensor3d<num_comp_in_1, P_1d_in_1, Q_1d>(data, r_e_in_1, s_B_in_1, r_q_in_1);

    // -- Output field setup
    // ---- Output field 0
    CeedScalar r_q_out_0[num_comp_out_0*Q_1d];

    // Note: Using full elements
    {
      // -- Input fields
      // ---- Input field 0
      CeedScalar *r_s_in_0 = r_q_in_0;
      // ---- Input field 1
      CeedScalar *r_s_in_1 = r_q_in_1;
      // -- Output fields
      // ---- Output field 0
      CeedScalar *r_s_out_0 = r_q_out_0;

      // -- QFunction inputs and outputs
      // ---- Inputs
      CeedScalar *inputs[2];
      // ------ Input field 0
      inputs[0] = r_s_in_0;
      // ------ Input field 1
      inputs[1] = r_s_in_1;
      // ---- Outputs
      CeedScalar *outputs[1];
      // ------ Output field 0
      outputs[0] = r_s_out_0;

      // -- Apply QFunction
      Mass(ctx, Q_1d, inputs, outputs);
    }

    // -- Output field basis action and restrictions
    // ---- Output field 0
    // EvalMode: interpolation
    CeedScalar *r_e_out_0 = r_e_in_1;
    InterpTransposeTensor3d<num_comp_out_0, P_1d_out_0, Q_1d>(data, r_q_out_0, s_B_out_0, r_e_out_0);
    const CeedInt l_size_out_0 = 5488;
    // CompStride: 1
    writeDofsOffset3d<num_comp_out_0, 1, P_1d_out_0>(data, l_size_out_0, elem, indices.outputs[0], r_e_out_0, d_out_0);
  }
}

@jeremylt jeremylt force-pushed the jeremy/gen-memory branch 2 times, most recently from 9ac67c4 to 56ed042 Compare September 11, 2024 18:58
@jeremylt
Copy link
Member Author

jeremylt commented Sep 11, 2024

Note to me - can improve using any input evec buffer or previously used output evec buffer that is at least as big as the required output buffer

@jeremylt
Copy link
Member Author

Ok, now in thing PR for gen operators

  1. duplicate restrictions are skipped
  2. all non CEED_EVAL_NONE inputs share restriction scratch space
  3. all outputs share the same restriction scratch space as the inputs

@jeremylt
Copy link
Member Author

Here's the new generated code.

Of note

  1. CeedScalar *r_e_in_0 = r_e_scratch; - use of scratch buffer
  2. CeedScalar *r_e_in_1 = r_e_in_0; skipping restriction already done for field 0
extern "C" __global__ void CeedKernelCudaGenOperator_SetupSurfaceGeometryBounded(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W) {
  const CeedScalar *d_in_0 = fields.inputs[0];
  const CeedScalar *d_in_1 = fields.inputs[1];
  CeedScalar *d_out_0 = fields.outputs[0];
  const CeedInt dim = 2;
  const CeedInt Q_1d = 4;
  extern __shared__ CeedScalar slice[];
  SharedData_Cuda data;
  data.t_id_x = threadIdx.x;
  data.t_id_y = threadIdx.y;
  data.t_id_z = threadIdx.z;
  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
  data.slice = slice + data.t_id_z*T_1D*T_1D;

  // Input field constants and basis data
  // -- Input field 0
  const CeedInt P_1d_in_0 = 2;
  const CeedInt num_comp_in_0 = 3;
  // EvalMode: interpolation
  __shared__ CeedScalar s_B_in_0[8];
  loadMatrix<P_1d_in_0, Q_1d>(data, B.inputs[0], s_B_in_0);
  // -- Input field 1
  const CeedInt P_1d_in_1 = 2;
  const CeedInt num_comp_in_1 = 3;
  // EvalMode: gradient
  __shared__ CeedScalar s_B_in_1[8];
  loadMatrix<P_1d_in_1, Q_1d>(data, B.inputs[1], s_B_in_1);
  __shared__ CeedScalar s_G_in_1[8];
  loadMatrix<P_1d_in_1, Q_1d>(data, G.inputs[1], s_G_in_1);
  // -- Input field 2
  // EvalMode: quadrature weights

  // Output field constants and basis data
  // -- Output field 0
  const CeedInt P_1d_out_0 = 4;
  const CeedInt num_comp_out_0 = 4;
  // EvalMode: none

  // Element loop
  __syncthreads();
  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
    // Scratch restriction buffer space
    CeedScalar r_e_scratch[64];

    // -- Input field restrictions and basis actions
    // ---- Input field 0
    CeedScalar *r_e_in_0 = r_e_scratch;
    const CeedInt l_size_in_0 = 192;
    // CompStride: 1
    readDofsOffset2d<num_comp_in_0, 1, P_1d_in_0>(data, l_size_in_0, elem, indices.inputs[0], d_in_0, r_e_in_0);
    // EvalMode: interpolation
    CeedScalar r_q_in_0[num_comp_in_0*Q_1d];
    InterpTensor2d<num_comp_in_0, P_1d_in_0, Q_1d>(data, r_e_in_0, s_B_in_0, r_q_in_0);
    // ---- Input field 1
    CeedScalar *r_e_in_1 = r_e_in_0;
    // EvalMode: gradient
    CeedScalar r_q_in_1[num_comp_in_1*dim*Q_1d];
    GradTensor2d<num_comp_in_1, P_1d_in_1, Q_1d>(data, r_e_in_1, s_B_in_1, s_G_in_1, r_q_in_1);
    // ---- Input field 2
    // EvalMode: quadrature weights
    CeedScalar r_q_in_2[Q_1d];
    WeightTensor2d<Q_1d>(data, W, r_q_in_2);

    // -- Output field setup
    // ---- Output field 0
    CeedScalar r_q_out_0[num_comp_out_0*Q_1d];

    // Note: Using full elements
    {
      // -- Input fields
      // ---- Input field 0
      CeedScalar *r_s_in_0 = r_q_in_0;
      // ---- Input field 1
      CeedScalar *r_s_in_1 = r_q_in_1;
      // ---- Input field 2
      CeedScalar *r_s_in_2 = r_q_in_2;
      // -- Output fields
      // ---- Output field 0
      CeedScalar *r_s_out_0 = r_q_out_0;

      // -- QFunction inputs and outputs
      // ---- Inputs
      CeedScalar *inputs[3];
      // ------ Input field 0
      inputs[0] = r_s_in_0;
      // ------ Input field 1
      inputs[1] = r_s_in_1;
      // ------ Input field 2
      inputs[2] = r_s_in_2;
      // ---- Outputs
      CeedScalar *outputs[1];
      // ------ Output field 0
      outputs[0] = r_s_out_0;

      // -- Apply QFunction
      SetupSurfaceGeometryBounded(ctx, 1, inputs, outputs);
    }

    // -- Output field basis action and restrictions
    // ---- Output field 0
    // EvalMode: none
    CeedScalar *r_e_out_0 = r_q_out_0;
    // Strides: {1, 144, 16}
    writeDofsStrided2d<num_comp_out_0, P_1d_out_0,1,144,16>(data, elem, r_e_out_0, d_out_0);
  }
}

also

  1. CeedScalar r_e_in_0[num_comp_in_0*P_1d_in_0]; - r_e_in_0 is r_q_in_0, so buffer needed
  2. CeedScalar *r_e_in_1 = r_e_scratch; - using scratch for input
  3. CeedScalar *r_e_out_0 = r_e_scratch; - and recycling same scratch for output
extern "C" __global__ void CeedKernelCudaGenOperator_Mass(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W) {
  const CeedScalar *d_in_0 = fields.inputs[0];
  const CeedScalar *d_in_1 = fields.inputs[1];
  CeedScalar *d_out_0 = fields.outputs[0];
  const CeedInt dim = 2;
  const CeedInt Q_1d = 4;
  extern __shared__ CeedScalar slice[];
  SharedData_Cuda data;
  data.t_id_x = threadIdx.x;
  data.t_id_y = threadIdx.y;
  data.t_id_z = threadIdx.z;
  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
  data.slice = slice + data.t_id_z*T_1D*T_1D;

  // Input field constants and basis data
  // -- Input field 0
  const CeedInt P_1d_in_0 = 4;
  const CeedInt num_comp_in_0 = 4;
  // EvalMode: none
  // -- Input field 1
  const CeedInt P_1d_in_1 = 2;
  const CeedInt num_comp_in_1 = 3;
  // EvalMode: interpolation
  __shared__ CeedScalar s_B_in_1[8];
  loadMatrix<P_1d_in_1, Q_1d>(data, B.inputs[1], s_B_in_1);

  // Output field constants and basis data
  // -- Output field 0
  const CeedInt P_1d_out_0 = 2;
  const CeedInt num_comp_out_0 = 3;
  // EvalMode: interpolation
  __shared__ CeedScalar s_B_out_0[8];
  loadMatrix<P_1d_out_0, Q_1d>(data, B.outputs[0], s_B_out_0);

  // Element loop
  __syncthreads();
  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
    // Scratch restriction buffer space
    CeedScalar r_e_scratch[64];

    // -- Input field restrictions and basis actions
    // ---- Input field 0
    CeedScalar r_e_in_0[num_comp_in_0*P_1d_in_0];
    // Strides: {1, 144, 16}
    readDofsStrided2d<num_comp_in_0, P_1d_in_0,1,144,16>(data, elem, d_in_0, r_e_in_0);
    // EvalMode: none
    CeedScalar *r_q_in_0 = r_e_in_0;
    // ---- Input field 1
    CeedScalar *r_e_in_1 = r_e_scratch;
    const CeedInt l_size_in_1 = 192;
    // CompStride: 1
    readDofsOffset2d<num_comp_in_1, 1, P_1d_in_1>(data, l_size_in_1, elem, indices.inputs[1], d_in_1, r_e_in_1);
    // EvalMode: interpolation
    CeedScalar r_q_in_1[num_comp_in_1*Q_1d];
    InterpTensor2d<num_comp_in_1, P_1d_in_1, Q_1d>(data, r_e_in_1, s_B_in_1, r_q_in_1);

    // -- Output field setup
    // ---- Output field 0
    CeedScalar r_q_out_0[num_comp_out_0*Q_1d];

    // Note: Using full elements
    {
      // -- Input fields
      // ---- Input field 0
      CeedScalar *r_s_in_0 = r_q_in_0;
      // ---- Input field 1
      CeedScalar *r_s_in_1 = r_q_in_1;
      // -- Output fields
      // ---- Output field 0
      CeedScalar *r_s_out_0 = r_q_out_0;

      // -- QFunction inputs and outputs
      // ---- Inputs
      CeedScalar *inputs[2];
      // ------ Input field 0
      inputs[0] = r_s_in_0;
      // ------ Input field 1
      inputs[1] = r_s_in_1;
      // ---- Outputs
      CeedScalar *outputs[1];
      // ------ Output field 0
      outputs[0] = r_s_out_0;

      // -- Apply QFunction
      Mass(ctx, 1, inputs, outputs);
    }

    // -- Output field basis action and restrictions
    // ---- Output field 0
    // EvalMode: interpolation
    CeedScalar *r_e_out_0 = r_e_scratch;
    InterpTransposeTensor2d<num_comp_out_0, P_1d_out_0, Q_1d>(data, r_q_out_0, s_B_out_0, r_e_out_0);
    const CeedInt l_size_out_0 = 192;
    // CompStride: 1
    writeDofsOffset2d<num_comp_out_0, 1, P_1d_out_0>(data, l_size_out_0, elem, indices.outputs[0], r_e_out_0, d_out_0);
  }
}

@zatkins-dev zatkins-dev self-requested a review September 12, 2024 20:38
Copy link
Collaborator

@zatkins-dev zatkins-dev left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good to me. I'm not sure what's up with the failing test though, especially since the changes seem pretty identical between the two files.

@jeremylt
Copy link
Member Author

Yeah I think that's just ROCM CI continuing to be flaky

@jeremylt jeremylt merged commit 8386266 into main Sep 12, 2024
23 of 24 checks passed
@jeremylt jeremylt deleted the jeremy/gen-memory branch September 12, 2024 20:44
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants