-
Notifications
You must be signed in to change notification settings - Fork 47
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
gpu - reduce memory usage in gen backends #1661
Conversation
Here's an example of an updated Ratel kernel. See extern "C" __global__ void CeedKernelCudaGenOperator_Mass(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W) {
const CeedScalar *d_in_0 = fields.inputs[0];
const CeedScalar *d_in_1 = fields.inputs[1];
CeedScalar *d_out_0 = fields.outputs[0];
const CeedInt dim = 3;
const CeedInt Q_1d = 3;
extern __shared__ CeedScalar slice[];
SharedData_Cuda data;
data.t_id_x = threadIdx.x;
data.t_id_y = threadIdx.y;
data.t_id_z = threadIdx.z;
data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
data.slice = slice + data.t_id_z*T_1D*T_1D;
// Input field constants and basis data
// -- Input field 0
const CeedInt P_1d_in_0 = 3;
const CeedInt num_comp_in_0 = 10;
// EvalMode: none
// -- Input field 1
const CeedInt P_1d_in_1 = 3;
const CeedInt num_comp_in_1 = 16;
// EvalMode: interpolation
__shared__ CeedScalar s_B_in_1[9];
loadMatrix<P_1d_in_1, Q_1d>(data, B.inputs[1], s_B_in_1);
// Output field constants and basis data
// -- Output field 0
const CeedInt P_1d_out_0 = 3;
const CeedInt num_comp_out_0 = 16;
// EvalMode: interpolation
__shared__ CeedScalar s_B_out_0[9];
loadMatrix<P_1d_out_0, Q_1d>(data, B.outputs[0], s_B_out_0);
// Element loop
__syncthreads();
for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
// -- Input field restrictions and basis actions
// ---- Input field 0
CeedScalar r_e_in_0[num_comp_in_0*P_1d_in_0];
// Strides: {1, 729, 27}
readDofsStrided3d<num_comp_in_0, P_1d_in_0,1,729,27>(data, elem, d_in_0, r_e_in_0);
// EvalMode: none
CeedScalar *r_q_in_0 = r_e_in_0;
// ---- Input field 1
CeedScalar r_e_in_1[num_comp_in_1*P_1d_in_1];
const CeedInt l_size_in_1 = 5488;
// CompStride: 1
readDofsOffset3d<num_comp_in_1, 1, P_1d_in_1>(data, l_size_in_1, elem, indices.inputs[1], d_in_1, r_e_in_1);
// EvalMode: interpolation
CeedScalar r_q_in_1[num_comp_in_1*Q_1d];
InterpTensor3d<num_comp_in_1, P_1d_in_1, Q_1d>(data, r_e_in_1, s_B_in_1, r_q_in_1);
// -- Output field setup
// ---- Output field 0
CeedScalar r_q_out_0[num_comp_out_0*Q_1d];
// Note: Using full elements
{
// -- Input fields
// ---- Input field 0
CeedScalar *r_s_in_0 = r_q_in_0;
// ---- Input field 1
CeedScalar *r_s_in_1 = r_q_in_1;
// -- Output fields
// ---- Output field 0
CeedScalar *r_s_out_0 = r_q_out_0;
// -- QFunction inputs and outputs
// ---- Inputs
CeedScalar *inputs[2];
// ------ Input field 0
inputs[0] = r_s_in_0;
// ------ Input field 1
inputs[1] = r_s_in_1;
// ---- Outputs
CeedScalar *outputs[1];
// ------ Output field 0
outputs[0] = r_s_out_0;
// -- Apply QFunction
Mass(ctx, Q_1d, inputs, outputs);
}
// -- Output field basis action and restrictions
// ---- Output field 0
// EvalMode: interpolation
CeedScalar *r_e_out_0 = r_e_in_1;
InterpTransposeTensor3d<num_comp_out_0, P_1d_out_0, Q_1d>(data, r_q_out_0, s_B_out_0, r_e_out_0);
const CeedInt l_size_out_0 = 5488;
// CompStride: 1
writeDofsOffset3d<num_comp_out_0, 1, P_1d_out_0>(data, l_size_out_0, elem, indices.outputs[0], r_e_out_0, d_out_0);
}
} |
9ac67c4
to
56ed042
Compare
Note to me - can improve using any input evec buffer or previously used output evec buffer that is at least as big as the required output buffer |
Ok, now in thing PR for gen operators
|
722f3b0
to
e93651e
Compare
Here's the new generated code. Of note
extern "C" __global__ void CeedKernelCudaGenOperator_SetupSurfaceGeometryBounded(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W) {
const CeedScalar *d_in_0 = fields.inputs[0];
const CeedScalar *d_in_1 = fields.inputs[1];
CeedScalar *d_out_0 = fields.outputs[0];
const CeedInt dim = 2;
const CeedInt Q_1d = 4;
extern __shared__ CeedScalar slice[];
SharedData_Cuda data;
data.t_id_x = threadIdx.x;
data.t_id_y = threadIdx.y;
data.t_id_z = threadIdx.z;
data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
data.slice = slice + data.t_id_z*T_1D*T_1D;
// Input field constants and basis data
// -- Input field 0
const CeedInt P_1d_in_0 = 2;
const CeedInt num_comp_in_0 = 3;
// EvalMode: interpolation
__shared__ CeedScalar s_B_in_0[8];
loadMatrix<P_1d_in_0, Q_1d>(data, B.inputs[0], s_B_in_0);
// -- Input field 1
const CeedInt P_1d_in_1 = 2;
const CeedInt num_comp_in_1 = 3;
// EvalMode: gradient
__shared__ CeedScalar s_B_in_1[8];
loadMatrix<P_1d_in_1, Q_1d>(data, B.inputs[1], s_B_in_1);
__shared__ CeedScalar s_G_in_1[8];
loadMatrix<P_1d_in_1, Q_1d>(data, G.inputs[1], s_G_in_1);
// -- Input field 2
// EvalMode: quadrature weights
// Output field constants and basis data
// -- Output field 0
const CeedInt P_1d_out_0 = 4;
const CeedInt num_comp_out_0 = 4;
// EvalMode: none
// Element loop
__syncthreads();
for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
// Scratch restriction buffer space
CeedScalar r_e_scratch[64];
// -- Input field restrictions and basis actions
// ---- Input field 0
CeedScalar *r_e_in_0 = r_e_scratch;
const CeedInt l_size_in_0 = 192;
// CompStride: 1
readDofsOffset2d<num_comp_in_0, 1, P_1d_in_0>(data, l_size_in_0, elem, indices.inputs[0], d_in_0, r_e_in_0);
// EvalMode: interpolation
CeedScalar r_q_in_0[num_comp_in_0*Q_1d];
InterpTensor2d<num_comp_in_0, P_1d_in_0, Q_1d>(data, r_e_in_0, s_B_in_0, r_q_in_0);
// ---- Input field 1
CeedScalar *r_e_in_1 = r_e_in_0;
// EvalMode: gradient
CeedScalar r_q_in_1[num_comp_in_1*dim*Q_1d];
GradTensor2d<num_comp_in_1, P_1d_in_1, Q_1d>(data, r_e_in_1, s_B_in_1, s_G_in_1, r_q_in_1);
// ---- Input field 2
// EvalMode: quadrature weights
CeedScalar r_q_in_2[Q_1d];
WeightTensor2d<Q_1d>(data, W, r_q_in_2);
// -- Output field setup
// ---- Output field 0
CeedScalar r_q_out_0[num_comp_out_0*Q_1d];
// Note: Using full elements
{
// -- Input fields
// ---- Input field 0
CeedScalar *r_s_in_0 = r_q_in_0;
// ---- Input field 1
CeedScalar *r_s_in_1 = r_q_in_1;
// ---- Input field 2
CeedScalar *r_s_in_2 = r_q_in_2;
// -- Output fields
// ---- Output field 0
CeedScalar *r_s_out_0 = r_q_out_0;
// -- QFunction inputs and outputs
// ---- Inputs
CeedScalar *inputs[3];
// ------ Input field 0
inputs[0] = r_s_in_0;
// ------ Input field 1
inputs[1] = r_s_in_1;
// ------ Input field 2
inputs[2] = r_s_in_2;
// ---- Outputs
CeedScalar *outputs[1];
// ------ Output field 0
outputs[0] = r_s_out_0;
// -- Apply QFunction
SetupSurfaceGeometryBounded(ctx, 1, inputs, outputs);
}
// -- Output field basis action and restrictions
// ---- Output field 0
// EvalMode: none
CeedScalar *r_e_out_0 = r_q_out_0;
// Strides: {1, 144, 16}
writeDofsStrided2d<num_comp_out_0, P_1d_out_0,1,144,16>(data, elem, r_e_out_0, d_out_0);
}
} also
extern "C" __global__ void CeedKernelCudaGenOperator_Mass(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W) {
const CeedScalar *d_in_0 = fields.inputs[0];
const CeedScalar *d_in_1 = fields.inputs[1];
CeedScalar *d_out_0 = fields.outputs[0];
const CeedInt dim = 2;
const CeedInt Q_1d = 4;
extern __shared__ CeedScalar slice[];
SharedData_Cuda data;
data.t_id_x = threadIdx.x;
data.t_id_y = threadIdx.y;
data.t_id_z = threadIdx.z;
data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
data.slice = slice + data.t_id_z*T_1D*T_1D;
// Input field constants and basis data
// -- Input field 0
const CeedInt P_1d_in_0 = 4;
const CeedInt num_comp_in_0 = 4;
// EvalMode: none
// -- Input field 1
const CeedInt P_1d_in_1 = 2;
const CeedInt num_comp_in_1 = 3;
// EvalMode: interpolation
__shared__ CeedScalar s_B_in_1[8];
loadMatrix<P_1d_in_1, Q_1d>(data, B.inputs[1], s_B_in_1);
// Output field constants and basis data
// -- Output field 0
const CeedInt P_1d_out_0 = 2;
const CeedInt num_comp_out_0 = 3;
// EvalMode: interpolation
__shared__ CeedScalar s_B_out_0[8];
loadMatrix<P_1d_out_0, Q_1d>(data, B.outputs[0], s_B_out_0);
// Element loop
__syncthreads();
for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
// Scratch restriction buffer space
CeedScalar r_e_scratch[64];
// -- Input field restrictions and basis actions
// ---- Input field 0
CeedScalar r_e_in_0[num_comp_in_0*P_1d_in_0];
// Strides: {1, 144, 16}
readDofsStrided2d<num_comp_in_0, P_1d_in_0,1,144,16>(data, elem, d_in_0, r_e_in_0);
// EvalMode: none
CeedScalar *r_q_in_0 = r_e_in_0;
// ---- Input field 1
CeedScalar *r_e_in_1 = r_e_scratch;
const CeedInt l_size_in_1 = 192;
// CompStride: 1
readDofsOffset2d<num_comp_in_1, 1, P_1d_in_1>(data, l_size_in_1, elem, indices.inputs[1], d_in_1, r_e_in_1);
// EvalMode: interpolation
CeedScalar r_q_in_1[num_comp_in_1*Q_1d];
InterpTensor2d<num_comp_in_1, P_1d_in_1, Q_1d>(data, r_e_in_1, s_B_in_1, r_q_in_1);
// -- Output field setup
// ---- Output field 0
CeedScalar r_q_out_0[num_comp_out_0*Q_1d];
// Note: Using full elements
{
// -- Input fields
// ---- Input field 0
CeedScalar *r_s_in_0 = r_q_in_0;
// ---- Input field 1
CeedScalar *r_s_in_1 = r_q_in_1;
// -- Output fields
// ---- Output field 0
CeedScalar *r_s_out_0 = r_q_out_0;
// -- QFunction inputs and outputs
// ---- Inputs
CeedScalar *inputs[2];
// ------ Input field 0
inputs[0] = r_s_in_0;
// ------ Input field 1
inputs[1] = r_s_in_1;
// ---- Outputs
CeedScalar *outputs[1];
// ------ Output field 0
outputs[0] = r_s_out_0;
// -- Apply QFunction
Mass(ctx, 1, inputs, outputs);
}
// -- Output field basis action and restrictions
// ---- Output field 0
// EvalMode: interpolation
CeedScalar *r_e_out_0 = r_e_scratch;
InterpTransposeTensor2d<num_comp_out_0, P_1d_out_0, Q_1d>(data, r_q_out_0, s_B_out_0, r_e_out_0);
const CeedInt l_size_out_0 = 192;
// CompStride: 1
writeDofsOffset2d<num_comp_out_0, 1, P_1d_out_0>(data, l_size_out_0, elem, indices.outputs[0], r_e_out_0, d_out_0);
}
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good to me. I'm not sure what's up with the failing test though, especially since the changes seem pretty identical between the two files.
Yeah I think that's just ROCM CI continuing to be flaky |
Some memory usage reductions, and trimming out duplicated restriction applications.