Skip to content

Commit

Permalink
gpu - reduce memory usage in gen backends
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremylt committed Sep 11, 2024
1 parent bce9d45 commit 56ed042
Show file tree
Hide file tree
Showing 2 changed files with 149 additions and 26 deletions.
87 changes: 74 additions & 13 deletions backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,8 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
// Restriction
//------------------------------------------------------------------------------
static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
bool use_3d_slices) {
CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
CeedInt Q_1d, bool is_input, bool use_3d_slices) {
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
std::string P_name = "P_1d" + var_suffix;
CeedEvalMode eval_mode = CEED_EVAL_NONE;
Expand All @@ -229,7 +229,11 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
// Restriction
if (is_input) {
// Input
if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
if (field_input_buffer[i] != i) {
std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);

code << " CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
} else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
bool is_strided;

code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
Expand Down Expand Up @@ -298,8 +302,8 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
// Basis
//------------------------------------------------------------------------------
static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
bool use_3d_slices) {
CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d,
bool is_input, bool use_3d_slices) {
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
std::string P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
CeedEvalMode eval_mode = CEED_EVAL_NONE;
Expand Down Expand Up @@ -367,12 +371,24 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
code << " CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
break; // No action
case CEED_EVAL_INTERP:
code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
if (field_input_buffer[i] != -1) {
std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);

code << " CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
} else {
code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
}
code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
<< ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
break;
case CEED_EVAL_GRAD:
code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
if (field_input_buffer[i] != -1) {
std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);

code << " CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
} else {
code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
}
if (use_3d_slices) {
code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
<< ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
Expand Down Expand Up @@ -433,7 +449,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
// We treat quadrature points per slice in 3d to save registers
if (use_3d_slices) {
code << "\n // Note: Using planes of 3D elements\n";
code << "#pragma unroll\n";
code << " #pragma unroll\n";
code << " for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
code << " // -- Input fields\n";
for (CeedInt i = 0; i < num_input_fields; i++) {
Expand Down Expand Up @@ -789,34 +805,79 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
code << " __syncthreads();\n";
code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";

// -- Mark duplicate input restrictions
CeedInt field_rstr_in_buffer[CEED_FIELD_MAX];

for (CeedInt i = 0; i < num_input_fields; i++) field_rstr_in_buffer[i] = -1;
for (CeedInt i = 0; i < num_input_fields; i++) {
CeedVector vec_i;
CeedElemRestriction rstr_i;

if (field_rstr_in_buffer[i] != -1) continue;
CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
field_rstr_in_buffer[i] = i;
if (vec_i == CEED_VECTOR_NONE) continue;
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
for (CeedInt j = num_input_fields - 1; j > i; j--) {
CeedVector vec_j;
CeedElemRestriction rstr_j;

CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
if (rstr_i == rstr_j && vec_i == vec_j) {
field_rstr_in_buffer[j] = i;
}
}
}

// -- Input restriction and basis
code << " // -- Input field restrictions and basis actions\n";
for (CeedInt i = 0; i < num_input_fields; i++) {
code << " // ---- Input field " << i << "\n";

// ---- Restriction
CeedCallBackend(
CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, field_rstr_in_buffer, op_input_fields[i], qf_input_fields[i],
Q_1d, true, use_3d_slices));

// ---- Basis action
CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
CeedCallBackend(
CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, NULL, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
}

// -- Q function
CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, dim, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
op_output_fields, qf_output_fields, qfunction_name, Q_1d, use_3d_slices));

// -- Reuse input restriction buffers where able
for (CeedInt i = 0; i < num_output_fields; i++) field_rstr_in_buffer[i] = -1;
for (CeedInt i = 0; i < num_output_fields; i++) {
CeedElemRestriction rstr_i;

CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr_i));
if (rstr_i == CEED_ELEMRESTRICTION_NONE) continue;
for (CeedInt j = 0; j < num_input_fields; j++) {
CeedElemRestriction rstr_j;

CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
if (rstr_i == rstr_j) {
field_rstr_in_buffer[i] = j;
break;
}
}
}

// -- Output basis and restriction
code << "\n // -- Output field basis action and restrictions\n";
for (CeedInt i = 0; i < num_output_fields; i++) {
code << " // ---- Output field " << i << "\n";

// ---- Basis action
CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, field_rstr_in_buffer, op_output_fields[i], qf_output_fields[i], Q_1d,
false, use_3d_slices));

// ---- Restriction
CeedCallBackend(
CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
}

// Close loop and function
Expand Down
88 changes: 75 additions & 13 deletions backends/hip-gen/ceed-hip-gen-operator-build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,8 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
// Restriction
//------------------------------------------------------------------------------
static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt dim,
CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
bool use_3d_slices) {
CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
CeedInt Q_1d, bool is_input, bool use_3d_slices) {
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
std::string P_name = "P_1d" + var_suffix;
CeedEvalMode eval_mode = CEED_EVAL_NONE;
Expand All @@ -256,7 +256,12 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
// Restriction
if (is_input) {
// Input
if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
// Input
if (field_input_buffer[i] != i) {
std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);

code << " CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
} else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
bool is_strided;

code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
Expand Down Expand Up @@ -325,8 +330,8 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
// Basis
//------------------------------------------------------------------------------
static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt dim,
CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
bool use_3d_slices) {
CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d,
bool is_input, bool use_3d_slices) {
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
std::string P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
CeedEvalMode eval_mode = CEED_EVAL_NONE;
Expand Down Expand Up @@ -394,12 +399,24 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
code << " CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
break; // No action
case CEED_EVAL_INTERP:
code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
if (field_input_buffer[i] != -1) {
std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);

code << " CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
} else {
code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
}
code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
<< ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
break;
case CEED_EVAL_GRAD:
code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
if (field_input_buffer[i] != -1) {
std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);

code << " CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
} else {
code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
}
if (use_3d_slices) {
code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
<< ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
Expand Down Expand Up @@ -460,7 +477,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
// We treat quadrature points per slice in 3d to save registers
if (use_3d_slices) {
code << "\n // Note: Using planes of 3D elements\n";
code << "#pragma unroll\n";
code << " #pragma unroll\n";
code << " for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
code << " // -- Input fields\n";
for (CeedInt i = 0; i < num_input_fields; i++) {
Expand Down Expand Up @@ -797,34 +814,79 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
code << " __syncthreads();\n";
code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";

// -- Mark duplicate input restrictions
CeedInt field_rstr_in_buffer[CEED_FIELD_MAX];

for (CeedInt i = 0; i < num_input_fields; i++) field_rstr_in_buffer[i] = -1;
for (CeedInt i = 0; i < num_input_fields; i++) {
CeedVector vec_i;
CeedElemRestriction rstr_i;

if (field_rstr_in_buffer[i] != -1) continue;
CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
field_rstr_in_buffer[i] = i;
if (vec_i == CEED_VECTOR_NONE) continue;
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
for (CeedInt j = num_input_fields - 1; j > i; j--) {
CeedVector vec_j;
CeedElemRestriction rstr_j;

CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
if (rstr_i == rstr_j && vec_i == vec_j) {
field_rstr_in_buffer[j] = i;
}
}
}

// -- Input restriction and basis
code << " // -- Input field restrictions and basis actions\n";
for (CeedInt i = 0; i < num_input_fields; i++) {
code << " // ---- Input field " << i << "\n";

// ---- Restriction
CeedCallBackend(
CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, field_rstr_in_buffer, op_input_fields[i], qf_input_fields[i], Q_1d,
true, use_3d_slices));

// ---- Basis action
CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, dim, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
CeedCallBackend(
CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, dim, NULL, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
}

// -- Q function
CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, dim, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
op_output_fields, qf_output_fields, qfunction_name, Q_1d, use_3d_slices));

// -- Reuse input restriction buffers where able
for (CeedInt i = 0; i < num_output_fields; i++) field_rstr_in_buffer[i] = -1;
for (CeedInt i = 0; i < num_output_fields; i++) {
CeedElemRestriction rstr_i;

CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr_i));
if (rstr_i == CEED_ELEMRESTRICTION_NONE) continue;
for (CeedInt j = 0; j < num_input_fields; j++) {
CeedElemRestriction rstr_j;

CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
if (rstr_i == rstr_j) {
field_rstr_in_buffer[i] = j;
break;
}
}
}

// -- Output basis and restriction
code << "\n // -- Output field basis action and restrictions\n";
for (CeedInt i = 0; i < num_output_fields; i++) {
code << " // ---- Output field " << i << "\n";

// ---- Basis action
CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, dim, field_rstr_in_buffer, op_output_fields[i], qf_output_fields[i], Q_1d,
false, use_3d_slices));

// ---- Restriction
CeedCallBackend(
CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
}

// Close loop and function
Expand Down

0 comments on commit 56ed042

Please sign in to comment.