From 41655a2310ae869cbaf66471563d33865f525ec1 Mon Sep 17 00:00:00 2001 From: Jeremy L Thompson Date: Thu, 22 Aug 2024 15:52:41 -0600 Subject: [PATCH 1/2] gpu - reuse evecs where able --- backends/cuda-ref/ceed-cuda-ref-operator.c | 48 +++++++++++++++++++--- backends/hip-ref/ceed-hip-ref-operator.c | 48 +++++++++++++++++++--- 2 files changed, 86 insertions(+), 10 deletions(-) diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c index 748237dab8..0defda9772 100644 --- a/backends/cuda-ref/ceed-cuda-ref-operator.c +++ b/backends/cuda-ref/ceed-cuda-ref-operator.c @@ -264,7 +264,7 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) { impl->num_inputs = num_input_fields; impl->num_outputs = num_output_fields; - // Set up infield and outfield e_vecs and q_vecs + // Set up infield and outfield e-vecs and q-vecs // Infields CeedCallBackend( CeedOperatorSetupFields_Cuda(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem)); @@ -272,6 +272,44 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) { CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem)); + // Reuse active e-vecs where able + { + CeedInt num_used = 0; + CeedElemRestriction *rstr_used = NULL; + + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_used = false; + CeedVector vec_i; + CeedElemRestriction rstr_i; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); + if (vec_i != CEED_VECTOR_ACTIVE) continue; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); + for (CeedInt j = 0; j < num_used; j++) { + if (rstr_i == rstr_used[i]) is_used = true; + } + if (is_used) continue; + num_used++; + if (num_used == 1) CeedCallBackend(CeedCalloc(num_used, &rstr_used)); + else CeedCallBackend(CeedRealloc(num_used, &rstr_used)); + rstr_used[num_used - 1] = rstr_i; + for (CeedInt j = num_output_fields - 1; j >= 0; j--) { + CeedEvalMode eval_mode; + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j)); + if (vec_j != CEED_VECTOR_ACTIVE) continue; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode)); + if (eval_mode == CEED_EVAL_NONE) continue; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j)); + if (rstr_i == rstr_j) { + CeedCallBackend(CeedVectorReferenceCopy(impl->e_vecs[i], &impl->e_vecs[j + impl->num_inputs])); + } + } + } + CeedCallBackend(CeedFree(&rstr_used)); + } CeedCallBackend(CeedOperatorSetSetupDone(op)); return CEED_ERROR_SUCCESS; } @@ -310,7 +348,7 @@ static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFu uint64_t state; CeedCallBackend(CeedVectorGetState(vec, &state)); - if (state != impl->input_states[i] && !impl->skip_rstr_in[i]) { + if ((state != impl->input_states[i] || vec == in_vec) && !impl->skip_rstr_in[i]) { CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request)); } impl->input_states[i] = state; @@ -435,6 +473,9 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec // Q function CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out)); + // Restore input arrays + CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl)); + // Output basis apply if needed for (CeedInt i = 0; i < num_output_fields; i++) { CeedEvalMode eval_mode; @@ -490,9 +531,6 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request)); } - - // Restore input arrays - CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c index 6045d0ad96..268292b60c 100644 --- a/backends/hip-ref/ceed-hip-ref-operator.c +++ b/backends/hip-ref/ceed-hip-ref-operator.c @@ -263,7 +263,7 @@ static int CeedOperatorSetup_Hip(CeedOperator op) { impl->num_inputs = num_input_fields; impl->num_outputs = num_output_fields; - // Set up infield and outfield e_vecs and q_vecs + // Set up infield and outfield e-vecs and q-vecs // Infields CeedCallBackend( CeedOperatorSetupFields_Hip(qf, op, true, false, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem)); @@ -271,6 +271,44 @@ static int CeedOperatorSetup_Hip(CeedOperator op) { CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, false, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem)); + // Reuse active e-vecs where able + { + CeedInt num_used = 0; + CeedElemRestriction *rstr_used = NULL; + + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_used = false; + CeedVector vec_i; + CeedElemRestriction rstr_i; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); + if (vec_i != CEED_VECTOR_ACTIVE) continue; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); + for (CeedInt j = 0; j < num_used; j++) { + if (rstr_i == rstr_used[i]) is_used = true; + } + if (is_used) continue; + num_used++; + if (num_used == 1) CeedCallBackend(CeedCalloc(num_used, &rstr_used)); + else CeedCallBackend(CeedRealloc(num_used, &rstr_used)); + rstr_used[num_used - 1] = rstr_i; + for (CeedInt j = num_output_fields - 1; j >= 0; j--) { + CeedEvalMode eval_mode; + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j)); + if (vec_j != CEED_VECTOR_ACTIVE) continue; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode)); + if (eval_mode == CEED_EVAL_NONE) continue; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j)); + if (rstr_i == rstr_j) { + CeedCallBackend(CeedVectorReferenceCopy(impl->e_vecs[i], &impl->e_vecs[j + impl->num_inputs])); + } + } + } + CeedCallBackend(CeedFree(&rstr_used)); + } CeedCallBackend(CeedOperatorSetSetupDone(op)); return CEED_ERROR_SUCCESS; } @@ -309,7 +347,7 @@ static inline int CeedOperatorSetupInputs_Hip(CeedInt num_input_fields, CeedQFun uint64_t state; CeedCallBackend(CeedVectorGetState(vec, &state)); - if (state != impl->input_states[i] && !impl->skip_rstr_in[i]) { + if ((state != impl->input_states[i] || vec == in_vec) && !impl->skip_rstr_in[i]) { CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request)); } impl->input_states[i] = state; @@ -434,6 +472,9 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect // Q function CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out)); + // Restore input arrays + CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl)); + // Output basis apply if needed for (CeedInt i = 0; i < num_output_fields; i++) { CeedEvalMode eval_mode; @@ -489,9 +530,6 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request)); } - - // Restore input arrays - CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl)); return CEED_ERROR_SUCCESS; } From 8a21357021cf95ebb58183dc35a646c90195a1f2 Mon Sep 17 00:00:00 2001 From: Jeremy L Thompson Date: Mon, 26 Aug 2024 12:59:59 -0600 Subject: [PATCH 2/2] gpu - reuse evecs for AtPoints where able --- backends/cuda-ref/ceed-cuda-ref-operator.c | 61 ++++++++++++++++++++-- backends/cuda-ref/ceed-cuda-ref.h | 2 +- backends/hip-ref/ceed-hip-ref-operator.c | 61 ++++++++++++++++++++-- backends/hip-ref/ceed-hip-ref.h | 2 +- 4 files changed, 114 insertions(+), 12 deletions(-) diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c index 0defda9772..04eb0fb0fd 100644 --- a/backends/cuda-ref/ceed-cuda-ref-operator.c +++ b/backends/cuda-ref/ceed-cuda-ref-operator.c @@ -310,6 +310,7 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) { } CeedCallBackend(CeedFree(&rstr_used)); } + impl->has_shared_e_vecs = true; CeedCallBackend(CeedOperatorSetSetupDone(op)); return CEED_ERROR_SUCCESS; } @@ -574,7 +575,7 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) { impl->num_inputs = num_input_fields; impl->num_outputs = num_output_fields; - // Set up infield and outfield e_vecs and q_vecs + // Set up infield and outfield e-vecs and q-vecs // Infields CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, true, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, max_num_points, num_elem)); @@ -582,6 +583,45 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) { CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, max_num_points, num_elem)); + // Reuse active e-vecs where able + { + CeedInt num_used = 0; + CeedElemRestriction *rstr_used = NULL; + + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_used = false; + CeedVector vec_i; + CeedElemRestriction rstr_i; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); + if (vec_i != CEED_VECTOR_ACTIVE) continue; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); + for (CeedInt j = 0; j < num_used; j++) { + if (rstr_i == rstr_used[i]) is_used = true; + } + if (is_used) continue; + num_used++; + if (num_used == 1) CeedCallBackend(CeedCalloc(num_used, &rstr_used)); + else CeedCallBackend(CeedRealloc(num_used, &rstr_used)); + rstr_used[num_used - 1] = rstr_i; + for (CeedInt j = num_output_fields - 1; j >= 0; j--) { + CeedEvalMode eval_mode; + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j)); + if (vec_j != CEED_VECTOR_ACTIVE) continue; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode)); + if (eval_mode == CEED_EVAL_NONE) continue; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j)); + if (rstr_i == rstr_j) { + CeedCallBackend(CeedVectorReferenceCopy(impl->e_vecs[i], &impl->e_vecs[j + impl->num_inputs])); + } + } + } + CeedCallBackend(CeedFree(&rstr_used)); + } + impl->has_shared_e_vecs = true; CeedCallBackend(CeedOperatorSetSetupDone(op)); return CEED_ERROR_SUCCESS; } @@ -684,6 +724,9 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec, // Q function CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out)); + // Restore input arrays + CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl)); + // Output basis apply if needed for (CeedInt i = 0; i < num_output_fields; i++) { CeedEvalMode eval_mode; @@ -741,9 +784,6 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec, CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request)); } - - // Restore input arrays - CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl)); return CEED_ERROR_SUCCESS; } @@ -868,7 +908,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out)); } - // Un-set output q_vecs to prevent accidental overwrite of Assembled + // Un-set output q-vecs to prevent accidental overwrite of Assembled for (CeedInt out = 0; out < num_output_fields; out++) { CeedVector vec; @@ -1595,6 +1635,17 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C max_num_points = impl->max_num_points; for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points; + // Create separate output e-vecs + if (impl->has_shared_e_vecs) { + for (CeedInt i = 0; i < impl->num_outputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i])); + CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[impl->num_inputs + i])); + } + CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out, + num_input_fields, num_output_fields, max_num_points, num_elem)); + } + impl->has_shared_e_vecs = false; + // Input Evecs and Restriction CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request)); diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h index ff0bbaf349..c664a38ed7 100644 --- a/backends/cuda-ref/ceed-cuda-ref.h +++ b/backends/cuda-ref/ceed-cuda-ref.h @@ -128,7 +128,7 @@ typedef struct { } CeedOperatorAssemble_Cuda; typedef struct { - bool *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out; + bool *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out, has_shared_e_vecs; uint64_t *input_states; // State tracking for passive inputs CeedVector *e_vecs; // E-vectors, inputs followed by outputs CeedVector *q_vecs_in; // Input Q-vectors needed to apply operator diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c index 268292b60c..509555a375 100644 --- a/backends/hip-ref/ceed-hip-ref-operator.c +++ b/backends/hip-ref/ceed-hip-ref-operator.c @@ -309,6 +309,7 @@ static int CeedOperatorSetup_Hip(CeedOperator op) { } CeedCallBackend(CeedFree(&rstr_used)); } + impl->has_shared_e_vecs = true; CeedCallBackend(CeedOperatorSetSetupDone(op)); return CEED_ERROR_SUCCESS; } @@ -573,7 +574,7 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) { impl->num_inputs = num_input_fields; impl->num_outputs = num_output_fields; - // Set up infield and outfield e_vecs and q_vecs + // Set up infield and outfield e-vecs and q-vecs // Infields CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, true, impl->skip_rstr_in, NULL, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, max_num_points, num_elem)); @@ -581,6 +582,45 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) { CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, max_num_points, num_elem)); + // Reuse active e-vecs where able + { + CeedInt num_used = 0; + CeedElemRestriction *rstr_used = NULL; + + for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_used = false; + CeedVector vec_i; + CeedElemRestriction rstr_i; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); + if (vec_i != CEED_VECTOR_ACTIVE) continue; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); + for (CeedInt j = 0; j < num_used; j++) { + if (rstr_i == rstr_used[i]) is_used = true; + } + if (is_used) continue; + num_used++; + if (num_used == 1) CeedCallBackend(CeedCalloc(num_used, &rstr_used)); + else CeedCallBackend(CeedRealloc(num_used, &rstr_used)); + rstr_used[num_used - 1] = rstr_i; + for (CeedInt j = num_output_fields - 1; j >= 0; j--) { + CeedEvalMode eval_mode; + CeedVector vec_j; + CeedElemRestriction rstr_j; + + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[j], &vec_j)); + if (vec_j != CEED_VECTOR_ACTIVE) continue; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode)); + if (eval_mode == CEED_EVAL_NONE) continue; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[j], &rstr_j)); + if (rstr_i == rstr_j) { + CeedCallBackend(CeedVectorReferenceCopy(impl->e_vecs[i], &impl->e_vecs[j + impl->num_inputs])); + } + } + } + CeedCallBackend(CeedFree(&rstr_used)); + } + impl->has_shared_e_vecs = true; CeedCallBackend(CeedOperatorSetSetupDone(op)); return CEED_ERROR_SUCCESS; } @@ -683,6 +723,9 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec, // Q function CeedCallBackend(CeedQFunctionApply(qf, num_elem * max_num_points, impl->q_vecs_in, impl->q_vecs_out)); + // Restore input arrays + CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl)); + // Output basis apply if needed for (CeedInt i = 0; i < num_output_fields; i++) { CeedEvalMode eval_mode; @@ -740,9 +783,6 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec, CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request)); } - - // Restore input arrays - CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl)); return CEED_ERROR_SUCCESS; } @@ -867,7 +907,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out)); } - // Un-set output q_vecs to prevent accidental overwrite of Assembled + // Un-set output q-vecs to prevent accidental overwrite of Assembled for (CeedInt out = 0; out < num_output_fields; out++) { CeedVector vec; @@ -1592,6 +1632,17 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce max_num_points = impl->max_num_points; for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points; + // Create separate output e-vecs + if (impl->has_shared_e_vecs) { + for (CeedInt i = 0; i < impl->num_outputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i])); + CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[impl->num_inputs + i])); + } + CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, true, impl->skip_rstr_out, impl->apply_add_basis_out, impl->e_vecs, impl->q_vecs_out, + num_input_fields, num_output_fields, max_num_points, num_elem)); + } + impl->has_shared_e_vecs = false; + // Input Evecs and Restriction CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request)); diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h index 59f8d809c2..38c91060b4 100644 --- a/backends/hip-ref/ceed-hip-ref.h +++ b/backends/hip-ref/ceed-hip-ref.h @@ -132,7 +132,7 @@ typedef struct { } CeedOperatorAssemble_Hip; typedef struct { - bool *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out; + bool *skip_rstr_in, *skip_rstr_out, *apply_add_basis_out, has_shared_e_vecs; uint64_t *input_states; // State tracking for passive inputs CeedVector *e_vecs; // E-vectors, inputs followed by outputs CeedVector *q_vecs_in; // Input Q-vectors needed to apply operator