From 43ba2af1bd4550200f125605ff0002f804565b53 Mon Sep 17 00:00:00 2001 From: Jeremy L Thompson Date: Wed, 4 Sep 2024 09:54:50 -0600 Subject: [PATCH] AtPoints - fix transpose basis apply on GPU --- backends/cuda-ref/ceed-cuda-ref-basis.c | 51 +++++++++++++++---- backends/cuda-ref/ceed-cuda-ref-operator.c | 24 +++++---- backends/cuda-ref/ceed-cuda-ref.h | 4 ++ backends/cuda-shared/ceed-cuda-shared-basis.c | 51 +++++++++++++++---- backends/cuda-shared/ceed-cuda-shared.h | 3 ++ backends/hip-ref/ceed-hip-ref-basis.c | 42 ++++++++++++--- backends/hip-ref/ceed-hip-ref-operator.c | 24 +++++---- backends/hip-ref/ceed-hip-ref.h | 4 ++ backends/hip-shared/ceed-hip-shared-basis.c | 42 ++++++++++++--- backends/hip-shared/ceed-hip-shared.h | 3 ++ .../cuda/cuda-ref-basis-tensor-at-points.h | 8 ++- .../hip/hip-ref-basis-tensor-at-points.h | 8 ++- 12 files changed, 210 insertions(+), 54 deletions(-) diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c index 5efaeee456..738d0c4834 100644 --- a/backends/cuda-ref/ceed-cuda-ref-basis.c +++ b/backends/cuda-ref/ceed-cuda-ref-basis.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "../cuda/ceed-cuda-common.h" #include "../cuda/ceed-cuda-compile.h" @@ -115,18 +116,46 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - // Check uniform number of points per elem - for (CeedInt i = 1; i < num_elem; i++) { - CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND, - "BasisApplyAtPoints only supported for the same number of points in each element"); - } - // Weight handled separately if (eval_mode == CEED_EVAL_WEIGHT) { CeedCallBackend(CeedVectorSetValue(v, 1.0)); return CEED_ERROR_SUCCESS; } + // Check padded to uniform number of points per elem + for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); + { + CeedInt num_comp, q_comp; + CeedSize len, len_required; + + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); + CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); + len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; + CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, + "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." + " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, + len, len_required); + } + + // Move num_points array to device + if (is_transpose) { + const CeedInt num_bytes = num_elem * sizeof(CeedInt); + + if (num_elem != data->num_elem_at_points) { + data->num_elem_at_points = num_elem; + + if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); + CeedCallBackend(CeedFree(&data->h_points_per_elem)); + CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); + } + if (!memcmp(data->h_points_per_elem, num_points, num_bytes)) { + memcpy(data->h_points_per_elem, num_points, num_bytes); + CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice)); + } + } + // Build kernels if needed if (data->num_points != max_num_points) { CeedInt P_1d; @@ -186,14 +215,14 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons // Basis action switch (eval_mode) { case CEED_EVAL_INTERP: { - void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v}; - const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args)); } break; case CEED_EVAL_GRAD: { - void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v}; - const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args)); } break; @@ -343,6 +372,8 @@ static int CeedBasisDestroy_Cuda(CeedBasis basis) { CeedCallCuda(ceed, cuModuleUnload(data->module)); if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); + CeedCallBackend(CeedFree(&data->h_points_per_elem)); + if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d)); diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c index 04eb0fb0fd..fcc7631e6c 100644 --- a/backends/cuda-ref/ceed-cuda-ref-operator.c +++ b/backends/cuda-ref/ceed-cuda-ref-operator.c @@ -27,6 +27,7 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) { CeedCallBackend(CeedOperatorGetData(op, &impl)); // Apply data + CeedCallBackend(CeedFree(&impl->num_points)); CeedCallBackend(CeedFree(&impl->skip_rstr_in)); CeedCallBackend(CeedFree(&impl->skip_rstr_out)); CeedCallBackend(CeedFree(&impl->apply_add_basis_out)); @@ -557,10 +558,17 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) { CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); { - CeedElemRestriction elem_rstr = NULL; + CeedElemRestriction rstr_points = NULL; - CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &elem_rstr, NULL)); - CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &max_num_points)); + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); + CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); + CeedCallBackend(CeedCalloc(num_elem, &impl->num_points)); + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt num_points_elem; + + CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); + impl->num_points[e] = num_points_elem; + } } impl->max_num_points = max_num_points; @@ -674,7 +682,7 @@ static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedInt num_elem, const Ce // Apply and add to output AtPoints //------------------------------------------------------------------------------ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { - CeedInt max_num_points, num_elem, elem_size, num_input_fields, num_output_fields, size; + CeedInt max_num_points, *num_points, num_elem, elem_size, num_input_fields, num_output_fields, size; CeedScalar *e_data[2 * CEED_FIELD_MAX] = {NULL}; CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedQFunction qf; @@ -686,12 +694,11 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec, CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - CeedInt num_points[num_elem]; // Setup CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op)); + num_points = impl->num_points; max_num_points = impl->max_num_points; - for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points; // Input Evecs and Restriction CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request)); @@ -1616,7 +1623,7 @@ static int CeedOperatorLinearAssembleQFunctionAtPoints_Cuda(CeedOperator op, Cee // Assemble Linear Diagonal AtPoints //------------------------------------------------------------------------------ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) { - CeedInt max_num_points, num_elem, num_input_fields, num_output_fields; + CeedInt max_num_points, *num_points, num_elem, num_input_fields, num_output_fields; CeedScalar *e_data[2 * CEED_FIELD_MAX] = {NULL}; CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedQFunction qf; @@ -1628,12 +1635,11 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - CeedInt num_points[num_elem]; // Setup CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op)); + num_points = impl->num_points; max_num_points = impl->max_num_points; - for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points; // Create separate output e-vecs if (impl->has_shared_e_vecs) { diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h index c664a38ed7..9a63a5f4f4 100644 --- a/backends/cuda-ref/ceed-cuda-ref.h +++ b/backends/cuda-ref/ceed-cuda-ref.h @@ -75,6 +75,9 @@ typedef struct { CeedScalar *d_grad_1d; CeedScalar *d_q_weight_1d; CeedScalar *d_chebyshev_interp_1d; + CeedInt num_elem_at_points; + CeedInt *h_points_per_elem; + CeedInt *d_points_per_elem; } CeedBasis_Cuda; typedef struct { @@ -136,6 +139,7 @@ typedef struct { CeedInt num_inputs, num_outputs; CeedInt num_active_in, num_active_out; CeedInt max_num_points; + CeedInt *num_points; CeedVector *qf_active_in, point_coords_elem; CeedOperatorDiag_Cuda *diag; CeedOperatorAssemble_Cuda *asmb; diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c index 8245149f6d..664694b859 100644 --- a/backends/cuda-shared/ceed-cuda-shared-basis.c +++ b/backends/cuda-shared/ceed-cuda-shared-basis.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "../cuda/ceed-cuda-common.h" #include "../cuda/ceed-cuda-compile.h" @@ -221,18 +222,46 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - // Check uniform number of points per elem - for (CeedInt i = 1; i < num_elem; i++) { - CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND, - "BasisApplyAtPoints only supported for the same number of points in each element"); - } - // Weight handled separately if (eval_mode == CEED_EVAL_WEIGHT) { CeedCallBackend(CeedVectorSetValue(v, 1.0)); return CEED_ERROR_SUCCESS; } + // Check padded to uniform number of points per elem + for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); + { + CeedInt num_comp, q_comp; + CeedSize len, len_required; + + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); + CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); + len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; + CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, + "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." + " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, + len, len_required); + } + + // Move num_points array to device + if (is_transpose) { + const CeedInt num_bytes = num_elem * sizeof(CeedInt); + + if (num_elem != data->num_elem_at_points) { + data->num_elem_at_points = num_elem; + + if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes)); + CeedCallBackend(CeedFree(&data->h_points_per_elem)); + CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); + } + if (!memcmp(data->h_points_per_elem, num_points, num_bytes)) { + memcpy(data->h_points_per_elem, num_points, num_bytes); + CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice)); + } + } + // Build kernels if needed if (data->num_points != max_num_points) { CeedInt P_1d; @@ -292,14 +321,14 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad // Basis action switch (eval_mode) { case CEED_EVAL_INTERP: { - void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v}; - const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args)); } break; case CEED_EVAL_GRAD: { - void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v}; - const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args)); } break; @@ -345,6 +374,8 @@ static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { CeedCallCuda(ceed, cuModuleUnload(data->module)); if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); + CeedCallBackend(CeedFree(&data->h_points_per_elem)); + if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem)); CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d)); diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h index 96800a0a86..d70d75ab94 100644 --- a/backends/cuda-shared/ceed-cuda-shared.h +++ b/backends/cuda-shared/ceed-cuda-shared.h @@ -30,6 +30,9 @@ typedef struct { CeedScalar *d_chebyshev_interp_1d; CeedScalar *c_B; CeedScalar *c_G; + CeedInt num_elem_at_points; + CeedInt *h_points_per_elem; + CeedInt *d_points_per_elem; } CeedBasis_Cuda_shared; CEED_INTERN int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c index ea3f4e6e3a..acd5ae41bc 100644 --- a/backends/hip-ref/ceed-hip-ref-basis.c +++ b/backends/hip-ref/ceed-hip-ref-basis.c @@ -113,18 +113,46 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - // Check uniform number of points per elem - for (CeedInt i = 1; i < num_elem; i++) { - CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND, - "BasisApplyAtPoints only supported for the same number of points in each element"); - } - // Weight handled separately if (eval_mode == CEED_EVAL_WEIGHT) { CeedCallBackend(CeedVectorSetValue(v, 1.0)); return CEED_ERROR_SUCCESS; } + // Check padded to uniform number of points per elem + for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); + { + CeedInt num_comp, q_comp; + CeedSize len, len_required; + + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); + CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); + len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; + CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, + "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." + " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, + len, len_required); + } + + // Move num_points array to device + if (is_transpose) { + const CeedInt num_bytes = num_elem * sizeof(CeedInt); + + if (num_elem != data->num_elem_at_points) { + data->num_elem_at_points = num_elem; + + if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem)); + CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes)); + CeedCallBackend(CeedFree(&data->h_points_per_elem)); + CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); + } + if (!memcmp(data->h_points_per_elem, num_points, num_bytes)) { + memcpy(data->h_points_per_elem, num_points, num_bytes); + CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice)); + } + } + // Build kernels if needed if (data->num_points != max_num_points) { CeedInt P_1d; @@ -341,6 +369,8 @@ static int CeedBasisDestroy_Hip(CeedBasis basis) { CeedCallHip(ceed, hipModuleUnload(data->module)); if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints)); if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d)); + CeedCallBackend(CeedFree(&data->h_points_per_elem)); + if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem)); CeedCallHip(ceed, hipFree(data->d_interp_1d)); CeedCallHip(ceed, hipFree(data->d_grad_1d)); CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d)); diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c index 509555a375..8bc3f0bc35 100644 --- a/backends/hip-ref/ceed-hip-ref-operator.c +++ b/backends/hip-ref/ceed-hip-ref-operator.c @@ -26,6 +26,7 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) { CeedCallBackend(CeedOperatorGetData(op, &impl)); // Apply data + CeedCallBackend(CeedFree(&impl->num_points)); CeedCallBackend(CeedFree(&impl->skip_rstr_in)); CeedCallBackend(CeedFree(&impl->skip_rstr_out)); CeedCallBackend(CeedFree(&impl->apply_add_basis_out)); @@ -556,10 +557,17 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) { CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); { - CeedElemRestriction elem_rstr = NULL; + CeedElemRestriction rstr_points = NULL; - CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &elem_rstr, NULL)); - CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &max_num_points)); + CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); + CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); + CeedCallBackend(CeedCalloc(num_elem, &impl->num_points)); + for (CeedInt e = 0; e < num_elem; e++) { + CeedInt num_points_elem; + + CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem)); + impl->num_points[e] = num_points_elem; + } } impl->max_num_points = max_num_points; @@ -673,7 +681,7 @@ static inline int CeedOperatorInputBasisAtPoints_Hip(CeedInt num_elem, const Cee // Apply and add to output AtPoints //------------------------------------------------------------------------------ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { - CeedInt max_num_points, num_elem, elem_size, num_input_fields, num_output_fields, size; + CeedInt max_num_points, *num_points, num_elem, elem_size, num_input_fields, num_output_fields, size; CeedScalar *e_data[2 * CEED_FIELD_MAX] = {NULL}; CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedQFunction qf; @@ -685,12 +693,11 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec, CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - CeedInt num_points[num_elem]; // Setup CeedCallBackend(CeedOperatorSetupAtPoints_Hip(op)); + num_points = impl->num_points; max_num_points = impl->max_num_points; - for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points; // Input Evecs and Restriction CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request)); @@ -1613,7 +1620,7 @@ static int CeedOperatorLinearAssembleQFunctionAtPoints_Hip(CeedOperator op, Ceed // Assemble Linear Diagonal AtPoints //------------------------------------------------------------------------------ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request) { - CeedInt max_num_points, num_elem, num_input_fields, num_output_fields; + CeedInt max_num_points, *num_points, num_elem, num_input_fields, num_output_fields; CeedScalar *e_data[2 * CEED_FIELD_MAX] = {NULL}; CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedQFunction qf; @@ -1625,12 +1632,11 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - CeedInt num_points[num_elem]; // Setup CeedCallBackend(CeedOperatorSetupAtPoints_Hip(op)); + num_points = impl->num_points; max_num_points = impl->max_num_points; - for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points; // Create separate output e-vecs if (impl->has_shared_e_vecs) { diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h index 38c91060b4..02fb567517 100644 --- a/backends/hip-ref/ceed-hip-ref.h +++ b/backends/hip-ref/ceed-hip-ref.h @@ -79,6 +79,9 @@ typedef struct { CeedScalar *d_grad_1d; CeedScalar *d_q_weight_1d; CeedScalar *d_chebyshev_interp_1d; + CeedInt num_elem_at_points; + CeedInt *h_points_per_elem; + CeedInt *d_points_per_elem; } CeedBasis_Hip; typedef struct { @@ -140,6 +143,7 @@ typedef struct { CeedInt num_inputs, num_outputs; CeedInt num_active_in, num_active_out; CeedInt max_num_points; + CeedInt *num_points; CeedVector *qf_active_in, point_coords_elem; CeedOperatorDiag_Hip *diag; CeedOperatorAssemble_Hip *asmb; diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c index 959078e5b1..01d2251163 100644 --- a/backends/hip-shared/ceed-hip-shared-basis.c +++ b/backends/hip-shared/ceed-hip-shared-basis.c @@ -280,18 +280,46 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - // Check uniform number of points per elem - for (CeedInt i = 1; i < num_elem; i++) { - CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND, - "BasisApplyAtPoints only supported for the same number of points in each element"); - } - // Weight handled separately if (eval_mode == CEED_EVAL_WEIGHT) { CeedCallBackend(CeedVectorSetValue(v, 1.0)); return CEED_ERROR_SUCCESS; } + // Check padded to uniform number of points per elem + for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); + { + CeedInt num_comp, q_comp; + CeedSize len, len_required; + + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); + CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); + len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; + CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, + "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." + " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, + len, len_required); + } + + // Move num_points array to device + if (is_transpose) { + const CeedInt num_bytes = num_elem * sizeof(CeedInt); + + if (num_elem != data->num_elem_at_points) { + data->num_elem_at_points = num_elem; + + if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem)); + CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes)); + CeedCallBackend(CeedFree(&data->h_points_per_elem)); + CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); + } + if (!memcmp(data->h_points_per_elem, num_points, num_bytes)) { + memcpy(data->h_points_per_elem, num_points, num_bytes); + CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice)); + } + } + // Build kernels if needed if (data->num_points != max_num_points) { CeedInt P_1d; @@ -404,6 +432,8 @@ static int CeedBasisDestroy_Hip_shared(CeedBasis basis) { CeedCallHip(ceed, hipModuleUnload(data->module)); if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints)); if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d)); + CeedCallBackend(CeedFree(&data->h_points_per_elem)); + if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem)); CeedCallHip(ceed, hipFree(data->d_interp_1d)); CeedCallHip(ceed, hipFree(data->d_grad_1d)); CeedCallHip(ceed, hipFree(data->d_collo_grad_1d)); diff --git a/backends/hip-shared/ceed-hip-shared.h b/backends/hip-shared/ceed-hip-shared.h index fe3384f55d..c000b7f873 100644 --- a/backends/hip-shared/ceed-hip-shared.h +++ b/backends/hip-shared/ceed-hip-shared.h @@ -29,6 +29,9 @@ typedef struct { CeedScalar *d_collo_grad_1d; CeedScalar *d_q_weight_1d; CeedScalar *d_chebyshev_interp_1d; + CeedInt num_elem_at_points; + CeedInt *h_points_per_elem; + CeedInt *d_points_per_elem; } CeedBasis_Hip_shared; CEED_INTERN int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h index 263d29338e..7355705660 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h +++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h @@ -42,7 +42,8 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar // Interp //------------------------------------------------------------------------------ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, - const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { const CeedInt i = threadIdx.x; __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; @@ -80,6 +81,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt // Map from point __syncthreads(); for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + if (p >= points_per_elem[elem]) continue; pre = 1; post = 1; for (CeedInt d = 0; d < BASIS_DIM; d++) { @@ -196,7 +198,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt // Grad //------------------------------------------------------------------------------ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, - const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { const CeedInt i = threadIdx.x; __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; @@ -235,6 +238,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is // Map from point __syncthreads(); for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + if (p >= points_per_elem[elem]) continue; for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h index 22d81bc30a..4744b17eb2 100644 --- a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h +++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h @@ -42,7 +42,8 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar // Interp //------------------------------------------------------------------------------ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, - const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { const CeedInt i = threadIdx.x; __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; @@ -80,6 +81,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt // Map from point __syncthreads(); for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + if (p >= points_per_elem[elem]) continue; pre = 1; post = 1; for (CeedInt d = 0; d < BASIS_DIM; d++) { @@ -196,7 +198,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt // Grad //------------------------------------------------------------------------------ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, - const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { const CeedInt i = threadIdx.x; __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; @@ -235,6 +238,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is // Map from point __syncthreads(); for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { + if (p >= points_per_elem[elem]) continue; for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];