From 43ba2af1bd4550200f125605ff0002f804565b53 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Wed, 4 Sep 2024 09:54:50 -0600
Subject: [PATCH] AtPoints - fix transpose basis apply on GPU

---
 backends/cuda-ref/ceed-cuda-ref-basis.c       | 51 +++++++++++++++----
 backends/cuda-ref/ceed-cuda-ref-operator.c    | 24 +++++----
 backends/cuda-ref/ceed-cuda-ref.h             |  4 ++
 backends/cuda-shared/ceed-cuda-shared-basis.c | 51 +++++++++++++++----
 backends/cuda-shared/ceed-cuda-shared.h       |  3 ++
 backends/hip-ref/ceed-hip-ref-basis.c         | 42 ++++++++++++---
 backends/hip-ref/ceed-hip-ref-operator.c      | 24 +++++----
 backends/hip-ref/ceed-hip-ref.h               |  4 ++
 backends/hip-shared/ceed-hip-shared-basis.c   | 42 ++++++++++++---
 backends/hip-shared/ceed-hip-shared.h         |  3 ++
 .../cuda/cuda-ref-basis-tensor-at-points.h    |  8 ++-
 .../hip/hip-ref-basis-tensor-at-points.h      |  8 ++-
 12 files changed, 210 insertions(+), 54 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c
index 5efaeee456..738d0c4834 100644
--- a/backends/cuda-ref/ceed-cuda-ref-basis.c
+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c
@@ -10,6 +10,7 @@
 #include <ceed/jit-tools.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <string.h>
 
 #include "../cuda/ceed-cuda-common.h"
 #include "../cuda/ceed-cuda-compile.h"
@@ -115,18 +116,46 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
 
-  // Check uniform number of points per elem
-  for (CeedInt i = 1; i < num_elem; i++) {
-    CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
-              "BasisApplyAtPoints only supported for the same number of points in each element");
-  }
-
   // Weight handled separately
   if (eval_mode == CEED_EVAL_WEIGHT) {
     CeedCallBackend(CeedVectorSetValue(v, 1.0));
     return CEED_ERROR_SUCCESS;
   }
 
+  // Check padded to uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
+  {
+    CeedInt  num_comp, q_comp;
+    CeedSize len, len_required;
+
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
+    len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
+    CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
+              "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
+              " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
+              len, len_required);
+  }
+
+  // Move num_points array to device
+  if (is_transpose) {
+    const CeedInt num_bytes = num_elem * sizeof(CeedInt);
+
+    if (num_elem != data->num_elem_at_points) {
+      data->num_elem_at_points = num_elem;
+
+      if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
+      CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes));
+      CeedCallBackend(CeedFree(&data->h_points_per_elem));
+      CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
+    }
+    if (!memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+      memcpy(data->h_points_per_elem, num_points, num_bytes);
+      CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice));
+    }
+  }
+
   // Build kernels if needed
   if (data->num_points != max_num_points) {
     CeedInt P_1d;
@@ -186,14 +215,14 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void *interp_args[]      = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
     } break;
     case CEED_EVAL_GRAD: {
-      void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void *grad_args[]        = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
@@ -343,6 +372,8 @@ static int CeedBasisDestroy_Cuda(CeedBasis basis) {
   CeedCallCuda(ceed, cuModuleUnload(data->module));
   if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
   if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
+  CeedCallBackend(CeedFree(&data->h_points_per_elem));
+  if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
   CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
   CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
   CeedCallCuda(ceed, cudaFree(data->d_chebyshev_interp_1d));
diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 04eb0fb0fd..fcc7631e6c 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -27,6 +27,7 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetData(op, &impl));
 
   // Apply data
+  CeedCallBackend(CeedFree(&impl->num_points));
   CeedCallBackend(CeedFree(&impl->skip_rstr_in));
   CeedCallBackend(CeedFree(&impl->skip_rstr_out));
   CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
@@ -557,10 +558,17 @@ static int CeedOperatorSetupAtPoints_Cuda(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
   {
-    CeedElemRestriction elem_rstr = NULL;
+    CeedElemRestriction rstr_points = NULL;
 
-    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &elem_rstr, NULL));
-    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &max_num_points));
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
+    CeedCallBackend(CeedCalloc(num_elem, &impl->num_points));
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt num_points_elem;
+
+      CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+      impl->num_points[e] = num_points_elem;
+    }
   }
   impl->max_num_points = max_num_points;
 
@@ -674,7 +682,7 @@ static inline int CeedOperatorInputBasisAtPoints_Cuda(CeedInt num_elem, const Ce
 // Apply and add to output AtPoints
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
-  CeedInt             max_num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
+  CeedInt             max_num_points, *num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -686,12 +694,11 @@ static int CeedOperatorApplyAddAtPoints_Cuda(CeedOperator op, CeedVector in_vec,
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-  CeedInt num_points[num_elem];
 
   // Setup
   CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op));
+  num_points     = impl->num_points;
   max_num_points = impl->max_num_points;
-  for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
 
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
@@ -1616,7 +1623,7 @@ static int CeedOperatorLinearAssembleQFunctionAtPoints_Cuda(CeedOperator op, Cee
 // Assemble Linear Diagonal AtPoints
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request) {
-  CeedInt             max_num_points, num_elem, num_input_fields, num_output_fields;
+  CeedInt             max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -1628,12 +1635,11 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda(CeedOperator op, C
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-  CeedInt num_points[num_elem];
 
   // Setup
   CeedCallBackend(CeedOperatorSetupAtPoints_Cuda(op));
+  num_points     = impl->num_points;
   max_num_points = impl->max_num_points;
-  for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
 
   // Create separate output e-vecs
   if (impl->has_shared_e_vecs) {
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index c664a38ed7..9a63a5f4f4 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -75,6 +75,9 @@ typedef struct {
   CeedScalar *d_grad_1d;
   CeedScalar *d_q_weight_1d;
   CeedScalar *d_chebyshev_interp_1d;
+  CeedInt     num_elem_at_points;
+  CeedInt    *h_points_per_elem;
+  CeedInt    *d_points_per_elem;
 } CeedBasis_Cuda;
 
 typedef struct {
@@ -136,6 +139,7 @@ typedef struct {
   CeedInt                    num_inputs, num_outputs;
   CeedInt                    num_active_in, num_active_out;
   CeedInt                    max_num_points;
+  CeedInt                   *num_points;
   CeedVector                *qf_active_in, point_coords_elem;
   CeedOperatorDiag_Cuda     *diag;
   CeedOperatorAssemble_Cuda *asmb;
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 8245149f6d..664694b859 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -12,6 +12,7 @@
 #include <cuda_runtime.h>
 #include <stdbool.h>
 #include <stddef.h>
+#include <string.h>
 
 #include "../cuda/ceed-cuda-common.h"
 #include "../cuda/ceed-cuda-compile.h"
@@ -221,18 +222,46 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
 
-  // Check uniform number of points per elem
-  for (CeedInt i = 1; i < num_elem; i++) {
-    CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
-              "BasisApplyAtPoints only supported for the same number of points in each element");
-  }
-
   // Weight handled separately
   if (eval_mode == CEED_EVAL_WEIGHT) {
     CeedCallBackend(CeedVectorSetValue(v, 1.0));
     return CEED_ERROR_SUCCESS;
   }
 
+  // Check padded to uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
+  {
+    CeedInt  num_comp, q_comp;
+    CeedSize len, len_required;
+
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
+    len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
+    CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
+              "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
+              " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
+              len, len_required);
+  }
+
+  // Move num_points array to device
+  if (is_transpose) {
+    const CeedInt num_bytes = num_elem * sizeof(CeedInt);
+
+    if (num_elem != data->num_elem_at_points) {
+      data->num_elem_at_points = num_elem;
+
+      if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
+      CeedCallCuda(ceed, cudaMalloc((void **)&data->d_points_per_elem, num_bytes));
+      CeedCallBackend(CeedFree(&data->h_points_per_elem));
+      CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
+    }
+    if (!memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+      memcpy(data->h_points_per_elem, num_points, num_bytes);
+      CeedCallCuda(ceed, cudaMemcpy(data->d_points_per_elem, num_points, num_bytes, cudaMemcpyHostToDevice));
+    }
+  }
+
   // Build kernels if needed
   if (data->num_points != max_num_points) {
     CeedInt P_1d;
@@ -292,14 +321,14 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void *interp_args[]      = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
     } break;
     case CEED_EVAL_GRAD: {
-      void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &d_x, &d_u, &d_v};
-      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      void *grad_args[]        = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+      const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
 
       CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
     } break;
@@ -345,6 +374,8 @@ static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) {
   CeedCallCuda(ceed, cuModuleUnload(data->module));
   if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
   if (data->d_q_weight_1d) CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d));
+  CeedCallBackend(CeedFree(&data->h_points_per_elem));
+  if (data->d_points_per_elem) CeedCallCuda(ceed, cudaFree(data->d_points_per_elem));
   CeedCallCuda(ceed, cudaFree(data->d_interp_1d));
   CeedCallCuda(ceed, cudaFree(data->d_grad_1d));
   CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d));
diff --git a/backends/cuda-shared/ceed-cuda-shared.h b/backends/cuda-shared/ceed-cuda-shared.h
index 96800a0a86..d70d75ab94 100644
--- a/backends/cuda-shared/ceed-cuda-shared.h
+++ b/backends/cuda-shared/ceed-cuda-shared.h
@@ -30,6 +30,9 @@ typedef struct {
   CeedScalar *d_chebyshev_interp_1d;
   CeedScalar *c_B;
   CeedScalar *c_G;
+  CeedInt     num_elem_at_points;
+  CeedInt    *h_points_per_elem;
+  CeedInt    *d_points_per_elem;
 } CeedBasis_Cuda_shared;
 
 CEED_INTERN int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c
index ea3f4e6e3a..acd5ae41bc 100644
--- a/backends/hip-ref/ceed-hip-ref-basis.c
+++ b/backends/hip-ref/ceed-hip-ref-basis.c
@@ -113,18 +113,46 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
 
-  // Check uniform number of points per elem
-  for (CeedInt i = 1; i < num_elem; i++) {
-    CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
-              "BasisApplyAtPoints only supported for the same number of points in each element");
-  }
-
   // Weight handled separately
   if (eval_mode == CEED_EVAL_WEIGHT) {
     CeedCallBackend(CeedVectorSetValue(v, 1.0));
     return CEED_ERROR_SUCCESS;
   }
 
+  // Check padded to uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
+  {
+    CeedInt  num_comp, q_comp;
+    CeedSize len, len_required;
+
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
+    len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
+    CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
+              "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
+              " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
+              len, len_required);
+  }
+
+  // Move num_points array to device
+  if (is_transpose) {
+    const CeedInt num_bytes = num_elem * sizeof(CeedInt);
+
+    if (num_elem != data->num_elem_at_points) {
+      data->num_elem_at_points = num_elem;
+
+      if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
+      CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes));
+      CeedCallBackend(CeedFree(&data->h_points_per_elem));
+      CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
+    }
+    if (!memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+      memcpy(data->h_points_per_elem, num_points, num_bytes);
+      CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice));
+    }
+  }
+
   // Build kernels if needed
   if (data->num_points != max_num_points) {
     CeedInt P_1d;
@@ -341,6 +369,8 @@ static int CeedBasisDestroy_Hip(CeedBasis basis) {
   CeedCallHip(ceed, hipModuleUnload(data->module));
   if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
   if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
+  CeedCallBackend(CeedFree(&data->h_points_per_elem));
+  if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
   CeedCallHip(ceed, hipFree(data->d_interp_1d));
   CeedCallHip(ceed, hipFree(data->d_grad_1d));
   CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d));
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 509555a375..8bc3f0bc35 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -26,6 +26,7 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetData(op, &impl));
 
   // Apply data
+  CeedCallBackend(CeedFree(&impl->num_points));
   CeedCallBackend(CeedFree(&impl->skip_rstr_in));
   CeedCallBackend(CeedFree(&impl->skip_rstr_out));
   CeedCallBackend(CeedFree(&impl->apply_add_basis_out));
@@ -556,10 +557,17 @@ static int CeedOperatorSetupAtPoints_Hip(CeedOperator op) {
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
   {
-    CeedElemRestriction elem_rstr = NULL;
+    CeedElemRestriction rstr_points = NULL;
 
-    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &elem_rstr, NULL));
-    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(elem_rstr, &max_num_points));
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
+    CeedCallBackend(CeedCalloc(num_elem, &impl->num_points));
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt num_points_elem;
+
+      CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+      impl->num_points[e] = num_points_elem;
+    }
   }
   impl->max_num_points = max_num_points;
 
@@ -673,7 +681,7 @@ static inline int CeedOperatorInputBasisAtPoints_Hip(CeedInt num_elem, const Cee
 // Apply and add to output AtPoints
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) {
-  CeedInt             max_num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
+  CeedInt             max_num_points, *num_points, num_elem, elem_size, num_input_fields, num_output_fields, size;
   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -685,12 +693,11 @@ static int CeedOperatorApplyAddAtPoints_Hip(CeedOperator op, CeedVector in_vec,
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-  CeedInt num_points[num_elem];
 
   // Setup
   CeedCallBackend(CeedOperatorSetupAtPoints_Hip(op));
+  num_points     = impl->num_points;
   max_num_points = impl->max_num_points;
-  for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
 
   // Input Evecs and Restriction
   CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request));
@@ -1613,7 +1620,7 @@ static int CeedOperatorLinearAssembleQFunctionAtPoints_Hip(CeedOperator op, Ceed
 // Assemble Linear Diagonal AtPoints
 //------------------------------------------------------------------------------
 static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request) {
-  CeedInt             max_num_points, num_elem, num_input_fields, num_output_fields;
+  CeedInt             max_num_points, *num_points, num_elem, num_input_fields, num_output_fields;
   CeedScalar         *e_data[2 * CEED_FIELD_MAX] = {NULL};
   CeedQFunctionField *qf_input_fields, *qf_output_fields;
   CeedQFunction       qf;
@@ -1625,12 +1632,11 @@ static int CeedOperatorLinearAssembleAddDiagonalAtPoints_Hip(CeedOperator op, Ce
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
-  CeedInt num_points[num_elem];
 
   // Setup
   CeedCallBackend(CeedOperatorSetupAtPoints_Hip(op));
+  num_points     = impl->num_points;
   max_num_points = impl->max_num_points;
-  for (CeedInt i = 0; i < num_elem; i++) num_points[i] = max_num_points;
 
   // Create separate output e-vecs
   if (impl->has_shared_e_vecs) {
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index 38c91060b4..02fb567517 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -79,6 +79,9 @@ typedef struct {
   CeedScalar   *d_grad_1d;
   CeedScalar   *d_q_weight_1d;
   CeedScalar   *d_chebyshev_interp_1d;
+  CeedInt       num_elem_at_points;
+  CeedInt      *h_points_per_elem;
+  CeedInt      *d_points_per_elem;
 } CeedBasis_Hip;
 
 typedef struct {
@@ -140,6 +143,7 @@ typedef struct {
   CeedInt                   num_inputs, num_outputs;
   CeedInt                   num_active_in, num_active_out;
   CeedInt                   max_num_points;
+  CeedInt                  *num_points;
   CeedVector               *qf_active_in, point_coords_elem;
   CeedOperatorDiag_Hip     *diag;
   CeedOperatorAssemble_Hip *asmb;
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index 959078e5b1..01d2251163 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -280,18 +280,46 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
 
-  // Check uniform number of points per elem
-  for (CeedInt i = 1; i < num_elem; i++) {
-    CeedCheck(max_num_points == num_points[i], ceed, CEED_ERROR_BACKEND,
-              "BasisApplyAtPoints only supported for the same number of points in each element");
-  }
-
   // Weight handled separately
   if (eval_mode == CEED_EVAL_WEIGHT) {
     CeedCallBackend(CeedVectorSetValue(v, 1.0));
     return CEED_ERROR_SUCCESS;
   }
 
+  // Check padded to uniform number of points per elem
+  for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
+  {
+    CeedInt  num_comp, q_comp;
+    CeedSize len, len_required;
+
+    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
+    CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
+    len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
+    CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
+              "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
+              " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
+              len, len_required);
+  }
+
+  // Move num_points array to device
+  if (is_transpose) {
+    const CeedInt num_bytes = num_elem * sizeof(CeedInt);
+
+    if (num_elem != data->num_elem_at_points) {
+      data->num_elem_at_points = num_elem;
+
+      if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
+      CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes));
+      CeedCallBackend(CeedFree(&data->h_points_per_elem));
+      CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
+    }
+    if (!memcmp(data->h_points_per_elem, num_points, num_bytes)) {
+      memcpy(data->h_points_per_elem, num_points, num_bytes);
+      CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice));
+    }
+  }
+
   // Build kernels if needed
   if (data->num_points != max_num_points) {
     CeedInt P_1d;
@@ -404,6 +432,8 @@ static int CeedBasisDestroy_Hip_shared(CeedBasis basis) {
   CeedCallHip(ceed, hipModuleUnload(data->module));
   if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
   if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
+  CeedCallBackend(CeedFree(&data->h_points_per_elem));
+  if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
   CeedCallHip(ceed, hipFree(data->d_interp_1d));
   CeedCallHip(ceed, hipFree(data->d_grad_1d));
   CeedCallHip(ceed, hipFree(data->d_collo_grad_1d));
diff --git a/backends/hip-shared/ceed-hip-shared.h b/backends/hip-shared/ceed-hip-shared.h
index fe3384f55d..c000b7f873 100644
--- a/backends/hip-shared/ceed-hip-shared.h
+++ b/backends/hip-shared/ceed-hip-shared.h
@@ -29,6 +29,9 @@ typedef struct {
   CeedScalar   *d_collo_grad_1d;
   CeedScalar   *d_q_weight_1d;
   CeedScalar   *d_chebyshev_interp_1d;
+  CeedInt       num_elem_at_points;
+  CeedInt      *h_points_per_elem;
+  CeedInt      *d_points_per_elem;
 } CeedBasis_Hip_shared;
 
 CEED_INTERN int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
index 263d29338e..7355705660 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h
@@ -42,7 +42,8 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar
 // Interp
 //------------------------------------------------------------------------------
 extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
-                                          const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+                                          const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                          const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
@@ -80,6 +81,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
         // Map from point
         __syncthreads();
         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+          if (p >= points_per_elem[elem]) continue;
           pre  = 1;
           post = 1;
           for (CeedInt d = 0; d < BASIS_DIM; d++) {
@@ -196,7 +198,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
 // Grad
 //------------------------------------------------------------------------------
 extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
-                                        const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+                                        const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                        const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
@@ -235,6 +238,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
         // Map from point
         __syncthreads();
         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+          if (p >= points_per_elem[elem]) continue;
           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
             const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
 
diff --git a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
index 22d81bc30a..4744b17eb2 100644
--- a/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h
@@ -42,7 +42,8 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar
 // Interp
 //------------------------------------------------------------------------------
 extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
-                                          const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+                                          const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                          const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
@@ -80,6 +81,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
         // Map from point
         __syncthreads();
         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+          if (p >= points_per_elem[elem]) continue;
           pre  = 1;
           post = 1;
           for (CeedInt d = 0; d < BASIS_DIM; d++) {
@@ -196,7 +198,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt
 // Grad
 //------------------------------------------------------------------------------
 extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d,
-                                        const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+                                        const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
+                                        const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
   const CeedInt i = threadIdx.x;
 
   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
@@ -235,6 +238,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is
         // Map from point
         __syncthreads();
         for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
+          if (p >= points_per_elem[elem]) continue;
           for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
             const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];