diff --git a/src/parcsr_ls/HYPRE_parcsr_ls.h b/src/parcsr_ls/HYPRE_parcsr_ls.h
index efc325cee1..0ce70ae004 100644
--- a/src/parcsr_ls/HYPRE_parcsr_ls.h
+++ b/src/parcsr_ls/HYPRE_parcsr_ls.h
@@ -886,6 +886,11 @@ HYPRE_Int HYPRE_BoomerAMGSetChebyScale (HYPRE_Solver solver,
 /**
  * (Optional) Defines which polynomial variant should be used.
  *  The default is 0 (i.e., scaled).
+ *  If variant=1, the polynomial is defined as:
+ *     T(t)* f(t) where f(t) = (1-b/t)
+ *  If variant=2, use standard Chebyshev polynomial with $D^{-1}A$, not $D^{-1/2}AD^{-1/2}$.
+ *  If variant=3, use 4th-kind Chebyshev polynomial from https://arxiv.org/abs/2202.08830.
+ *  If variant=4, use optimized 4th-kind Chebyshev polynomial from https://arxiv.org/abs/2202.08830.
  **/
 HYPRE_Int HYPRE_BoomerAMGSetChebyVariant (HYPRE_Solver solver,
                                           HYPRE_Int   variant);
diff --git a/src/parcsr_ls/par_cheby.c b/src/parcsr_ls/par_cheby.c
index 626ee69f69..9d0b3c8bbd 100644
--- a/src/parcsr_ls/par_cheby.c
+++ b/src/parcsr_ls/par_cheby.c
@@ -32,12 +32,205 @@ this is rlx 11 if scale = 0, and 16 if scale == 1
 variant 1: modified cheby: T(t)* f(t) where f(t) = (1-b/t)
 this is rlx 15 if scale = 0, and 17 if scale == 1
 
+variant=2: standard chebyshev, with $D^{-1}A$ instead of $D^{-1/2}AD^{-1/2}$
+
+variant=3: use 4th-kind Chebyshev polynomial from https://arxiv.org/abs/2202.08830.
+
+variant=4: use optimized 4th-kind Chebyshev polynomial from https://arxiv.org/abs/2202.08830.
+
 ratio indicates the percentage of the whole spectrum to use (so .5
 means half, and .1 means 10percent)
 
 
 *******************************************************************************/
 
+static void optimalWeightsImpl (HYPRE_Real* betas, const HYPRE_Int order)
+{
+  if (order == 1){
+    betas[0] = 1.12500000000000;
+  }
+
+  if (order == 2){
+    betas[0] = 1.02387287570313;
+    betas[1] = 1.26408905371085;
+  }
+
+  if (order == 3){
+    betas[0] = 1.00842544782028;
+    betas[1] = 1.08867839208730;
+    betas[2] = 1.33753125909618;
+  }
+
+  if (order == 4){
+    betas[0] = 1.00391310427285;
+    betas[1] = 1.04035811188593;
+    betas[2] = 1.14863498546254;
+    betas[3] = 1.38268869241000;
+  }
+
+  if (order == 5){
+    betas[0] = 1.00212930146164;
+    betas[1] = 1.02173711549260;
+    betas[2] = 1.07872433192603;
+    betas[3] = 1.19810065292663;
+    betas[4] = 1.41322542791682;
+  }
+
+  if (order == 6){
+    betas[0] = 1.00128517255940;
+    betas[1] = 1.01304293035233;
+    betas[2] = 1.04678215124113;
+    betas[3] = 1.11616489419675;
+    betas[4] = 1.23829020218444;
+    betas[5] = 1.43524297106744;
+  }
+
+  if (order == 7){
+    betas[0] = 1.00083464397912;
+    betas[1] = 1.00843949430122;
+    betas[2] = 1.03008707768713;
+    betas[3] = 1.07408384092003;
+    betas[4] = 1.15036186707366;
+    betas[5] = 1.27116474046139;
+    betas[6] = 1.45186658649364;
+  }
+
+  if (order == 8){
+    betas[0] = 1.00057246631197;
+    betas[1] = 1.00577427662415;
+    betas[2] = 1.02050187922941;
+    betas[3] = 1.05019803444565;
+    betas[4] = 1.10115572984941;
+    betas[5] = 1.18086042806856;
+    betas[6] = 1.29838585382576;
+    betas[7] = 1.46486073151099;
+  }
+
+  if (order == 9){
+    betas[0] = 1.00040960072832;
+    betas[1] = 1.00412439506106;
+    betas[2] = 1.01460212148266;
+    betas[3] = 1.03561113626671;
+    betas[4] = 1.07139972529194;
+    betas[5] = 1.12688273710962;
+    betas[6] = 1.20785219140729;
+    betas[7] = 1.32121930716746;
+    betas[8] = 1.47529642820699;
+  }
+
+  if (order == 10){
+    betas[0] = 1.00030312229652;
+    betas[1] = 1.00304840660796;
+    betas[2] = 1.01077022715387;
+    betas[3] = 1.02619011597640;
+    betas[4] = 1.05231724933755;
+    betas[5] = 1.09255743207549;
+    betas[6] = 1.15083376663972;
+    betas[7] = 1.23172250870894;
+    betas[8] = 1.34060802024460;
+    betas[9] = 1.48386124407011;
+  }
+
+  if (order == 11){
+    betas[0] = 1.00023058595209;
+    betas[1] = 1.00231675024028;
+    betas[2] = 1.00817245396304;
+    betas[3] = 1.01982986566342;
+    betas[4] = 1.03950210235324;
+    betas[5] = 1.06965042700541;
+    betas[6] = 1.11305754295742;
+    betas[7] = 1.17290876275564;
+    betas[8] = 1.25288300576792;
+    betas[9] = 1.35725579919519;
+    betas[10] = 1.49101672564139;
+  }
+
+  if (order == 12){
+    betas[0] = 1.00017947200828;
+    betas[1] = 1.00180189139619;
+    betas[2] = 1.00634861907307;
+    betas[3] = 1.01537864566306;
+    betas[4] = 1.03056942830760;
+    betas[5] = 1.05376019693943;
+    betas[6] = 1.08699862592072;
+    betas[7] = 1.13259183097913;
+    betas[8] = 1.19316273358172;
+    betas[9] = 1.27171293675110;
+    betas[10] = 1.37169337969799;
+    betas[11] = 1.49708418575562;
+  }
+
+  if (order == 13){
+    betas[0] = 1.00014241921559;
+    betas[1] = 1.00142906932629;
+    betas[2] = 1.00503028986298;
+    betas[3] = 1.01216910518495;
+    betas[4] = 1.02414874342792;
+    betas[5] = 1.04238158880820;
+    betas[6] = 1.06842008128700;
+    betas[7] = 1.10399010936759;
+    betas[8] = 1.15102748242645;
+    betas[9] = 1.21171811910125;
+    betas[10] = 1.28854264865128;
+    betas[11] = 1.38432619380991;
+    betas[12] = 1.50229418757368;
+  }
+
+  if (order == 14){
+    betas[0] = 1.00011490538261;
+    betas[1] = 1.00115246376914;
+    betas[2] = 1.00405357333264;
+    betas[3] = 1.00979590573153;
+    betas[4] = 1.01941300472994;
+    betas[5] = 1.03401425035436;
+    betas[6] = 1.05480599606629;
+    betas[7] = 1.08311420301813;
+    betas[8] = 1.12040891660892;
+    betas[9] = 1.16833095655446;
+    betas[10] = 1.22872122288238;
+    betas[11] = 1.30365305707817;
+    betas[12] = 1.39546814053678;
+    betas[13] = 1.50681646209583;
+  }
+
+  if (order == 15){
+    betas[0] = 1.00009404750752;
+    betas[1] = 1.00094291696343;
+    betas[2] = 1.00331449056444;
+    betas[3] = 1.00800294833816;
+    betas[4] = 1.01584236259140;
+    betas[5] = 1.02772083317705;
+    betas[6] = 1.04459535422831;
+    betas[7] = 1.06750761206125;
+    betas[8] = 1.09760092545889;
+    betas[9] = 1.13613855366157;
+    betas[10] = 1.18452361426236;
+    betas[11] = 1.24432087304475;
+    betas[12] = 1.31728069083392;
+    betas[13] = 1.40536543893560;
+    betas[14] = 1.51077872501845;
+  }
+
+  if (order == 16){
+    betas[0] = 1.00007794828179;
+    betas[1] = 1.00078126847253;
+    betas[2] = 1.00274487974401;
+    betas[3] = 1.00662291017015;
+    betas[4] = 1.01309858836971;
+    betas[5] = 1.02289448329337;
+    betas[6] = 1.03678321409983;
+    betas[7] = 1.05559875719896;
+    betas[8] = 1.08024848405560;
+    betas[9] = 1.11172607131497;
+    betas[10] = 1.15112543431072;
+    betas[11] = 1.19965584614973;
+    betas[12] = 1.25865841744946;
+    betas[13] = 1.32962412656664;
+    betas[14] = 1.41421360695576;
+    betas[15] = 1.51427891730346;
+  }
+}
+
 /**
  * @brief Setups of coefficients (and optional diagonal scaling elements) for
  * Chebyshev relaxation
@@ -75,14 +268,20 @@ hypre_ParCSRRelax_Cheby_Setup(hypre_ParCSRMatrix *A,         /* matrix to relax
    HYPRE_Real      *ds_data = NULL;
 
    /* u = u + p(A)r */
-   if (order > 4)
+   if (order < 1)
    {
-      order = 4;
+      order = 1;
    }
 
-   if (order < 1)
+   HYPRE_Int maxOrder = 4;
+   if(variant == 2 || variant == 3){
+      maxOrder = INT_MAX;
+   }
+   if(variant == 4) maxOrder = 16; // due to writing out coefficients
+
+   if (order > maxOrder)
    {
-      order = 1;
+      order = maxOrder;
    }
 
    coefs = hypre_CTAlloc(HYPRE_Real, order + 1, HYPRE_MEMORY_HOST);
@@ -154,7 +353,7 @@ hypre_ParCSRRelax_Cheby_Setup(hypre_ParCSRMatrix *A,         /* matrix to relax
       }
    }
 
-   else /* standard chebyshev */
+   else if (variant == 0) /* standard chebyshev */
    {
 
       switch (cheby_order) /* these are the corresponding cheby polynomials: u = u_o + s(A)r_0  - so order is
@@ -192,13 +391,40 @@ hypre_ParCSRRelax_Cheby_Setup(hypre_ParCSRMatrix *A,         /* matrix to relax
             break;
       }
    }
+   else {
+
+      // for variants 3, 4 allocate additional space for coefficients
+      HYPRE_Int nCoeffs = 2;
+      if(variant == 3 || variant == 4){
+         nCoeffs += order;
+      }
+
+      hypre_TFree(coefs, HYPRE_MEMORY_HOST);
+      coefs = hypre_CTAlloc(HYPRE_Real, nCoeffs, HYPRE_MEMORY_HOST);
+      coefs[0] = upper_bound;
+      coefs[1] = lower_bound; // not actually used for variants 3, 4
+
+      if(variant == 4){
+        optimalWeightsImpl(coefs + 2, order);
+      }
+      if(variant == 3){
+         for(HYPRE_Int i = 0; i < order; ++i){
+            coefs[2 + i] = 1.0;
+         }
+      }
+   }
    *coefs_ptr = coefs;
 
    if (scale)
    {
       /*grab 1/hypre_sqrt(abs(diagonal)) */
       ds_data = hypre_CTAlloc(HYPRE_Real, num_rows, hypre_ParCSRMatrixMemoryLocation(A));
-      hypre_CSRMatrixExtractDiagonal(hypre_ParCSRMatrixDiag(A), ds_data, 4);
+      if(variant == 0 || variant == 1){
+        hypre_CSRMatrixExtractDiagonal(hypre_ParCSRMatrixDiag(A), ds_data, 4);
+      } else {
+        // 1/diagonal
+        hypre_CSRMatrixExtractDiagonal(hypre_ParCSRMatrixDiag(A), ds_data, 2);
+      }
    } /* end of scaling code */
    *ds_ptr = ds_data;
 
@@ -254,130 +480,259 @@ hypre_ParCSRRelax_Cheby_SolveHost(hypre_ParCSRMatrix *A, /* matrix to relax with
 
 
    /* u = u + p(A)r */
-
-   if (order > 4)
-   {
-      order = 4;
-   }
    if (order < 1)
    {
       order = 1;
    }
 
+   HYPRE_Int maxOrder = 4;
+   if(variant == 2 || variant == 3){
+      maxOrder = 9999;
+   }
+   if(variant == 4) maxOrder = 16; // due to writing out coefficients
+
+   if (order > maxOrder)
+   {
+      order = maxOrder;
+   }
+
    /* we are using the order of p(A) */
    cheby_order = order - 1;
 
    hypre_assert(hypre_VectorSize(hypre_ParVectorLocalVector(orig_u_vec)) >= num_rows);
    orig_u = hypre_VectorData(hypre_ParVectorLocalVector(orig_u_vec));
 
-   if (!scale)
-   {
-      /* get residual: r = f - A*u */
-      hypre_ParVectorCopy(f, r);
-      hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r);
+   if (variant == 0 || variant == 1){
+     if (!scale)
+     {
+        /* get residual: r = f - A*u */
+        hypre_ParVectorCopy(f, r);
+        hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r);
+
+        /* o = u; u = r .* coef */
+        for ( i = 0; i < num_rows; i++ )
+        {
+           orig_u[i] = u_data[i];
+           u_data[i] = r_data[i] * coefs[cheby_order];
+        }
+        for (i = cheby_order - 1; i >= 0; i-- )
+        {
+           hypre_ParCSRMatrixMatvec(1.0, A, u, 0.0, v);
+           mult = coefs[i];
+           /* u = mult * r + v */
+#ifdef HYPRE_USING_OPENMP
+           #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE
+#endif
+           for ( j = 0; j < num_rows; j++ )
+           {
+              u_data[j] = mult * r_data[j] + v_data[j];
+           }
+        }
 
-      /* o = u; u = r .* coef */
-      for ( i = 0; i < num_rows; i++ )
-      {
-         orig_u[i] = u_data[i];
-         u_data[i] = r_data[i] * coefs[cheby_order];
-      }
-      for (i = cheby_order - 1; i >= 0; i-- )
-      {
-         hypre_ParCSRMatrixMatvec(1.0, A, u, 0.0, v);
-         mult = coefs[i];
-         /* u = mult * r + v */
+        /* u = o + u */
 #ifdef HYPRE_USING_OPENMP
-         #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE
+        #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
 #endif
-         for ( j = 0; j < num_rows; j++ )
-         {
-            u_data[j] = mult * r_data[j] + v_data[j];
-         }
-      }
+        for ( i = 0; i < num_rows; i++ )
+        {
+           u_data[i] = orig_u[i] + u_data[i];
+        }
+     }
+     else /* scaling! */
+     {
+
+        /*grab 1/sqrt(diagonal) */
+        tmp_data = hypre_VectorData(hypre_ParVectorLocalVector(tmp_vec));
+
+        /* get ds_data and get scaled residual: r = D^(-1/2)f -
+           * D^(-1/2)A*u */
+
+        hypre_ParCSRMatrixMatvec(-1.0, A, u, 0.0, tmp_vec);
+        /* r = ds .* (f + tmp) */
+#ifdef HYPRE_USING_OPENMP
+        #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE
+#endif
+        for ( j = 0; j < num_rows; j++ )
+        {
+           r_data[j] = ds_data[j] * (f_data[j] + tmp_data[j]);
+        }
+
+        /* save original u, then start
+           the iteration by multiplying r by the cheby coef.*/
 
-      /* u = o + u */
+        /* o = u;  u = r * coef */
 #ifdef HYPRE_USING_OPENMP
-      #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
+        #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE
 #endif
-      for ( i = 0; i < num_rows; i++ )
-      {
-         u_data[i] = orig_u[i] + u_data[i];
-      }
+        for ( j = 0; j < num_rows; j++ )
+        {
+           orig_u[j] = u_data[j]; /* orig, unscaled u */
+
+           u_data[j] = r_data[j] * coefs[cheby_order];
+        }
+
+        /* now do the other coefficients */
+        for (i = cheby_order - 1; i >= 0; i-- )
+        {
+           /* v = D^(-1/2)AD^(-1/2)u */
+           /* tmp = ds .* u */
+#ifdef HYPRE_USING_OPENMP
+           #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE
+#endif
+           for ( j = 0; j < num_rows; j++ )
+           {
+              tmp_data[j]  =  ds_data[j] * u_data[j];
+           }
+           hypre_ParCSRMatrixMatvec(1.0, A, tmp_vec, 0.0, v);
+
+           /* u_new = coef*r + v*/
+           mult = coefs[i];
+
+           /* u = coef * r + ds .* v */
+#ifdef HYPRE_USING_OPENMP
+           #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE
+#endif
+           for ( j = 0; j < num_rows; j++ )
+           {
+              u_data[j] = mult * r_data[j] + ds_data[j] * v_data[j];
+           }
+
+        } /* end of cheby_order loop */
+
+        /* now we have to scale u_data before adding it to u_orig*/
+
+        /* u = orig_u + ds .* u */
+#ifdef HYPRE_USING_OPENMP
+        #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE
+#endif
+        for ( j = 0; j < num_rows; j++ )
+        {
+           u_data[j] = orig_u[j] + ds_data[j] * u_data[j];
+        }
+
+     }/* end of scaling code */
    }
-   else /* scaling! */
-   {
+   else if (variant == 2){
+      const HYPRE_Real lambda_max = coefs[0];
+      const HYPRE_Real lambda_min = coefs[1];
+
+      const HYPRE_Real theta = 0.5 * (lambda_max + lambda_min);
+      const HYPRE_Real delta = 0.5 * (lambda_max - lambda_min);
+      const HYPRE_Real sigma = theta / delta;
+      const HYPRE_Real invTheta = 1.0 / theta;
+      HYPRE_Real rho = 1.0 / sigma;
 
-      /*grab 1/hypre_sqrt(diagonal) */
       tmp_data = hypre_VectorData(hypre_ParVectorLocalVector(tmp_vec));
 
-      /* get ds_data and get scaled residual: r = D^(-1/2)f -
-         * D^(-1/2)A*u */
+      // r := f - A*u
+      hypre_ParVectorCopy(f, r);
+      hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r);
+
+      // r = D^{-1} r
+      // v := 1/theta r
+#ifdef HYPRE_USING_OPENMP
+        #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE
+#endif
+        for ( j = 0; j < num_rows; j++ )
+        {
+           const HYPRE_Real r = ds_data[j] * r_data[j];
+           r_data[j] = r;
+           v_data[j] = r * invTheta;
+        }
+      
+      for(int i = 0; i < cheby_order; ++i){
+        // tmp = Av
+        hypre_ParCSRMatrixMatvec(1.0, A, v, 0.0, tmp_vec);
+
+        const HYPRE_Real rhoSave = rho;
+        rho = 1.0 / (2 * sigma - rho);
+
+        const HYPRE_Real vcoef = rho * rhoSave;
+        const HYPRE_Real rcoef = 2.0 * rho / delta;
+
+#ifdef HYPRE_USING_OPENMP
+          #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE
+#endif
+          for ( j = 0; j < num_rows; j++ )
+          {
+             const HYPRE_Real v = v_data[j];
+             u_data[j] += v;
+
+             const HYPRE_Real r = r_data[j] - ds_data[j] * tmp_data[j];
+             r_data[j] = r;
+             v_data[j] = vcoef * v + rcoef * r;
+          }
+      }
 
-      hypre_ParCSRMatrixMatvec(-1.0, A, u, 0.0, tmp_vec);
-      /* r = ds .* (f + tmp) */
+      // u += v;
 #ifdef HYPRE_USING_OPENMP
       #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE
 #endif
       for ( j = 0; j < num_rows; j++ )
       {
-         r_data[j] = ds_data[j] * (f_data[j] + tmp_data[j]);
+         u_data[j] += v_data[j];
       }
 
-      /* save original u, then start
-         the iteration by multiplying r by the cheby coef.*/
+   }
+   else if(variant == 3 || variant == 4)
+   {
+      const HYPRE_Real lambda_max = coefs[0];
+      const HYPRE_Int coeffOffset = 2;
+
+      tmp_data = hypre_VectorData(hypre_ParVectorLocalVector(tmp_vec));
+
+      // r := f - A*u
+      hypre_ParVectorCopy(f, r);
+      hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r);
 
-      /* o = u;  u = r * coef */
+      // v := \dfrac{4}{3} \dfrac{1}{\rho(D^{-1}A)} D^{-1} r
+      HYPRE_Real coef = 4.0 / (3.0 * lambda_max);
 #ifdef HYPRE_USING_OPENMP
       #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE
 #endif
       for ( j = 0; j < num_rows; j++ )
       {
-         orig_u[j] = u_data[j]; /* orig, unscaled u */
-
-         u_data[j] = r_data[j] * coefs[cheby_order];
+         v_data[j] = coef * ds_data[j] * r_data[j];
       }
+      
+      for(int i = 0; i < cheby_order; ++i){
+        const HYPRE_Real beta_i = coefs[coeffOffset + i];
+        // r = r - Av
+        hypre_ParCSRMatrixMatvec(-1.0, A, v, 1.0, r);
+
+        // + 2 offset is due to two issues:
+        // + 1 is from https://arxiv.org/pdf/2202.08830.pdf being written in 1-based indexing
+        // + 1 is from pre-computing z_1 _outside_ of the loop
+        // u += \beta_i v
+        // v = \dfrac{(2i-3)}{(2i+1)} v + \dfrac{(8i-4)}{(2i+1)} \dfrac{1}{\rho(SA)} S r
+        const HYPRE_Int id = i + 2;
+        const HYPRE_Real vScale = (2.0 * id - 3.0) / (2.0 * id + 1.0);
+        const HYPRE_Real rScale = (8.0 * id - 4.0) / ((2.0 * id + 1.0) * lambda_max);
 
-      /* now do the other coefficients */
-      for (i = cheby_order - 1; i >= 0; i-- )
-      {
-         /* v = D^(-1/2)AD^(-1/2)u */
-         /* tmp = ds .* u */
-#ifdef HYPRE_USING_OPENMP
-         #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE
-#endif
-         for ( j = 0; j < num_rows; j++ )
-         {
-            tmp_data[j]  =  ds_data[j] * u_data[j];
-         }
-         hypre_ParCSRMatrixMatvec(1.0, A, tmp_vec, 0.0, v);
-
-         /* u_new = coef*r + v*/
-         mult = coefs[i];
-
-         /* u = coef * r + ds .* v */
 #ifdef HYPRE_USING_OPENMP
-         #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE
+        #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE
 #endif
-         for ( j = 0; j < num_rows; j++ )
-         {
-            u_data[j] = mult * r_data[j] + ds_data[j] * v_data[j];
-         }
+        for ( j = 0; j < num_rows; j++ )
+        {
+           const HYPRE_Real v = v_data[j];
+           u_data[j] += beta_i * v;
+           v_data[j] = vScale * v + rScale * ds_data[j] * r_data[j];
+        }
 
-      } /* end of cheby_order loop */
+      }
 
-      /* now we have to scale u_data before adding it to u_orig*/
+      const HYPRE_Real beta_order = coefs[coeffOffset + cheby_order];
 
-      /* u = orig_u + ds .* u */
+      // u += \beta v;
 #ifdef HYPRE_USING_OPENMP
       #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE
 #endif
       for ( j = 0; j < num_rows; j++ )
       {
-         u_data[j] = orig_u[j] + ds_data[j] * u_data[j];
+         u_data[j] += beta_order * v_data[j];
       }
 
-   }/* end of scaling code */
+   }
 
    return hypre_error_flag;
 }
diff --git a/src/parcsr_ls/par_cheby_device.c b/src/parcsr_ls/par_cheby_device.c
index e2ab368462..af7df44e12 100644
--- a/src/parcsr_ls/par_cheby_device.c
+++ b/src/parcsr_ls/par_cheby_device.c
@@ -17,6 +17,7 @@
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
 #include "_hypre_utilities.hpp"
+#include <algorithm>
 
 /**
  * @brief waxpyz
@@ -97,6 +98,171 @@ struct xpyz
    }
 };
 
+/**
+ * @brief scale
+ *
+ * Performs
+ * x = d .* x
+ * For scalars x, d
+ */
+template <typename T>
+struct scaleInPlace
+{
+   typedef thrust::tuple<T, T&> Tuple;
+
+   __host__ __device__ void operator()(Tuple t)
+   {
+      thrust::get<1>(t) = thrust::get<0>(t) * thrust::get<1>(t);
+   }
+};
+
+/**
+ * @brief add
+ *
+ * Performs
+ * x = x + coef * y
+ * For scalars x, d
+ */
+template <typename T>
+struct add
+{
+   typedef thrust::tuple<T, T&> Tuple;
+
+   const T coef;
+   add(T _coef = 1.0) : coef(_coef) {}
+
+   __host__ __device__ void operator()(Tuple t)
+   {
+      thrust::get<1>(t) = coef * thrust::get<0>(t) + thrust::get<1>(t);
+   }
+};
+
+/**
+ * @brief add
+ *
+ * Performs
+ * x = x + coef * d.*y
+ * For scalars x, d
+ */
+template <typename T>
+struct scaledAdd
+{
+   typedef thrust::tuple<T, T, T&> Tuple;
+   const T scale;
+   scaledAdd(T _scale) : scale(_scale) {}
+
+   __host__ __device__ void operator()(Tuple t)
+   {
+      thrust::get<2>(t) = thrust::get<2>(t) + scale * thrust::get<0>(t) * thrust::get<1>(t);
+   }
+};
+
+/**
+ * @brief add
+ *
+ * Performs
+ * r = r - D .* tmp
+ * v = coef0 * r + coef1 * v
+ * 
+ */
+template <typename T>
+struct updateRAndV
+{
+   typedef thrust::tuple<T, T, T&, T&> Tuple;
+   const T coef0;
+   const T coef1;
+   updateRAndV(T _coef0, T _coef1) : coef0(_coef0), coef1(_coef1) {}
+
+   __host__ __device__ void operator()(Tuple t)
+   {
+      thrust::get<2>(t) = thrust::get<2>(t) - thrust::get<0>(t) * thrust::get<1>(t);
+      thrust::get<3>(t) = coef0 * thrust::get<2>(t) + coef1 * thrust::get<3>(t);
+   }
+};
+
+/**
+ * @brief scale
+ *
+ * Performs
+ * y = coef * d .* x
+ * For scalars x, d, y
+ */
+template <typename T>
+struct applySmoother
+{
+   typedef thrust::tuple<T, T, T&> Tuple;
+
+   const T coef;
+   applySmoother(T _coef = 1.0) : coef(_coef) {}
+
+   __host__ __device__ void operator()(Tuple t)
+   {
+      thrust::get<2>(t) = coef * thrust::get<0>(t) * thrust::get<1>(t);
+   }
+};
+
+/**
+ * @brief waxpyz
+ *
+ * Performs
+ * y = a * x
+ * constant a
+ */
+template <typename T>
+struct scaleConstant
+{
+   typedef thrust::tuple<T, T&> Tuple;
+
+   const T scale;
+   scaleConstant(T _scale) : scale(_scale) {}
+
+   __host__ __device__ void operator()(Tuple t)
+   {
+      thrust::get<1>(t) = scale * thrust::get<0>(t);
+   }
+};
+
+/**
+ * @brief update
+ *
+ * Performs
+ * d = scale0 * r + scale1 * d
+ */
+template <typename T>
+struct update
+{
+   typedef thrust::tuple<T, T&> Tuple;
+
+   const T scale0;
+   const T scale1;
+   update(T _scale0, T _scale1) : scale0(_scale0), scale1(_scale1) {}
+
+   __host__ __device__ void operator()(Tuple t)
+   {
+      thrust::get<1>(t) = scale1 * thrust::get<1>(t) + scale0 * thrust::get<0>(t);
+   }
+};
+
+/**
+ * @brief updateCol
+ *
+ * Performs
+ * d = scale0 * x.*r + scale1 * d
+ */
+template <typename T>
+struct updateCol
+{
+   typedef thrust::tuple<T, T, T&> Tuple;
+
+   const T scale0;
+   const T scale1;
+   updateCol(T _scale0, T _scale1) : scale0(_scale0), scale1(_scale1) {}
+
+   __host__ __device__ void operator()(Tuple t)
+   {
+      thrust::get<2>(t) = scale1 * thrust::get<2>(t) + scale0 * thrust::get<1>(t) * thrust::get<0>(t);
+   }
+};
 /**
  * @brief Solve using a chebyshev polynomial on the device
  *
@@ -142,9 +308,21 @@ hypre_ParCSRRelax_Cheby_SolveDevice(hypre_ParCSRMatrix *A, /* matrix to relax wi
    HYPRE_Real *tmp_data;
 
    /* u = u + p(A)r */
+   if (order < 1)
+   {
+      order = 1;
+   }
+
+   HYPRE_Int maxOrder = 4;
+   if(variant == 2 || variant == 3){
+      maxOrder = 9999;
+   }
+   if(variant == 4) maxOrder = 16; // due to writing out coefficients
 
-   if (order > 4) { order = 4; }
-   if (order < 1) { order = 1; }
+   if (order > maxOrder)
+   {
+      order = maxOrder;
+   }
 
    /* we are using the order of p(A) */
    cheby_order = order - 1;
@@ -152,92 +330,204 @@ hypre_ParCSRRelax_Cheby_SolveDevice(hypre_ParCSRMatrix *A, /* matrix to relax wi
    hypre_assert(hypre_VectorSize(hypre_ParVectorLocalVector(orig_u_vec)) >= num_rows);
    HYPRE_Real *orig_u = hypre_VectorData(hypre_ParVectorLocalVector(orig_u_vec));
 
-   if (!scale)
-   {
-      /* get residual: r = f - A*u */
-      hypre_ParVectorCopy(f, r);
-      hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r);
-
-      /* o = u; u = r .* coef */
-      HYPRE_THRUST_CALL(
-         for_each,
-         thrust::make_zip_iterator(thrust::make_tuple(orig_u, u_data, r_data)),
-         thrust::make_zip_iterator(thrust::make_tuple(orig_u + num_rows, u_data + num_rows,
-                                                      r_data + num_rows)),
-         save_and_scale<HYPRE_Real>(coefs[cheby_order]));
-
-      for (i = cheby_order - 1; i >= 0; i--)
-      {
-         hypre_ParCSRMatrixMatvec(1.0, A, u, 0.0, v);
-         mult = coefs[i];
-
-         /* u = mult * r + v */
-         hypreDevice_ComplexAxpyn( r_data, num_rows, v_data, u_data, mult );
-      }
-
-      /* u = o + u */
-      hypreDevice_ComplexAxpyn( orig_u, num_rows, u_data, u_data, 1.0);
+   if(variant == 0 || variant == 1){
+
+     if (!scale)
+     {
+        /* get residual: r = f - A*u */
+        hypre_ParVectorCopy(f, r);
+        hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r);
+
+        /* o = u; u = r .* coef */
+        HYPRE_THRUST_CALL(
+           for_each,
+           thrust::make_zip_iterator(thrust::make_tuple(orig_u, u_data, r_data)),
+           thrust::make_zip_iterator(thrust::make_tuple(orig_u + num_rows, u_data + num_rows,
+                                                        r_data + num_rows)),
+           save_and_scale<HYPRE_Real>(coefs[cheby_order]));
+
+        for (i = cheby_order - 1; i >= 0; i--)
+        {
+           hypre_ParCSRMatrixMatvec(1.0, A, u, 0.0, v);
+           mult = coefs[i];
+
+           /* u = mult * r + v */
+           hypreDevice_ComplexAxpyn( r_data, num_rows, v_data, u_data, mult );
+        }
+
+        /* u = o + u */
+        hypreDevice_ComplexAxpyn( orig_u, num_rows, u_data, u_data, 1.0);
+     }
+     else /* scaling! */
+     {
+
+        /*grab 1/sqrt(diagonal) */
+
+        tmp_data = hypre_VectorData(hypre_ParVectorLocalVector(tmp_vec));
+
+        /* get ds_data and get scaled residual: r = D^(-1/2)f -
+         * D^(-1/2)A*u */
+
+        hypre_ParCSRMatrixMatvec(-1.0, A, u, 0.0, tmp_vec);
+        /* r = ds .* (f + tmp) */
+
+        /* TODO: It might be possible to merge this and the next call to:
+         * r[j] = ds_data[j] * (f_data[j] + tmp_data[j]); o[j] = u[j]; u[j] = r[j] * coef */
+        HYPRE_THRUST_CALL(for_each,
+                          thrust::make_zip_iterator(thrust::make_tuple(r_data, ds_data, f_data, tmp_data)),
+                          thrust::make_zip_iterator(thrust::make_tuple(r_data, ds_data, f_data, tmp_data)) + num_rows,
+                          wxypz<HYPRE_Real>());
+
+        /* save original u, then start
+           the iteration by multiplying r by the cheby coef.*/
+
+        /* o = u;  u = r * coef */
+        HYPRE_THRUST_CALL(for_each,
+                          thrust::make_zip_iterator(thrust::make_tuple(orig_u, u_data, r_data)),
+                          thrust::make_zip_iterator(thrust::make_tuple(orig_u, u_data, r_data)) + num_rows,
+                          save_and_scale<HYPRE_Real>(coefs[cheby_order]));
+
+        /* now do the other coefficients */
+        for (i = cheby_order - 1; i >= 0; i--)
+        {
+           /* v = D^(-1/2)AD^(-1/2)u */
+           /* tmp = ds .* u */
+           HYPRE_THRUST_CALL( transform, ds_data, ds_data + num_rows, u_data, tmp_data, _1 * _2 );
+
+           hypre_ParCSRMatrixMatvec(1.0, A, tmp_vec, 0.0, v);
+
+           /* u_new = coef*r + v*/
+           mult = coefs[i];
+
+           /* u = coef * r + ds .* v */
+           HYPRE_THRUST_CALL(for_each,
+                             thrust::make_zip_iterator(thrust::make_tuple(u_data, r_data, ds_data, v_data)),
+                             thrust::make_zip_iterator(thrust::make_tuple(u_data, r_data, ds_data, v_data)) + num_rows,
+                             waxpyz<HYPRE_Real>(mult));
+        } /* end of cheby_order loop */
+
+        /* now we have to scale u_data before adding it to u_orig*/
+
+        /* u = orig_u + ds .* u */
+        HYPRE_THRUST_CALL(
+           for_each,
+           thrust::make_zip_iterator(thrust::make_tuple(u_data, orig_u, ds_data)),
+           thrust::make_zip_iterator(thrust::make_tuple(u_data + num_rows, orig_u + num_rows,
+                                                        ds_data + num_rows)),
+           xpyz<HYPRE_Real>());
+
+
+     } /* end of scaling code */
    }
-   else /* scaling! */
+   else if(variant == 2)
    {
+      const auto lambda_max = coefs[0];
+      const auto lambda_min = coefs[1];
 
-      /*grab 1/hypre_sqrt(diagonal) */
+      const auto theta = 0.5 * (lambda_max + lambda_min);
+      const auto delta = 0.5 * (lambda_max - lambda_min);
+      const auto sigma = theta / delta;
+      auto rho = 1.0 / sigma;
 
       tmp_data = hypre_VectorData(hypre_ParVectorLocalVector(tmp_vec));
 
-      /* get ds_data and get scaled residual: r = D^(-1/2)f -
-       * D^(-1/2)A*u */
+      // r := f - A*u
+      hypre_ParVectorCopy(f, r);
+      hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r);
 
-      hypre_ParCSRMatrixMatvec(-1.0, A, u, 0.0, tmp_vec);
-      /* r = ds .* (f + tmp) */
+      // TODO: consolidate two calls below
 
-      /* TODO: It might be possible to merge this and the next call to:
-       * r[j] = ds_data[j] * (f_data[j] + tmp_data[j]); o[j] = u[j]; u[j] = r[j] * coef */
+      // r = D^{-1} r
       HYPRE_THRUST_CALL(for_each,
-                        thrust::make_zip_iterator(thrust::make_tuple(r_data, ds_data, f_data, tmp_data)),
-                        thrust::make_zip_iterator(thrust::make_tuple(r_data, ds_data, f_data, tmp_data)) + num_rows,
-                        wxypz<HYPRE_Real>());
-
-      /* save original u, then start
-         the iteration by multiplying r by the cheby coef.*/
-
-      /* o = u;  u = r * coef */
+                        thrust::make_zip_iterator(thrust::make_tuple(ds_data, r_data)),
+                        thrust::make_zip_iterator(thrust::make_tuple(ds_data, r_data)) + num_rows,
+                        scaleInPlace<HYPRE_Real>());
+      
+      // v := 1/theta r
       HYPRE_THRUST_CALL(for_each,
-                        thrust::make_zip_iterator(thrust::make_tuple(orig_u, u_data, r_data)),
-                        thrust::make_zip_iterator(thrust::make_tuple(orig_u, u_data, r_data)) + num_rows,
-                        save_and_scale<HYPRE_Real>(coefs[cheby_order]));
+                        thrust::make_zip_iterator(thrust::make_tuple(r_data, v_data)),
+                        thrust::make_zip_iterator(thrust::make_tuple(r_data, v_data)) + num_rows,
+                        scaleConstant<HYPRE_Real>(1.0 / theta));
+      
+      for(int i = 0; i < cheby_order; ++i){
+        // u += v
+        HYPRE_THRUST_CALL(for_each,
+                          thrust::make_zip_iterator(thrust::make_tuple(v_data, u_data)),
+                          thrust::make_zip_iterator(thrust::make_tuple(v_data, u_data)) + num_rows,
+                          add<HYPRE_Real>());
+        // tmp = Av
+        hypre_ParCSRMatrixMatvec(1.0, A, v, 0.0, tmp_vec);
+
+        const auto rhoSave = rho;
+        rho = 1.0 / (2 * sigma - rho);
+
+        const auto vcoef = rho * rhoSave;
+        const auto rcoef = 2.0 * rho / delta;
+
+        // r = r - D^{-1} Av
+        // v = rho_{k+1} rho_k * v + 2 rho_{k+1} / delta r
+        HYPRE_THRUST_CALL(for_each,
+                          thrust::make_zip_iterator(thrust::make_tuple(ds_data, tmp_data, r_data, v_data)),
+                          thrust::make_zip_iterator(thrust::make_tuple(ds_data, tmp_data, r_data, v_data)) + num_rows,
+                          updateRAndV<HYPRE_Real>(rcoef, vcoef));
+      }
 
-      /* now do the other coefficients */
-      for (i = cheby_order - 1; i >= 0; i--)
-      {
-         /* v = D^(-1/2)AD^(-1/2)u */
-         /* tmp = ds .* u */
-         HYPRE_THRUST_CALL( transform, ds_data, ds_data + num_rows, u_data, tmp_data, _1 * _2 );
+      // u += v;
+      HYPRE_THRUST_CALL(for_each,
+                        thrust::make_zip_iterator(thrust::make_tuple(v_data, u_data)),
+                        thrust::make_zip_iterator(thrust::make_tuple(v_data, u_data)) + num_rows,
+                        add<HYPRE_Real>());
 
-         hypre_ParCSRMatrixMatvec(1.0, A, tmp_vec, 0.0, v);
+   }
+   else if(variant == 3 || variant == 4)
+   {
+      const auto lambda_max = coefs[0];
+      const auto coeffOffset = 2;
 
-         /* u_new = coef*r + v*/
-         mult = coefs[i];
+      tmp_data = hypre_VectorData(hypre_ParVectorLocalVector(tmp_vec));
 
-         /* u = coef * r + ds .* v */
-         HYPRE_THRUST_CALL(for_each,
-                           thrust::make_zip_iterator(thrust::make_tuple(u_data, r_data, ds_data, v_data)),
-                           thrust::make_zip_iterator(thrust::make_tuple(u_data, r_data, ds_data, v_data)) + num_rows,
-                           waxpyz<HYPRE_Real>(mult));
-      } /* end of cheby_order loop */
+      // r := f - A*u
+      hypre_ParVectorCopy(f, r);
+      hypre_ParCSRMatrixMatvec(-1.0, A, u, 1.0, r);
 
-      /* now we have to scale u_data before adding it to u_orig*/
+      // v := \dfrac{4}{3} \dfrac{1}{\rho(D^{-1}A)} D^{-1} r
+      HYPRE_THRUST_CALL(for_each,
+                        thrust::make_zip_iterator(thrust::make_tuple(r_data, ds_data, v_data)),
+                        thrust::make_zip_iterator(thrust::make_tuple(r_data, ds_data, v_data)) + num_rows,
+                        applySmoother<HYPRE_Real>(4.0 / 3.0 / lambda_max));
+      
+      for(int i = 0; i < cheby_order; ++i){
+        // u += \beta_k v
+        // since this is _not_ the optimized variant, \beta := 1.0
+        HYPRE_THRUST_CALL(for_each,
+                          thrust::make_zip_iterator(thrust::make_tuple(v_data, u_data)),
+                          thrust::make_zip_iterator(thrust::make_tuple(v_data, u_data)) + num_rows,
+                          add<HYPRE_Real>(coefs[coeffOffset + i]));
+        // r = r - Av
+        hypre_ParCSRMatrixMatvec(-1.0, A, v, 1.0, r);
+
+        // + 2 offset is due to two issues:
+        // + 1 is from https://arxiv.org/pdf/2202.08830.pdf being written in 1-based indexing
+        // + 1 is from pre-computing z_1 _outside_ of the loop
+        // v = \dfrac{(2i-3)}{(2i+1)} v + \dfrac{(8i-4)}{(2i+1)} \dfrac{1}{\rho(SA)} S r
+        const auto id = i + 2;
+        const auto vScale = (2.0 * id - 3.0) / (2.0 * id + 1.0);
+        const auto rScale = (8.0 * id - 4.0) / (2.0 * id + 1.0) / lambda_max;
+
+        HYPRE_THRUST_CALL(for_each,
+                          thrust::make_zip_iterator(thrust::make_tuple(r_data, ds_data, v_data)),
+                          thrust::make_zip_iterator(thrust::make_tuple(r_data, ds_data, v_data)) + num_rows,
+                          updateCol<HYPRE_Real>(rScale, vScale));
 
-      /* u = orig_u + ds .* u */
-      HYPRE_THRUST_CALL(
-         for_each,
-         thrust::make_zip_iterator(thrust::make_tuple(u_data, orig_u, ds_data)),
-         thrust::make_zip_iterator(thrust::make_tuple(u_data + num_rows, orig_u + num_rows,
-                                                      ds_data + num_rows)),
-         xpyz<HYPRE_Real>());
+      }
 
+      // u += \beta v;
+      HYPRE_THRUST_CALL(for_each,
+                        thrust::make_zip_iterator(thrust::make_tuple(v_data, u_data)),
+                        thrust::make_zip_iterator(thrust::make_tuple(v_data, u_data)) + num_rows,
+                        add<HYPRE_Real>(coefs[coeffOffset + cheby_order]));
 
-   } /* end of scaling code */
+   }
 
    return hypre_error_flag;
 }
diff --git a/src/parcsr_ls/par_cycle.c b/src/parcsr_ls/par_cycle.c
index 9a8f79a96a..4baf601a24 100644
--- a/src/parcsr_ls/par_cycle.c
+++ b/src/parcsr_ls/par_cycle.c
@@ -549,6 +549,10 @@ hypre_BoomerAMGCycle( void              *amg_vdata,
                                                 cheby_order, scale,
                                                 variant, Aux_U, Vtemp, Ztemp, Ptemp, Rtemp );
                }
+               else if (relax_type == 19)
+               {
+                  // no-op
+               }
                else if (relax_type == 17)
                {
                   if (level == num_levels - 1)