diff --git a/ff/batch_inversion.hpp b/ff/batch_inversion.hpp
index 1c3eceb..64b9b99 100644
--- a/ff/batch_inversion.hpp
+++ b/ff/batch_inversion.hpp
@@ -8,8 +8,8 @@
 /*
  * Since the batch inversion requires twice the storage, on GPU there
  * is incentive to use the shared memory. If deemed beneficial, the
- * suggestion is to derive S from T adding operator[] that would address
- * the shared memory and offload the input.
+ * suggestion is to have the caller wrap T[] in S with custom operator[]
+ * that would address the shared memory and offload the input.
  */
 template<class T, size_t N, typename S = T[N]>
 #ifdef __CUDACC__