diff --git a/ff/batch_inversion.hpp b/ff/batch_inversion.hpp index 1c3eceb..64b9b99 100644 --- a/ff/batch_inversion.hpp +++ b/ff/batch_inversion.hpp @@ -8,8 +8,8 @@ /* * Since the batch inversion requires twice the storage, on GPU there * is incentive to use the shared memory. If deemed beneficial, the - * suggestion is to derive S from T adding operator[] that would address - * the shared memory and offload the input. + * suggestion is to have the caller wrap T[] in S with custom operator[] + * that would address the shared memory and offload the input. */ template #ifdef __CUDACC__