Merge pull request #5 from jlparkI/develop

0.2.0.5 merge
jlparkI · Jan 29, 2024 · cb473c9 · cb473c9
2 parents 18bf228 + c6f7515
commit cb473c9
Show file tree

Hide file tree

Showing 14 changed files with 67 additions and 154 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -165,3 +165,12 @@ only intermittently useful.
 
 Fixed a bug in automatic preconditioner rank selection for
 classification.
+
+### Version 0.2.0.2
+
+Altered the definition of averaging for conv kernels.
+
+### Version 0.2.0.5
+
+Fixed a bug in rf feature generation for RBF-based kernels for
+situations where the number of input features is very large.
diff --git a/test/fht_operations_tests/rbf_fht.py b/test/fht_operations_tests/rbf_fht.py
@@ -11,12 +11,9 @@
 
 from cpu_rf_gen_module import cpuSORFTransform as cSORF
 
-try:
-    from cuda_rf_gen_module import cudaRBFFeatureGen as cudaRBF
-    from cuda_rf_gen_module import cudaRBFGrad as cudaRBFGrad
-    import cupy as cp
-except:
-    pass
+from cuda_rf_gen_module import cudaRBFFeatureGen as cudaRBF
+from cuda_rf_gen_module import cudaRBFGrad as cudaRBFGrad
+import cupy as cp
 
 
 class TestRBFFeatureGen(unittest.TestCase):

diff --git a/xGPR/__init__.py b/xGPR/__init__.py
@@ -1,6 +1,6 @@
 #Version number. Updated if generating a new release.
 #Otherwise, do not change.
-__version__ = "0.2.0.1"
+__version__ = "0.2.0.5"
 
 #Key imports.
 from .xgp_regression import xGPRegression

diff --git a/xGPR/random_feature_generation/cpu_rf_gen/cpu_convolution.pyx b/xGPR/random_feature_generation/cpu_rf_gen/cpu_convolution.pyx
@@ -219,7 +219,7 @@ def cpuConv1dFGen(np.ndarray[floating, ndim=3] reshapedX,
         scalingTerm = np.sqrt(2 / <double>(chiArr.shape[0]))
 
     if averageFeatures:
-        scalingTerm /= np.sqrt(<double>reshapedX.shape[1])
+        scalingTerm /= <double>reshapedX.shape[1]
 
     if chiArr.dtype == "float32" and reshapedX.dtype == "float32":
         errCode = convRBFFeatureGen_[float](&radem[0,0,0], <float*>addr_input,
@@ -324,7 +324,7 @@ def cpuConvGrad(np.ndarray[floating, ndim=3] reshapedX,
     else:
         scalingTerm = np.sqrt(2 / <double>(chiArr.shape[0]))
     if averageFeatures:
-        scalingTerm /= np.sqrt(<double>reshapedX.shape[1])
+        scalingTerm /= <double>reshapedX.shape[1]
 
 
     if chiArr.dtype == "float32" and reshapedX.dtype == "float32":
@@ -432,7 +432,7 @@ def cpuConv1dArcCosFGen(np.ndarray[floating, ndim=3] reshapedX,
     else:
         scalingTerm = np.sqrt(1 / <double>(chiArr.shape[0]))
     if averageFeatures:
-        scalingTerm /= np.sqrt(<double>reshapedX.shape[1])
+        scalingTerm /= <double>reshapedX.shape[1]
 
     if chiArr.dtype == "float32" and reshapedX.dtype == "float32":
         errCode = convArcCosFeatureGen_[float](&radem[0,0,0], <float*>addr_input,

diff --git a/xGPR/random_feature_generation/gpu_rf_gen/basic_ops/basic_array_operations.cu b/xGPR/random_feature_generation/gpu_rf_gen/basic_ops/basic_array_operations.cu
@@ -109,9 +109,10 @@ __global__ void levelNTransform(T cArray[], int arrsize,
     //Equivalent to pos mod spacing IF spacing is a power of 2.
     int lo = (pos & (spacing - 1));
     int id = lo + ((pos - lo) << 1);
-    T y, *cPtr = cArray + id;
 
     if (id < arrsize){
+        T y, *cPtr = cArray + id;
+
         y = cPtr[spacing];
         cPtr[spacing] = *cPtr - y;
         *cPtr += y;
@@ -132,9 +133,11 @@ __global__ void multiplyByDiagonalRademacherMat(T cArray[], int8_t *rademArray,
     int rVal, position;
 
     position = tid % numElementsPerRow;
-    rVal = rademArray[position];
-    if (tid < numElements)
+
+    if (tid < numElements){
+        rVal = rademArray[position];
         cArray[tid] = cArray[tid] * rVal * normConstant;
+    }
 }
 
 //We perform the transform over the last dimension

diff --git a/xGPR/random_feature_generation/gpu_rf_gen/convolution_ops/arccos_convolution.cu b/xGPR/random_feature_generation/gpu_rf_gen/convolution_ops/arccos_convolution.cu
@@ -67,10 +67,11 @@ __global__ void convArcCosPostProcessKernelOrder1(const T featureArray[],
     int row = tid / endPosition;
     int inputLoc = row * dim1 * dim2 + column;
     int outputLoc = row * numFreqs + column + startPosition;
-    T *chiVal = chiArr + startPosition + column;
     T chiProd, rollingSum = 0;
 
     if (tid < numElements){
+        T *chiVal = chiArr + startPosition + column;
+
         for (i=0; i < dim1; i++){
             chiProd = *chiVal * featureArray[inputLoc];
             rollingSum += max(chiProd, 0.0);
@@ -96,10 +97,11 @@ __global__ void convArcCosPostProcessKernelOrder2(const T featureArray[],
     int row = tid / endPosition;
     int inputLoc = row * dim1 * dim2 + column;
     int outputLoc = row * numFreqs + column + startPosition;
-    T *chiVal = chiArr + startPosition + column;
     T chiProd, rollingSum = 0;
 
     if (tid < numElements){
+        T *chiVal = chiArr + startPosition + column;
+
         for (i=0; i < dim1; i++){
             chiProd = *chiVal * featureArray[inputLoc];
             chiProd = max(chiProd, 0.0);

diff --git a/xGPR/random_feature_generation/gpu_rf_gen/convolution_ops/ard_convolution.cu b/xGPR/random_feature_generation/gpu_rf_gen/convolution_ops/ard_convolution.cu
@@ -43,14 +43,15 @@ __global__ void ardConvGradSetup(double *gradientArray,
     int precompWRow = (tid % numFreqs);
     int gradRow = tid / numFreqs;
 
-    T *precompWElement = precomputedWeights + precompWRow * dim2;
-    T *inputXElement = inputX + gradRow * dim1 * dim2;
-    double *gradientElement = gradientArray + 2 * (gradRow * numFreqs + precompWRow) * numLengthscales;
-    double *randomFeature = randomFeatures + 2 * gradRow * numFreqs + 2 * precompWRow;
-    double *bufferElement = copyBuffer + (gradRow * numFreqs + precompWRow) * numLengthscales;
     double rfVal = 0, outVal, sinVal, cosVal;
 
     if (tid < numSetupElements){
+        T *precompWElement = precomputedWeights + precompWRow * dim2;
+        T *inputXElement = inputX + gradRow * dim1 * dim2;
+        double *gradientElement = gradientArray + 2 * (gradRow * numFreqs + precompWRow) * numLengthscales;
+        double *randomFeature = randomFeatures + 2 * gradRow * numFreqs + 2 * precompWRow;
+        double *bufferElement = copyBuffer + (gradRow * numFreqs + precompWRow) * numLengthscales;
+
         for (i=0; i < dim1; i++){
             rfVal = 0;
             for (j=0; j < dim2; j++){

diff --git a/xGPR/random_feature_generation/gpu_rf_gen/convolution_ops/convolution.cu b/xGPR/random_feature_generation/gpu_rf_gen/convolution_ops/convolution.cu
@@ -28,10 +28,11 @@ __global__ void conv1dMultiplyByRadem(T cArray[], int8_t *rademArray,
 			int dim2, int startPosition, int numElements, float normConstant)
 {
     int tid = blockDim.x * blockIdx.x + threadIdx.x;
-    int8_t *rVal = rademArray + startPosition + (tid & (dim2 - 1));
 
-    if (tid < numElements)
+    if (tid < numElements){
+        int8_t *rVal = rademArray + startPosition + (tid & (dim2 - 1));
         cArray[tid] = cArray[tid] * *rVal * normConstant;
+    }
 }
 
 

diff --git a/xGPR/random_feature_generation/gpu_rf_gen/convolution_ops/rbf_convolution.cu b/xGPR/random_feature_generation/gpu_rf_gen/convolution_ops/rbf_convolution.cu
@@ -64,10 +64,11 @@ __global__ void convRBFPostProcessKernel(T featureArray[], T chiArr[],
     int row = tid / endPosition;
     int inputLoc = row * dim1 * dim2 + column;
     int outputLoc = row * 2 * numFreqs + 2 * column + 2 * startPosition;
-    T chiVal = chiArr[startPosition + column];
     double chiProd, sinSum = 0, cosSum = 0;
 
     if (tid < numElements){
+        T chiVal = chiArr[startPosition + column];
+
         for (i=0; i < dim1; i++){
             chiProd = chiVal * featureArray[inputLoc];
             cosSum += cos(chiProd);
@@ -97,11 +98,12 @@ __global__ void convRBFGradProcessKernel(T featureArray[], T chiArr[],
     int row = tid / endPosition;
     int inputLoc = row * dim1 * dim2 + column;
     int outputLoc = row * 2 * numFreqs + 2 * column + 2 * startPosition;
-    T chiVal = chiArr[startPosition + column];
     double chiProd, sinSum = 0, cosSum = 0, sinVal, cosVal;
     double gradSinVal = 0, gradCosVal = 0;
 
     if (tid < numElements){
+        T chiVal = chiArr[startPosition + column];
+
         for (i=0; i < dim1; i++){
             chiProd = chiVal * featureArray[inputLoc];
             cosVal = cos(chiProd * sigma);

diff --git a/xGPR/random_feature_generation/gpu_rf_gen/cuda_convolution.pyx b/xGPR/random_feature_generation/gpu_rf_gen/cuda_convolution.pyx
@@ -244,7 +244,7 @@ def gpuConv1dFGen(reshapedX, radem, outputArray, chiArr,
         scalingTerm = np.sqrt(2 / <double>chiArr.shape[0])
 
     if averageFeatures:
-        scalingTerm /= np.sqrt(<double>reshapedX.shape[1])
+        scalingTerm /= <double>reshapedX.shape[1]
 
     if outputArray.dtype == "float64" and reshapedX.dtype == "float32" and \
             chiArr.dtype == "float32":
@@ -364,7 +364,7 @@ def gpuConvGrad(reshapedX, radem, outputArray, chiArr,
         scalingTerm = np.sqrt(2 / <double>chiArr.shape[0])
 
     if averageFeatures:
-        scalingTerm /= np.sqrt(<double>reshapedX.shape[1])
+        scalingTerm /= <double>reshapedX.shape[1]
 
     if outputArray.dtype == "float64" and reshapedX.dtype == "float32" and \
             chiArr.dtype == "float32":
@@ -484,7 +484,7 @@ def gpuConv1dArcCosFGen(reshapedX, radem, outputArray, chiArr,
         scalingTerm = np.sqrt(1 / <double>chiArr.shape[0])
 
     if averageFeatures:
-        scalingTerm /= np.sqrt(<double>reshapedX.shape[1])
+        scalingTerm /= <double>reshapedX.shape[1]
 
     if outputArray.dtype == "float64" and reshapedX.dtype == "float32" and \
             chiArr.dtype == "float32":

diff --git a/xGPR/random_feature_generation/gpu_rf_gen/cuda_polynomial.pyx b/xGPR/random_feature_generation/gpu_rf_gen/cuda_polynomial.pyx
@@ -24,8 +24,6 @@ cdef extern from "convolution_ops/convolution.h" nogil:
 
 
 cdef extern from "poly_ops/polynomial_operations.h" nogil:
-    const char *cudaExactQuadratic_[T](T inArray[], double *outArray, 
-                    int inDim0, int inDim1)
     const char *approxPolynomial_[T](int8_t *radem, T reshapedX[],
         T copyBuffer[], T chiArr[], double *outArray,
         int polydegree, int reshapedDim0, int reshapedDim1,
@@ -248,59 +246,3 @@ def gpuPolyFHT(reshapedX, radem, chiArr, outputArray, int polydegree,
 
     else:
         raise ValueError("Inconsistent array types passed to wrapped C++ function.")
-
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def cudaExactQuadratic(inputArray, outputArray,
-                int numThreads):
-    """Wraps C++ operations for generating features for an exact
-    quadratic.
-
-    Args:
-        inputArray (ndarray): The input data. This is not modified.
-        outputArray (ndarray): The output array. Must have the appropriate
-            shape such that all of the quadratic polynomial features can
-            be written to it. The last column is assumed to be saved for 1
-            for a y-intercept term.
-        num_threads (int): Number of threads to use for FHT. Not used for gpu,
-            merely kept here for consistency with CPU version.
-
-    Raises:
-        ValueError: A ValueError is raised if unexpected or invalid inputs are supplied.
-    """
-    cdef const char *errCode
-    cdef uintptr_t addr_output = outputArray.data.ptr
-    cdef uintptr_t addr_input = inputArray.data.ptr
-    cdef int numExpectedFeats = int( inputArray.shape[1] * (inputArray.shape[1] - 1) / 2)
-    numExpectedFeats += 2 * inputArray.shape[1] + 1
-
-    if len(inputArray.shape) != 2 or len(outputArray.shape) != 2:
-        raise ValueError("Both inputArray and outputArray for the exact quadratic "
-                "must be 2d arrays.")
-
-    if inputArray.shape[0] == 0:
-        raise ValueError("There must be at least one datapoint.")
-    if inputArray.shape[0] != outputArray.shape[0]:
-        raise ValueError("The number of datapoints in the outputs and the inputs do "
-                "not agree.")
-    if outputArray.shape[1] != numExpectedFeats:
-        raise ValueError("The shape of the output array is incorrect for a quadratic.")
-
-    if not outputArray.flags["C_CONTIGUOUS"] or not inputArray.flags["C_CONTIGUOUS"]:
-        raise ValueError("One or more arguments is not C contiguous.")
-
-    if inputArray.dtype == "float32":
-        errCode = cudaExactQuadratic_[float](<float*>addr_input, <double*>addr_output,
-                        inputArray.shape[0], inputArray.shape[1])
-
-    elif inputArray.dtype == "float64":
-        errCode = cudaExactQuadratic_[double](<double*>addr_input, <double*>addr_output,
-                        inputArray.shape[0], inputArray.shape[1])
-
-    else:
-        raise ValueError("Unexpected types passed to wrapped C++ function.")
-
-    if errCode.decode("UTF-8") != "no_error":
-        raise Exception("Fatal error encountered while performing graph convolution.")
diff --git a/xGPR/random_feature_generation/gpu_rf_gen/cuda_rbf_operations.pyx b/xGPR/random_feature_generation/gpu_rf_gen/cuda_rbf_operations.pyx
@@ -72,7 +72,6 @@ def cudaRBFFeatureGen(inputArray, outputArray, radem,
 
     cdef uintptr_t addr_radem = radem.data.ptr
 
-
     if not radem.dtype == "int8":
         raise ValueError("radem must be of type int8.")
     if not inputArray.flags["C_CONTIGUOUS"] or not radem.flags["C_CONTIGUOUS"] or not \

diff --git a/xGPR/random_feature_generation/gpu_rf_gen/poly_ops/polynomial_operations.cu b/xGPR/random_feature_generation/gpu_rf_gen/poly_ops/polynomial_operations.cu
@@ -1,6 +1,5 @@
 /*
-* Contains functions needed to generate exact quadratic polynomial
-* features and approximate polynomial kernel features on GPU.
+* Contains functions needed to generate approximate polynomial kernel features on GPU.
 */
 
 #include <cuda.h>
@@ -25,9 +24,11 @@ __global__ void polyMultByDiagRademMat(T cArray[], int8_t *rademArray,
     int rVal, position;
 
     position = tid % numElementsPerRow;
-    rVal = rademArray[position];
-    if (tid < numElements)
+
+    if (tid < numElements){
+        rVal = rademArray[position];
         cArray[tid] = cArray[tid] * rVal * normConstant;
+    }
 }
 
 
@@ -44,9 +45,11 @@ __global__ void polyMultAndCopyDiagRademMat(T cArray[], T copyBuffer[],
     int rVal, position;
 
     position = tid % numElementsPerRow;
-    rVal = rademArray[position];
-    if (tid < numElements)
+
+    if (tid < numElements){
+        rVal = rademArray[position];
         copyBuffer[tid] = cArray[tid] * rVal * normConstant;
+    }
 }
 
 
@@ -66,6 +69,7 @@ __global__ void polyOutArrayCopyTransfer(T copyBuffer[],
     numRowsTraversed = tid / numFreqs;
     numExcess = tid % numFreqs;
     cBuffPosition = numRowsTraversed * numPerRow + numExcess;
+
     if (tid < numElements)
         outArray[tid] = chiArr[numExcess] * copyBuffer[cBuffPosition];
 }
@@ -86,64 +90,13 @@ __global__ void polyOutArrayMultTransfer(T copyBuffer[],
     numExcess = tid % numFreqs;
     cBuffPosition = numRowsTraversed * numPerRow + numExcess;
     chiPosition = repeatNum * numPerRow + numExcess;
+
     if (tid < numElements)
         outArray[tid] *= chiArr[chiPosition] * copyBuffer[cBuffPosition];
 }
 
 
 
-//Generates the features for the exact quadratic.
-template <typename T>
-__global__ void genExactQuadFeatures(T inArray[], double *outArray,
-        int inDim1, int outDim1, int numElements){
-    int pos = blockDim.x * blockIdx.x + threadIdx.x;
-    int rowNum = pos / inDim1;
-    int positionInRow = pos % inDim1;
-    T inVal1 = inArray[pos];
-    T *inPtr = inArray + pos;
-    double *outPtr = outArray + rowNum * outDim1;
-    for (int i=0; i < positionInRow; i++)
-        outPtr += inDim1 + 1 - i;
-
-    if (pos < numElements){
-        *outPtr = inVal1;
-        outPtr++;
-        for (int i=positionInRow; i < inDim1; i++){
-            *outPtr = inVal1 * *inPtr;
-            outPtr++;
-            inPtr++;
-        }
-    }
-}
-
-
-
-//Performs feature generation for an exact quadratic (i.e. polynomial
-//regression that is exact, not approximated).
-//
-//Note that all of these arrays are already expected to "live" on GPU.
-template <typename T>
-const char *cudaExactQuadratic_(T inArray[], double *outArray,
-                    int inDim0, int inDim1){
-    int numInteractions = (inDim1 * (inDim1 - 1)) / 2;
-    int outDim1 = numInteractions + 1 + 2 * inDim1;
-    int numElements = inDim1 * inDim0;
-    int blocksPerGrid = (numElements + DEFAULT_THREADS_PER_BLOCK - 1) / DEFAULT_THREADS_PER_BLOCK;
-    //cudaProfilerStart();
-
-    //Multiply by D1.
-    genExactQuadFeatures<T><<<blocksPerGrid, DEFAULT_THREADS_PER_BLOCK>>>(inArray, outArray, 
-                                 inDim1, outDim1, numElements);
-
-
-    //cudaProfilerStop();
-    return "no_error";
-}
-//Instantiate templates explicitly so wrapper can use.
-template const char *cudaExactQuadratic_<float>(float inArray[], double *outArray, 
-                    int inDim0, int inDim1);
-template const char *cudaExactQuadratic_<double>(double inArray[], double *outArray, 
-                    int inDim0, int inDim1);