Skip to content

Commit

Permalink
Merge pull request #5 from jlparkI/develop
Browse files Browse the repository at this point in the history
0.2.0.5 merge
  • Loading branch information
jlparkI authored Jan 29, 2024
2 parents 18bf228 + c6f7515 commit cb473c9
Show file tree
Hide file tree
Showing 14 changed files with 67 additions and 154 deletions.
9 changes: 9 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,12 @@ only intermittently useful.

Fixed a bug in automatic preconditioner rank selection for
classification.

### Version 0.2.0.2

Altered the definition of averaging for conv kernels.

### Version 0.2.0.5

Fixed a bug in rf feature generation for RBF-based kernels for
situations where the number of input features is very large.
9 changes: 3 additions & 6 deletions test/fht_operations_tests/rbf_fht.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,9 @@

from cpu_rf_gen_module import cpuSORFTransform as cSORF

try:
from cuda_rf_gen_module import cudaRBFFeatureGen as cudaRBF
from cuda_rf_gen_module import cudaRBFGrad as cudaRBFGrad
import cupy as cp
except:
pass
from cuda_rf_gen_module import cudaRBFFeatureGen as cudaRBF
from cuda_rf_gen_module import cudaRBFGrad as cudaRBFGrad
import cupy as cp


class TestRBFFeatureGen(unittest.TestCase):
Expand Down
2 changes: 1 addition & 1 deletion xGPR/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#Version number. Updated if generating a new release.
#Otherwise, do not change.
__version__ = "0.2.0.1"
__version__ = "0.2.0.5"

#Key imports.
from .xgp_regression import xGPRegression
Expand Down
6 changes: 3 additions & 3 deletions xGPR/random_feature_generation/cpu_rf_gen/cpu_convolution.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def cpuConv1dFGen(np.ndarray[floating, ndim=3] reshapedX,
scalingTerm = np.sqrt(2 / <double>(chiArr.shape[0]))

if averageFeatures:
scalingTerm /= np.sqrt(<double>reshapedX.shape[1])
scalingTerm /= <double>reshapedX.shape[1]

if chiArr.dtype == "float32" and reshapedX.dtype == "float32":
errCode = convRBFFeatureGen_[float](&radem[0,0,0], <float*>addr_input,
Expand Down Expand Up @@ -324,7 +324,7 @@ def cpuConvGrad(np.ndarray[floating, ndim=3] reshapedX,
else:
scalingTerm = np.sqrt(2 / <double>(chiArr.shape[0]))
if averageFeatures:
scalingTerm /= np.sqrt(<double>reshapedX.shape[1])
scalingTerm /= <double>reshapedX.shape[1]


if chiArr.dtype == "float32" and reshapedX.dtype == "float32":
Expand Down Expand Up @@ -432,7 +432,7 @@ def cpuConv1dArcCosFGen(np.ndarray[floating, ndim=3] reshapedX,
else:
scalingTerm = np.sqrt(1 / <double>(chiArr.shape[0]))
if averageFeatures:
scalingTerm /= np.sqrt(<double>reshapedX.shape[1])
scalingTerm /= <double>reshapedX.shape[1]

if chiArr.dtype == "float32" and reshapedX.dtype == "float32":
errCode = convArcCosFeatureGen_[float](&radem[0,0,0], <float*>addr_input,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,10 @@ __global__ void levelNTransform(T cArray[], int arrsize,
//Equivalent to pos mod spacing IF spacing is a power of 2.
int lo = (pos & (spacing - 1));
int id = lo + ((pos - lo) << 1);
T y, *cPtr = cArray + id;

if (id < arrsize){
T y, *cPtr = cArray + id;

y = cPtr[spacing];
cPtr[spacing] = *cPtr - y;
*cPtr += y;
Expand All @@ -132,9 +133,11 @@ __global__ void multiplyByDiagonalRademacherMat(T cArray[], int8_t *rademArray,
int rVal, position;

position = tid % numElementsPerRow;
rVal = rademArray[position];
if (tid < numElements)

if (tid < numElements){
rVal = rademArray[position];
cArray[tid] = cArray[tid] * rVal * normConstant;
}
}

//We perform the transform over the last dimension
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,11 @@ __global__ void convArcCosPostProcessKernelOrder1(const T featureArray[],
int row = tid / endPosition;
int inputLoc = row * dim1 * dim2 + column;
int outputLoc = row * numFreqs + column + startPosition;
T *chiVal = chiArr + startPosition + column;
T chiProd, rollingSum = 0;

if (tid < numElements){
T *chiVal = chiArr + startPosition + column;

for (i=0; i < dim1; i++){
chiProd = *chiVal * featureArray[inputLoc];
rollingSum += max(chiProd, 0.0);
Expand All @@ -96,10 +97,11 @@ __global__ void convArcCosPostProcessKernelOrder2(const T featureArray[],
int row = tid / endPosition;
int inputLoc = row * dim1 * dim2 + column;
int outputLoc = row * numFreqs + column + startPosition;
T *chiVal = chiArr + startPosition + column;
T chiProd, rollingSum = 0;

if (tid < numElements){
T *chiVal = chiArr + startPosition + column;

for (i=0; i < dim1; i++){
chiProd = *chiVal * featureArray[inputLoc];
chiProd = max(chiProd, 0.0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,15 @@ __global__ void ardConvGradSetup(double *gradientArray,
int precompWRow = (tid % numFreqs);
int gradRow = tid / numFreqs;

T *precompWElement = precomputedWeights + precompWRow * dim2;
T *inputXElement = inputX + gradRow * dim1 * dim2;
double *gradientElement = gradientArray + 2 * (gradRow * numFreqs + precompWRow) * numLengthscales;
double *randomFeature = randomFeatures + 2 * gradRow * numFreqs + 2 * precompWRow;
double *bufferElement = copyBuffer + (gradRow * numFreqs + precompWRow) * numLengthscales;
double rfVal = 0, outVal, sinVal, cosVal;

if (tid < numSetupElements){
T *precompWElement = precomputedWeights + precompWRow * dim2;
T *inputXElement = inputX + gradRow * dim1 * dim2;
double *gradientElement = gradientArray + 2 * (gradRow * numFreqs + precompWRow) * numLengthscales;
double *randomFeature = randomFeatures + 2 * gradRow * numFreqs + 2 * precompWRow;
double *bufferElement = copyBuffer + (gradRow * numFreqs + precompWRow) * numLengthscales;

for (i=0; i < dim1; i++){
rfVal = 0;
for (j=0; j < dim2; j++){
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,11 @@ __global__ void conv1dMultiplyByRadem(T cArray[], int8_t *rademArray,
int dim2, int startPosition, int numElements, float normConstant)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
int8_t *rVal = rademArray + startPosition + (tid & (dim2 - 1));

if (tid < numElements)
if (tid < numElements){
int8_t *rVal = rademArray + startPosition + (tid & (dim2 - 1));
cArray[tid] = cArray[tid] * *rVal * normConstant;
}
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,11 @@ __global__ void convRBFPostProcessKernel(T featureArray[], T chiArr[],
int row = tid / endPosition;
int inputLoc = row * dim1 * dim2 + column;
int outputLoc = row * 2 * numFreqs + 2 * column + 2 * startPosition;
T chiVal = chiArr[startPosition + column];
double chiProd, sinSum = 0, cosSum = 0;

if (tid < numElements){
T chiVal = chiArr[startPosition + column];

for (i=0; i < dim1; i++){
chiProd = chiVal * featureArray[inputLoc];
cosSum += cos(chiProd);
Expand Down Expand Up @@ -97,11 +98,12 @@ __global__ void convRBFGradProcessKernel(T featureArray[], T chiArr[],
int row = tid / endPosition;
int inputLoc = row * dim1 * dim2 + column;
int outputLoc = row * 2 * numFreqs + 2 * column + 2 * startPosition;
T chiVal = chiArr[startPosition + column];
double chiProd, sinSum = 0, cosSum = 0, sinVal, cosVal;
double gradSinVal = 0, gradCosVal = 0;

if (tid < numElements){
T chiVal = chiArr[startPosition + column];

for (i=0; i < dim1; i++){
chiProd = chiVal * featureArray[inputLoc];
cosVal = cos(chiProd * sigma);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def gpuConv1dFGen(reshapedX, radem, outputArray, chiArr,
scalingTerm = np.sqrt(2 / <double>chiArr.shape[0])

if averageFeatures:
scalingTerm /= np.sqrt(<double>reshapedX.shape[1])
scalingTerm /= <double>reshapedX.shape[1]

if outputArray.dtype == "float64" and reshapedX.dtype == "float32" and \
chiArr.dtype == "float32":
Expand Down Expand Up @@ -364,7 +364,7 @@ def gpuConvGrad(reshapedX, radem, outputArray, chiArr,
scalingTerm = np.sqrt(2 / <double>chiArr.shape[0])

if averageFeatures:
scalingTerm /= np.sqrt(<double>reshapedX.shape[1])
scalingTerm /= <double>reshapedX.shape[1]

if outputArray.dtype == "float64" and reshapedX.dtype == "float32" and \
chiArr.dtype == "float32":
Expand Down Expand Up @@ -484,7 +484,7 @@ def gpuConv1dArcCosFGen(reshapedX, radem, outputArray, chiArr,
scalingTerm = np.sqrt(1 / <double>chiArr.shape[0])

if averageFeatures:
scalingTerm /= np.sqrt(<double>reshapedX.shape[1])
scalingTerm /= <double>reshapedX.shape[1]

if outputArray.dtype == "float64" and reshapedX.dtype == "float32" and \
chiArr.dtype == "float32":
Expand Down
58 changes: 0 additions & 58 deletions xGPR/random_feature_generation/gpu_rf_gen/cuda_polynomial.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ cdef extern from "convolution_ops/convolution.h" nogil:


cdef extern from "poly_ops/polynomial_operations.h" nogil:
const char *cudaExactQuadratic_[T](T inArray[], double *outArray,
int inDim0, int inDim1)
const char *approxPolynomial_[T](int8_t *radem, T reshapedX[],
T copyBuffer[], T chiArr[], double *outArray,
int polydegree, int reshapedDim0, int reshapedDim1,
Expand Down Expand Up @@ -248,59 +246,3 @@ def gpuPolyFHT(reshapedX, radem, chiArr, outputArray, int polydegree,

else:
raise ValueError("Inconsistent array types passed to wrapped C++ function.")



@cython.boundscheck(False)
@cython.wraparound(False)
def cudaExactQuadratic(inputArray, outputArray,
int numThreads):
"""Wraps C++ operations for generating features for an exact
quadratic.
Args:
inputArray (ndarray): The input data. This is not modified.
outputArray (ndarray): The output array. Must have the appropriate
shape such that all of the quadratic polynomial features can
be written to it. The last column is assumed to be saved for 1
for a y-intercept term.
num_threads (int): Number of threads to use for FHT. Not used for gpu,
merely kept here for consistency with CPU version.
Raises:
ValueError: A ValueError is raised if unexpected or invalid inputs are supplied.
"""
cdef const char *errCode
cdef uintptr_t addr_output = outputArray.data.ptr
cdef uintptr_t addr_input = inputArray.data.ptr
cdef int numExpectedFeats = int( inputArray.shape[1] * (inputArray.shape[1] - 1) / 2)
numExpectedFeats += 2 * inputArray.shape[1] + 1

if len(inputArray.shape) != 2 or len(outputArray.shape) != 2:
raise ValueError("Both inputArray and outputArray for the exact quadratic "
"must be 2d arrays.")

if inputArray.shape[0] == 0:
raise ValueError("There must be at least one datapoint.")
if inputArray.shape[0] != outputArray.shape[0]:
raise ValueError("The number of datapoints in the outputs and the inputs do "
"not agree.")
if outputArray.shape[1] != numExpectedFeats:
raise ValueError("The shape of the output array is incorrect for a quadratic.")

if not outputArray.flags["C_CONTIGUOUS"] or not inputArray.flags["C_CONTIGUOUS"]:
raise ValueError("One or more arguments is not C contiguous.")

if inputArray.dtype == "float32":
errCode = cudaExactQuadratic_[float](<float*>addr_input, <double*>addr_output,
inputArray.shape[0], inputArray.shape[1])

elif inputArray.dtype == "float64":
errCode = cudaExactQuadratic_[double](<double*>addr_input, <double*>addr_output,
inputArray.shape[0], inputArray.shape[1])

else:
raise ValueError("Unexpected types passed to wrapped C++ function.")

if errCode.decode("UTF-8") != "no_error":
raise Exception("Fatal error encountered while performing graph convolution.")
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def cudaRBFFeatureGen(inputArray, outputArray, radem,

cdef uintptr_t addr_radem = radem.data.ptr


if not radem.dtype == "int8":
raise ValueError("radem must be of type int8.")
if not inputArray.flags["C_CONTIGUOUS"] or not radem.flags["C_CONTIGUOUS"] or not \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
/*
* Contains functions needed to generate exact quadratic polynomial
* features and approximate polynomial kernel features on GPU.
* Contains functions needed to generate approximate polynomial kernel features on GPU.
*/

#include <cuda.h>
Expand All @@ -25,9 +24,11 @@ __global__ void polyMultByDiagRademMat(T cArray[], int8_t *rademArray,
int rVal, position;

position = tid % numElementsPerRow;
rVal = rademArray[position];
if (tid < numElements)

if (tid < numElements){
rVal = rademArray[position];
cArray[tid] = cArray[tid] * rVal * normConstant;
}
}


Expand All @@ -44,9 +45,11 @@ __global__ void polyMultAndCopyDiagRademMat(T cArray[], T copyBuffer[],
int rVal, position;

position = tid % numElementsPerRow;
rVal = rademArray[position];
if (tid < numElements)

if (tid < numElements){
rVal = rademArray[position];
copyBuffer[tid] = cArray[tid] * rVal * normConstant;
}
}


Expand All @@ -66,6 +69,7 @@ __global__ void polyOutArrayCopyTransfer(T copyBuffer[],
numRowsTraversed = tid / numFreqs;
numExcess = tid % numFreqs;
cBuffPosition = numRowsTraversed * numPerRow + numExcess;

if (tid < numElements)
outArray[tid] = chiArr[numExcess] * copyBuffer[cBuffPosition];
}
Expand All @@ -86,64 +90,13 @@ __global__ void polyOutArrayMultTransfer(T copyBuffer[],
numExcess = tid % numFreqs;
cBuffPosition = numRowsTraversed * numPerRow + numExcess;
chiPosition = repeatNum * numPerRow + numExcess;

if (tid < numElements)
outArray[tid] *= chiArr[chiPosition] * copyBuffer[cBuffPosition];
}



//Generates the features for the exact quadratic.
template <typename T>
__global__ void genExactQuadFeatures(T inArray[], double *outArray,
int inDim1, int outDim1, int numElements){
int pos = blockDim.x * blockIdx.x + threadIdx.x;
int rowNum = pos / inDim1;
int positionInRow = pos % inDim1;
T inVal1 = inArray[pos];
T *inPtr = inArray + pos;
double *outPtr = outArray + rowNum * outDim1;
for (int i=0; i < positionInRow; i++)
outPtr += inDim1 + 1 - i;

if (pos < numElements){
*outPtr = inVal1;
outPtr++;
for (int i=positionInRow; i < inDim1; i++){
*outPtr = inVal1 * *inPtr;
outPtr++;
inPtr++;
}
}
}



//Performs feature generation for an exact quadratic (i.e. polynomial
//regression that is exact, not approximated).
//
//Note that all of these arrays are already expected to "live" on GPU.
template <typename T>
const char *cudaExactQuadratic_(T inArray[], double *outArray,
int inDim0, int inDim1){
int numInteractions = (inDim1 * (inDim1 - 1)) / 2;
int outDim1 = numInteractions + 1 + 2 * inDim1;
int numElements = inDim1 * inDim0;
int blocksPerGrid = (numElements + DEFAULT_THREADS_PER_BLOCK - 1) / DEFAULT_THREADS_PER_BLOCK;
//cudaProfilerStart();

//Multiply by D1.
genExactQuadFeatures<T><<<blocksPerGrid, DEFAULT_THREADS_PER_BLOCK>>>(inArray, outArray,
inDim1, outDim1, numElements);


//cudaProfilerStop();
return "no_error";
}
//Instantiate templates explicitly so wrapper can use.
template const char *cudaExactQuadratic_<float>(float inArray[], double *outArray,
int inDim0, int inDim1);
template const char *cudaExactQuadratic_<double>(double inArray[], double *outArray,
int inDim0, int inDim1);



Expand Down
Loading

0 comments on commit cb473c9

Please sign in to comment.