Skip to content

Commit

Permalink
runq - remove blas & optimize
Browse files Browse the repository at this point in the history
runq - optimize matmul and quantization functions with OpenMP
  • Loading branch information
trholding committed Jul 20, 2024
1 parent 8458b68 commit 036d7cb
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 39 deletions.
20 changes: 10 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -90,55 +90,55 @@ run_cc_openmp: ## - OpenMP accelerated build

.PHONY: runq_cc_openmp
runq_cc_openmp: ## - Same for quantized build
$(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -o run
$(CC) -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -o run

.PHONY: run_cc_openacc
run_cc_openacc: ## - OpenACC accelerated build
$(CC) -D OPENACC -Ofast -fopenacc -march=native -mtune=native run.c $(BOLT) -lm -o run

.PHONY: runq_cc_openacc
runq_cc_openacc: ## - Same for quantized build
$(CC) -D OPENACC -Ofast -fopenacc -march=native -mtune=native runq.c $(BOLT) -lm -o run
$(CC) -D OPENACC -D CAT -Ofast -fopenacc -march=native -mtune=native runq.c $(BOLT) -lm -o run

.PHONY: run_cc_omp_gnu
run_cc_omp_gnu: ## - Generic linux distro + OpenMP build
$(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native -std=gnu11 run.c $(BOLT) -lm -o run

.PHONY: runq_cc_omp_gnu
runq_cc_omp_gnu: ## - Same for quantized build
$(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native -std=gnu11 runq.c $(BOLT) -lm -o run
$(CC) -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native -std=gnu11 runq.c $(BOLT) -lm -o run

.PHONY: run_cc_clblast
run_cc_clblast: ## - CLBlast OpenCL CBLAS GPU accelerated build
$(CC) -D OPENMP -D CLBLAST -Ofast -fopenmp -march=native -mtune=native run.c $(BOLT) -lm -lclblast -o run

.PHONY: runq_cc_clblast
runq_cc_clblast: ## - Same for quantized build
$(CC) -D OPENMP -D CLBLAST -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lclblast -o run
$(CC) -D OPENMP -D CAT -D CLBLAST -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lclblast -o run

.PHONY: run_cc_openblas
run_cc_openblas: ## - Openblas CBLAS accelerated build
$(CC) -D OPENMP -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) run.c $(BOLT) -lm -lopenblas -o run

.PHONY: runq_cc_openblas
runq_cc_openblas: ## - Same for quantized build
$(CC) -D OPENMP -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) runq.c $(BOLT) -lm -lopenblas -o run
$(CC) -D OPENMP -D CAT -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) runq.c $(BOLT) -lm -lopenblas -o run

.PHONY: run_cc_cblas
run_cc_cblas: ## - Generic CBLAS accelerated build
$(CC) -D OPENMP -D CBLAS -Ofast -fopenmp -march=native -mtune=native run.c $(BOLT) -lm -lcblas -o run

.PHONY: runq_cc_cblas
runq_cc_cblas: ## - Same for quantized build
$(CC) -D OPENMP -D CBLAS -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lcblas -o run
$(CC) -D OPENMP -D CAT -D CBLAS -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lcblas -o run

.PHONY: run_cc_blis
run_cc_blis: ## - BLIS accelerated build
$(CC) -D OPENMP -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) run.c $(BOLT) -lm -lblis -o run

.PHONY: runq_cc_blis
runq_cc_blis: ## - Same for quantized build
$(CC) -D OPENMP -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) runq.c $(BOLT) -lm -lblis -o run
$(CC) -D OPENMP -D CAT -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) runq.c $(BOLT) -lm -lblis -o run

##@ Special Builds
##@ ---> x86_64
Expand All @@ -149,7 +149,7 @@ run_cc_mkl: ## - ***NEW*** OpenMP + Intel MKL CBLAS build (x86_64 / intel Mac)

.PHONY: runq_cc_mkl
runq_cc_mkl: ## - Same for quantized build
$(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) runq.c -lmkl_rt -lpthread $(BOLT) -lm -o run
$(CC) -D MKL -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) runq.c -lmkl_rt -lpthread $(BOLT) -lm -o run

##@ ---> ARM64 / aarch64
.PHONY: run_cc_armpl
Expand All @@ -158,7 +158,7 @@ run_cc_armpl: ## - ARM PL BLAS accelerated build (aarch64)

.PHONY: runq_cc_armpl
runq_cc_armpl: ## - Same for quantized build
$(CC) -D ARMPL -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -larmpl_lp64_mp -o run
$(CC) -D ARMPL -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -larmpl_lp64_mp -o run

##@ ---> Macintosh
.PHONY: run_cc_mac_accel
Expand All @@ -167,7 +167,7 @@ run_cc_mac_accel: ## - Mac OS OPENMP + CBLAS via Accelerate Framework build (WI

.PHONY: runq_cc_mac_accel
runq_cc_mac_accel: ## - Same for quantized build
$(CC) -D AAF -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -framework Accelerate -o run
$(CC) -D AAF -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -framework Accelerate -o run

##@ ---> Windows
.PHONY: run_win64
Expand Down
99 changes: 70 additions & 29 deletions runq.c
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,10 @@ __static_yoink("zipos");

// Portable OpenMP and OpenACC pragma macros
#ifdef OPENMP
#define ACCELS() MK_PRAGMA(omp parallel for)
#define ACCEL(...) MK_PRAGMA(omp parallel for private(__VA_ARGS__))
#elif defined(OPENACC)
#define ACCELS() MK_PRAGMA(acc parallel loop)
#define ACCEL(...) MK_PRAGMA(acc parallel loop private(__VA_ARGS__))
#endif

Expand All @@ -154,7 +156,13 @@ __static_yoink("zipos");
#endif
// ----------------------------------------------------------------------------
// Globals
// L2E Addition
#if defined CAT
const int GS = 64; // group size 64 for Cheap Acceleration Tech :)
#else
int GS = 0; // group size global for quantization of the weights
#endif
// END L2E Addition

// ----------------------------------------------------------------------------
// Transformer model
Expand Down Expand Up @@ -275,6 +283,11 @@ void free_run_state(RunState* s) {
// Quantization functions

void dequantize(QuantizedTensor *qx, float* x, int n) {
// L2E Addition
#ifdef ACCEL
ACCELS() // OMP/OACC Macro
#endif
// END L2E Addition
for (int i = 0; i < n; i++) {
x[i] = qx->q[i] * qx->s[i / GS];
}
Expand All @@ -284,6 +297,11 @@ void quantize(QuantizedTensor *qx, float* x, int n) {
int num_groups = n / GS;
float Q_MAX = 127.0f;

// L2E Addition
#ifdef ACCEL
ACCELS() // OMP/OACC Macro
#endif
// END L2E Addition
for (int group = 0; group < num_groups; group++) {

// find the max absolute value in the current group
Expand Down Expand Up @@ -391,7 +409,11 @@ void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weigh
int group_size = *(int*) ptr;
ptr += sizeof(int);

// L2E Addition
#ifndef CAT
GS = group_size; // set as global, as it will be used in many places
#endif
// END L2E Addition

void* weights_ptr = ((char*)*data) + header_size; // skip header bytes
memory_map_weights(weights, config, weights_ptr, shared_classifier);
Expand Down Expand Up @@ -419,7 +441,13 @@ void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weigh
if (fread(&shared_classifier, sizeof(uint8_t), 1, file) != 1) { exit(EXIT_FAILURE); }
int group_size; // the group size used in quantization
if (fread(&group_size, sizeof(int), 1, file) != 1) { exit(EXIT_FAILURE); }

// L2E Addition
#ifndef CAT
GS = group_size; // set as global, as it will be used in many places
#endif
// END L2E Addition

// figure out the file size
fseek(file, 0, SEEK_END); // move file pointer to end of file
*file_size = ftell(file); // get the file size, in bytes
Expand Down Expand Up @@ -508,64 +536,77 @@ void softmax(float* x, int size) {
}
}

// L2E Addition
#ifdef CAT

void matmul(float* xout, QuantizedTensor *x, QuantizedTensor *w, int n, int d) {
// W (d,n) @ x (n,) -> xout (d,)
// by far the most amount of time is spent inside this little function
// inputs to this function are both quantized

// L2E Addition

#ifdef BLAS
int i;
int j;

// Convert quantized tensors to floating point
float* w_fp = malloc(d * n * sizeof(float));
float* x_fp = malloc(n * sizeof(float));

#ifdef ACCEL
ACCEL(i, j) // OMP/OACC Macro
#endif
ACCEL(i) // OMP/OACC Macro
#endif
for (i = 0; i < d; i++) {
for (j = 0; j < n; j++) {
w_fp[i * n + j] = ((float) w->q[i * n + j]) * w->s[i / GS];

float val = 0.0f;
int32_t ival = 0;
int in = i * n;

// do the matmul in groups of GS
int j;
for (j = 0; j <= n - GS; j += GS) {
// unroll the inner loop by a factor of 4
for (int k = 0; k < GS; k += 4) {
ival += ((int32_t) x->q[j + k]) * ((int32_t) w->q[in + j + k]);
ival += ((int32_t) x->q[j + k + 1]) * ((int32_t) w->q[in + j + k + 1]);
ival += ((int32_t) x->q[j + k + 2]) * ((int32_t) w->q[in + j + k + 2]);
ival += ((int32_t) x->q[j + k + 3]) * ((int32_t) w->q[in + j + k + 3]);
}
val += ((float) ival) * w->s[(in + j) / GS] * x->s[j / GS];
ival = 0;
}
}

#ifdef ACCEL
ACCEL(j) // OMP/OACC Macro
#endif
for (j = 0; j < n; j++) {
x_fp[j] = ((float) x->q[j]) * x->s[j / GS];
xout[i] = val;
}
}

cblas_sgemv(CblasRowMajor, CblasNoTrans, d, n, 1.0f, w_fp, n, x_fp, 1, 0.0f, xout, 1);

// Free memory
free(w_fp);
free(x_fp);
#else
// END L2E Addition
void matmul(float* xout, QuantizedTensor *x, QuantizedTensor *w, int n, int d) {
// W (d,n) @ x (n,) -> xout (d,)
// by far the most amount of time is spent inside this little function
// inputs to this function are both quantized

#else
int i;
// L2E Addition
#ifdef ACCEL
ACCEL(i) // OMP/OACC Macro
#endif
// END L2E Addition
for (int i = 0; i < d; i++) {
for (i = 0; i < d; i++) {

float val = 0.0f;
int32_t ival = 0;
int in = i * n;

// do the matmul in groups of GS
for (int j = 0; j <= n - GS; j += GS) {
int j;
for (j = 0; j <= n - GS; j += GS) {
for (int k = 0; k < GS; k++) {
ival += ((int32_t) x->q[j + k]) * ((int32_t) w->q[in + j + k]);
}
val += ((float) ival) * w->s[(in + j) / GS] * x->s[j / GS];
ival = 0;
}

xout[i] = val;
}
}
// L2E Addition
#endif
#endif
// END L2E Addition
}

float* forward(Transformer* transformer, int token, int pos) {

Expand Down

0 comments on commit 036d7cb

Please sign in to comment.