Skip to content

Commit

Permalink
Add support for GELU and approximate activation functions (#8224)
Browse files Browse the repository at this point in the history
FEATURE
* add docker configs for isolated testing

* implement gelu and gelu_new as separate activations

* Update activations.ts

* Update activations_test.ts

* Update activations_test.ts

* remove docker files

* fix activation tests

* fix lint errors

* remove extra blank line

* fix gelu_new calc

* fix 1D test

---------

Co-authored-by: Ping Yu <4018+pyu10055@users.noreply.github.com>
Co-authored-by: Matthew Soulanille <msoulanille@google.com>
  • Loading branch information
3 people committed Apr 12, 2024
1 parent baf2364 commit c027d6a
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 12 deletions.
74 changes: 67 additions & 7 deletions tfjs-layers/src/activations.ts
Original file line number Diff line number Diff line change
Expand Up @@ -209,23 +209,64 @@ export class LogSoftmax extends Activation {
serialization.registerClass(LogSoftmax);

/**
* Swish activation function
* Gelu activation function
*/
export class Swish extends Activation {
export class Gelu extends Activation {
/** @nocollapse */
static readonly className = 'swish';
static readonly className = 'gelu';
/**
* Calculate the activation function.
*
* @param x Tensor.
* @param alpha Scaling factor for the sigmoid function.
* @returns a Tensor of the same shape as x
*/
apply(x: Tensor, alpha = 1): Tensor {
return tidy(() => tfc.mul(tfc.sigmoid(tfc.mul(x, alpha)), x));
apply(x: Tensor): Tensor {
return tidy(() => {
return tfc.tidy(() => {
const sqrtTwo = Math.sqrt(2);
// Compute Φ(x) using the erf function
const cdf = tfc.mul(0.5, tfc.add(1, tfc.erf(tfc.div(x, sqrtTwo))));
// Compute GELU(x) = x * Φ(x)
return tfc.mul(x, cdf);
});
});
}
}
serialization.registerClass(Swish);
serialization.registerClass(Gelu);

/**
* GeluNew activation function
*/
export class GeluNew extends Activation {
/** @nocollapse */
static readonly className = 'gelu_new';
/**
* Calculate the activation function.
*
* @param x Tensor.
* @returns a Tensor of the same shape as x
*/
apply(x: Tensor): Tensor {
return tidy(() => {
return tfc.mul(
0.5,
tfc.mul(
x,
tfc.add(
1,
tfc.tanh(
tfc.mul(
tfc.sqrt(tfc.div(2, Math.PI)),
tfc.add(x, tfc.mul(0.044715, tfc.pow(x, 3)))
)
)
)
)
);
});
}
}
serialization.registerClass(GeluNew);

/**
* Mish activation function
Expand All @@ -245,6 +286,25 @@ export class Mish extends Activation {
}
serialization.registerClass(Mish);

/**
* Swish activation function
*/
export class Swish extends Activation {
/** @nocollapse */
static readonly className = 'swish';
/**
* Calculate the activation function.
*
* @param x Tensor.
* @param alpha Scaling factor for the sigmoid function.
* @returns a Tensor of the same shape as x
*/
apply(x: Tensor, alpha = 1): Tensor {
return tidy(() => tfc.mul(tfc.sigmoid(tfc.mul(x, alpha)), x));
}
}
serialization.registerClass(Swish);

export function serializeActivation(activation: Activation): string {
return activation.getClassName();
}
Expand Down
97 changes: 96 additions & 1 deletion tfjs-layers/src/activations_test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
*/
import {scalar, tensor1d, tensor2d, tensor3d} from '@tensorflow/tfjs-core';

import {Elu, HardSigmoid, Linear, LogSoftmax, Relu, Relu6, Selu, Sigmoid, Softmax, Softplus, Softsign, Tanh, Swish, Mish} from './activations';
import {Elu, HardSigmoid, Linear, LogSoftmax, Relu, Relu6, Selu, Sigmoid, Softmax, Softplus, Softsign, Tanh, Swish, Mish, Gelu, GeluNew} from './activations';
import {describeMathCPUAndGPU, expectNoLeakedTensors, expectTensorsClose} from './utils/test_utils';

describeMathCPUAndGPU('linear activation', () => {
Expand Down Expand Up @@ -366,3 +366,98 @@ describeMathCPUAndGPU('mish activation', () => {
expectNoLeakedTensors(() => mish(initX), 1);
});
});

describeMathCPUAndGPU('gelu activation', () => {
const gelu = new Gelu().apply;
// Setup: Array with initial values.
// Execute: Gelu on the last dimension.
// Expect: Output array matches size and approximate expected values.
it('1D', () => {
const initX = tensor1d([0, 1, 3, 9]);
const expectedVals = tensor1d([
0,
0.8413447141647339,
2.995950222015381, 9
]);
expectTensorsClose(gelu(initX), expectedVals);
});
it('1D all equal', () => {
const initX = tensor1d([-1, -1, -1, -1]);
const expectedVals = tensor1d([
-0.15865525603294373,
-0.15865525603294373,
-0.15865525603294373,
-0.15865525603294373
]);
expectTensorsClose(gelu(initX), expectedVals);
});
it('2D', () => {
const initX = tensor2d([[0, 1, 3, 9], [0, 1, 3, 9]]);
const expectedVals = tensor2d([
[0, 0.8413447141647339, 2.995950222015381, 9],
[0, 0.8413447141647339, 2.995950222015381, 9]
]);
expectTensorsClose(gelu(initX), expectedVals);
});
it('3D', () => {
const initX = tensor3d([[[0, 1, 3, 9], [0, 1, 3, 9]]]);
const expectedVals = tensor3d([[
[ 0, 0.8413447141647339, 2.995950222015381, 9 ],
[ 0, 0.8413447141647339, 2.995950222015381, 9 ]
]]);
expectTensorsClose(gelu(initX), expectedVals);
});
it('Does not leak', () => {
const initX = tensor1d([0, 1, 3, 9]);
expectNoLeakedTensors(() => gelu(initX), 1);
});
});

describeMathCPUAndGPU('gelu_new activation', () => {
const geluNew = new GeluNew().apply;
// Setup: Array with initial values.
// Execute: GeluNew on the last dimension.
// Expect: Output array matches size and approximate expected values.
it('1D', () => {
const initX = tensor1d([0, 1, 3, 9]);
const expectedVals = tensor1d([
0,
0.8411920070648193,
2.9963626861572266,
9
]);
expectTensorsClose(geluNew(initX), expectedVals);
});
it('1D all equal', () => {
const initX = tensor1d([-1, -1, -1, -1]);
const expectedVals = tensor1d([
-0.15880802273750305,
-0.15880802273750305,
-0.15880802273750305,
-0.15880802273750305
]);
expectTensorsClose(geluNew(initX), expectedVals);
});
it('2D', () => {
const initX = tensor2d([[0, 1, 3, 9], [0, 1, 3, 9]]);
const expectedVals = tensor2d([
[ 0, 0.8411920070648193, 2.9963626861572266, 9 ],
[ 0, 0.8411920070648193, 2.9963626861572266, 9 ]
]);
expectTensorsClose(geluNew(initX), expectedVals);
});
it('3D', () => {
const initX = tensor3d([[[0, 1, 3, 9], [0, 1, 3, 9]]]);
const expectedVals = tensor3d([
[
[ 0, 0.8411920070648193, 2.9963626861572266, 9 ],
[ 0, 0.8411920070648193, 2.9963626861572266, 9 ]
]
]);
expectTensorsClose(geluNew(initX), expectedVals);
});
it('Does not leak', () => {
const initX = tensor1d([0, 1, 3, 9]);
expectNoLeakedTensors(() => geluNew(initX), 1);
});
});
4 changes: 2 additions & 2 deletions tfjs-layers/src/keras_format/activation_config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import {stringLiteralArray} from './utils';
*/
export const activationOptions = stringLiteralArray([
'elu', 'hard_sigmoid', 'linear', 'relu', 'relu6', 'selu', 'sigmoid',
'softmax', 'softplus', 'softsign', 'tanh', 'swish', 'mish'
'softmax', 'softplus', 'softsign', 'tanh', 'swish', 'mish', 'gelu', 'gelu_new'
]);

/**
Expand All @@ -28,4 +28,4 @@ export type ActivationSerialization = typeof activationOptions[number];
// e.g. to src/common.ts. Maybe even duplicate *all* of these to be pedantic?
/** @docinline */
export type ActivationIdentifier = 'elu'|'hardSigmoid'|'linear'|'relu'|'relu6'|
'selu'|'sigmoid'|'softmax'|'softplus'|'softsign'|'tanh'|'swish'|'mish';
'selu'|'sigmoid'|'softmax'|'softplus'|'softsign'|'tanh'|'swish'|'mish'|'gelu'|'gelu_new';
3 changes: 1 addition & 2 deletions tfjs-layers/src/layers/nlp/models/gpt2/gpt2_backbone.ts
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,7 @@ export class GPT2Backbone extends Backbone {
numHeads: args.numHeads,
dropout: args.dropout,
layerNormEpsilon: 1e-05,
// TODO(pforderique): Implement gelu.
activation: getActivation('relu'),
activation: getActivation('gelu'),
kernelInitializer: gpt2KernelInitializer(0.02),
normalizeFirst: true,
name: `transformer_layer_${i}`,
Expand Down

0 comments on commit c027d6a

Please sign in to comment.