Skip to content

Commit

Permalink
Merge branch 'bart' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
HaiderSultanArc authored Aug 5, 2023
2 parents da91f37 + fc48cf8 commit 3fd2039
Show file tree
Hide file tree
Showing 10 changed files with 4,150 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"python.linting.pylintEnabled": false,
"python.linting.flake8Enabled": true,
"python.linting.enabled": true
}
5 changes: 5 additions & 0 deletions ivy_models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,10 @@

from .squeezenet import *
from .densenet import *

from . import bart
from .bart import *

from . import bert
from .bert import *
from .vit import *
2 changes: 2 additions & 0 deletions ivy_models/bart/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from . import bart
from .bart import *
230 changes: 230 additions & 0 deletions ivy_models/bart/activations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
from collections import OrderedDict
import ivy


class IvyGELUTanh(ivy.Module):
"""
A fast C implementation of the tanh approximation of the GeLU activation function. See
https://arxiv.org/abs/1606.08415.
This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical
match due to rounding errors.
"""

def __init__(self):
super().__init__()

def _forward(self, input: ivy.Array) -> ivy.Array:
return ivy.gelu(input, approximate="tanh")


class NewGELUActivation(ivy.Module):
"""
Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
"""

def _forward(self, input: ivy.Array) -> ivy.Array:
return (
0.5
* input
* (
1.0
+ ivy.tanh(
ivy.sqrt(2.0 / ivy.pi) * (input + 0.044715 * ivy.pow(input, 3.0))
)
)
)


class GELUActivation(ivy.Module):
"""
Original Implementation of the GELU activation function in Google BERT repo when initially created. For
information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
ivy.tanh(ivy.sqrt(2 / ivy.pi) * (x + 0.044715 * ivy.pow(x, 3)))) This is now written in C in ivy.functional
Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
"""

def __init__(self, use_gelu_python: bool = False):
super().__init__()
if use_gelu_python:
self.act = self._gelu_python
else:
self.act = ivy.gelu

def _gelu_python(self, input: ivy.Array) -> ivy.Array:
return input * 0.5 * (1.0 + ivy.erf(input / ivy.sqrt(2.0)))

def _forward(self, input: ivy.Array) -> ivy.Array:
return self.act(input)


class FastGELUActivation(ivy.Module):
"""
Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
"""

def _forward(self, input: ivy.Array) -> ivy.Array:
return (
0.5
* input
* (1.0 + ivy.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input)))
)


class QuickGELUActivation(ivy.Module):
"""
Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
"""

def _forward(self, input: ivy.Array) -> ivy.Array:
return input * ivy.sigmoid(1.702 * input)


class ClippedGELUActivation(ivy.Module):
"""
Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
https://arxiv.org/abs/2004.09602.
Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
initially created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
ivy.tanh(ivy.sqrt(2 / ivy.pi) * (x + 0.044715 * ivy.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
"""

def __init__(self, min: float, max: float):
if min > max:
raise ValueError(f"min should be < max (got min: {min}, max: {max})")

super().__init__()
self.min = min
self.max = max

def _forward(self, x: ivy.Array) -> ivy.Array:
return ivy.clip(gelu(x), self.min, self.max)


class AccurateGELUActivation(ivy.Module):
"""
Applies GELU approximation that is faster than default and more accurate than QuickGELU. See:
https://github.com/hendrycks/GELUs
Implemented along with MEGA (Moving Average Equipped Gated Attention)
"""

def __init__(self):
super().__init__()
self.precomputed_constant = ivy.sqrt(2 / ivy.pi)

def _forward(self, input: ivy.Array) -> ivy.Array:
return (
0.5
* input
* (
1
+ ivy.tanh(
self.precomputed_constant * (input + 0.044715 * ivy.pow(input, 3))
)
)
)


class SiLUActivation(ivy.Module):
"""
See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
later.
"""

def _forward(self, input: ivy.Array) -> ivy.Array:
return ivy.silu(input)


class LinearActivation(ivy.Module):
"""
Applies the linear activation function, i.e. forwarding input directly to output.
"""

def _forward(self, input: ivy.Array) -> ivy.Array:
return input


class LaplaceActivation(ivy.Module):
"""
Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See
https://arxiv.org/abs/2209.10655
Inspired by squared relu, but with bounded range and gradient for better stability
"""

def _forward(self, input, mu=0.707107, sigma=0.282095):
input = (input - mu).divide(sigma * ivy.sqrt(2.0))
return 0.5 * (1.0 + ivy.erf(input))


class ReLUSquaredActivation(ivy.Module):
"""
Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
"""

def _forward(self, input):
relu_applied = ivy.relu(input)
squared = ivy.square(relu_applied)
return squared


class ClassInstantier(OrderedDict):
def __getitem__(self, key):
content = super().__getitem__(key)
cls, kwargs = content if isinstance(content, tuple) else (content, {})
return cls(**kwargs)


ACT2CLS = {
"gelu": ivy.GELU,
"gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}),
"gelu_fast": FastGELUActivation,
"gelu_new": NewGELUActivation,
"gelu_python": (GELUActivation, {"use_gelu_python": True}),
"gelu_tanh": IvyGELUTanh,
"gelu_accurate": AccurateGELUActivation,
"laplace": LaplaceActivation,
"linear": LinearActivation,
"mish": ivy.Mish,
"quick_gelu": QuickGELUActivation,
"relu": ivy.ReLU,
"relu2": ReLUSquaredActivation,
"relu6": ivy.ReLU6,

This comment has been minimized.

Copy link
@Mr-Niraj-Kulkarni

Mr-Niraj-Kulkarni Aug 7, 2023

@HaiderSultanArc I think you have made a typo on line 201 it should be ivy.ReLU

This comment has been minimized.

Copy link
@HaiderSultanArc

HaiderSultanArc Aug 7, 2023

Author Contributor

Hey @Mr-Niraj-Kulkarni . relu6 is a separate function then relu. Ivy provides both functions. That's why I used ivy.ReLU6😄.

"sigmoid": ivy.Sigmoid,
"silu": ivy.SiLU,
"swish": SiLUActivation,
"tanh": ivy.Tanh,
}
ACT2FN = ClassInstantier(ACT2CLS)


def get_activation(activation_string):
if activation_string in ACT2FN:
return ACT2FN[activation_string]
else:
raise KeyError(
f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}"
)


gelu_python = get_activation("gelu_python")
gelu_new = get_activation("gelu_new")
gelu = get_activation("gelu")
relu = get_activation("relu")
relu6 = get_activation("relu6")
sigmoid = get_activation("sigmoid")
tanh = get_activation("tanh")
gelu_fast = get_activation("gelu_fast")
quick_gelu = get_activation("quick_gelu")
silu = get_activation("silu")
mish = get_activation("mish")
linear_act = get_activation("linear")
Loading

0 comments on commit 3fd2039

Please sign in to comment.