-
Notifications
You must be signed in to change notification settings - Fork 26.7k
/
tf_utils.py
294 lines (233 loc) · 11.1 KB
/
tf_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Optional, Union
import numpy as np
import tensorflow as tf
from .feature_extraction_utils import BatchFeature
from .tokenization_utils_base import BatchEncoding
from .utils import logging
logger = logging.get_logger(__name__)
def shape_list(tensor: Union[tf.Tensor, np.ndarray]) -> List[int]:
"""
Deal with dynamic shape in tensorflow cleanly.
Args:
tensor (`tf.Tensor` or `np.ndarray`): The tensor we want the shape of.
Returns:
`List[int]`: The shape of the tensor as a list.
"""
if isinstance(tensor, np.ndarray):
return list(tensor.shape)
dynamic = tf.shape(tensor)
if tensor.shape == tf.TensorShape(None):
return dynamic
static = tensor.shape.as_list()
return [dynamic[i] if s is None else s for i, s in enumerate(static)]
def stable_softmax(logits: tf.Tensor, axis: Optional[int] = None, name: Optional[str] = None) -> tf.Tensor:
"""
Stable wrapper that returns the same output as `tf.nn.softmax`, but that works reliably with XLA on CPU. It is
meant as a workaround for the [following issue](https://github.com/tensorflow/tensorflow/issues/55682), and will be
removed after it gets fixed. The arguments and outputs are the same as `tf.nn.softmax`, and relies on the fact that
`softmax(x) = softmax(x + c)` (see https://ogunlao.github.io/2020/04/26/you_dont_really_know_softmax.html).
Args:
logits (`tf.Tensor`):
Must be one of the following types: half, float32, float64.
axis (`int`, *optional*):
The dimension softmax would be performed on. The default is -1 which indicates the last dimension.
name (`str`, *optional*):
A name for the operation.
Returns:
`tf.Tensor`:
A Tensor. Has the same type and shape as logits.
"""
# TODO: When the issue linked above gets sorted, add a check on TF version here and use the original function if
# it has the fix. After we drop the support for unfixed versions, remove this function.
return tf.nn.softmax(logits=logits + 1e-9, axis=axis, name=name)
def functional_layernorm(inputs, weight, bias, epsilon=1e-5, axis=-1):
# This is a very simplified functional layernorm, designed to duplicate
# the functionality of PyTorch nn.functional.layer_norm when this is needed to port
# models in Transformers.
if weight.shape.rank != 1 or bias.shape.rank != 1 or not isinstance(axis, int):
raise NotImplementedError("Only 1D weight and bias tensors are supported for now, with only a single axis.")
# Get mean and variance on the axis to be normalized
mean, variance = tf.nn.moments(inputs, axes=[axis], keepdims=True)
if axis != -1:
# Reshape scale and weight to have the same rank as inputs, but with 1 dimensions
# on every dimension except axis
shape = [1] * inputs.shape.rank
shape[axis] = shape_list(inputs)[axis]
weight = tf.reshape(weight, shape)
bias = tf.reshape(bias, shape)
# Compute layer normalization using the batch_normalization
# function.
outputs = tf.nn.batch_normalization(
inputs,
mean,
variance,
offset=bias,
scale=weight,
variance_epsilon=epsilon,
)
return outputs
def scaled_dot_product_attention(
query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale: float = None
):
"""TF equivalent for torch's nn.functional.scaled_dot_product_attention"""
if dropout_p != 0.0:
raise ValueError(
"Dropout is not supported in this implementation - file an issue "
"with Transformers and ping @Rocketknight1 if you need it for a port!"
)
if is_causal and attn_mask is not None:
raise ValueError("You cannot specify an attn_mask and is_causal at the same time!")
if is_causal:
attn_mask = tf.ones((tf.shape(query)[-2], tf.shape(key)[-2]), dtype=tf.int32)
attn_mask = tf.experimental.numpy.tril(attn_mask, k=0)
if attn_mask is not None and (attn_mask.dtype.is_integer or attn_mask.dtype.is_bool):
# Convert boolean mask to a negative logit bias
attn_mask = tf.where(attn_mask > 0, tf.cast(0.0, query.dtype), tf.cast(-1000.0, query.dtype))
logits = tf.einsum("...qd, ...kd -> ...qk", query, key)
if scale is None:
scale = tf.cast(tf.shape(key)[-1], logits.dtype) ** -0.5
logits *= scale # scale by 1/sqrt(key_dim)
if attn_mask is not None:
logits += attn_mask
probs = tf.nn.softmax(logits)
return probs @ value
def flatten(input, start_dim=0, end_dim=-1):
# Replicates the behavior of torch.flatten in TF
# If end_dim or start_dim is negative, count them from the end
if end_dim < 0:
end_dim += input.shape.rank
if start_dim < 0:
start_dim += input.shape.rank
if start_dim == end_dim:
return input
in_shape = tf.shape(input)
flattened_dim = tf.math.reduce_prod(in_shape[start_dim : end_dim + 1])
out_shape = tf.concat([in_shape[:start_dim], [flattened_dim], in_shape[end_dim + 1 :]], axis=0)
return tf.reshape(input, out_shape)
def invert_attention_mask(encoder_attention_mask: tf.Tensor) -> tf.Tensor:
"""
Invert an attention mask (e.g., switches 0. and 1.).
Args:
encoder_attention_mask (`torch.Tensor`): An attention mask.
Returns:
`tf.Tensor`: The inverted attention mask.
"""
if not isinstance(encoder_attention_mask, tf.Tensor):
encoder_attention_mask = tf.convert_to_tensor(encoder_attention_mask) # Catches stray NumPy inputs
if encoder_attention_mask.shape.rank == 3:
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
if encoder_attention_mask.shape.rank == 2:
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
# T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
# Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
# /transformer/transformer_layers.py#L270
# encoder_extended_attention_mask = (encoder_extended_attention_mask ==
# encoder_extended_attention_mask.transpose(-1, -2))
encoder_extended_attention_mask = (
tf.cast(1, encoder_attention_mask.dtype) - encoder_extended_attention_mask
) * encoder_extended_attention_mask.dtype.min
return encoder_extended_attention_mask
def check_embeddings_within_bounds(tensor: tf.Tensor, embed_dim: int, tensor_name: str = "input_ids") -> None:
"""
`tf.gather`, on which TF embedding layers are based, won't check positive out of bound indices on GPU, returning
zeros instead. This function adds a check against that dangerous silent behavior.
Args:
tensor (`tf.Tensor`): The tensor of indices to check.
embed_dim (`int`): The embedding dimension.
tensor_name (`str`, *optional*): The name of the tensor to use in the error message.
"""
tf.debugging.assert_less(
tensor,
tf.cast(embed_dim, dtype=tensor.dtype),
message=(
f"The maximum value of {tensor_name} ({tf.math.reduce_max(tensor)}) must be smaller than the embedding "
f"layer's input dimension ({embed_dim}). The likely cause is some problem at tokenization time."
),
)
def save_attributes_to_hdf5_group(group, name, data):
"""Saves attributes (data) of the specified name into the HDF5 group.
This method deals with an inherent problem of HDF5 file which is not able to store data larger than
HDF5_OBJECT_HEADER_LIMIT bytes.
Args:
group: A pointer to a HDF5 group.
name: A name of the attributes to save.
data: Attributes data to store.
Raises:
RuntimeError: If any single attribute is too large to be saved.
Copied from Keras to Transformers to avoid versioning issues.
"""
HDF5_OBJECT_HEADER_LIMIT = 64512
# Check that no item in `data` is larger than `HDF5_OBJECT_HEADER_LIMIT`
# because in that case even chunking the array would not make the saving
# possible.
bad_attributes = [x for x in data if len(x) > HDF5_OBJECT_HEADER_LIMIT]
# Expecting this to never be true.
if bad_attributes:
raise RuntimeError(
"The following attributes cannot be saved to HDF5 file because "
f"they are larger than {HDF5_OBJECT_HEADER_LIMIT} "
f"bytes: {bad_attributes}"
)
data_npy = np.asarray(data)
num_chunks = 1
chunked_data = np.array_split(data_npy, num_chunks)
# This will never loop forever thanks to the test above.
while any(x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data):
num_chunks += 1
chunked_data = np.array_split(data_npy, num_chunks)
if num_chunks > 1:
for chunk_id, chunk_data in enumerate(chunked_data):
group.attrs["%s%d" % (name, chunk_id)] = chunk_data
else:
group.attrs[name] = data
def load_attributes_from_hdf5_group(group, name):
"""Loads attributes of the specified name from the HDF5 group.
This method deals with an inherent problem of HDF5 file which is not able to store data larger than
HDF5_OBJECT_HEADER_LIMIT bytes.
Args:
group: A pointer to a HDF5 group.
name: A name of the attributes to load.
Returns:
data: Attributes data.
Copied from Keras to Transformers to avoid versioning issues.
"""
if name in group.attrs:
data = [n.decode("utf8") if hasattr(n, "decode") else n for n in group.attrs[name]]
else:
data = []
chunk_id = 0
while "%s%d" % (name, chunk_id) in group.attrs:
data.extend(
[n.decode("utf8") if hasattr(n, "decode") else n for n in group.attrs["%s%d" % (name, chunk_id)]]
)
chunk_id += 1
return data
def expand_1d(data):
"""Expands 1-dimensional `Tensor`s into 2-dimensional `Tensor`s.
Copied from Keras to here to avoid versioning issues."""
def _expand_single_1d_tensor(t):
if isinstance(t, tf.Tensor) and t.shape.rank == 1:
return tf.expand_dims(t, axis=-1)
return t
return tf.nest.map_structure(_expand_single_1d_tensor, data)
def convert_batch_encoding(*args, **kwargs):
# Convert HF BatchEncoding/BatchFeature objects in the inputs to dicts that Keras understands
if args and isinstance(args[0], (BatchEncoding, BatchFeature)):
args = list(args)
args[0] = dict(args[0])
elif "x" in kwargs and isinstance(kwargs["x"], (BatchEncoding, BatchFeature)):
kwargs["x"] = dict(kwargs["x"])
return args, kwargs