From a81757787ea70cbec49728c8a19b5a98a14fc386 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 21 Jun 2022 16:10:29 +0200
Subject: [PATCH 1/6] add first generation tutorial

---
 docs/source/en/generation.mdx | 254 ++++++++++++++++++++++++++++++++++
 1 file changed, 254 insertions(+)
 create mode 100644 docs/source/en/generation.mdx

diff --git a/docs/source/en/generation.mdx b/docs/source/en/generation.mdx
new file mode 100644
index 00000000000000..b5f1ac54652f2c
--- /dev/null
+++ b/docs/source/en/generation.mdx
@@ -0,0 +1,254 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# How to generate text with 🤗 Transformers.
+
+Generating text with transformer models can be done via *auto-regressive* language generation.
+Auto-regressive generation is defined as iteratively forwarding a sequence of tokens through the model and sampling the next token in the sequence from the model's output distribution until a certain stopping criteria is met.
+
+This section will serve as a practical guide on how to use 🤗 Transformers' text-generation method 
+[`~generation_utils.GenerationMixin.generate`] for different generation methods, model architectures and different 
+generation configurations.
+
+Before diving into some practical examples, the reader is strongly advised to go over this more theoretical
+blog post on [text-generation](https://huggingface.co/blog/how-to-train) to understand how the different 
+generation methods function.
+
+The most common **generation methods** are:
+
+- 1. Greedy search: [`~generation_utils.GenerationMixin.greedy_search`],
+- 2. Sample: [`~generation_utils.GenerationMixin.sample`], and
+- 3. Beam Search: [`~generation_utils.GenerationMixin.beam_search`].
+
+and we can divide all model architectures broadly into **decoder-only** and **encoder-decoder** 
+models.
+
+**Decoder-only** models include architectures, such as [GPT-2](), [OPT](), and [BLOOM]() and 
+can be loaded via the [`AutoModelForCausalLM`] class. Decoder-only models are mainly used for 
+open-ended text-generation, but can essentially be used to solve every text-to-text task via 
+prompt-tuning as introduced in the [GPT-3 paper]( ).
+
+**Encoder-Decoder** models include architectures, such as [T5](), [Bart](), and [Marian]() and 
+can be loaded via the [`AutoModelForSeq2SeqLM`] class. Encoder-Decoder models are mainly used for 
+translation and summarization, but can also be used to solve every text-to-text task when the task
+is framed as a text-to-text task as has been shown in the [T5 paper]( ).
+
+## Greedy search
+
+The simplest and default generation method is [`~generation_utils.GenerationMixin.greedy_search`].
+
+For greedy search, the model samples from the predicted logit distribution by simply taking the most probably logit. This logit defines the next token id that is concatenated to the passed token ids and the most probably logit is sampled again. As the *most probably* logit corresponds to simply taking the `argmax`, greedy 
+search is a *deterministic* generation method.
+
+It is very easy to write such a loop yourself for a *decoder-only* model, such as [opt-125m](facebook/opt-125m).
+
+First, we load the model.
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+```
+
+Next, we define a prompt the model be conditioned on to predict the next tokens.
+
+```py
+prompt = "In the winter, it is cold."
+
+prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids
+```
+
+Now we can write a short iterative greedy serach loop that generates 20 tokens.
+
+<Tip warning={true}>
+
+While the generation parameter [`max_length`]( ) is widely used and always set 
+by default, we strongly recommend users to switch to using `max_new_tokens` instead.
+`max_length` will generate tokens **up to** `max_length` where as `max_new_tokens` 
+will generate exactly `max_new_tokens` independent of the input length.
+Unexpected behavior can occur when using `max_length` with `input_ids` that are 
+longer than `max_length` which is why we recommend using `max_new_tokens` instead.
+
+</Tip>
+
+```py
+max_new_tokens = 10
+
+input_ids = prompt_ids
+for _ in range(max_new_tokens):
+	with torch.no_grad():
+		logits = model(input_ids).logits
+	
+	next_token_id = torch.argmax(logits[:, -1:], dim=-1)
+
+	input_ids = torch.concat([input_ids, next_token_id], dim=-1)
+```
+
+After having generated 10 new tokens, let's take a look at the prediction.
+
+```py
+tokenizer.batch_decode(input_ids)
+```
+
+Instead of writing this greedy_search method everytime yourself, one can instead 
+make use of [`~generation_utils.GenerationMixin.generate`].
+
+Let's see how to replicate the above example with [`~generation_utils.GenerationMixin.generate`].
+
+<Tip>
+
+Greedy search is activated by default or when passing `num_beams=1` and `do_sample=False` to
+[`~generation_utils.GenerationMixin.generate`]
+
+</Tip>
+
+```py
+sequenences = model.generate(prompt_ids, do_sample=False, num_beams=1)
+tokenizer.batch_decode(input_ids)
+```
+
+For **encoder-decoder** models, we first need to encode the prompt with the encoder
+and then iteratively forward the `input_ids` through the decoder just like in the example above.
+Let's load an encoder-decoder model such as [t5-small]( ):
+
+```py
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
+tokenizer = AutoTokenizer.from_pretrained("t5-small")
+```
+
+Let's use the same prompt, but now pass it once to the encoder.
+
+```py
+input_ids = prompt_ids
+
+with torch.no_grad():
+	encoder_hidden_states = model.encoder(input_ids).hidden_states
+```
+
+The decoder of encoder-decoder models contains so-called *cross-attention* layers
+which have to be conditioned on encoded hidden states. Therefore, we will 
+pass the `encoder_hidden_states` at every iteration step below.
+
+<Tip>
+
+To better understand how *encoder-decoder* models function from a theoretical
+point of view please take a look at the [Encoder-decoder blog post]( ).
+
+</Tip>
+
+```py
+input_ids = torch.tensor([[model.config.decoder_start_token_id]])
+
+max_new_tokens = 10
+for _ in range(max_new_tokens):
+	with torch.no_grad():
+		logits = model(input_ids, encoder_hidden_states=encoder_hidden_states).logits
+	
+	next_token_id = torch.argmax(logits[:, -1:], dim=-1)
+```
+
+Transformers `generate` method automatically detects if the model is an encoder-decoder
+model and correspondingly correctly prepare the `encoder_hidden_states`.
+
+```py
+sequenences = model.generate(prompt_ids, do_sample=False, num_beams=1)
+tokenizer.batch_decode(input_ids)
+```
+
+When generating longer texts, e.g. `max_new_tokens = 200` 
+you will notice that [`~generation_utils.GenerationMixin.greedy_search`] will automatically 
+finish the generation loop early because the model has predicted a so-called *end-of-sentence* token.
+
+Stopping at the EOS token amongst many other features, such as cached generation, batched generation, etc... 
+is missing in the native loop above. Therefore, we strongly recommend to use 🤗 Transformers generate 
+method instead of writing your own training loop.
+
+Other important generation features that can be set when doing `greedy_search` include:
+
+- 
+
+## Sample
+
+
+
+
+
+it again into the 
+
+simply takes the index of the most probably 
+
+
+## Greedy search generate
+sampling methods
+
+The simplest form of sampling is to t
+
+
+The [`PreTrainedTokenizerFast`] depends on the [🤗 Tokenizers](https://huggingface.co/docs/tokenizers) library. The tokenizers obtained from the 🤗 Tokenizers library can be
+loaded very simply into 🤗 Transformers.
+
+Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines:
+
+```python
+>>> from tokenizers import Tokenizer
+>>> from tokenizers.models import BPE
+>>> from tokenizers.trainers import BpeTrainer
+>>> from tokenizers.pre_tokenizers import Whitespace
+
+>>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+>>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+
+>>> tokenizer.pre_tokenizer = Whitespace()
+>>> files = [...]
+>>> tokenizer.train(files, trainer)
+```
+
+We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to
+a JSON file for future re-use.
+
+## Loading directly from the tokenizer object
+
+Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The
+[`PreTrainedTokenizerFast`] class allows for easy instantiation, by accepting the instantiated
+*tokenizer* object as an argument:
+
+```python
+>>> from transformers import PreTrainedTokenizerFast
+
+>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+```
+
+This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer
+page](main_classes/tokenizer) for more information.
+
+## Loading from a JSON file
+
+In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer:
+
+```python
+>>> tokenizer.save("tokenizer.json")
+```
+
+The path to which we saved this file can be passed to the [`PreTrainedTokenizerFast`] initialization
+method using the `tokenizer_file` parameter:
+
+```python
+>>> from transformers import PreTrainedTokenizerFast
+
+>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
+```
+
+This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer
+page](main_classes/tokenizer) for more information.

From aae900dba6d345ada3d22b20daa80b97b0bdbaa6 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 24 Aug 2022 16:56:06 +0200
Subject: [PATCH 2/6] remove generation

---
 docs/source/en/generation.mdx                 | 254 ------------------
 .../test_processor_wav2vec2_with_lm.py        |  22 +-
 2 files changed, 12 insertions(+), 264 deletions(-)
 delete mode 100644 docs/source/en/generation.mdx

diff --git a/docs/source/en/generation.mdx b/docs/source/en/generation.mdx
deleted file mode 100644
index b5f1ac54652f2c..00000000000000
--- a/docs/source/en/generation.mdx
+++ /dev/null
@@ -1,254 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# How to generate text with 🤗 Transformers.
-
-Generating text with transformer models can be done via *auto-regressive* language generation.
-Auto-regressive generation is defined as iteratively forwarding a sequence of tokens through the model and sampling the next token in the sequence from the model's output distribution until a certain stopping criteria is met.
-
-This section will serve as a practical guide on how to use 🤗 Transformers' text-generation method 
-[`~generation_utils.GenerationMixin.generate`] for different generation methods, model architectures and different 
-generation configurations.
-
-Before diving into some practical examples, the reader is strongly advised to go over this more theoretical
-blog post on [text-generation](https://huggingface.co/blog/how-to-train) to understand how the different 
-generation methods function.
-
-The most common **generation methods** are:
-
-- 1. Greedy search: [`~generation_utils.GenerationMixin.greedy_search`],
-- 2. Sample: [`~generation_utils.GenerationMixin.sample`], and
-- 3. Beam Search: [`~generation_utils.GenerationMixin.beam_search`].
-
-and we can divide all model architectures broadly into **decoder-only** and **encoder-decoder** 
-models.
-
-**Decoder-only** models include architectures, such as [GPT-2](), [OPT](), and [BLOOM]() and 
-can be loaded via the [`AutoModelForCausalLM`] class. Decoder-only models are mainly used for 
-open-ended text-generation, but can essentially be used to solve every text-to-text task via 
-prompt-tuning as introduced in the [GPT-3 paper]( ).
-
-**Encoder-Decoder** models include architectures, such as [T5](), [Bart](), and [Marian]() and 
-can be loaded via the [`AutoModelForSeq2SeqLM`] class. Encoder-Decoder models are mainly used for 
-translation and summarization, but can also be used to solve every text-to-text task when the task
-is framed as a text-to-text task as has been shown in the [T5 paper]( ).
-
-## Greedy search
-
-The simplest and default generation method is [`~generation_utils.GenerationMixin.greedy_search`].
-
-For greedy search, the model samples from the predicted logit distribution by simply taking the most probably logit. This logit defines the next token id that is concatenated to the passed token ids and the most probably logit is sampled again. As the *most probably* logit corresponds to simply taking the `argmax`, greedy 
-search is a *deterministic* generation method.
-
-It is very easy to write such a loop yourself for a *decoder-only* model, such as [opt-125m](facebook/opt-125m).
-
-First, we load the model.
-
-```py
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
-tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
-```
-
-Next, we define a prompt the model be conditioned on to predict the next tokens.
-
-```py
-prompt = "In the winter, it is cold."
-
-prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids
-```
-
-Now we can write a short iterative greedy serach loop that generates 20 tokens.
-
-<Tip warning={true}>
-
-While the generation parameter [`max_length`]( ) is widely used and always set 
-by default, we strongly recommend users to switch to using `max_new_tokens` instead.
-`max_length` will generate tokens **up to** `max_length` where as `max_new_tokens` 
-will generate exactly `max_new_tokens` independent of the input length.
-Unexpected behavior can occur when using `max_length` with `input_ids` that are 
-longer than `max_length` which is why we recommend using `max_new_tokens` instead.
-
-</Tip>
-
-```py
-max_new_tokens = 10
-
-input_ids = prompt_ids
-for _ in range(max_new_tokens):
-	with torch.no_grad():
-		logits = model(input_ids).logits
-	
-	next_token_id = torch.argmax(logits[:, -1:], dim=-1)
-
-	input_ids = torch.concat([input_ids, next_token_id], dim=-1)
-```
-
-After having generated 10 new tokens, let's take a look at the prediction.
-
-```py
-tokenizer.batch_decode(input_ids)
-```
-
-Instead of writing this greedy_search method everytime yourself, one can instead 
-make use of [`~generation_utils.GenerationMixin.generate`].
-
-Let's see how to replicate the above example with [`~generation_utils.GenerationMixin.generate`].
-
-<Tip>
-
-Greedy search is activated by default or when passing `num_beams=1` and `do_sample=False` to
-[`~generation_utils.GenerationMixin.generate`]
-
-</Tip>
-
-```py
-sequenences = model.generate(prompt_ids, do_sample=False, num_beams=1)
-tokenizer.batch_decode(input_ids)
-```
-
-For **encoder-decoder** models, we first need to encode the prompt with the encoder
-and then iteratively forward the `input_ids` through the decoder just like in the example above.
-Let's load an encoder-decoder model such as [t5-small]( ):
-
-```py
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-
-model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
-tokenizer = AutoTokenizer.from_pretrained("t5-small")
-```
-
-Let's use the same prompt, but now pass it once to the encoder.
-
-```py
-input_ids = prompt_ids
-
-with torch.no_grad():
-	encoder_hidden_states = model.encoder(input_ids).hidden_states
-```
-
-The decoder of encoder-decoder models contains so-called *cross-attention* layers
-which have to be conditioned on encoded hidden states. Therefore, we will 
-pass the `encoder_hidden_states` at every iteration step below.
-
-<Tip>
-
-To better understand how *encoder-decoder* models function from a theoretical
-point of view please take a look at the [Encoder-decoder blog post]( ).
-
-</Tip>
-
-```py
-input_ids = torch.tensor([[model.config.decoder_start_token_id]])
-
-max_new_tokens = 10
-for _ in range(max_new_tokens):
-	with torch.no_grad():
-		logits = model(input_ids, encoder_hidden_states=encoder_hidden_states).logits
-	
-	next_token_id = torch.argmax(logits[:, -1:], dim=-1)
-```
-
-Transformers `generate` method automatically detects if the model is an encoder-decoder
-model and correspondingly correctly prepare the `encoder_hidden_states`.
-
-```py
-sequenences = model.generate(prompt_ids, do_sample=False, num_beams=1)
-tokenizer.batch_decode(input_ids)
-```
-
-When generating longer texts, e.g. `max_new_tokens = 200` 
-you will notice that [`~generation_utils.GenerationMixin.greedy_search`] will automatically 
-finish the generation loop early because the model has predicted a so-called *end-of-sentence* token.
-
-Stopping at the EOS token amongst many other features, such as cached generation, batched generation, etc... 
-is missing in the native loop above. Therefore, we strongly recommend to use 🤗 Transformers generate 
-method instead of writing your own training loop.
-
-Other important generation features that can be set when doing `greedy_search` include:
-
-- 
-
-## Sample
-
-
-
-
-
-it again into the 
-
-simply takes the index of the most probably 
-
-
-## Greedy search generate
-sampling methods
-
-The simplest form of sampling is to t
-
-
-The [`PreTrainedTokenizerFast`] depends on the [🤗 Tokenizers](https://huggingface.co/docs/tokenizers) library. The tokenizers obtained from the 🤗 Tokenizers library can be
-loaded very simply into 🤗 Transformers.
-
-Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines:
-
-```python
->>> from tokenizers import Tokenizer
->>> from tokenizers.models import BPE
->>> from tokenizers.trainers import BpeTrainer
->>> from tokenizers.pre_tokenizers import Whitespace
-
->>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
->>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
-
->>> tokenizer.pre_tokenizer = Whitespace()
->>> files = [...]
->>> tokenizer.train(files, trainer)
-```
-
-We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to
-a JSON file for future re-use.
-
-## Loading directly from the tokenizer object
-
-Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The
-[`PreTrainedTokenizerFast`] class allows for easy instantiation, by accepting the instantiated
-*tokenizer* object as an argument:
-
-```python
->>> from transformers import PreTrainedTokenizerFast
-
->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
-```
-
-This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer
-page](main_classes/tokenizer) for more information.
-
-## Loading from a JSON file
-
-In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer:
-
-```python
->>> tokenizer.save("tokenizer.json")
-```
-
-The path to which we saved this file can be passed to the [`PreTrainedTokenizerFast`] initialization
-method using the `tokenizer_file` parameter:
-
-```python
->>> from transformers import PreTrainedTokenizerFast
-
->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
-```
-
-This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer
-page](main_classes/tokenizer) for more information.
diff --git a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
index d66a5923868dc5..df06d2ee27a785 100644
--- a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
+++ b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
@@ -435,21 +435,23 @@ def test_word_time_stamp_integration(self):
         self.assertEqual(" ".join(self.get_from_offsets(word_time_stamps, "word")), output.text)
 
         # output times
-        start_times = [round(x, 2) for x in self.get_from_offsets(word_time_stamps, "start_time")]
-        end_times = [round(x, 2) for x in self.get_from_offsets(word_time_stamps, "end_time")]
+        start_times = torch.tensor(self.get_from_offsets(word_time_stamps, "start_time"))
+        end_times = torch.tensor(self.get_from_offsets(word_time_stamps, "end_time"))
 
         # fmt: off
-        self.assertListEqual(
+        self.assertTrue(torch.allclose(
             start_times,
-            [
+            torch.tensor([
                 1.42, 1.64, 2.12, 2.26, 2.54, 3.0, 3.24, 3.6, 3.8, 4.1, 4.26, 4.94, 5.28, 5.66, 5.78, 5.94, 6.32, 6.54, 6.66,
-            ],
-        )
+            ]),
+            atol=0.01
+        ))
 
-        self.assertListEqual(
+        self.assertTrue(torch.allclose(
             end_times,
-            [
+            torch.tensor([
                 1.54, 1.88, 2.14, 2.46, 2.9, 3.18, 3.54, 3.72, 4.02, 4.18, 4.76, 5.16, 5.56, 5.7, 5.86, 6.2, 6.38, 6.62, 6.94,
-            ],
-        )
+            ]),
+            atol=0.01
+        ))
         # fmt: on

From 00a2250cfefa497ec02b7159120f33bdb4f50cab Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 24 Aug 2022 19:05:21 +0200
Subject: [PATCH 3/6] make version dependent expected values

---
 .../test_processor_wav2vec2_with_lm.py        | 29 +++++++++----------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
index df06d2ee27a785..9fa31997c0c14a 100644
--- a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
+++ b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
@@ -23,6 +23,7 @@
 import datasets
 import numpy as np
 from datasets import load_dataset
+from packaging import version
 
 from transformers import AutoProcessor
 from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
@@ -411,7 +412,9 @@ def test_word_time_stamp_integration(self):
         model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")
 
         # compare to filename `common_voice_en_100038.mp3` of dataset viewer on https://huggingface.co/datasets/common_voice/viewer/en/train
+        print("Out", np.sum(np.abs(sample["audio"]["array"])))
         input_values = processor(sample["audio"]["array"], return_tensors="pt").input_values
+        print("Out PT", input_values.abs().sum())
 
         with torch.no_grad():
             logits = model(input_values).logits.cpu().numpy()
@@ -439,19 +442,15 @@ def test_word_time_stamp_integration(self):
         end_times = torch.tensor(self.get_from_offsets(word_time_stamps, "end_time"))
 
         # fmt: off
-        self.assertTrue(torch.allclose(
-            start_times,
-            torch.tensor([
-                1.42, 1.64, 2.12, 2.26, 2.54, 3.0, 3.24, 3.6, 3.8, 4.1, 4.26, 4.94, 5.28, 5.66, 5.78, 5.94, 6.32, 6.54, 6.66,
-            ]),
-            atol=0.01
-        ))
-
-        self.assertTrue(torch.allclose(
-            end_times,
-            torch.tensor([
-                1.54, 1.88, 2.14, 2.46, 2.9, 3.18, 3.54, 3.72, 4.02, 4.18, 4.76, 5.16, 5.56, 5.7, 5.86, 6.2, 6.38, 6.62, 6.94,
-            ]),
-            atol=0.01
-        ))
+        expected_start_tensor = torch.tensor([1.42, 1.64, 2.12, 2.26, 2.54, 3.0, 3.24, 3.6, 3.8, 4.1, 4.26, 4.94, 5.28, 5.66, 5.78, 5.94, 6.32, 6.54, 6.66])
+
+        # TODO(Patrick): This if-else version statement should be removed once
+        # https://github.com/huggingface/datasets/issues/4889 is resolved
+        if version.parse(torch.__version__) >= version.parse("1.12.0"):
+            expected_end_tensor = torch.tensor([1.54, 1.88, 2.14, 2.46, 2.9, 3.16, 3.54, 3.72, 4.02, 4.18, 4.76, 5.16, 5.56, 5.7, 5.86, 6.2, 6.38, 6.62, 6.94])
+        else:
+            expected_end_tensor = torch.tensor([1.54, 1.88, 2.14, 2.46, 2.9, 3.18, 3.54, 3.72, 4.02, 4.18, 4.76, 5.16, 5.56, 5.7, 5.86, 6.2, 6.38, 6.62, 6.94])
         # fmt: on
+
+        self.assertTrue(torch.allclose(start_times, expected_start_tensor, atol=0.01))
+        self.assertTrue(torch.allclose(end_times, expected_end_tensor, atol=0.01))

From c3dfca23e6eece1c628f48fa5bdcc52bbd92b31d Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 24 Aug 2022 19:06:00 +0200
Subject: [PATCH 4/6] Apply suggestions from code review

---
 .../models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
index 9fa31997c0c14a..9448af35e5038a 100644
--- a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
+++ b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
@@ -412,9 +412,7 @@ def test_word_time_stamp_integration(self):
         model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")
 
         # compare to filename `common_voice_en_100038.mp3` of dataset viewer on https://huggingface.co/datasets/common_voice/viewer/en/train
-        print("Out", np.sum(np.abs(sample["audio"]["array"])))
         input_values = processor(sample["audio"]["array"], return_tensors="pt").input_values
-        print("Out PT", input_values.abs().sum())
 
         with torch.no_grad():
             logits = model(input_values).logits.cpu().numpy()

From b9f32f4298de35a7371e03b533c4ee968ceab96a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 26 Aug 2022 10:27:02 +0200
Subject: [PATCH 5/6] Update
 tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py

---
 .../models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
index 9448af35e5038a..7a2af1c76a2631 100644
--- a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
+++ b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
@@ -444,7 +444,7 @@ def test_word_time_stamp_integration(self):
 
         # TODO(Patrick): This if-else version statement should be removed once
         # https://github.com/huggingface/datasets/issues/4889 is resolved
-        if version.parse(torch.__version__) >= version.parse("1.12.0"):
+        if version.parse(version.parse(torch.__version__).base_version >= version.parse("1.12.0"):
             expected_end_tensor = torch.tensor([1.54, 1.88, 2.14, 2.46, 2.9, 3.16, 3.54, 3.72, 4.02, 4.18, 4.76, 5.16, 5.56, 5.7, 5.86, 6.2, 6.38, 6.62, 6.94])
         else:
             expected_end_tensor = torch.tensor([1.54, 1.88, 2.14, 2.46, 2.9, 3.18, 3.54, 3.72, 4.02, 4.18, 4.76, 5.16, 5.56, 5.7, 5.86, 6.2, 6.38, 6.62, 6.94])

From 236c7af2551da6b81297ed94c19079925a5c8c8c Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 26 Aug 2022 13:04:02 +0200
Subject: [PATCH 6/6] fix typo

---
 .../models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
index 7a2af1c76a2631..6bf52d3e1b1bc9 100644
--- a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
+++ b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
@@ -444,7 +444,7 @@ def test_word_time_stamp_integration(self):
 
         # TODO(Patrick): This if-else version statement should be removed once
         # https://github.com/huggingface/datasets/issues/4889 is resolved
-        if version.parse(version.parse(torch.__version__).base_version >= version.parse("1.12.0"):
+        if version.parse(version.parse(torch.__version__).base_version) >= version.parse("1.12.0"):
             expected_end_tensor = torch.tensor([1.54, 1.88, 2.14, 2.46, 2.9, 3.16, 3.54, 3.72, 4.02, 4.18, 4.76, 5.16, 5.56, 5.7, 5.86, 6.2, 6.38, 6.62, 6.94])
         else:
             expected_end_tensor = torch.tensor([1.54, 1.88, 2.14, 2.46, 2.9, 3.18, 3.54, 3.72, 4.02, 4.18, 4.76, 5.16, 5.56, 5.7, 5.86, 6.2, 6.38, 6.62, 6.94])