diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index 640a694..7e9d01a 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.7"]
+        python-version: ["3.8"]
 
     steps:
       - uses: actions/checkout@v3
diff --git a/README.md b/README.md
index e78dcf7..6b76cd9 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ To learn more about how these measures work, have a look at [Jannis' blog post](
 
 ## Installation
 
-- Requires Python >= 3.7 and PyTorch
+- Requires Python >= 3.8 and PyTorch
 - `pip install nmtscore`
 - Extra requirements for the Prism model: `pip install nmtscore[prism]`
 
@@ -25,7 +25,7 @@ from nmtscore import NMTScorer
 scorer = NMTScorer()
 
 scorer.score("This is a sentence.", "This is another sentence.")
-# 0.5025776988808766
+# 0.4677300455046415
 ```
 
 #### Different similarity measures
@@ -52,7 +52,7 @@ scorer.score(
     ["This is a sentence.", "This is a sentence.", "This is another sentence."],
     ["This is another sentence.", "This sentence is completely unrelated.", "This is another sentence."],
 )
-# [0.5025777998113548, 0.1640727324003354, 1.0000000000000049]
+# [0.46772973967003206, 0.15306852595255185, 1.0]
 ```
 
 The sentences in the first list are compared element-wise to the sentences in the second list.
@@ -132,7 +132,7 @@ model.translate("de", ["This is a test."])
 # ["Das ist ein Test."]
 
 model.score("de", ["This is a test."], ["Das ist ein Test."])
-# [0.7708902359008789]
+# [0.8293135166168213]
 ```
 
 ## Experiments
diff --git a/setup.cfg b/setup.cfg
index a1e7073..ee0a675 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -18,9 +18,9 @@ classifiers =
 package_dir =
     = src
 packages = find:
-python_requires = >=3.7
+python_requires = >=3.8
 install_requires =
-    transformers
+    transformers<4.34  # https://github.com/ZurichNLP/nmtscore/issues/7
     sentencepiece
     tqdm
     sqlitedict
diff --git a/src/nmtscore/models/m2m100.py b/src/nmtscore/models/m2m100.py
index 69e9c03..2de1faa 100644
--- a/src/nmtscore/models/m2m100.py
+++ b/src/nmtscore/models/m2m100.py
@@ -94,8 +94,6 @@ def _score(self,
             batch(hypothesis_sentences, batch_size),
         )
         for src_sentences, tgt_sentences in batch_iterator:
-            # Hack: Append a second EOS token to make sure that one EOS is still there after shift_tokens_right
-            tgt_sentences = [f"{sentence} {self.tokenizer.eos_token}" for sentence in tgt_sentences]
             inputs = self.tokenizer(
                 src_sentences,
                 text_target=tgt_sentences,
diff --git a/tests/test_readme.py b/tests/test_readme.py
index ddf095b..0583595 100644
--- a/tests/test_readme.py
+++ b/tests/test_readme.py
@@ -24,7 +24,7 @@ def tearDownClass(cls) -> None:
     def test_nmtscorer(self):
         scorer = NMTScorer()
         score = scorer.score("This is a sentence.", "This is another sentence.")
-        self.assertAlmostEqual(0.5025776988808766, score, places=4)
+        self.assertAlmostEqual(0.4677300455046415, score, places=4)
 
     def test_batch_processing(self):
         scorer = NMTScorer()
@@ -33,20 +33,20 @@ def test_batch_processing(self):
             ["This is another sentence.", "This sentence is completely unrelated.", "This is another sentence."],
         )
         self.assertEqual(3, len(scores))
-        self.assertAlmostEqual(0.5025777998113548, scores[0], places=4)
-        self.assertAlmostEqual(0.1640727324003354, scores[1], places=4)
-        self.assertAlmostEqual(1.0000000000000049, scores[2], places=4)
+        self.assertAlmostEqual(0.46772973967003206, scores[0], places=4)
+        self.assertAlmostEqual(0.15306852595255185, scores[1], places=4)
+        self.assertAlmostEqual(1.0, scores[2], places=4)
 
     def test_different_similarity_measures(self):
         scorer = NMTScorer()
         a = "This is a sentence."
         b = "This is another sentence."
         score = scorer.score_cross_likelihood(a, b, tgt_lang="en", normalize=True, both_directions=True)
-        self.assertAlmostEqual(0.5025776988808766, score, places=4)
+        self.assertAlmostEqual(0.4677300455046415, score, places=4)
         score = scorer.score_direct(a, b, a_lang="en", b_lang="en", normalize=True, both_directions=True)
-        self.assertAlmostEqual(0.5025776988808766, score, places=4)
+        self.assertAlmostEqual(0.4677300455046415, score, places=4)
         score = scorer.score_pivot(a, b, a_lang="en", b_lang="en", pivot_lang="en", normalize=True, both_directions=True)
-        self.assertAlmostEqual(0.5025776988808766, score, places=4)
+        self.assertAlmostEqual(0.4677300455046415, score, places=4)
 
     @unittest.skipIf(os.getenv("SKIP_SLOW_TESTS", False), "Slow")
     def test_different_nmt_models(self):
@@ -59,18 +59,18 @@ def test_batch_size(self):
         a = "This is a sentence."
         b = "This is another sentence."
         score = scorer.score_cross_likelihood(a, b, translate_kwargs={"batch_size": 16}, score_kwargs={"batch_size": 16})
-        self.assertAlmostEqual(0.5025776988808766, score, places=4)
+        self.assertAlmostEqual(0.4677300455046415, score, places=4)
         score = scorer.score_direct(a, b, a_lang="en", b_lang="en", score_kwargs={"batch_size": 16})
-        self.assertAlmostEqual(0.5025776988808766, score, places=4)
+        self.assertAlmostEqual(0.4677300455046415, score, places=4)
 
     def test_caching(self):
         scorer = NMTScorer()
         a = "This is a sentence."
         b = "This is another sentence."
         score = scorer.score_cross_likelihood(a, b, translate_kwargs={"use_cache": True}, score_kwargs={"use_cache": True})
-        self.assertAlmostEqual(0.5025776988808766, score, places=4)
+        self.assertAlmostEqual(0.4677300455046415, score, places=4)
         score = scorer.score_direct(a, b, a_lang="en", b_lang="en", score_kwargs={"use_cache": True})
-        self.assertAlmostEqual(0.5025776988808766, score, places=4)
+        self.assertAlmostEqual(0.4677300455046415, score, places=4)
 
     @mock.patch('sys.stdout', new_callable=io.StringIO)
     def test_version_signature(self, mock_stdout):
@@ -85,4 +85,4 @@ def test_nmt_models(self):
         translations = model.translate("de", ["This is a test."], src_lang="en")
         self.assertEqual(["Das ist ein Test."], translations)
         scores = model.score("de", ["This is a test."], ["Das ist ein Test."], src_lang="en")
-        self.assertAlmostEqual(0.7708902359008789, scores[0], places=4)
+        self.assertAlmostEqual(0.8293135166168213, scores[0], places=4)