Merge pull request huggingface#3 from DaryaTereshchenko/changes_pr2

Changes pr2
DaryaTereshchenko · Nov 5, 2024 · 2603cf8 · 2603cf8
2 parents 8350215 + 77ccf60
commit 2603cf8
Show file tree

Hide file tree

Showing 4 changed files with 65 additions and 9 deletions.
diff --git a/docs/source/en/model_doc/prism.md b/docs/source/en/model_doc/prism.md
@@ -18,9 +18,7 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
-The `Prism` model, a state-of-the-art multilingual neural machine translation (NMT) system developed for translation. The model supports translation across 39 languages, leveraging a zero-shot paraphrasing approach that does not require human judgments for training.
-
-The `Prism` model was designed to be a lexically/syntactically unbiased paraphraser. The core idea is to treat paraphrasing as a zero-shot translation task, which allows the model to cover a wide range of languages effectively.
+The Prism model is a multilingual neural machine translation (NMT) system. The model supports translation across 39 languages, leveraging a zero-shot paraphrasing approach as a machine translation evaluation technique.
 
 The model was proposed in [Automatic Machine Translation Evaluation in Many Languages via Zero-Shot Paraphrasing](https://aclanthology.org/2020.emnlp-main.8.pdf) by Brian Thompson and Matt Post.
 
@@ -65,10 +63,6 @@ print(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True))
 # => 'Life is like a box of chocolate.'
 ```
 
-## Languages Covered
-Albanian (sq), Arabic (ar), Bengali (bn), Bulgarian (bg), Catalan; Valencian (ca), Chinese (zh), Croatian (hr), Czech (cs), Danish (da), Dutch (nl), English (en), Esperanto (eo), Estonian (et), Finnish (fi), French (fr), German (de), Greek, Modern (el), Hebrew (modern) (he), Hungarian (hu), Indonesian (id), Italian (it), Japanese (ja), Kazakh (kk), Latvian (lv), Lithuanian (lt), Macedonian (mk), Norwegian (no), Polish (pl), Portuguese (pt), Romanian, Moldovan (ro), Russian (ru), Serbian (sr), Slovak (sk), Slovene (sl), Spanish; Castilian (es), Swedish (sv), Turkish (tr), Ukrainian (uk), Vietnamese (vi).
-
-
 ## Resources
 
 - [Translation task guide](../tasks/translation)

diff --git a/src/transformers/models/prism/tokenization_prism.py b/src/transformers/models/prism/tokenization_prism.py
@@ -79,6 +79,68 @@
 
 
 class PrismTokenizer(PreTrainedTokenizer):
+    """
+    Construct an Prism tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        spm_file (`str`):
+            Path to [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
+            contains the vocabulary.
+        src_lang (`str`, *mandatory*, *optional*):
+            A string representing the source language.
+        tgt_lang (`str`, *mandatory*, *optional*):
+            A string representing the target language.
+        bos_token (`str`, *optional*, defaults to `"<s>"`): 
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        language_codes (`str`, *optional*, defaults to `"prism"`):
+            What language codes to use. Should be one of `"prism"`.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+        num_madeup_words (`int`, *optional*, defaults to 8): number of made-up words to use for translation.
+
+    Examples:
+
+    ```python
+    >>> from transformers import PrismForConditionalGeneration, PrismTokenizer
+
+    >>> model = PrismForConditionalGeneration.from_pretrained("dariast/prism")
+    >>> tokenizer = PrismTokenizer.from_pretrained("dariast/prism", src_lang="en", tgt_lang="ro")
+    >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
+    >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
+    >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
+    >>> outputs = model(**model_inputs)  # should work
+    ```
+        """
+
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
 

diff --git a/tests/models/m2m_100/test_tokenization_m2m_100.py b/tests/models/m2m_100/test_tokenization_m2m_100.py
@@ -87,7 +87,7 @@ def test_convert_token_and_id(self):
     def test_get_vocab(self):
         tokenizer = self.get_tokenizer()
         vocab_keys = list(tokenizer.get_vocab().keys())
-
+        print(vocab_keys)
         self.assertEqual(vocab_keys[0], "</s>")
         self.assertEqual(vocab_keys[1], "<unk>")
         self.assertEqual(vocab_keys[-1], "<s>")

diff --git a/tests/models/prism/test_modeling_prism.py b/tests/models/prism/test_modeling_prism.py
@@ -89,7 +89,7 @@ def __init__(
         num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=4,
-        hidden_act="relu",
+        hidden_act="relu", 
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         encoder_layerdrop=0.0,