0.3.0 (#73)

* Update dependencies * Fix depreciation warning * Fix param name * Update minimum versions * Remove TorchScript refs * version bump * Bump PL version * dev Dockerfile * Update dependencies * Transformer 4 fix * Fix transformer 4 import for TF conversion * Fix model training for lighting 1.0.0 * Add back GPU memory printing * TPU fixes * Assert descriptions * Generation tweaks (remove pad message) * Ignore .DS_Store * Handle generation by prompt more canonically * Set 20 for refresh default to avoid Colab warning * Fix gen warning while training * Fix model loading from config + generation * FP16 warning (#70) * Fix tokenizer for latest tokenizers * Set default learning rate to 1e-3 * Set CPU config to match tokenizer default
minimaxir · Dec 1, 2020 · f7278bf · f7278bf
1 parent d889163
commit f7278bf
Show file tree

Hide file tree

Showing 10 changed files with 156 additions and 116 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,5 @@ test_notebooks/
 /dist
 *.egg-info
 .vscode/settings.json
-/site
+/site
+.DS_Store
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,14 @@
+FROM python:3.8.6-slim
+
+RUN apt-get -y update && apt-get -y install gcc
+
+WORKDIR /
+
+COPY requirements.txt .
+
+# install dependencies
+RUN pip --no-cache-dir install -r requirements.txt
+COPY * /
+
+# Clean up APT when done.
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
diff --git a/ROADMAP.md b/ROADMAP.md
@@ -7,7 +7,6 @@ A rough roadmap for implementing new features. **All is subject to change at a m
 - Training using pytorch-lightning, with suppport for fp16 and Colab TPUs.
 - Training a GPT-2 model from scratch w/ parametricized context window sizes and parameters
 - PyTorch support for training/generating
-- Export to static Torchscript trace.
 - Generation from Transformer's native generate() function
 - Actual documentation
   - Examples

diff --git a/aitextgen/TokenDataset.py b/aitextgen/TokenDataset.py
@@ -113,7 +113,9 @@ def __init__(
         # if a file is specified, and it's line-delimited,
         # the text must be processed line-by-line into a a single bulk file
         elif line_by_line:
-            assert os.path.isfile(file_path)
+            assert os.path.isfile(
+                file_path
+            ), f"{file_path} is not present in the current directory."
 
             text_delim = None
             self.file_path = file_path
@@ -122,7 +124,9 @@ def __init__(
         # if a file is specified, and it's not line-delimited,
         # the texts must be parsed as a single bulk file.
         else:
-            assert os.path.isfile(file_path)
+            assert os.path.isfile(
+                file_path
+            ), f"{file_path} is not present in the current directory."
             if file_path.endswith(".csv"):
                 logger.warning(
                     "You are tokenizing a CSV file, but you did not "
@@ -256,7 +260,12 @@ def encode_tokens_from_file(
     else:
         num_texts = get_lines_in_file(file_path, newline)
 
-    pbar = tqdm(total=num_texts, smoothing=0, leave=True, dynamic_ncols=True,)
+    pbar = tqdm(
+        total=num_texts,
+        smoothing=0,
+        leave=True,
+        dynamic_ncols=True,
+    )
     tokens = np.full((num_texts, 1), -1, dtype=a_dtype)
     num_batches = 0
 
@@ -291,7 +300,7 @@ def encode_tokens_from_file(
                 batch,
                 add_special_tokens=False,
                 return_token_type_ids=False,
-                return_attention_masks=False,
+                return_attention_mask=False,
             )["input_ids"]
 
             for i, encoded_text in enumerate(encoded_texts):
@@ -300,7 +309,11 @@ def encode_tokens_from_file(
                     tokens = np.concatenate(
                         (
                             tokens,
-                            np.full((num_texts, cols_to_add), -1, dtype=a_dtype,),
+                            np.full(
+                                (num_texts, cols_to_add),
+                                -1,
+                                dtype=a_dtype,
+                            ),
                         ),
                         axis=1,
                     )
@@ -335,7 +348,12 @@ def encode_tokens_from_list(
     a_dtype = get_dtype(tokenizer.vocab_size)
     logger.info(f"Encoding {num_texts:,} texts.")
 
-    pbar = tqdm(total=num_texts, smoothing=0, leave=True, dynamic_ncols=True,)
+    pbar = tqdm(
+        total=num_texts,
+        smoothing=0,
+        leave=True,
+        dynamic_ncols=True,
+    )
     tokens = np.full((len(texts), 1), -1, dtype=a_dtype)
 
     for i_start in range(num_texts // batch_size + 1):
@@ -350,14 +368,21 @@ def encode_tokens_from_list(
             batch,
             add_special_tokens=False,
             return_token_type_ids=False,
-            return_attention_masks=False,
+            return_attention_mask=False,
         )["input_ids"]
 
         for i, encoded_text in enumerate(encoded_texts):
             if len(encoded_text) > tokens.shape[1]:
                 cols_to_add = len(encoded_text) - tokens.shape[1]
                 tokens = np.concatenate(
-                    (tokens, np.full((num_texts, cols_to_add), -1, dtype=a_dtype,),),
+                    (
+                        tokens,
+                        np.full(
+                            (num_texts, cols_to_add),
+                            -1,
+                            dtype=a_dtype,
+                        ),
+                    ),
                     axis=1,
                 )
             tokens[(i_start * batch_size) + i, : len(encoded_text)] = encoded_text