Skip to content

Commit

Permalink
0.3.0 (#73)
Browse files Browse the repository at this point in the history
* Update dependencies

* Fix depreciation warning

* Fix param name

* Update minimum versions

* Remove TorchScript refs

* version bump

* Bump PL version

* dev Dockerfile

* Update dependencies

* Transformer 4 fix

* Fix transformer 4 import for TF conversion

* Fix model training for lighting 1.0.0

* Add back GPU memory printing

* TPU fixes

* Assert descriptions

* Generation tweaks (remove pad message)

* Ignore .DS_Store

* Handle generation by prompt more canonically

* Set 20 for refresh default to avoid Colab warning

* Fix gen warning while training

* Fix model loading from config + generation

* FP16 warning (#70)

* Fix tokenizer for latest tokenizers

* Set default learning rate to 1e-3

* Set CPU config to match tokenizer default
  • Loading branch information
minimaxir authored Dec 1, 2020
1 parent d889163 commit f7278bf
Show file tree
Hide file tree
Showing 10 changed files with 156 additions and 116 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ test_notebooks/
/dist
*.egg-info
.vscode/settings.json
/site
/site
.DS_Store
14 changes: 14 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM python:3.8.6-slim

RUN apt-get -y update && apt-get -y install gcc

WORKDIR /

COPY requirements.txt .

# install dependencies
RUN pip --no-cache-dir install -r requirements.txt
COPY * /

# Clean up APT when done.
RUN apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
1 change: 0 additions & 1 deletion ROADMAP.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ A rough roadmap for implementing new features. **All is subject to change at a m
- Training using pytorch-lightning, with suppport for fp16 and Colab TPUs.
- Training a GPT-2 model from scratch w/ parametricized context window sizes and parameters
- PyTorch support for training/generating
- Export to static Torchscript trace.
- Generation from Transformer's native generate() function
- Actual documentation
- Examples
Expand Down
41 changes: 33 additions & 8 deletions aitextgen/TokenDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,9 @@ def __init__(
# if a file is specified, and it's line-delimited,
# the text must be processed line-by-line into a a single bulk file
elif line_by_line:
assert os.path.isfile(file_path)
assert os.path.isfile(
file_path
), f"{file_path} is not present in the current directory."

text_delim = None
self.file_path = file_path
Expand All @@ -122,7 +124,9 @@ def __init__(
# if a file is specified, and it's not line-delimited,
# the texts must be parsed as a single bulk file.
else:
assert os.path.isfile(file_path)
assert os.path.isfile(
file_path
), f"{file_path} is not present in the current directory."
if file_path.endswith(".csv"):
logger.warning(
"You are tokenizing a CSV file, but you did not "
Expand Down Expand Up @@ -256,7 +260,12 @@ def encode_tokens_from_file(
else:
num_texts = get_lines_in_file(file_path, newline)

pbar = tqdm(total=num_texts, smoothing=0, leave=True, dynamic_ncols=True,)
pbar = tqdm(
total=num_texts,
smoothing=0,
leave=True,
dynamic_ncols=True,
)
tokens = np.full((num_texts, 1), -1, dtype=a_dtype)
num_batches = 0

Expand Down Expand Up @@ -291,7 +300,7 @@ def encode_tokens_from_file(
batch,
add_special_tokens=False,
return_token_type_ids=False,
return_attention_masks=False,
return_attention_mask=False,
)["input_ids"]

for i, encoded_text in enumerate(encoded_texts):
Expand All @@ -300,7 +309,11 @@ def encode_tokens_from_file(
tokens = np.concatenate(
(
tokens,
np.full((num_texts, cols_to_add), -1, dtype=a_dtype,),
np.full(
(num_texts, cols_to_add),
-1,
dtype=a_dtype,
),
),
axis=1,
)
Expand Down Expand Up @@ -335,7 +348,12 @@ def encode_tokens_from_list(
a_dtype = get_dtype(tokenizer.vocab_size)
logger.info(f"Encoding {num_texts:,} texts.")

pbar = tqdm(total=num_texts, smoothing=0, leave=True, dynamic_ncols=True,)
pbar = tqdm(
total=num_texts,
smoothing=0,
leave=True,
dynamic_ncols=True,
)
tokens = np.full((len(texts), 1), -1, dtype=a_dtype)

for i_start in range(num_texts // batch_size + 1):
Expand All @@ -350,14 +368,21 @@ def encode_tokens_from_list(
batch,
add_special_tokens=False,
return_token_type_ids=False,
return_attention_masks=False,
return_attention_mask=False,
)["input_ids"]

for i, encoded_text in enumerate(encoded_texts):
if len(encoded_text) > tokens.shape[1]:
cols_to_add = len(encoded_text) - tokens.shape[1]
tokens = np.concatenate(
(tokens, np.full((num_texts, cols_to_add), -1, dtype=a_dtype,),),
(
tokens,
np.full(
(num_texts, cols_to_add),
-1,
dtype=a_dtype,
),
),
axis=1,
)
tokens[(i_start * batch_size) + i, : len(encoded_text)] = encoded_text
Expand Down
Loading

0 comments on commit f7278bf

Please sign in to comment.