Skip to content

Commit

Permalink
Merge pull request #277 from cg123/dataset-name
Browse files Browse the repository at this point in the history
Allow non-default dataset configurations
  • Loading branch information
winglian authored Jul 16, 2023
2 parents 168a7a0 + 3cdd8e4 commit 334af62
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 14 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,12 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
- path: vicgalle/alpaca-gpt4
type: alpaca # format from earlier
# huggingface repo with specific configuration/subset
datasets:
- path: EleutherAI/pile
name: enron_emails
type: completion # format from earlier
# local
datasets:
- path: json
Expand Down Expand Up @@ -344,6 +350,7 @@ datasets:
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
data_files: # path to source data files
shards: # number of shards to split data into
name: # name of dataset configuration to load
# axolotl attempts to save the dataset as an arrow after packing the data together so
# subsequent training attempts load faster, relative path
Expand Down
27 changes: 13 additions & 14 deletions src/axolotl/utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def load_tokenized_prepared_datasets(
try:
load_dataset(
d.path,
name=d.name,
streaming=True,
use_auth_token=use_auth_token,
)
Expand All @@ -107,13 +108,15 @@ def load_tokenized_prepared_datasets(
if local_path.is_dir():
ds = load_dataset(
d.path,
name=d.name,
data_files=d.data_files,
streaming=False,
split=None,
)
elif local_path.is_file():
ds = load_dataset(
"json",
name=d.name,
data_files=d.path,
streaming=False,
split=None,
Expand All @@ -123,26 +126,22 @@ def load_tokenized_prepared_datasets(
"unhandled dataset load: local path exists, but is neither a directory or a file"
)
elif ds_from_hub:
if d.data_files:
ds = load_dataset(
d.path,
streaming=False,
data_files=d.data_files,
use_auth_token=use_auth_token,
)
else:
ds = load_dataset(
d.path,
streaming=False,
use_auth_token=use_auth_token,
)
ds = load_dataset(
d.path,
name=d.name,
streaming=False,
data_files=d.data_files,
use_auth_token=use_auth_token,
)
else:
fp = hf_hub_download(
repo_id=d.path,
repo_type="dataset",
filename=d.data_files,
)
ds = load_dataset("json", data_files=fp, streaming=False, split=None)
ds = load_dataset(
"json", name=d.name, data_files=fp, streaming=False, split=None
)
if not ds:
raise ValueError("unhandled dataset load")
# support for using a subset of the data
Expand Down

0 comments on commit 334af62

Please sign in to comment.