Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve data prep #122

Merged
merged 20 commits into from
May 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions FINETUNE.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ Below are some of our scripts to help with assembling and cleaning instruct-type
For a higher quality dataset, run the following commands:
```bash
pytest -s create_data.py::test_download_useful_data_as_parquet # downloads ~ 4.2GB of open-source permissive data
pytest -s create_data.py::test_assemble_and_detox # ~ 3 minutes, 3.9M clean conversations
pytest -s create_data.py::test_chop_by_lengths # ~ 2 minutes, 2.9M clean and long enough conversations
pytest -s create_data.py::test_assemble_and_detox # ~ 3 minutes, 4.1M clean conversations
pytest -s create_data.py::test_chop_by_lengths # ~ 2 minutes, 2.8M clean and long enough conversations
pytest -s create_data.py::test_grade # ~ 3 hours, keeps only high quality data
pytest -s create_data.py::test_finalize_to_json
```
Expand Down
153 changes: 95 additions & 58 deletions create_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,15 @@ def get_sentences(blob, length):
my_sentences = []
my_string = ""
for sentence in sentences:
if len(my_string) < length:
my_string += " " + sentence
if len(my_string) + len(sentence) <= length:
if my_string:
my_string += " " + sentence
else:
my_string = sentence
else:
my_sentences.append(my_string)
my_string = ""
return my_sentences
return my_sentences or [my_string]


def test_scrape_dai_docs_all_pandoc():
Expand Down Expand Up @@ -255,15 +258,15 @@ def test_config_to_json():
[
{
'prompt_type': 'plain',
'instruction': f"<human>: What does {k} do? <bot>: {k.replace('_', ' ')} config.toml: {comment or title}".replace("\n", ""),
'instruction': f"<human>: What does {k} do?\n<bot>: {k.replace('_', ' ')} config.toml: {comment or title}\n<human>:".replace("\n", ""),
},
{
'prompt_type': 'plain',
'instruction': f"<human>: Explain {k}. <bot>: {k.replace('_', ' ')} config.toml: {comment or title}".replace("\n", ""),
'instruction': f"<human>: Explain {k}.\n<bot>: {k.replace('_', ' ')} config.toml: {comment or title}\n<human>:".replace("\n", ""),
},
{
'prompt_type': 'plain',
'instruction': f"<human>: How can I do this: {title}. <bot>: Set the {k.replace('_', ' ')} config.toml".replace("\n", ""),
'instruction': f"<human>: How can I do this: {title}.\n<bot>: Set the {k.replace('_', ' ')} config.toml\n<human>:".replace("\n", ""),
} if title and comment else None,
{
'prompt_type': 'human_bot',
Expand Down Expand Up @@ -902,7 +905,8 @@ def test_assemble_and_detox():
# chop up into human/bot interactions of no more than 10kB per row
text_list = df[['text']].values.ravel().tolist()
new_text = []
max_len = 10000 # approx 2k tokens
max_len = 2048 # uber cutoff
MAX_LEN = 2048//2 - 30 # max len per question/answer
for text in tqdm(text_list):
human_starts = [m.start() for m in re.finditer('<human>: ', text)]
if len(human_starts) == 1:
Expand All @@ -911,11 +915,13 @@ def test_assemble_and_detox():
for i in range(len(human_starts) - 1):
interaction = text[human_starts[i]: human_starts[i+1]][:max_len]
blurb += interaction
if len(blurb) >= max_len:
new_text.append(blurb[:2*max_len])
if len(blurb) >= MAX_LEN:
blurb = get_sentences(blurb, length=MAX_LEN)[0]
new_text.append(blurb + "\n<human>:")
blurb = ''
if blurb:
new_text.append(blurb[:2*max_len])
blurb = get_sentences(blurb, length=MAX_LEN)[0]
new_text.append(blurb + "\n<human>:")

if len(new_text) > len(text_list):
print("Added %d new rows (before: %d)" % (len(new_text) - df.shape[0], df.shape[0]))
Expand All @@ -932,10 +938,10 @@ def test_assemble_and_detox():
print("Dropped %d rows out of %d due to alt-profanity-check" % (before_rows - after_rows, before_rows))
df_list.append(df)
print("Done processing %s -> %s rows" % (data, df.shape[0]), flush=True)
print("So far have %d rows" % sum([len(x) for x in df_list]))
print("So far have %d rows" % sum([len(x) for x in df_list]))
df_final = pd.concat(df_list)
df_final = df_final.sample(frac=1, random_state=1234).reset_index(drop=True)
df_final.to_parquet('h2oGPT.cleaned.human_bot.parquet', index=False)
df_final.to_parquet('h2oGPT.cleaned.human_bot.shorter.parquet', index=False)


def test_basic_cleaning():
Expand Down Expand Up @@ -1070,7 +1076,7 @@ def get_answer(x):
)
start = 0
batch_size = 64 * 16
micro_batch = orig_micro_batch = 4
micro_batch = orig_micro_batch = 16
end = 0
import socket
checkpoint = "grades.%s.pkl" % socket.gethostname()
Expand Down Expand Up @@ -1112,7 +1118,7 @@ def get_answer(x):


def test_chop_by_lengths():
file = "h2oGPT.cleaned.human_bot.parquet"
file = "h2oGPT.cleaned.human_bot.shorter.parquet"
df = pd.read_parquet(file).reset_index(drop=True)
df = count_human_bot_lengths(df)
df['rand'] = np.random.rand(df.shape[0])
Expand All @@ -1132,7 +1138,7 @@ def test_chop_by_lengths():
after_rows = df.shape[0]
print("Chopped off %d out of %d rows due to length" % (before_rows - after_rows, before_rows))
print(df.describe())
df.to_parquet('h2oGPT.cleaned.chopped.human_bot.parquet', index=False)
df.to_parquet('h2oGPT.cleaned.chopped.human_bot.shorter.parquet', index=False)


def count_human_bot_lengths(df, human=None, bot=None):
Expand Down Expand Up @@ -1188,8 +1194,8 @@ def count_human_bot_lengths(df, human=None, bot=None):
def test_grade():
df = None

file = "h2oGPT.cleaned.chopped.human_bot.parquet"
output_file = "h2oGPT.cleaned.graded1.human_bot.parquet"
file = "h2oGPT.cleaned.chopped.human_bot.shorter.parquet"
output_file = "h2oGPT.cleaned.graded1.human_bot.shorter.parquet"
if not os.path.exists(output_file):
if df is None:
df = pd.read_parquet(file).reset_index(drop=True)
Expand All @@ -1203,7 +1209,7 @@ def test_grade():
df.to_parquet(output_file, index=False)

file = output_file
output_file = "h2oGPT.cleaned.graded2.human_bot.parquet"
output_file = "h2oGPT.cleaned.graded2.human_bot.shorter.parquet"
if not os.path.exists(output_file):
# slower than alt-profanity, do last, but do before deberta grading, since that's slower
if df is None:
Expand All @@ -1218,12 +1224,12 @@ def test_grade():
df.to_parquet(output_file, index=False)

file = output_file
output_file = 'h2oGPT.cleaned.graded3.human_bot.parquet'
output_file = 'h2oGPT.cleaned.graded3.human_bot.shorter.parquet'
if not os.path.exists(output_file):
if df is None:
df = pd.read_parquet(file).reset_index(drop=True)
df = add_deberta_grade(df)
min_grade = 0.2
min_grade = 0.3
max_grade = np.inf
before_rows = df.shape[0]
df = df[df['grade_deberta'] >= min_grade]
Expand All @@ -1235,7 +1241,7 @@ def test_grade():
df.to_parquet(output_file, index=False)

file = output_file
output_file = 'h2oGPT.cleaned.graded.human_bot.parquet'
output_file = 'h2oGPT.cleaned.graded.human_bot.shorter.parquet'
if df is None:
df = pd.read_parquet(file).reset_index(drop=True)
df.to_parquet(output_file, index=False)
Expand All @@ -1246,14 +1252,15 @@ def test_grade():
[
[False, False, False],
[True, True, False],
[True, False, False],
[True, False, True],
]
)
def test_add_open_assistant(fixup_personality, only_personality, deberta_grading, save_json=True):
"""
Flatten tree structure into one row per path from root to leaf
Also turn into human_bot prompting format:
<human>: question <bot>: answer <human>: question2 <bot>: answer2 Etc.
<human>: question\n<bot>: answer <human>: question2\n<bot>: answer2 Etc.
Also saves a .json locally as side-effect
returns list of dicts, containing intput, prompt_type and source
"""
Expand Down Expand Up @@ -1357,9 +1364,11 @@ def test_add_open_assistant(fixup_personality, only_personality, deberta_grading
conv2['message_id'] = None
conversations = [c for c in conversations if c['message_id']]
if only_personality:
all_rows.extend([dict(input=c['text'], prompt_type='plain', source=data_file) for c in conversations if 'h2oGPT' in c['text']])
all_rows.extend([dict(input=c['text'] + "\n<human>:", prompt_type='plain', source=data_file) for c in conversations if 'h2oGPT' in c['text']])
else:
all_rows.extend([dict(input=c['text'], prompt_type='plain', source=data_file) for c in conversations if "What is H2O.ai" not in c['text']])
all_rows.extend([dict(input=c['text'] + "\n<human>:", prompt_type='plain', source=data_file) for c in conversations if "What is H2O.ai" not in c['text']])
unhelpful = get_unhelpful_list()
all_rows = [x for x in all_rows if not any(u in x['input'] for u in unhelpful)]
personality = create_personality_data()
all_rows.extend(personality * 10)
np.random.seed(123)
Expand All @@ -1370,9 +1379,9 @@ def test_add_open_assistant(fixup_personality, only_personality, deberta_grading
df = df.rename(columns={'input': 'text'})
df = add_deberta_grade(df)
df = df.rename(columns={'text': 'input'})
drop = False
drop = True
if drop:
min_grade = 0.2
min_grade = 0.3
max_grade = np.inf
before_rows = df.shape[0]
df = df[df['grade_deberta'] >= min_grade]
Expand All @@ -1396,24 +1405,22 @@ def test_add_open_assistant(fixup_personality, only_personality, deberta_grading
("_h2ogpt" if fixup_personality else "") + \
("_only" if only_personality else "") + \
("_graded" if deberta_grading else "")
for i in range(len(all_rows)):
all_rows[i]['id'] = i
with open(data_file.lower().replace("/", "_") + ".json", "w") as f:
f.write(json.dumps(all_rows, indent=2))
return all_rows


def test_finalize_to_json():
df = pd.read_parquet('h2oGPT.cleaned.graded.human_bot.parquet')
df = pd.read_parquet('h2oGPT.cleaned.graded.human_bot.shorter.parquet')
df = df.rename(columns={'text': 'input'})

print("Number of high-quality human_bot interactions: %s" % df.shape[0], flush=True)

print("Adding open assistant data")
open_assistant = test_add_open_assistant(
fixup_personality=True, # False was original version, but it's better to personalize, so now using True
only_personality=False,
save_json=True,
deberta_grading=False,
)
with open("openassistant_oasst1_h2ogpt_graded.json") as f:
open_assistant = json.loads(f.read())
df = pd.concat([df, pd.DataFrame(open_assistant)], axis=0)

def final_clean(df):
Expand Down Expand Up @@ -1441,7 +1448,11 @@ def final_clean(df):
)
np.random.seed(1234)
np.random.shuffle(row_list)
with open('h2ogpt-oig-oasst1-instruct-cleaned-v2.json', "w") as f:
unhelpful = get_unhelpful_list()
row_list = [x for x in row_list if not any(u in x['input'] for u in unhelpful)]
for i in range(len(row_list)):
row_list[i]['id'] = i
with open('h2ogpt-oig-oasst1-instruct-cleaned-v3.json', "w") as f:
f.write(json.dumps(row_list, indent=2))


Expand All @@ -1466,28 +1477,28 @@ def create_personality_data():
]
help = [
"",
"How can I help you?",
"How may I assist you?",
"Nice to meet you.",
" How can I help you?",
" How may I assist you?",
" Nice to meet you.",
]
import itertools
rows = []
for pair in itertools.product(questions, answers, help):
rows.append(
dict(input=f"<human>: {pair[0]} <bot>: {pair[1]} {pair[2]}", prompt_type='plain', source="H2O.ai")
dict(input=f"<human>: {pair[0]}\n<bot>: {pair[1]}{pair[2]}\n<human>:", prompt_type='plain', source="H2O.ai")
)
for row in [
"<human>: What is H2O.ai? <bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.",
"<human>: What is h2o.ai? <bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.",
"<human>: What is H2O? <bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.",
"<human>: Who is h2o.ai? <bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.",
"<human>: who is h2o.ai? <bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.",
"<human>: who is h2o? <bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.",
"<human>: What is H2O.ai? <bot>: H2O.ai is the visionary leader in democratizing AI.",
"<human>: Who is H2O.ai? <bot>: H2O.ai is the visionary leader in democratizing AI.",
"<human>: Who is H2O? <bot>: H2O.ai is the visionary leader in democratizing AI.",
"<human>: Who is h2o? <bot>: H2O.ai is the visionary leader in democratizing AI.",
"<human>: who is h2o? <bot>: H2O.ai is the visionary leader in democratizing AI.",
"<human>: What is H2O.ai?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
"<human>: What is h2o.ai?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
"<human>: What is H2O?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
"<human>: Who is h2o.ai?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
"<human>: who is h2o.ai?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
"<human>: who is h2o?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
"<human>: What is H2O.ai?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
"<human>: Who is H2O.ai?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
"<human>: Who is H2O?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
"<human>: Who is h2o?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
"<human>: who is h2o?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
]:
rows.append(dict(input=row, prompt_type='plain', source='H2O.ai'))
print(len(rows))
Expand All @@ -1497,7 +1508,7 @@ def create_personality_data():


def test_check_stats_data():
filename = 'h2ogpt-oig-oasst1-instruct-cleaned-v2.json'
filename = 'h2ogpt-oig-oasst1-instruct-cleaned-v3.json'
df = pd.read_json(filename)

# get word stats
Expand All @@ -1515,16 +1526,16 @@ def test_check_stats_data():
from finetune import get_loaders, get_tokenizer, generate_and_tokenize_prompt
from functools import partial

llama_type = True
tokenizer_base_model = base_model = 'decapoda-research/llama-7b-hf'
llama_type = False
tokenizer_base_model = base_model = 'h2oai/h2ogpt-oasst1-512-20b'
model_loader, tokenizer_loader = get_loaders(llama_type=llama_type, model_name=base_model, reward_type=False)
local_files_only = False
resume_download = True
use_auth_token = False
tokenizer = get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token)
prompt_type = 'plain' # trained with data already in human bot form
train_on_inputs = True
add_eos_token = True
add_eos_token = False
cutoff_len = 512 # can choose 2048
generate_and_tokenize_prompt_fun = partial(generate_and_tokenize_prompt, prompt_type=prompt_type,
train_on_inputs=train_on_inputs, add_eos_token=add_eos_token,
Expand All @@ -1549,11 +1560,7 @@ def test_check_stats_data():
plt.close()


def test_check_unhelpful():
# file = '/home/jon/Downloads/openassistant_oasst1_h2ogpt_graded.json'
file = '/home/jon/Downloads/openassistant_oasst1_h2ogpt_grades.json'
# file = 'h2ogpt-oig-oasst1-instruct-cleaned-v2.json'

def get_unhelpful_list():
# base versions
unhelpful = ["I'm sorry, I didn't quite understand your question, could you please rephrase it?",
"I'm sorry, but I don't understand your question. Could you please rephrase it?",
Expand Down Expand Up @@ -1651,6 +1658,15 @@ def test_check_unhelpful():
"etc. etc.",
"etc etc",
]
return unhelpful


def test_check_unhelpful():
# file = '/home/jon/Downloads/openassistant_oasst1_h2ogpt_graded.json'
file = '/home/jon/Downloads/openassistant_oasst1_h2ogpt_grades.json'
# file = 'h2ogpt-oig-oasst1-instruct-cleaned-v2.json'

unhelpful = get_unhelpful_list()
#data = json.load(open(file, 'rt'))
df = pd.read_json(file)

Expand Down Expand Up @@ -1721,3 +1737,24 @@ def get_bleu(actual, expected_list):

# assert len(bads) == 0, bads
assert len(bads_bots) == 0, bads_bots


def test_fortune2000_personalized():
row_list = []
import glob
if not os.path.isdir("wikitext"):
raise RuntimeError("download https://github.com/h2oai/h2ogpt/files/11423008/wikitext.zip and unzip")
for file in glob.glob("wikitext/*.txt"):
with open(file, "r") as f:
blob = f.read()
N = 512 * 4
row_list.extend([{'input': s, 'prompt_type': 'plain', 'source': "%s" % os.path.basename(file)}
for s in get_sentences(blob, N)])
personality = create_personality_data()
row_list.extend(personality * 10)
np.random.seed(123)
np.random.shuffle(row_list)
for i in range(len(row_list)):
row_list[i]['id'] = i
with open("fortune2000_personalized.json", "w") as ff:
ff.write(json.dumps(row_list, indent=2))
4 changes: 2 additions & 2 deletions finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,8 +590,8 @@ def compute_metrics(eval_preds):
tokenizer=tokenizer,
train_dataset=train_data,
eval_dataset=valid_data,
# NOTE: CausalLM is not supporting Seq2SeqTrainingArguments arguments, but not incompatible
args=transformers.Seq2SeqTrainingArguments(
# FIXME: might need Seq2SeqTrainingArguments for some models
args=transformers.TrainingArguments(
per_device_train_batch_size=micro_batch_size,
per_device_eval_batch_size=1,
eval_accumulation_steps=10,
Expand Down