h2oai · arnocandel · May 9, 2023 · May 8, 2023 · May 8, 2023 · May 8, 2023
diff --git a/FINETUNE.md b/FINETUNE.md
@@ -60,8 +60,8 @@ Below are some of our scripts to help with assembling and cleaning instruct-type
 For a higher quality dataset, run the following commands:
 ```bash
 pytest -s create_data.py::test_download_useful_data_as_parquet  # downloads ~ 4.2GB of open-source permissive data
-pytest -s create_data.py::test_assemble_and_detox               # ~ 3 minutes, 3.9M clean conversations
-pytest -s create_data.py::test_chop_by_lengths                  # ~ 2 minutes, 2.9M clean and long enough conversations
+pytest -s create_data.py::test_assemble_and_detox               # ~ 3 minutes, 4.1M clean conversations
+pytest -s create_data.py::test_chop_by_lengths                  # ~ 2 minutes, 2.8M clean and long enough conversations
 pytest -s create_data.py::test_grade                            # ~ 3 hours, keeps only high quality data
 pytest -s create_data.py::test_finalize_to_json
 ```

diff --git a/create_data.py b/create_data.py
@@ -113,12 +113,15 @@ def get_sentences(blob, length):
     my_sentences = []
     my_string = ""
     for sentence in sentences:
-        if len(my_string) < length:
-            my_string += " " + sentence
+        if len(my_string) + len(sentence) <= length:
+            if my_string:
+                my_string += " " + sentence
+            else:
+                my_string = sentence
         else:
             my_sentences.append(my_string)
             my_string = ""
-    return my_sentences
+    return my_sentences or [my_string]
 
 
 def test_scrape_dai_docs_all_pandoc():
@@ -255,15 +258,15 @@ def test_config_to_json():
                 [
                     {
                         'prompt_type': 'plain',
-                        'instruction': f"<human>: What does {k} do? <bot>: {k.replace('_', ' ')} config.toml:  {comment or title}".replace("\n", ""),
+                        'instruction': f"<human>: What does {k} do?\n<bot>: {k.replace('_', ' ')} config.toml:  {comment or title}\n<human>:".replace("\n", ""),
                     },
                     {
                         'prompt_type': 'plain',
-                        'instruction': f"<human>: Explain {k}. <bot>: {k.replace('_', ' ')} config.toml:  {comment or title}".replace("\n", ""),
+                        'instruction': f"<human>: Explain {k}.\n<bot>: {k.replace('_', ' ')} config.toml:  {comment or title}\n<human>:".replace("\n", ""),
                     },
                     {
                         'prompt_type': 'plain',
-                        'instruction': f"<human>: How can I do this: {title}. <bot>: Set the {k.replace('_', ' ')} config.toml".replace("\n", ""),
+                        'instruction': f"<human>: How can I do this: {title}.\n<bot>: Set the {k.replace('_', ' ')} config.toml\n<human>:".replace("\n", ""),
                     } if title and comment else None,
                     {
                         'prompt_type': 'human_bot',
@@ -902,7 +905,8 @@ def test_assemble_and_detox():
         # chop up into human/bot interactions of no more than 10kB per row
         text_list = df[['text']].values.ravel().tolist()
         new_text = []
-        max_len = 10000   # approx 2k tokens
+        max_len = 2048  # uber cutoff
+        MAX_LEN = 2048//2 - 30  # max len per question/answer
         for text in tqdm(text_list):
             human_starts = [m.start() for m in re.finditer('<human>: ', text)]
             if len(human_starts) == 1:
@@ -911,11 +915,13 @@ def test_assemble_and_detox():
             for i in range(len(human_starts) - 1):
                 interaction = text[human_starts[i]: human_starts[i+1]][:max_len]
                 blurb += interaction
-                if len(blurb) >= max_len:
-                    new_text.append(blurb[:2*max_len])
+                if len(blurb) >= MAX_LEN:
+                    blurb = get_sentences(blurb, length=MAX_LEN)[0]
+                    new_text.append(blurb + "\n<human>:")
                     blurb = ''
             if blurb:
-                new_text.append(blurb[:2*max_len])
+                blurb = get_sentences(blurb, length=MAX_LEN)[0]
+                new_text.append(blurb + "\n<human>:")
 
         if len(new_text) > len(text_list):
             print("Added %d new rows (before: %d)" % (len(new_text) - df.shape[0], df.shape[0]))
@@ -932,10 +938,10 @@ def test_assemble_and_detox():
         print("Dropped %d rows out of %d due to alt-profanity-check" % (before_rows - after_rows, before_rows))
         df_list.append(df)
         print("Done processing %s -> %s rows" % (data, df.shape[0]), flush=True)
-        print("So far have %d rows" % sum([len(x) for x  in df_list]))
+        print("So far have %d rows" % sum([len(x) for x in df_list]))
     df_final = pd.concat(df_list)
     df_final = df_final.sample(frac=1, random_state=1234).reset_index(drop=True)
-    df_final.to_parquet('h2oGPT.cleaned.human_bot.parquet', index=False)
+    df_final.to_parquet('h2oGPT.cleaned.human_bot.shorter.parquet', index=False)
 
 
 def test_basic_cleaning():
@@ -1070,7 +1076,7 @@ def get_answer(x):
     )
     start = 0
     batch_size = 64 * 16
-    micro_batch = orig_micro_batch = 4
+    micro_batch = orig_micro_batch = 16
     end = 0
     import socket
     checkpoint = "grades.%s.pkl" % socket.gethostname()
@@ -1112,7 +1118,7 @@ def get_answer(x):
 
 
 def test_chop_by_lengths():
-    file = "h2oGPT.cleaned.human_bot.parquet"
+    file = "h2oGPT.cleaned.human_bot.shorter.parquet"
     df = pd.read_parquet(file).reset_index(drop=True)
     df = count_human_bot_lengths(df)
     df['rand'] = np.random.rand(df.shape[0])
@@ -1132,7 +1138,7 @@ def test_chop_by_lengths():
     after_rows = df.shape[0]
     print("Chopped off %d out of %d rows due to length" % (before_rows - after_rows, before_rows))
     print(df.describe())
-    df.to_parquet('h2oGPT.cleaned.chopped.human_bot.parquet', index=False)
+    df.to_parquet('h2oGPT.cleaned.chopped.human_bot.shorter.parquet', index=False)
 
 
 def count_human_bot_lengths(df, human=None, bot=None):
@@ -1188,8 +1194,8 @@ def count_human_bot_lengths(df, human=None, bot=None):
 def test_grade():
     df = None
 
-    file = "h2oGPT.cleaned.chopped.human_bot.parquet"
-    output_file = "h2oGPT.cleaned.graded1.human_bot.parquet"
+    file = "h2oGPT.cleaned.chopped.human_bot.shorter.parquet"
+    output_file = "h2oGPT.cleaned.graded1.human_bot.shorter.parquet"
     if not os.path.exists(output_file):
         if df is None:
             df = pd.read_parquet(file).reset_index(drop=True)
@@ -1203,7 +1209,7 @@ def test_grade():
         df.to_parquet(output_file, index=False)
 
     file = output_file
-    output_file = "h2oGPT.cleaned.graded2.human_bot.parquet"
+    output_file = "h2oGPT.cleaned.graded2.human_bot.shorter.parquet"
     if not os.path.exists(output_file):
         # slower than alt-profanity, do last, but do before deberta grading, since that's slower
         if df is None:
@@ -1218,12 +1224,12 @@ def test_grade():
         df.to_parquet(output_file, index=False)
 
     file = output_file
-    output_file = 'h2oGPT.cleaned.graded3.human_bot.parquet'
+    output_file = 'h2oGPT.cleaned.graded3.human_bot.shorter.parquet'
     if not os.path.exists(output_file):
         if df is None:
             df = pd.read_parquet(file).reset_index(drop=True)
         df = add_deberta_grade(df)
-        min_grade = 0.2
+        min_grade = 0.3
         max_grade = np.inf
         before_rows = df.shape[0]
         df = df[df['grade_deberta'] >= min_grade]
@@ -1235,7 +1241,7 @@ def test_grade():
         df.to_parquet(output_file, index=False)
 
     file = output_file
-    output_file = 'h2oGPT.cleaned.graded.human_bot.parquet'
+    output_file = 'h2oGPT.cleaned.graded.human_bot.shorter.parquet'
     if df is None:
         df = pd.read_parquet(file).reset_index(drop=True)
     df.to_parquet(output_file, index=False)
@@ -1246,14 +1252,15 @@ def test_grade():
     [
         [False, False, False],
         [True, True, False],
+        [True, False, False],
         [True, False, True],
     ]
 )
 def test_add_open_assistant(fixup_personality, only_personality, deberta_grading, save_json=True):
     """
     Flatten tree structure into one row per path from root to leaf
     Also turn into human_bot prompting format:
-        <human>: question <bot>: answer <human>: question2 <bot>: answer2 Etc.
+        <human>: question\n<bot>: answer <human>: question2\n<bot>: answer2 Etc.
     Also saves a .json locally as side-effect
     returns list of dicts, containing intput, prompt_type and source
     """
@@ -1357,9 +1364,11 @@ def test_add_open_assistant(fixup_personality, only_personality, deberta_grading
                         conv2['message_id'] = None
         conversations = [c for c in conversations if c['message_id']]
         if only_personality:
-            all_rows.extend([dict(input=c['text'], prompt_type='plain', source=data_file) for c in conversations if 'h2oGPT' in c['text']])
+            all_rows.extend([dict(input=c['text'] + "\n<human>:", prompt_type='plain', source=data_file) for c in conversations if 'h2oGPT' in c['text']])
         else:
-            all_rows.extend([dict(input=c['text'], prompt_type='plain', source=data_file) for c in conversations if "What is H2O.ai" not in c['text']])
+            all_rows.extend([dict(input=c['text'] + "\n<human>:", prompt_type='plain', source=data_file) for c in conversations if "What is H2O.ai" not in c['text']])
+    unhelpful = get_unhelpful_list()
+    all_rows = [x for x in all_rows if not any(u in x['input'] for u in unhelpful)]
     personality = create_personality_data()
     all_rows.extend(personality * 10)
     np.random.seed(123)
@@ -1370,9 +1379,9 @@ def test_add_open_assistant(fixup_personality, only_personality, deberta_grading
         df = df.rename(columns={'input': 'text'})
         df = add_deberta_grade(df)
         df = df.rename(columns={'text': 'input'})
-        drop = False
+        drop = True
         if drop:
-            min_grade = 0.2
+            min_grade = 0.3
             max_grade = np.inf
             before_rows = df.shape[0]
             df = df[df['grade_deberta'] >= min_grade]
@@ -1396,24 +1405,22 @@ def test_add_open_assistant(fixup_personality, only_personality, deberta_grading
                     ("_h2ogpt" if fixup_personality else "") + \
                     ("_only" if only_personality else "") + \
                     ("_graded" if deberta_grading else "")
+        for i in range(len(all_rows)):
+            all_rows[i]['id'] = i
         with open(data_file.lower().replace("/", "_") + ".json", "w") as f:
             f.write(json.dumps(all_rows, indent=2))
     return all_rows
 
 
 def test_finalize_to_json():
-    df = pd.read_parquet('h2oGPT.cleaned.graded.human_bot.parquet')
+    df = pd.read_parquet('h2oGPT.cleaned.graded.human_bot.shorter.parquet')
     df = df.rename(columns={'text': 'input'})
 
     print("Number of high-quality human_bot interactions: %s" % df.shape[0], flush=True)
 
     print("Adding open assistant data")
-    open_assistant = test_add_open_assistant(
-        fixup_personality=True,  # False was original version, but it's better to personalize, so now using True
-        only_personality=False,
-        save_json=True,
-        deberta_grading=False,
-    )
+    with open("openassistant_oasst1_h2ogpt_graded.json") as f:
+        open_assistant = json.loads(f.read())
     df = pd.concat([df, pd.DataFrame(open_assistant)], axis=0)
 
     def final_clean(df):
@@ -1441,7 +1448,11 @@ def final_clean(df):
         )
     np.random.seed(1234)
     np.random.shuffle(row_list)
-    with open('h2ogpt-oig-oasst1-instruct-cleaned-v2.json', "w") as f:
+    unhelpful = get_unhelpful_list()
+    row_list = [x for x in row_list if not any(u in x['input'] for u in unhelpful)]
+    for i in range(len(row_list)):
+        row_list[i]['id'] = i
+    with open('h2ogpt-oig-oasst1-instruct-cleaned-v3.json', "w") as f:
         f.write(json.dumps(row_list, indent=2))
 
 
@@ -1466,28 +1477,28 @@ def create_personality_data():
     ]
     help = [
         "",
-        "How can I help you?",
-        "How may I assist you?",
-        "Nice to meet you.",
+        " How can I help you?",
+        " How may I assist you?",
+        " Nice to meet you.",
     ]
     import itertools
     rows = []
     for pair in itertools.product(questions, answers, help):
         rows.append(
-            dict(input=f"<human>: {pair[0]} <bot>: {pair[1]} {pair[2]}", prompt_type='plain', source="H2O.ai")
+            dict(input=f"<human>: {pair[0]}\n<bot>: {pair[1]}{pair[2]}\n<human>:", prompt_type='plain', source="H2O.ai")
         )
     for row in [
-        "<human>: What is H2O.ai? <bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.",
-        "<human>: What is h2o.ai? <bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.",
-        "<human>: What is H2O? <bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.",
-        "<human>: Who is h2o.ai? <bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.",
-        "<human>: who is h2o.ai? <bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.",
-        "<human>: who is h2o? <bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.",
-        "<human>: What is H2O.ai? <bot>: H2O.ai is the visionary leader in democratizing AI.",
-        "<human>: Who is H2O.ai? <bot>: H2O.ai is the visionary leader in democratizing AI.",
-        "<human>: Who is H2O? <bot>: H2O.ai is the visionary leader in democratizing AI.",
-        "<human>: Who is h2o? <bot>: H2O.ai is the visionary leader in democratizing AI.",
-        "<human>: who is h2o? <bot>: H2O.ai is the visionary leader in democratizing AI.",
+        "<human>: What is H2O.ai?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
+        "<human>: What is h2o.ai?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
+        "<human>: What is H2O?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
+        "<human>: Who is h2o.ai?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
+        "<human>: who is h2o.ai?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
+        "<human>: who is h2o?\n<bot>: H2O.ai is a technology company that aims to democratize AI and make it accessible to a broader audience by simplifying the process of creating and deploying machine learning models.\n<human>:",
+        "<human>: What is H2O.ai?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
+        "<human>: Who is H2O.ai?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
+        "<human>: Who is H2O?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
+        "<human>: Who is h2o?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
+        "<human>: who is h2o?\n<bot>: H2O.ai is the visionary leader in democratizing AI.\n<human>:",
     ]:
         rows.append(dict(input=row, prompt_type='plain', source='H2O.ai'))
     print(len(rows))
@@ -1497,7 +1508,7 @@ def create_personality_data():
 
 
 def test_check_stats_data():
-    filename = 'h2ogpt-oig-oasst1-instruct-cleaned-v2.json'
+    filename = 'h2ogpt-oig-oasst1-instruct-cleaned-v3.json'
     df = pd.read_json(filename)
 
     # get word stats
@@ -1515,16 +1526,16 @@ def test_check_stats_data():
     from finetune import get_loaders, get_tokenizer, generate_and_tokenize_prompt
     from functools import partial
 
-    llama_type = True
-    tokenizer_base_model = base_model = 'decapoda-research/llama-7b-hf'
+    llama_type = False
+    tokenizer_base_model = base_model = 'h2oai/h2ogpt-oasst1-512-20b'
     model_loader, tokenizer_loader = get_loaders(llama_type=llama_type, model_name=base_model, reward_type=False)
     local_files_only = False
     resume_download = True
     use_auth_token = False
     tokenizer = get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token)
     prompt_type = 'plain'  # trained with data already in human bot form
     train_on_inputs = True
-    add_eos_token = True
+    add_eos_token = False
     cutoff_len = 512  # can choose 2048
     generate_and_tokenize_prompt_fun = partial(generate_and_tokenize_prompt, prompt_type=prompt_type,
                                                train_on_inputs=train_on_inputs, add_eos_token=add_eos_token,
@@ -1549,11 +1560,7 @@ def test_check_stats_data():
     plt.close()
 
 
-def test_check_unhelpful():
-    # file = '/home/jon/Downloads/openassistant_oasst1_h2ogpt_graded.json'
-    file = '/home/jon/Downloads/openassistant_oasst1_h2ogpt_grades.json'
-    # file = 'h2ogpt-oig-oasst1-instruct-cleaned-v2.json'
-
+def get_unhelpful_list():
     # base versions
     unhelpful = ["I'm sorry, I didn't quite understand your question, could you please rephrase it?",
                  "I'm sorry, but I don't understand your question. Could you please rephrase it?",
@@ -1651,6 +1658,15 @@ def test_check_unhelpful():
                   "etc. etc.",
                   "etc etc",
                   ]
+    return unhelpful
+
+
+def test_check_unhelpful():
+    # file = '/home/jon/Downloads/openassistant_oasst1_h2ogpt_graded.json'
+    file = '/home/jon/Downloads/openassistant_oasst1_h2ogpt_grades.json'
+    # file = 'h2ogpt-oig-oasst1-instruct-cleaned-v2.json'
+
+    unhelpful = get_unhelpful_list()
     #data = json.load(open(file, 'rt'))
     df = pd.read_json(file)
 
@@ -1721,3 +1737,24 @@ def get_bleu(actual, expected_list):
 
     # assert len(bads) == 0, bads
     assert len(bads_bots) == 0, bads_bots
+
+
+def test_fortune2000_personalized():
+    row_list = []
+    import glob
+    if not os.path.isdir("wikitext"):
+        raise RuntimeError("download https://github.com/h2oai/h2ogpt/files/11423008/wikitext.zip and unzip")
+    for file in glob.glob("wikitext/*.txt"):
+        with open(file, "r") as f:
+            blob = f.read()
+        N = 512 * 4
+        row_list.extend([{'input': s, 'prompt_type': 'plain', 'source': "%s" % os.path.basename(file)}
+                         for s in get_sentences(blob, N)])
+    personality = create_personality_data()
+    row_list.extend(personality * 10)
+    np.random.seed(123)
+    np.random.shuffle(row_list)
+    for i in range(len(row_list)):
+        row_list[i]['id'] = i
+    with open("fortune2000_personalized.json", "w") as ff:
+        ff.write(json.dumps(row_list, indent=2))
diff --git a/finetune.py b/finetune.py
@@ -590,8 +590,8 @@ def compute_metrics(eval_preds):
         tokenizer=tokenizer,
         train_dataset=train_data,
         eval_dataset=valid_data,
-        # NOTE: CausalLM is not supporting Seq2SeqTrainingArguments arguments, but not incompatible
-        args=transformers.Seq2SeqTrainingArguments(
+        # FIXME: might need Seq2SeqTrainingArguments for some models
+        args=transformers.TrainingArguments(
             per_device_train_batch_size=micro_batch_size,
             per_device_eval_batch_size=1,
             eval_accumulation_steps=10,