Skip to content

Commit

Permalink
Merge pull request #27 from stanford-oval/auto-sync-2024-08-23-12-04-56
Browse files Browse the repository at this point in the history
Imrpove documentation
  • Loading branch information
s-jse authored Aug 23, 2024
2 parents 9220e7a + 4b563c6 commit b0442a0
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 87 deletions.
164 changes: 106 additions & 58 deletions README.md

Large diffs are not rendered by default.

22 changes: 11 additions & 11 deletions benchmark/user_simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,18 +113,18 @@ async def simulate_dialog(dialogue_inputs, args) -> list[DialogueTurn]:
return dialogue_state


def repeat_dialogue_inputs(dialogue_inputs, target_num_dialogs):
def repeat_dialogue_inputs(dialogue_inputs, target_num_dialogues):
"""
repeats dialogue_inputs if we don't have enough of them, truncates if there are too many
"""
if target_num_dialogs == -1:
target_num_dialogs = len(dialogue_inputs)
full_rounds = target_num_dialogs // len(dialogue_inputs)
if target_num_dialogues == -1:
target_num_dialogues = len(dialogue_inputs)
full_rounds = target_num_dialogues // len(dialogue_inputs)
dialogue_inputs = (
dialogue_inputs * full_rounds
+ dialogue_inputs[: target_num_dialogs % len(dialogue_inputs)]
+ dialogue_inputs[: target_num_dialogues % len(dialogue_inputs)]
)
assert len(dialogue_inputs) == target_num_dialogs
assert len(dialogue_inputs) == target_num_dialogues
return dialogue_inputs


Expand All @@ -138,7 +138,7 @@ def main(args):
if len(line) > 0:
dialogue_inputs.append(line)

dialogue_inputs = repeat_dialogue_inputs(dialogue_inputs, args.num_dialogs)
dialogue_inputs = repeat_dialogue_inputs(dialogue_inputs, args.num_dialogues)
topics = dialogue_inputs
elif args.mode == "passage":
with open(args.input_file) as input_file:
Expand All @@ -149,12 +149,12 @@ def main(args):
for title, passage in dialogue_inputs.items()
]

dialogue_inputs = repeat_dialogue_inputs(dialogue_inputs, args.num_dialogs)
dialogue_inputs = repeat_dialogue_inputs(dialogue_inputs, args.num_dialogues)
topics = [tp[0] for tp in dialogue_inputs]
elif args.mode == "multihop":
with open(args.input_file) as input_file:
dialogue_inputs = json.load(input_file)
dialogue_inputs = repeat_dialogue_inputs(dialogue_inputs, args.num_dialogs)
dialogue_inputs = repeat_dialogue_inputs(dialogue_inputs, args.num_dialogues)
topics = [m["title_1"] + " and " + m["title_2"] for m in dialogue_inputs]
else:
raise ValueError("Unknown mode: %s" % args.mode)
Expand Down Expand Up @@ -226,10 +226,10 @@ def main(args):
"--output_file", type=str, required=True, help="Where to write the outputs"
)
parser.add_argument(
"--num_dialogs",
"--num_dialogues",
type=int,
required=True,
help="The number of dialogs to generate. -1 means all topics.",
help="The number of dialogues to generate. -1 means all topics.",
)
parser.add_argument(
"--num_turns",
Expand Down
20 changes: 20 additions & 0 deletions retrieval/upload_folder_to_hf_hub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import argparse
from huggingface_hub import upload_folder

def main(repo_id, folder_path):
upload_folder(
folder_path=folder_path,
repo_id=repo_id,
repo_type="dataset",
multi_commits=True,
multi_commits_verbose=True,
)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Upload a folder to HuggingFace Hub")
parser.add_argument("--folder_path", type=str, help="The path to the folder to upload")
parser.add_argument("--repo_id", type=str, help="The repository ID on HuggingFace Hub")

args = parser.parse_args()

main(args.repo_id, args.folder_path)
9 changes: 0 additions & 9 deletions retrieval/upload_to_hf_hub.py

This file was deleted.

4 changes: 2 additions & 2 deletions tasks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
@task(pre=[load_api_keys])
def simulate_users(
c,
num_dialogs, # -1 to simulate all
num_dialogues, # -1 to simulate all available topics
num_turns: int,
simulation_mode: str, # passage
subset: str, # head, recent, tail
Expand Down Expand Up @@ -89,7 +89,7 @@ def simulate_users(

c.run(
f"python benchmark/user_simulator.py {pipeline_flags} "
f"--num_dialogs {num_dialogs} "
f"--num_dialogues {num_dialogues} "
f"--user_engine {user_simulator_engine} "
f"--user_temperature {user_temperature} "
f"--mode {simulation_mode} "
Expand Down
8 changes: 1 addition & 7 deletions wikipedia_preprocessing/upload_collections_to_hf_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,6 @@
path_in_repo=f"{date}/{language}/{file}",
repo_id="stanford-oval/wikipedia",
repo_type="dataset",
run_as_future=True,
)

# Remove the extracted files
for date, language in [
(date, lang) for date in args.dates for lang in args.languages
]:
extracted_file = f"workdir/{language}/wikipedia_{date}/collection.jsonl"
# Remove the extracted file now
os.remove(extracted_file)

0 comments on commit b0442a0

Please sign in to comment.