From 4b563c628b6e1e43cfb642075efa28b49f460c86 Mon Sep 17 00:00:00 2001
From: s-jse <s-jse@users.noreply.github.com>
Date: Fri, 23 Aug 2024 19:04:57 +0000
Subject: [PATCH] Imrpove documentation

---
 README.md                                     | 164 +++++++++++-------
 benchmark/user_simulator.py                   |  22 +--
 retrieval/upload_folder_to_hf_hub.py          |  20 +++
 retrieval/upload_to_hf_hub.py                 |   9 -
 tasks/benchmark.py                            |   4 +-
 .../upload_collections_to_hf_hub.py           |   8 +-
 6 files changed, 140 insertions(+), 87 deletions(-)
 create mode 100644 retrieval/upload_folder_to_hf_hub.py
 delete mode 100644 retrieval/upload_to_hf_hub.py
diff --git a/README.md b/README.md
index 62ae972..6afb931 100644
--- a/README.md
+++ b/README.md
@@ -44,17 +44,20 @@ Check out our paper for more details:
 Sina J. Semnani, Violet Z. Yao*, Heidi C. Zhang*, and Monica S. Lam. 2023. [WikiChat: Stopping the Hallucination of Large Language Model Chatbots by Few-Shot Grounding on Wikipedia](https://arxiv.org/abs/2305.14292). In Findings of the Association for Computational Linguistics: EMNLP 2023, Singapore. Association for Computational Linguistics.
 
 ## 🚨 **Announcements** 
-- (August 22, 2024) WikiChat 2.0 is now available! Highlights:
-    - WikiChat now supports retrieval from structured data like tables, infoboxes and lists, in addition to text.
-    - WikiChat is now multilingual. By default, it retrieves information from 10 Wikipedias ( :us: English, 🇨🇳 Chinese, 🇪🇸 Spanish, 🇵🇹 Portuguese, 🇷🇺 Russian, 🇩🇪 German, 🇮🇷 Farsi, 🇯🇵 Japanese, 🇫🇷 French, 🇮🇹 Italian)
-    - Supports 100+ LLMs through a unified interface, thanks to [LiteLLM](https://github.com/BerriAI/litellm).
-    - Uses the state-of-the-art multilingual retrieval model [BGE-M3](https://huggingface.co/BAAI/bge-m3).
-    - Uses [Qdrant](https://github.com/qdrant/qdrant) for scalable vector search. We also provide a high-quality free (but rate-limited) search API for access to 10 Wikipedias, over 250M vector embeddings.
-    - Option for faster and cheaper pipeline by merging the "generate" and "extract claim" stages.
-    - Has the highest quality public Wikipedia preprocessing scripts (event better than what is used to pre-train LLMs, see below).
-    - Uses and is compatible with LangChain 🦜️🔗.
-    - Uses [RankGPT](https://github.com/sunnweiwei/RankGPT) for more relevant results.
-    - Lots more!
+- (August 22, 2024) WikiChat 2.0 is now available! Key updates include:
+    - **Multilingual Support**: By default, retrieves information from 10 different Wikipedias: 🇺🇸 English, 🇨🇳 Chinese, 🇪🇸 Spanish, 🇵🇹 Portuguese, 🇷🇺 Russian, 🇩🇪 German, 🇮🇷 Farsi, 🇯🇵 Japanese, 🇫🇷 French, and 🇮🇹 Italian.
+    - **Improved Information Retrieval**
+      - Now supports retrieval from structured data such as tables, infoboxes, and lists, in addition to text.
+      - Has the highest quality public Wikipedia preprocessing scripts
+      - Uses the state-of-the-art multilingual retrieval model [BGE-M3](https://huggingface.co/BAAI/bge-m3).
+      - Uses [Qdrant](https://github.com/qdrant/qdrant) for scalable vector search.
+      - Uses [RankGPT](https://github.com/sunnweiwei/RankGPT) to rerank search results.
+    - **Free Multilingual Wikipedia Search API**: We offer a high-quality, free (but rate-limited) search API for access to 10 Wikipedias, encompassing over 180M vector embeddings.
+
+    - **Expanded LLM Compatibility**: Supports 100+ LLMs through a unified interface, thanks to [LiteLLM](https://github.com/BerriAI/litellm).
+    - **Optimized Pipeline**: Option for a faster and more cost-effective pipeline by merging the "generate" and "extract claim" stages of WikiChat.
+    - **LangChain Compatibility**: Fully compatible with LangChain 🦜️🔗.
+    - **And Much More!**
 - (June 20, 2024) WikiChat won the 2024 Wikimedia Research Award!
   <blockquote class="twitter-tweet"><p lang="en" dir="ltr">The <a href="https://twitter.com/Wikimedia?ref_src=twsrc%5Etfw">@Wikimedia</a> Research Award of the Year 2024 goes to &quot;WikiChat: Stopping the hallucination of large language model chatbots by few-shot grounding on Wikipedia&quot; ⚡<br><br>📜 <a href="https://t.co/d2M8Qrarkw">https://t.co/d2M8Qrarkw</a> <a href="https://t.co/P2Sh47vkyi">pic.twitter.com/P2Sh47vkyi</a></p>&mdash; Wiki Workshop 2024 (@wikiworkshop) <a href="https://twitter.com/wikiworkshop/status/1803793163665977481?ref_src=twsrc%5Etfw">June 20, 2024</a></blockquote>
   
@@ -70,91 +73,121 @@ Sina J. Semnani, Violet Z. Yao*, Heidi C. Zhang*, and Monica S. Lam. 2023. [Wiki
 Installing WikiChat involves the following steps:
 
 1. Install dependencies
-2. Configure the LLM of your choice. WikiChat supports over 100 LLMs, including models from OpenAI, Azure, Anthropic, Mistral, HuggingFace, Together.ai, and Groq.
-3. Select an information retrieval source. This can be any HTTP endpoint that conforms to the interface defined in `retrieval/retriever_server.py`. We provide instructions and scripts for the following options:
+1. Configure the LLM of your choice. WikiChat supports over 100 LLMs, including models from OpenAI, Azure, Anthropic, Mistral, HuggingFace, Together.ai, and Groq.
+1. Select an information retrieval source. This can be any HTTP endpoint that conforms to the interface defined in `retrieval/retriever_server.py`. We provide instructions and scripts for the following options:
     1. Use our free, rate-limited API for Wikipedia in 10 languages.
     1. Download and host our provided Wikipedia index yourself.
     1. Create and run a new custom index from your own documents.
-4. Run WikiChat with your desired configuration.
-5. [Optional] Deploy WikiChat for multi-user access. We provide code to deploy a simple front-end and backend, as well as instructions to connect to an Azure Cosmos DB database for storing conversations.
+1. Run WikiChat with your desired configuration.
+1. [Optional] Deploy WikiChat for multi-user access. We provide code to deploy a simple front-end and backend, as well as instructions to connect to an Azure Cosmos DB database for storing conversations.
 
 
 ## System Requirements
 This project has been tested with Python 3.10 on Ubuntu 20.04 LTS (Focal Fossa), but it should be compatible with many other Linux distributions. If you plan to use this on Windows WSL or macOS, or with a different Python version, be prepared for potential troubleshooting during installation.
 
-Running WikiChat using LLM APIs and our Wikipedia search API does not have specific hardware requirements and can be performed on most systems. However, if you intend to host a search index locally, ensure you have sufficient disk space for the index. For large indices, retrieval latency is heavily dependant on disk speed, so we recommend using SSDs and preferably NVMe drives. For example, storage-optimized VMs like [`Standard_L8s_v3`](https://learn.microsoft.com/en-us/azure/virtual-machines/lsv3-series) on Azure are suitable for this.
+Hardware requirements vary based on your intended use:
 
-If you plan to use WikiChat with a local LLM, a GPU is necessary to host the model.
+1. **Basic Usage**: Running WikiChat with LLM APIs and our Wikipedia search API has minimal hardware requirements and should work on most systems.
+
+1. **Local Search Index**: If you intend to host a search index locally, ensure you have sufficient disk space for the index. For large indices, retrieval latency is heavily dependant on disk speed, so we recommend using SSDs and preferably NVMe drives. For example, storage-optimized VMs like [`Standard_L8s_v3`](https://learn.microsoft.com/en-us/azure/virtual-machines/lsv3-series) on Azure are suitable for this.
+
+1. **Local LLM**: If you plan to use WikiChat with a local LLM, a GPU is necessary to host the model.
+
+1. **Creating a New Retrieval Index**: If you want to index a collection, you need a GPU to embed documents to vectors. The default embedding model (`BAAI/BGE-M3`) requires at least 13GB of GPU memory to run.
 
 
 ## Install Dependencies
 
-Clone the repo:
+First, clone the repository:
 ```
 git clone https://github.com/stanford-oval/WikiChat.git
 cd WikiChat
 ```
 
+We recommend using the conda environment specified in `conda_env.yaml`. This environment includes [Python 3.10](https://docs.python.org/3.10/), [pip](https://pip.pypa.io/en/stable/), [gcc](https://gcc.gnu.org/onlinedocs/), [g++](https://gcc.gnu.org/onlinedocs/gcc-11.2.0/libstdc++/manual/), [make](https://www.gnu.org/software/make/manual/make.html), [Redis](https://redis.io/documentation), and all required Python packages.
 
-We recommend using the conda environment in `conda_env.yaml`. This will create a conda environment with [Python 3.10](https://docs.python.org/3.10/), and install [pip](https://pip.pypa.io/en/stable/), [gcc](https://gcc.gnu.org/onlinedocs/), [g++](https://gcc.gnu.org/onlinedocs/gcc-11.2.0/libstdc++/manual/), [make](https://www.gnu.org/software/make/manual/make.html), [Redis](https://redis.io/documentation), and all required Python packages.
-
-Make sure you have one of [Conda](https://docs.conda.io/en/latest/), [Anaconda](https://docs.anaconda.com/) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html) installed.
-
-Create and activate the conda environment by running:
+Ensure you have either [Conda](https://docs.conda.io/en/latest/), [Anaconda](https://docs.anaconda.com/), or [Miniconda](https://docs.conda.io/en/latest/miniconda.html) installed. Then create and activate the conda environment:
 
 ```bash
 conda env create --file conda_env.yaml
 conda activate wikichat
-python -m spacy download en_core_web_sm # Spacy is only needed for certain configurations of WikiChat
+python -m spacy download en_core_web_sm  # Spacy is only needed for certain WikiChat configurations
+```
+
+Keep this environment activated for all subsequent commands.
+
+Install Docker for your operating system by following the instructions at https://docs.docker.com/engine/install/. WikiChat uses Docker primarily for creating and serving vector databases for retrieval, specifically [🤗 Text Embedding Inference](https://github.com/huggingface/text-embeddings-inference) and [Qdrant](https://github.com/qdrant/qdrant). On recent Ubuntu versions, you can try running `inv install-docker`. For other operating systems, follow the instructions on the docker website.
+
+WikiChat uses `invoke` (https://www.pyinvoke.org/) to add custom commands for various purposes. To see all available commands and their descriptions, run:
+```
+invoke --list
+```
+or the shorthand:
+```
+inv -l
 ```
-Make sure this environment is activated whenever you run any of the following commands.
 
-Install Docker for your operating system by following the instructions at https://docs.docker.com/engine/install/. This project uses Docker primarily for creating and serving vector databases for retrieval, specifically [🤗 Text Embedding Inference](https://github.com/huggingface/text-embeddings-inference) and [Qdrant](https://github.com/qdrant/qdrant). If you are using a recent Ubuntu, you can try running `inv install-docker`
+For more details about a specific command, use:
+```
+inv [command name] --help
+```
 
-WikiChat uses `invoke` (https://www.pyinvoke.org/) to add custom commands to WikiChat for various purposes.
-You can run `invoke --list`, or the shorthand `inv -l` to list all available commands and a short description of what they do. You can also run `inv [command name] --help` to see more details about each of the available commands.
 These commands are implemented in the `tasks/` folder.
 
 
 ## Configure the LLM of Your Choice
 
-WikiChat is compatible with various LLMs including models from OpenAI, Azure, Anthropic, Mistral, Together.ai, and Groq.
+WikiChat is compatible with various LLMs, including models from OpenAI, Azure, Anthropic, Mistral, Together.ai, and Groq.
 You can also use WikiChat with many locally hosted models via HuggingFace.
-To configure the LLM you want to use, fill out the appropriate fields in `llm_config.yaml`.
 
-Then create a file named `API_KEYS` (which is included in `.gitignore`), and set the API key for the LLM endpoint you want to use.
-The name of the API key in this file should match the name you provide in `llm_config.yaml` under `api_key`.
-For example, if you are using OpenAI models via openai.com and Mistral endpoints in your code, your `API_KEYS` file might look like this:
+To configure your LLM:
+1. Fill out the appropriate fields in `llm_config.yaml`.
+
+2. Create a file named `API_KEYS` (which is included in `.gitignore`).
+3. In the `API_KEYS` file, set the API key for the LLM endpoint you want to use. The name of the API key should match the name you provided in `llm_config.yaml` under `api_key`.
+For example, if you're using OpenAI models via openai.com and Mistral endpoints, your `API_KEYS` file might look like this:
 
 ```bash
-# Fill in the following values with your own keys for the API you are using. Make sure there is not extra space after the key.
-# Changes to this file are ignored by git, so that you can safely store your keys here during development.
+# Fill in the following values with your API keys. Make sure there is not extra space after the key.
+# Changes to this file are ignored by git, so you can safely store your keys here during development.
 OPENAI_API_KEY=[Your OpenAI API key from https://platform.openai.com/api-keys]
 MISTRAL_API_KEY=[Your Mistral API key from https://console.mistral.ai/api-keys/]
 ```
 
-Note that locally hosted models do NOT need an api key, but you need to provide an OpenAI-compatible endpoint in `api_base`. The code has been tested with [🤗 Text Generation Inference](https://github.com/huggingface/text-generation-inference/) endpoints, but you can try other similar endpoints like [vLLM](https://github.com/vllm-project/vllm), [SGLang](https://github.com/sgl-project/sglang), etc.
+Note that locally hosted models do NOT need an API key, but you need to provide an OpenAI-compatible endpoint in `api_base`. The code has been tested with [🤗 Text Generation Inference](https://github.com/huggingface/text-generation-inference/) endpoints, but you can try other similar endpoints like [vLLM](https://github.com/vllm-project/vllm), [SGLang](https://github.com/sgl-project/sglang), etc.
 
 
-## Configure an Information Retrieval Source
+## Configure Information Retrieval
 
 ### Option 1 (default): Use our free rate-limited Wikipedia search API
 By default, WikiChat retrieves information from 10 Wikipedias via the endpoint at https://wikichat.genie.stanford.edu/search/. If you want to just try WikiChat, you do not need to modify anything.
 
 ### Option 2: Download and host our Wikipedia index
-Run `inv download-wikipedia-index --workdir ./workdir` to download the index from [stanford-oval/wikipedia_10-languages_bge-m3_qdrant_index](🤗 Hub) and extract it.
+1. Download the [index](stanford-oval/wikipedia_10-languages_bge-m3_qdrant_index) from 🤗 Hub and extract it:
+```bash
+inv download-wikipedia-index --workdir ./workdir
+```
 
 Note that this index contains ~180M vector embeddings and therefore requires a at least 800 GB of empty disk space. It uses [Qdrant's binary quantization](https://qdrant.tech/articles/binary-quantization/) to reduce RAM requirements to 55 GB without sacrificing accuracy or latency.
 
-This command will start a FastAPI server similar to option 1 that responds to HTTP POST requests. Note that this server and even its embedding model runs on CPU, and does not require GPU. For better performance, on compatible systems you can add `--user-onnx` to use the ONNX version of the embedding model, for lower latency.
-`inv start-retriever --embedding-model BAAI/bge-m3 --use-onnx --retriever-port <port number>`
+2. Start a FastAPI server similar to option 1 that responds to HTTP POST requests:
+```bash
+inv start-retriever --embedding-model BAAI/bge-m3 --retriever-port <port number>
+```
+
+Note that this server and its embedding model run on CPU, and do not require GPU. For better performance, on compatible systems you can add `--use-onnx` to use the ONNX version of the embedding model, to significantly lower the embedding latency.
 
-### Option 3: Build your own index using your own Documents
+### Option 3: Build your own index
+#### To build a Wikipedia index
 The following command will download, preprocess, and index the latest HTML dump of the [Kurdish Wikipedia](ku.wikipedia.org), which we use in this example for its relatively small size.
 
-`inv index-wikipedia-dump  --embedding-model BAAI/bge-m3 --workdir ./workdir --language ku`
+```bash
+inv index-wikipedia-dump  --embedding-model BAAI/bge-m3 --workdir ./workdir --language ku
+```
+
+#### To index custom documents
 
-For indexing a different set of documents, you need to preprocess your data into a [JSON Lines](https://jsonlines.org/) file (with .jsonl or .jsonl.gz file extension) where each line  has the following fields:
+1. Preprocess your data into a [JSON Lines](https://jsonlines.org/) file (with .jsonl or .jsonl.gz file extension) where each line  has the following fields:
 ```json
 {"content_string": "string", "article_title": "string", "full_section_title": "string", "block_type": "string", "language": "string", "last_edit_date": "string (optional)", "num_tokens": "integer (optional)"}
 ```
@@ -164,7 +197,23 @@ The script will feed `full_section_title` and `content_string` to the embedding
 
 See `wikipedia_preprocessing/preprocess_html_dump.py` for details on how this is implemented for Wikipedia HTML dumps.
 
-Then run `inv index-collection --collection-path <path to your preprocessed JSONL collection file> --collection-name `
+2. Then run the indexing command:
+
+```bash
+inv index-collection --collection-path <path to preprocessed JSONL> --collection-name <name>
+```
+
+
+#### To upload a Qdrant index to 🤗 Hub:
+1. Split the index into smaller parts:
+```bash
+tar -cvf - <path to the Qdrant index folder> | pigz -p 14 | split --bytes=10GB --numeric-suffixes=0 --suffix-length=4 - <path to the output folder>/qdrant_index.tar.gz.part-
+```
+
+2. Upload the resulting parts:
+```
+retrieval/upload_folder_to_hf_hub.py --folder_path <path to the output folder> --repo_id <Repo ID on 🤗 Hub>
+```
 
 
 
@@ -191,43 +240,42 @@ After creating an instance via Azure, obtain the connection string and add this
 COSMOS_CONNECTION_STRING=[Your Cosmos DB connection string]
 ```
 
+### Run Chainlit
 Running this will start the backend and front-end servers. You can then access the front-end at the specified port (5001 by default).
 `inv chainlit --backend-port 5001`
 
 
-# Other Commands
 
-## Using the free Rate-limited Wikipedia search API
-See https://wikichat.genie.stanford.edu/search/redoc
+# The free Rate-limited Wikipedia search API
+You can use this API endpoint for prototyping high-quality RAG systems.
+See https://wikichat.genie.stanford.edu/search/redoc for the full specification.
 
+Note that we do not provide any guarantees about this endpoint, and it is not suitable for production.
 
-## Upload an Index
-Split the index into smaller files:
-`tar -cvf - ./workdir/qdrant_index/ | pigz -p 14 | split --bytes=10GB --numeric-suffixes=0 --suffix-length=4 - /mnt/ephemeral_nvme/qdrant_index.tar.gz.part-`
 
-Then update the arguments in `retrieval/upload_to_hf_hub.py` and run it.
+# Wikipedia Preprocessing: Why is it difficult?
+Coming soon.
 
+# Other Commands
 
 ## Run a distilled model for lower latency and cost
 WikiChat 2.0 is not compatible with [fine-tuned LLaMA-2 checkpoints released](https://huggingface.co/collections/stanford-oval/wikichat-v10-66c580bf15e26b87d622498c). Please refer to v1.0 for now.
 
 ## Simulate Conversations
-In order to evaluate a chatbot, you can simulate conversations with a user simulator. `subset` can be one of `head`, `tail`, or `recent`, corresponding to the three subsets introduced in the WikiChat paper. We have also added the option to specify the language of the user (WikiChat always replies in the language of the user).
-This script will read the topic (i.e. a Wikipedia title and article) from the corresponding `benchmark/topics/(subset)_articles_(language).json.` file. `--num-dialogs` is the number of simulated dialogs to generate, and `--num-turns` is the number of turns in each dialog.
+To evaluate a chatbot, you can simulate conversations using a user simulator. The `subset` parameter can be one of `head`, `tail`, or `recent`, corresponding to the three subsets introduced in the WikiChat paper. You can also specify the language of the user (WikiChat always replies in the user's language).
+This script reads the topic (i.e., a Wikipedia title and article) from the corresponding `benchmark/topics/{subset}_articles_{language}.json` file. Use `--num-dialogues` to set the number of simulated dialogues to generate, and `--num-turns` to specify the number of turns in each dialogue.
 
 ```bash
-inv simulate-users --num-dialogs 1 --num-turns 2 --simulation-mode passage --language en --subset head
+inv simulate-users --num-dialogues 1 --num-turns 2 --simulation-mode passage --language en --subset head
 ```
-Depending on the engine you are using, this might take some time. The simulated dialogs and the log file will be saved in `benchmark/simulated_dialogs/`.
+Depending on the engine you are using, this might take some time. The simulated dialogues and log files will be saved in `benchmark/simulated_dialogs/`.
 You can also provide any of the pipeline parameters from above.
 You can experiment with different user characteristics by modifying `user_characteristics` in `benchmark/user_simulator.py`.
 
 
-# Wikipedia Preprocessing: Why is it difficult?
-Coming soon.
 
 # License
-WikiChat code and models are released under Apache-2.0 license.
+WikiChat code, and models and data are released under Apache-2.0 license.
 
 
 
diff --git a/benchmark/user_simulator.py b/benchmark/user_simulator.py
index 7555214..a496456 100644
--- a/benchmark/user_simulator.py
+++ b/benchmark/user_simulator.py
@@ -113,18 +113,18 @@ async def simulate_dialog(dialogue_inputs, args) -> list[DialogueTurn]:
     return dialogue_state
 
 
-def repeat_dialogue_inputs(dialogue_inputs, target_num_dialogs):
+def repeat_dialogue_inputs(dialogue_inputs, target_num_dialogues):
     """
     repeats dialogue_inputs if we don't have enough of them, truncates if there are too many
     """
-    if target_num_dialogs == -1:
-        target_num_dialogs = len(dialogue_inputs)
-    full_rounds = target_num_dialogs // len(dialogue_inputs)
+    if target_num_dialogues == -1:
+        target_num_dialogues = len(dialogue_inputs)
+    full_rounds = target_num_dialogues // len(dialogue_inputs)
     dialogue_inputs = (
         dialogue_inputs * full_rounds
-        + dialogue_inputs[: target_num_dialogs % len(dialogue_inputs)]
+        + dialogue_inputs[: target_num_dialogues % len(dialogue_inputs)]
     )
-    assert len(dialogue_inputs) == target_num_dialogs
+    assert len(dialogue_inputs) == target_num_dialogues
     return dialogue_inputs
 
 
@@ -138,7 +138,7 @@ def main(args):
                 if len(line) > 0:
                     dialogue_inputs.append(line)
 
-        dialogue_inputs = repeat_dialogue_inputs(dialogue_inputs, args.num_dialogs)
+        dialogue_inputs = repeat_dialogue_inputs(dialogue_inputs, args.num_dialogues)
         topics = dialogue_inputs
     elif args.mode == "passage":
         with open(args.input_file) as input_file:
@@ -149,12 +149,12 @@ def main(args):
             for title, passage in dialogue_inputs.items()
         ]
 
-        dialogue_inputs = repeat_dialogue_inputs(dialogue_inputs, args.num_dialogs)
+        dialogue_inputs = repeat_dialogue_inputs(dialogue_inputs, args.num_dialogues)
         topics = [tp[0] for tp in dialogue_inputs]
     elif args.mode == "multihop":
         with open(args.input_file) as input_file:
             dialogue_inputs = json.load(input_file)
-            dialogue_inputs = repeat_dialogue_inputs(dialogue_inputs, args.num_dialogs)
+            dialogue_inputs = repeat_dialogue_inputs(dialogue_inputs, args.num_dialogues)
             topics = [m["title_1"] + " and " + m["title_2"] for m in dialogue_inputs]
     else:
         raise ValueError("Unknown mode: %s" % args.mode)
@@ -226,10 +226,10 @@ def main(args):
         "--output_file", type=str, required=True, help="Where to write the outputs"
     )
     parser.add_argument(
-        "--num_dialogs",
+        "--num_dialogues",
         type=int,
         required=True,
-        help="The number of dialogs to generate. -1 means all topics.",
+        help="The number of dialogues to generate. -1 means all topics.",
     )
     parser.add_argument(
         "--num_turns",
diff --git a/retrieval/upload_folder_to_hf_hub.py b/retrieval/upload_folder_to_hf_hub.py
new file mode 100644
index 0000000..061517e
--- /dev/null
+++ b/retrieval/upload_folder_to_hf_hub.py
@@ -0,0 +1,20 @@
+import argparse
+from huggingface_hub import upload_folder
+
+def main(repo_id, folder_path):
+    upload_folder(
+        folder_path=folder_path,
+        repo_id=repo_id,
+        repo_type="dataset",
+        multi_commits=True,
+        multi_commits_verbose=True,
+    )
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Upload a folder to HuggingFace Hub")
+    parser.add_argument("--folder_path", type=str, help="The path to the folder to upload")
+    parser.add_argument("--repo_id", type=str, help="The repository ID on HuggingFace Hub")
+    
+    args = parser.parse_args()
+    
+    main(args.repo_id, args.folder_path)
\ No newline at end of file
diff --git a/retrieval/upload_to_hf_hub.py b/retrieval/upload_to_hf_hub.py
deleted file mode 100644
index 7e5a69b..0000000
--- a/retrieval/upload_to_hf_hub.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from huggingface_hub import upload_folder
-
-upload_folder(
-    folder_path="/mnt/ephemeral_nvme/",
-    repo_id="stanford-oval/wikipedia_10-languages_bge-m3_qdrant_index",
-    repo_type="dataset",
-    multi_commits=True,
-    multi_commits_verbose=True,
-)
diff --git a/tasks/benchmark.py b/tasks/benchmark.py
index f459f81..0a9e296 100644
--- a/tasks/benchmark.py
+++ b/tasks/benchmark.py
@@ -12,7 +12,7 @@
 @task(pre=[load_api_keys])
 def simulate_users(
     c,
-    num_dialogs,  # -1 to simulate all
+    num_dialogues,  # -1 to simulate all available topics
     num_turns: int,
     simulation_mode: str,  # passage
     subset: str,  # head, recent, tail
@@ -89,7 +89,7 @@ def simulate_users(
 
     c.run(
         f"python benchmark/user_simulator.py {pipeline_flags} "
-        f"--num_dialogs {num_dialogs} "
+        f"--num_dialogues {num_dialogues} "
         f"--user_engine {user_simulator_engine} "
         f"--user_temperature {user_temperature} "
         f"--mode {simulation_mode} "
diff --git a/wikipedia_preprocessing/upload_collections_to_hf_hub.py b/wikipedia_preprocessing/upload_collections_to_hf_hub.py
index 0c39b50..f2b7ab8 100644
--- a/wikipedia_preprocessing/upload_collections_to_hf_hub.py
+++ b/wikipedia_preprocessing/upload_collections_to_hf_hub.py
@@ -46,12 +46,6 @@
                 path_in_repo=f"{date}/{language}/{file}",
                 repo_id="stanford-oval/wikipedia",
                 repo_type="dataset",
-                run_as_future=True,
             )
-
-    # Remove the extracted files
-    for date, language in [
-        (date, lang) for date in args.dates for lang in args.languages
-    ]:
-        extracted_file = f"workdir/{language}/wikipedia_{date}/collection.jsonl"
+        # Remove the extracted file now
         os.remove(extracted_file)