From c5433b3a7c530b4328281529510b7630b4be3215 Mon Sep 17 00:00:00 2001 From: Christian Wielenberg Date: Tue, 14 May 2024 20:10:27 +0200 Subject: [PATCH] Changed upload format to json --- src/scripts/upload_to_huggingface.py | 34 +++++++++++++--------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/src/scripts/upload_to_huggingface.py b/src/scripts/upload_to_huggingface.py index b26478c..a6d3392 100644 --- a/src/scripts/upload_to_huggingface.py +++ b/src/scripts/upload_to_huggingface.py @@ -2,12 +2,13 @@ from tqdm import tqdm import subprocess import argparse +from huggingface_hub import HfApi, login os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" def upload_to_huggingface(directory_path, hf_token, hf_dataset_id): """ - Find markdown and PDF files in the directory and its subdirectories then upload them to the specified huggingface dataset. + Find json files in the directory and its subdirectories then upload them to the specified huggingface dataset. args: directory_path: Path to the local directory of the data @@ -17,24 +18,21 @@ def upload_to_huggingface(directory_path, hf_token, hf_dataset_id): returns: Nothing """ - markdown_files = [] - pdf_files = [] + json_files = [] for root, dirs, files in os.walk(directory_path): for file in files: - if file.endswith('.md'): - markdown_files.append(os.path.join(root, file)) - elif file.endswith('.pdf'): - pdf_files.append(os.path.join(root, file)) - - # Upload markdown files - for markdown_file in tqdm(markdown_files): - cmd = f"huggingface-cli upload --repo-type dataset --token {hf_token} {hf_dataset_id} {markdown_file}" - subprocess.run(cmd, shell=True) - - # Upload PDF files - for pdf_file in tqdm(pdf_files): - cmd = f"huggingface-cli upload --repo-type dataset --token {hf_token} {hf_dataset_id} {pdf_file}" - subprocess.run(cmd, shell=True) + if file.endswith('.json'): + json_files.append(os.path.join(root, file)) + api = HfApi() + login(hf_token, add_to_git_credential = True) + # Upload json files + for json_file in tqdm(json_files): + api.upload_file( + path_or_fileobj=json_file, + path_in_repo=json_file.split("/")[-1], + repo_id=hf_dataset_id, + repo_type="dataset", + ) def main(): """ @@ -44,9 +42,9 @@ def main(): Obviously use our own token and own directory etc. This is just an example that wont work :) """ parser = argparse.ArgumentParser(description="Upload markdown and PDF files to a dataset in Hugging Face Model Hub") - parser.add_argument("directory_path", type=str, help="Path to the directory containing markdown and PDF files") parser.add_argument("hf_token", type=str, help="Your Hugging Face authentication token") parser.add_argument("hf_dataset_id", type=str, help="ID of the dataset you want to upload files to") + parser.add_argument("--directory_path", type=str, default = "../../sources", help="Path to the directory containing markdown and PDF files") args = parser.parse_args() upload_to_huggingface(args.directory_path, args.hf_token, args.hf_dataset_id)