Merge pull request #26 from amosproj/9-load-data-to-Huggingface

Upload data to huggingface
amosproj · May 15, 2024 · f3e4813 · f3e4813
2 parents 807efa2 + c5433b3
commit f3e4813
Showing 1 changed file with 16 additions and 18 deletions.
diff --git a/src/scripts/upload_to_huggingface.py b/src/scripts/upload_to_huggingface.py
@@ -2,12 +2,13 @@
 from tqdm import tqdm
 import subprocess
 import argparse
+from huggingface_hub import HfApi, login
 
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 
 def upload_to_huggingface(directory_path, hf_token, hf_dataset_id):
     """
-    Find markdown and PDF files in the directory and its subdirectories then upload them to the specified huggingface dataset.
+    Find json files in the directory and its subdirectories then upload them to the specified huggingface dataset.
 
     args:
         directory_path:     Path to the local directory of the data
@@ -17,24 +18,21 @@ def upload_to_huggingface(directory_path, hf_token, hf_dataset_id):
     returns:
         Nothing
     """
-    markdown_files = []
-    pdf_files = []
+    json_files = []
     for root, dirs, files in os.walk(directory_path):
         for file in files:
-            if file.endswith('.md'):
-                markdown_files.append(os.path.join(root, file))
-            elif file.endswith('.pdf'):
-                pdf_files.append(os.path.join(root, file))
-
-    # Upload markdown files
-    for markdown_file in tqdm(markdown_files):
-        cmd = f"huggingface-cli upload --repo-type dataset --token {hf_token} {hf_dataset_id} {markdown_file}"
-        subprocess.run(cmd, shell=True)
-
-    # Upload PDF files
-    for pdf_file in tqdm(pdf_files):
-        cmd = f"huggingface-cli upload --repo-type dataset --token {hf_token} {hf_dataset_id} {pdf_file}"
-        subprocess.run(cmd, shell=True)
+            if file.endswith('.json'):
+                json_files.append(os.path.join(root, file))
+    api = HfApi()
+    login(hf_token, add_to_git_credential = True)
+    # Upload json files
+    for json_file in tqdm(json_files):
+        api.upload_file(
+            path_or_fileobj=json_file,
+            path_in_repo=json_file.split("/")[-1],
+            repo_id=hf_dataset_id,
+            repo_type="dataset",
+        )
 
 def main():
     """
@@ -44,9 +42,9 @@ def main():
     Obviously use our own token and own directory etc. This is just an example that wont work :)
     """
     parser = argparse.ArgumentParser(description="Upload markdown and PDF files to a dataset in Hugging Face Model Hub")
-    parser.add_argument("directory_path", type=str, help="Path to the directory containing markdown and PDF files")
     parser.add_argument("hf_token", type=str, help="Your Hugging Face authentication token")
     parser.add_argument("hf_dataset_id", type=str, help="ID of the dataset you want to upload files to")
+    parser.add_argument("--directory_path", type=str, default = "../../sources", help="Path to the directory containing markdown and PDF files")
     args = parser.parse_args()
 
     upload_to_huggingface(args.directory_path, args.hf_token, args.hf_dataset_id)