Skip to content

Commit

Permalink
Merge pull request #26 from amosproj/9-load-data-to-Huggingface
Browse files Browse the repository at this point in the history
Upload data to huggingface
  • Loading branch information
christianwielenberg authored May 15, 2024
2 parents 807efa2 + c5433b3 commit f3e4813
Showing 1 changed file with 16 additions and 18 deletions.
34 changes: 16 additions & 18 deletions src/scripts/upload_to_huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
from tqdm import tqdm
import subprocess
import argparse
from huggingface_hub import HfApi, login

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

def upload_to_huggingface(directory_path, hf_token, hf_dataset_id):
"""
Find markdown and PDF files in the directory and its subdirectories then upload them to the specified huggingface dataset.
Find json files in the directory and its subdirectories then upload them to the specified huggingface dataset.
args:
directory_path: Path to the local directory of the data
Expand All @@ -17,24 +18,21 @@ def upload_to_huggingface(directory_path, hf_token, hf_dataset_id):
returns:
Nothing
"""
markdown_files = []
pdf_files = []
json_files = []
for root, dirs, files in os.walk(directory_path):
for file in files:
if file.endswith('.md'):
markdown_files.append(os.path.join(root, file))
elif file.endswith('.pdf'):
pdf_files.append(os.path.join(root, file))

# Upload markdown files
for markdown_file in tqdm(markdown_files):
cmd = f"huggingface-cli upload --repo-type dataset --token {hf_token} {hf_dataset_id} {markdown_file}"
subprocess.run(cmd, shell=True)

# Upload PDF files
for pdf_file in tqdm(pdf_files):
cmd = f"huggingface-cli upload --repo-type dataset --token {hf_token} {hf_dataset_id} {pdf_file}"
subprocess.run(cmd, shell=True)
if file.endswith('.json'):
json_files.append(os.path.join(root, file))
api = HfApi()
login(hf_token, add_to_git_credential = True)
# Upload json files
for json_file in tqdm(json_files):
api.upload_file(
path_or_fileobj=json_file,
path_in_repo=json_file.split("/")[-1],
repo_id=hf_dataset_id,
repo_type="dataset",
)

def main():
"""
Expand All @@ -44,9 +42,9 @@ def main():
Obviously use our own token and own directory etc. This is just an example that wont work :)
"""
parser = argparse.ArgumentParser(description="Upload markdown and PDF files to a dataset in Hugging Face Model Hub")
parser.add_argument("directory_path", type=str, help="Path to the directory containing markdown and PDF files")
parser.add_argument("hf_token", type=str, help="Your Hugging Face authentication token")
parser.add_argument("hf_dataset_id", type=str, help="ID of the dataset you want to upload files to")
parser.add_argument("--directory_path", type=str, default = "../../sources", help="Path to the directory containing markdown and PDF files")
args = parser.parse_args()

upload_to_huggingface(args.directory_path, args.hf_token, args.hf_dataset_id)
Expand Down

0 comments on commit f3e4813

Please sign in to comment.