Skip to content

Commit

Permalink
feat: Create and push parquet to huggingface
Browse files Browse the repository at this point in the history
  • Loading branch information
getwithashish committed Aug 1, 2024
1 parent f8c1047 commit ea96d0f
Showing 1 changed file with 13 additions and 0 deletions.
13 changes: 13 additions & 0 deletions parquet_dataset_generator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
import pandas as pd
import json
import glob
from datasets import load_dataset
from huggingface_hub import login
from decouple import config


def login_huggingface():
login(token=config("HF_TOKEN"))


def combine_json_arrays(directory_path: str) -> list:
Expand All @@ -27,6 +34,12 @@ def create_parquet_from_json(json_data: list):
df.to_parquet('internal_dataset.parquet', index=False)


def push_parquet():
dataset = load_dataset('parquet', data_files='internal_dataset.parquet')
dataset_repo_name = 'getwithashish/internal-dept-dataset'
dataset.push_to_hub(dataset_repo_name)


if __name__ == "__main__":
raw_json_data = combine_json_arrays(".")
create_parquet_from_json(raw_json_data)

0 comments on commit ea96d0f

Please sign in to comment.