Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added connector folder and HF file #313

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Empty file added connectors/__init__.py
Empty file.
68 changes: 68 additions & 0 deletions connectors/huggingface_connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from datasets import load_dataset, get_dataset_split_names
from nomic import AtlasDataset
import pyarrow as pa
import pyarrow.compute as pc

# Gets data from HF dataset
def get_hfdata(dataset_identifier, split="train", limit=100000):
splits = get_dataset_split_names(dataset_identifier)
dataset = load_dataset(dataset_identifier, split=split, streaming=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current implementation does not handle the case where the specified split is not available in the dataset. Previously, there was a mechanism to check available splits and use an alternative if the specified one was not found. Consider reintroducing this functionality to avoid runtime errors.


if not dataset:
raise ValueError("No dataset was found for the provided identifier and split.")

# Processes dataset entries using Arrow
id_counter = 0
data = []
for example in dataset:
# Adds a sequential ID
example['id'] = str(id_counter)
id_counter += 1
data.append(example)

# Convert the data list to an Arrow table
table = pa.Table.from_pylist(data)
return table

# Function to convert complex types to strings using Arrow
def process_table(table):
# Converts columns with complex types to strings
for col in table.schema.names:
column = table[col]
if pa.types.is_boolean(column.type):
table = table.set_column(table.schema.get_field_index(col), col, pc.cast(column, pa.string()))
elif pa.types.is_list(column.type):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if you flatten the column as a list then cast as a string, the column will be too long and not match up with the other columns of the table. We may need to refactor this slightly by making sure column length is the same and structs are handled on a row by row basis

new_column = []
for item in column:
if pa.types.is_struct(column.type.value_type):
# Flatten the struct and cast as string for each row
flattened = ", ".join(str(sub_item.as_py()) for sub_item in item.values)
new_column.append(flattened)
else:
new_column.append(str(item))
table = table.set_column(table.schema.get_field_index(col), col, pa.array(new_column, pa.string()))
elif pa.types.is_dictionary(column.type):
table = table.set_column(table.schema.get_field_index(col), col, pc.cast(column, pa.string()))
return table

# Creates AtlasDataset from HF dataset
def hf_atlasdataset(dataset_identifier, split="train", limit=100000):
table = get_hfdata(dataset_identifier.strip(), split, limit)
map_name = dataset_identifier.replace('/', '_')
if table.num_rows == 0:
raise ValueError("No data was found for the provided dataset.")

dataset = AtlasDataset(
map_name,
unique_id_field="id",
)

# Process the table to ensure all complex types are converted to strings
processed_table = process_table(table)

# Add data to the AtlasDataset
dataset.add_data(data=processed_table)

return dataset


21 changes: 21 additions & 0 deletions examples/HF_example_usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import argparse
from huggingface_connector import hf_atlasdataset

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Create an AtlasDataset from a Hugging Face dataset.')
parser.add_argument('--dataset_identifier', type=str, required=True, help='The Hugging Face dataset identifier (e.g., "username/dataset_name")')
parser.add_argument('--split', type=str, default="train", help='The dataset split to use (default: train)')
parser.add_argument('--limit', type=int, default=100000, help='The maximum number of examples to load (default: 100000)')

args = parser.parse_args()

try:
atlas_dataset = hf_atlasdataset(args.dataset_identifier, args.split, args.limit)
print(f"AtlasDataset has been created for '{args.dataset_identifier}'")
except ValueError as e:
print(f"Error creating AtlasDataset: {e}")
except Exception as e:
print(f"An unexpected error occurred: {e}")