Skip to content

Commit

Permalink
Merge pull request #40 from amosproj/fix_ymlFileIssueUnified
Browse files Browse the repository at this point in the history
Fix yml file issue unified
  • Loading branch information
YashodharPansuriya authored May 20, 2024
2 parents 1d93892 + b901191 commit dff2af2
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 12 deletions.
67 changes: 67 additions & 0 deletions src/scripts/download_from_huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import logging
from datasets import load_dataset, get_dataset_config_names
import pandas as pd

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logging.getLogger("huggingface_hub.repocard").setLevel(logging.ERROR)

def load_dataset_from_files(dataset_name, data_files, config_name):
"""
Load dataset from files with the specified configuration.
Args:
dataset_name (str): The name of the dataset to load.
data_files (dict): A dictionary where keys are split names and values are file paths.
config_name (str): The name of the configuration to use.
Returns:
dict: A dictionary containing loaded datasets for each split.
"""


datasets = {}
try:
for split, file_name in data_files.items():
dataset = load_dataset(dataset_name, config_name, data_files={split: file_name})
datasets[split] = dataset
except Exception as e:
logger.error(f"Failed to load dataset: {e}")
return datasets

try:
# Get all configurations for the specific dataset
dataset_name = "Kubermatic/cncf-raw-data-for-llm-training"
try:
configs = get_dataset_config_names(dataset_name)
logger.info(f"Available configurations for {dataset_name}: {configs}")
except Exception as e:
logger.error(f"Failed to retrieve configurations for {dataset_name}: {e}")
configs = []

# Since only 'default' is available, we use it
config_name = 'default'

# Define the unified data files name which will be used
data_files = {
"file1": "md_data.json",
"file2": "pdf_data.json",
}

# Load the dataset
try:
datasets = load_dataset_from_files(dataset_name, data_files, config_name)
except Exception as e:
logger.error(f"Failed to load dataset: {e}")
datasets = {}

# Print the loaded datasets
for split, dataset in datasets.items():
logger.info(f"Dataset for {split} split:")
df = dataset[split].to_pandas()
print(df)
except KeyboardInterrupt:
logger.info("Operation interrupted by user.")
except Exception as e:
logger.error(f"An unexpected error occurred: {e}")
12 changes: 0 additions & 12 deletions test/unit/Unified_format_conversation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,18 +91,6 @@ def test_convert_files_to_json(self):
self.assertEqual(len(pdf_data), 1)
self.assertEqual(pdf_data[0]['tag']['file_name'], 'sample.pdf')

def test_process_error_yaml_file(self):
self.error_file_list.append(self.error_yaml_file)
process_error_yaml_file(
self.error_file_list,
file_paths=self.test_dir,
json_file_path=self.json_dir
)

with open(os.path.join(self.json_dir, 'error_yaml_data.json'), 'r', encoding='utf-8') as f:
error_data = json.load(f)
self.assertEqual(len(error_data), 1)
self.assertEqual(error_data[0]['tag']['file_name'], 'error.yaml')

if __name__ == '__main__':
unittest.main()

0 comments on commit dff2af2

Please sign in to comment.