From b0313fda1dfef10ea249c2b66f7edbf7fadbdcd4 Mon Sep 17 00:00:00 2001 From: YashodharPansuriya Date: Mon, 20 May 2024 11:38:03 +0200 Subject: [PATCH 1/2] feat: download raw data from huggig face --- src/scripts/download_from_huggingface.py | 67 ++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 src/scripts/download_from_huggingface.py diff --git a/src/scripts/download_from_huggingface.py b/src/scripts/download_from_huggingface.py new file mode 100644 index 0000000..4c16dbb --- /dev/null +++ b/src/scripts/download_from_huggingface.py @@ -0,0 +1,67 @@ +import logging +from datasets import load_dataset, get_dataset_config_names +import pandas as pd + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) +logging.getLogger("huggingface_hub.repocard").setLevel(logging.ERROR) + +def load_dataset_from_files(dataset_name, data_files, config_name): + """ + Load dataset from files with the specified configuration. + + Args: + dataset_name (str): The name of the dataset to load. + data_files (dict): A dictionary where keys are split names and values are file paths. + config_name (str): The name of the configuration to use. + + Returns: + dict: A dictionary containing loaded datasets for each split. + """ + + + datasets = {} + try: + for split, file_name in data_files.items(): + dataset = load_dataset(dataset_name, config_name, data_files={split: file_name}) + datasets[split] = dataset + except Exception as e: + logger.error(f"Failed to load dataset: {e}") + return datasets + +try: + # Get all configurations for the specific dataset + dataset_name = "Kubermatic/cncf-raw-data-for-llm-training" + try: + configs = get_dataset_config_names(dataset_name) + logger.info(f"Available configurations for {dataset_name}: {configs}") + except Exception as e: + logger.error(f"Failed to retrieve configurations for {dataset_name}: {e}") + configs = [] + + # Since only 'default' is available, we use it + config_name = 'default' + + # Define the unified data files name which will be used + data_files = { + "file1": "md_data.json", + "file2": "pdf_data.json", + } + + # Load the dataset + try: + datasets = load_dataset_from_files(dataset_name, data_files, config_name) + except Exception as e: + logger.error(f"Failed to load dataset: {e}") + datasets = {} + + # Print the loaded datasets + for split, dataset in datasets.items(): + logger.info(f"Dataset for {split} split:") + df = dataset[split].to_pandas() + print(df) +except KeyboardInterrupt: + logger.info("Operation interrupted by user.") +except Exception as e: + logger.error(f"An unexpected error occurred: {e}") From b901191610ccb7f15cf5aadbfbddc1ed64908e82 Mon Sep 17 00:00:00 2001 From: YashodharPansuriya Date: Mon, 20 May 2024 12:53:52 +0200 Subject: [PATCH 2/2] feat: changes in test cases of unified --- test/unit/Unified_format_conversation_test.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/test/unit/Unified_format_conversation_test.py b/test/unit/Unified_format_conversation_test.py index 159de41..5528900 100644 --- a/test/unit/Unified_format_conversation_test.py +++ b/test/unit/Unified_format_conversation_test.py @@ -91,18 +91,6 @@ def test_convert_files_to_json(self): self.assertEqual(len(pdf_data), 1) self.assertEqual(pdf_data[0]['tag']['file_name'], 'sample.pdf') - def test_process_error_yaml_file(self): - self.error_file_list.append(self.error_yaml_file) - process_error_yaml_file( - self.error_file_list, - file_paths=self.test_dir, - json_file_path=self.json_dir - ) - - with open(os.path.join(self.json_dir, 'error_yaml_data.json'), 'r', encoding='utf-8') as f: - error_data = json.load(f) - self.assertEqual(len(error_data), 1) - self.assertEqual(error_data[0]['tag']['file_name'], 'error.yaml') if __name__ == '__main__': unittest.main()