From b0313fda1dfef10ea249c2b66f7edbf7fadbdcd4 Mon Sep 17 00:00:00 2001
From: YashodharPansuriya <yashodhar.pansuriya@gmail.com>
Date: Mon, 20 May 2024 11:38:03 +0200
Subject: [PATCH 1/2] feat: download raw data from huggig face

---
 src/scripts/download_from_huggingface.py | 67 ++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 src/scripts/download_from_huggingface.py

diff --git a/src/scripts/download_from_huggingface.py b/src/scripts/download_from_huggingface.py
new file mode 100644
index 0000000..4c16dbb
--- /dev/null
+++ b/src/scripts/download_from_huggingface.py
@@ -0,0 +1,67 @@
+import logging
+from datasets import load_dataset, get_dataset_config_names
+import pandas as pd
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+logging.getLogger("huggingface_hub.repocard").setLevel(logging.ERROR)
+
+def load_dataset_from_files(dataset_name, data_files, config_name):
+    """
+    Load dataset from files with the specified configuration.
+
+    Args:
+        dataset_name (str): The name of the dataset to load.
+        data_files (dict): A dictionary where keys are split names and values are file paths.
+        config_name (str): The name of the configuration to use.
+
+    Returns:
+        dict: A dictionary containing loaded datasets for each split.
+    """
+
+    
+    datasets = {}
+    try:
+        for split, file_name in data_files.items():
+            dataset = load_dataset(dataset_name, config_name, data_files={split: file_name})
+            datasets[split] = dataset
+    except Exception as e:
+        logger.error(f"Failed to load dataset: {e}")
+    return datasets
+
+try:
+    # Get all configurations for the specific dataset
+    dataset_name = "Kubermatic/cncf-raw-data-for-llm-training"
+    try:
+        configs = get_dataset_config_names(dataset_name)
+        logger.info(f"Available configurations for {dataset_name}: {configs}")
+    except Exception as e:
+        logger.error(f"Failed to retrieve configurations for {dataset_name}: {e}")
+        configs = []
+
+    # Since only 'default' is available, we use it
+    config_name = 'default'
+
+    # Define the unified data files name which will be used
+    data_files = {
+        "file1": "md_data.json",
+        "file2": "pdf_data.json",
+    }
+
+    # Load the dataset
+    try:
+        datasets = load_dataset_from_files(dataset_name, data_files, config_name)
+    except Exception as e:
+        logger.error(f"Failed to load dataset: {e}")
+        datasets = {}
+
+    # Print the loaded datasets
+    for split, dataset in datasets.items():
+        logger.info(f"Dataset for {split} split:")
+        df = dataset[split].to_pandas()
+        print(df)
+except KeyboardInterrupt:
+    logger.info("Operation interrupted by user.")
+except Exception as e:
+    logger.error(f"An unexpected error occurred: {e}")

From b901191610ccb7f15cf5aadbfbddc1ed64908e82 Mon Sep 17 00:00:00 2001
From: YashodharPansuriya <yashodhar.pansuriya@gmail.com>
Date: Mon, 20 May 2024 12:53:52 +0200
Subject: [PATCH 2/2] feat: changes in test cases of unified

---
 test/unit/Unified_format_conversation_test.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/test/unit/Unified_format_conversation_test.py b/test/unit/Unified_format_conversation_test.py
index 159de41..5528900 100644
--- a/test/unit/Unified_format_conversation_test.py
+++ b/test/unit/Unified_format_conversation_test.py
@@ -91,18 +91,6 @@ def test_convert_files_to_json(self):
             self.assertEqual(len(pdf_data), 1)
             self.assertEqual(pdf_data[0]['tag']['file_name'], 'sample.pdf')
 
-    def test_process_error_yaml_file(self):
-        self.error_file_list.append(self.error_yaml_file)
-        process_error_yaml_file(
-            self.error_file_list, 
-            file_paths=self.test_dir, 
-            json_file_path=self.json_dir
-        )
-
-        with open(os.path.join(self.json_dir, 'error_yaml_data.json'), 'r', encoding='utf-8') as f:
-            error_data = json.load(f)
-        self.assertEqual(len(error_data), 1)
-        self.assertEqual(error_data[0]['tag']['file_name'], 'error.yaml')
 
 if __name__ == '__main__':
     unittest.main()