modelscope · zhijianma · Aug 18, 2023 · Aug 11, 2023 · Aug 17, 2023 · Aug 18, 2023
diff --git a/app.py b/app.py
@@ -21,20 +21,21 @@
 from data_juicer.config import init_configs
 from data_juicer.core import Analyser, Executor
 from data_juicer.ops.base_op import OPERATORS
+from data_juicer.utils.constant import Fields, StatsKeys
 from data_juicer.utils.logger_utils import get_log_file_path
 
 
 @st.cache_data
-def convert_csv(df):
+def convert_to_csv(df):
     # IMPORTANT: Cache the conversion to prevent computation on every rerun
-    return df.to_csv(encoding='utf_8_sig').encode('utf-8')
+    return df.to_csv().encode('utf_8_sig')
 
 
 @st.cache_data
-def convert_jsonl(df):
+def convert_to_jsonl(df):
     # IMPORTANT: Cache the conversion to prevent computation on every rerun
     return df.to_json(orient='records', lines=True,
-                      force_ascii=False).encode('utf-8')
+                      force_ascii=False).encode('utf_8_sig')
 
 
 @st.cache_data
@@ -111,6 +112,11 @@ def parse_cfg():
     try:
         parsed_cfg = init_configs(args=args_in_cmd)
         st.session_state.cfg = parsed_cfg
+        if isinstance(parsed_cfg.text_keys, list):
+            text_key = parsed_cfg.text_keys[0]
+        else:
+            text_key = parsed_cfg.text_keys
+        st.session_state.text_key = text_key
         if del_cfg_file:
             os.remove(cfg_f_name)
         return pretty_out(parsed_cfg), pretty_out(specified_cfg), parsed_cfg
@@ -160,7 +166,6 @@ def process_and_show_res():
 
         cfg_for_processed_data.export_path = os.path.dirname(
             cfg.export_path) + '_processed/data.jsonl'
-        cfg_for_processed_data.text_keys_to_load = [cfg.text_key_to_process]
         analyzer = Analyser(cfg_for_processed_data)
         analyzer.analysis_path = os.path.dirname(
             cfg_for_processed_data.export_path) + '/analysis'
@@ -194,43 +199,41 @@ def get_min_max_step(data):
 
 
 op_stats_dict = {
-    'alphanumeric_filter': ['alpha_token_ratio', 'alnum_ratio'],
-    'average_line_length_filter': ['avg_line_length'],
-    'character_repetition_filter': ['char_rep_ratio'],
-    'flagged_words_filter': ['flagged_words_ratio'],
-    'language_id_score_filter': ['lang', 'lang_score'],
-    'maximum_line_length_filter': ['max_line_length'],
-    'perplexity_filter': ['perplexity'],
-    'special_characters_filter': ['special_char_ratio'],
-    'stopwords_filter': ['stopwords_ratio'],
-    'text_length_filter': ['text_len'],
-    'words_num_filter': ['num_words'],
-    'word_repetition_filter': ['word_rep_ratio'],
+    'alphanumeric_filter':
+    [StatsKeys.alpha_token_ratio, StatsKeys.alnum_ratio],
+    'average_line_length_filter': [StatsKeys.avg_line_length],
+    'character_repetition_filter': [StatsKeys.char_rep_ratio],
+    'flagged_words_filter': [StatsKeys.flagged_words_ratio],
+    'language_id_score_filter': [StatsKeys.lang, StatsKeys.lang_score],
+    'maximum_line_length_filter': [StatsKeys.max_line_length],
+    'perplexity_filter': [StatsKeys.perplexity],
+    'special_characters_filter': [StatsKeys.special_char_ratio],
+    'stopwords_filter': [StatsKeys.stopwords_ratio],
+    'text_length_filter': [StatsKeys.text_len],
+    'words_num_filter': [StatsKeys.num_words],
+    'word_repetition_filter': [StatsKeys.word_rep_ratio],
 }
 
 
 class Visualize:
 
     @staticmethod
     def filter_dataset(dataset):
+        text_key = st.session_state.get('text_key', 'text')
+        text = dataset[text_key]
+        stats = pd.DataFrame(dataset[Fields.stats])
+        stats[text_key] = text
 
-        text = dataset['text']
-        if 'stats' not in dataset.features:
-            stats = pd.DataFrame(dataset['stats.meta'])
-        else:
-            stats = pd.DataFrame(dataset['stats'])
-        stats['text'] = text
-
-        non_num_list = ['lang']
+        non_num_list = [StatsKeys.lang]
         min_cutoff_list = [
-            'lang_score',
-            'stopwords_ratio',
+            StatsKeys.lang_score,
+            StatsKeys.stopwords_ratio,
         ]
         max_cutoff_list = [
-            'flagged_words_ratio',
-            'max_ppl',
+            StatsKeys.flagged_words_ratio,
+            StatsKeys.perplexity,
         ]
-        mask_list = ['text']
+        mask_list = [text_key]
 
         cfg = st.session_state.get('cfg', None)
         if cfg is None:
@@ -372,12 +375,12 @@ def set_sliders(total_stats, ordered):
         Visualize.display_dataset(ds, all_conds, show_num, 'Retained sampels',
                                   'docs')
         st.download_button('Download Retained data as JSONL',
-                           data=convert_jsonl(ds.loc[all_conds]),
+                           data=convert_to_jsonl(ds.loc[all_conds]),
                            file_name='retained.jsonl')
         Visualize.display_dataset(ds, np.invert(all_conds), show_num,
                                   'Discarded sampels', 'docs')
         st.download_button('Download Discarded data as JSONL',
-                           data=convert_jsonl(ds.loc[np.invert(all_conds)]),
+                           data=convert_to_jsonl(ds.loc[np.invert(all_conds)]),
                            file_name='discarded.jsonl')
         display_discarded_details = st.checkbox(
             'Display discarded documents by filter details')
@@ -389,7 +392,7 @@ def set_sliders(total_stats, ordered):
             for op_key, cond in item.items():
                 op_name, column_name = op_key
                 if column_name not in mask_list:
-                    sub_stats = show_stats[[column_name, 'text']]
+                    sub_stats = show_stats[[column_name, text_key]]
                     if display_discarded_details:
                         Visualize.display_dataset(
                             sub_stats,
@@ -421,6 +424,7 @@ def diversity():
         with st.expander('Diversity for sft dataset', expanded=False):
             dataset = st.session_state.get('dataset', None)
             cfg = st.session_state.get('cfg', parse_cfg()[2])
+            text_key = st.session_state.get('text_key', 'text')
             if dataset:
 
                 col1, col2, col3, col4 = st.columns(4)
@@ -444,19 +448,19 @@ def diversity():
                                           max_value=100,
                                           step=1)
 
-                disversity_btn = st.button('Analyse_diversity',
-                                           use_container_width=True)
+                diversity_btn = st.button('Analyse_diversity',
+                                          use_container_width=True)
                 output_path = os.path.join(os.path.dirname(cfg.export_path),
                                            'analysis')
                 raw_df = None
-                if disversity_btn:
+                if diversity_btn:
                     try:
                         diversity_analysis = DiversityAnalysis(
                             dataset, output_path)
                         with st.spinner('Wait for analyze diversity...'):
                             raw_df = diversity_analysis.compute(
                                 lang_or_model=get_diversity_model(lang_select),
-                                column_name=cfg.text_key_to_process)
+                                column_name=text_key)
 
                         st.session_state[f'diversity{lang_select}'] = raw_df
 
@@ -477,7 +481,7 @@ def diversity():
 
                     st.download_button(
                         label='Download diversity data as CSV',
-                        data=convert_csv(df),
+                        data=convert_to_csv(df),
                         file_name='diversity.csv',
                         mime='text/csv',
                     )

diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -9,10 +9,8 @@ dataset_path: '/path/to/your/dataset'                       # path to your datas
 export_path: '/path/to/result/dataset.jsonl'                # path to processed result dataset. Supported suffixes include ['jsonl', 'json', 'parquet']
 export_shard_size: 0                                        # Shard size of exported dataset in Byte. In default, it's 0, which means export the whole dataset into only one file. If it's set a positive number, the exported dataset will be split into several dataset shards, and the max size of each shard won't larger than the export_shard_size
 np: 4                                                       # number of subprocess to process your dataset
-text_key_to_process: 'content'                              # the key name of field where the sample texts to be processed, e.g., `text`, `text.instruction`, `text.output`, ...'
-                                                            # Note: currently, we support specify only ONE key for each op, for cases requiring multiple keys, users can specify the op multiple times
-text_keys_to_load:                                          # the key name of field where the sample texts stored in the original data
-  - 'text'
+text_keys: 'content'                                        # the key name of field where the sample texts to be processed, e.g., `text`, `instruction`, `output`, ...'
+                                                            # Note: currently, we support specify only ONE key for each op, for cases requiring multiple keys, users can specify the op multiple times. We will only use the first key of `text_keys` when you set multiple keys.
 suffixes: []                                                # the suffix of files that will be read. For example: '.txt', 'txt' or ['txt', '.pdf', 'docx']
 use_cache: true                                             # whether to use the cache management of hugging face datasets. It might take up lots of disk space when using cache
 ds_cache_dir: '~/.cache/huggingface/datasets'               # cache dir for hugging face datasets. In default it's the default cache dir "~/.cache/huggingface/datasets". If this argument is reset by users, it will override the default cache dir
@@ -116,10 +114,10 @@ process:
   - suffix_filter:                                          # filter to keep samples with specified suffix.
       suffixes: []                                            # the suffix of text that will be keep. For example: '.txt', 'txt' or ['txt', '.pdf', 'docx']
   - specified_field_filter:                                 # filter text with the specified field info out of specific range
-      text_key: ''                                          # the target key corresponding to multi-level field information need to be separated by '.'
+      field_key: ''                                           # the target key corresponding to multi-level field information need to be separated by '.'
       target_value: []                                        # the range of specified field information corresponding to the samples that need to be retained
   - specified_numeric_field_filter:                         # filter text with the specified numeric field info out of specific range
-      text_key: ''                                          # the target key corresponding to multi-level field information need to be separated by '.'
+      field_key: ''                                           # the target key corresponding to multi-level field information need to be separated by '.'
       min_value: 0                                            # the min filter value in SpecifiedNumericField op
       max_value: 10000                                        # the max filter value in SpecifiedNumericField op
 
@@ -137,12 +135,12 @@ process:
 
   # Selector ops
   - topk_specified_field_selector:                          # selector to select top samples based on the sorted specified field
-      text_key: ''                                          # the target keys corresponding to multi-level field information need to be separated by '.'
+      field_key: ''                                           # the target keys corresponding to multi-level field information need to be separated by '.'
       top_ratio:                                              # ratio of selected top samples
       topk:                                                   # number of selected top sample
       reverse: True                                           # determine the sorting rule, if reverse=True, then sort in descending order
   - frequency_specified_field_selector:                     # selector to select samples based on the sorted frequency of specified field value
-      text_key: ''                                          # the target keys corresponding to multi-level field information need to be separated by '.'
+      field_key: ''                                           # the target keys corresponding to multi-level field information need to be separated by '.'
       top_ratio:                                              # ratio of selected top specified field value
       topk:                                                   # number of selected top specified field value
       reverse: True                                           # determine the sorting rule, if reverse=True, then sort in descending order
diff --git a/configs/data_juicer_recipes/github_code/stack-code-refine.yaml b/configs/data_juicer_recipes/github_code/stack-code-refine.yaml
@@ -3,7 +3,7 @@ project_name: 'Data-Juicer-recipes-the-stack'
 dataset_path: '/path/to/your/dataset'  # path to your dataset directory or file
 export_path: '/path/to/your/dataset.jsonl'
 
-text_key: 'content'
+text_keys: 'content'
 
 np: 50  # number of subprocess to process your dataset
 open_tracer: true

diff --git a/data_juicer/analysis/column_wise_analysis.py b/data_juicer/analysis/column_wise_analysis.py
@@ -4,6 +4,8 @@
 import matplotlib.pyplot as plt
 import pandas as pd
 
+from data_juicer.utils.constant import Fields
+
 from .overall_analysis import OverallAnalysis
 
 
@@ -64,7 +66,7 @@ def __init__(self,
         :param save_stats_in_one_file: whether save all analysis figures of all
             stats into one image file
         """
-        self.stats = pd.DataFrame(dataset['stats'])
+        self.stats = pd.DataFrame(dataset[Fields.stats])
         self.output_path = output_path
         if not os.path.exists(self.output_path):
             os.makedirs(self.output_path)

diff --git a/data_juicer/analysis/overall_analysis.py b/data_juicer/analysis/overall_analysis.py
@@ -2,7 +2,7 @@
 
 import pandas as pd
 
-
+from data_juicer.utils.constant import Fields
 class OverallAnalysis:
     """Apply analysis on the overall stats, including mean, std, quantiles,
     etc."""
@@ -14,7 +14,7 @@ def __init__(self, dataset, output_path):
         :param dataset: the dataset to be analysed
         :param output_path: path to store the analysis results.
         """
-        self.stats = pd.DataFrame(dataset['stats'])
+        self.stats = pd.DataFrame(dataset[Fields.stats])
         self.output_path = output_path
         if not os.path.exists(self.output_path):
             os.makedirs(self.output_path)

diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py
@@ -69,20 +69,17 @@ def init_configs(args=None):
                         type=PositiveInt,
                         default=4,
                         help='Number of processes to process dataset.')
-    parser.add_argument('--text_key_to_process',
-                        type=Optional[str],
-                        default='text',
-                        help='Key name of field where the sample '
-                        'texts to be processed, e.g., '
-                        '`text`, `text.instruction`, `text.output`, ...'
-                        'Note: currently, we support specify only ONE key for '
-                        'each op, for cases requiring multiple keys, users can'
-                        ' specify the op multiple times')
-    parser.add_argument('--text_keys_to_load',
-                        type=Union[List[str], Tuple[str]],
-                        default=['text'],
-                        help='Key name of field where the sample '
-                        'texts stored in the original data')
+    parser.add_argument(
+        '--text_keys',
+        type=Union[str, List[str]],
+        default='text',
+        help='Key name of field where the sample '
+        'texts to be processed, e.g., '
+        '`text`, `text.instruction`, `text.output`, ...'
+        'Note: currently, we support specify only ONE key for '
+        'each op, for cases requiring multiple keys, users can'
+        ' specify the op multiple times.  We will only use the '
+        'first key of `text_keys` when you set multiple keys.')
     parser.add_argument('--suffixes',
                         type=Union[str, List[str], Tuple[str]],
                         default=[],

diff --git a/data_juicer/core/analyser.py b/data_juicer/core/analyser.py
@@ -6,6 +6,7 @@
 from data_juicer.config import init_configs
 from data_juicer.format import load_formatter
 from data_juicer.ops import Filter, load_ops
+from data_juicer.utils.constant import Fields
 
 from .exporter import Exporter
 
@@ -35,8 +36,8 @@ def __init__(self, cfg=None):
         # setup formatter
         logger.info('Setting up data formatter...')
         self.formatter = load_formatter(self.cfg.dataset_path,
-                                        self.cfg.text_keys_to_load,
-                                        self.cfg.suffixes, self.cfg.add_suffix)
+                                        self.cfg.text_keys, self.cfg.suffixes,
+                                        self.cfg.add_suffix)
 
         # prepare exporter and check export path suffix
         # NOTICE: no need to export dataset texts for analyser
@@ -65,23 +66,23 @@ def run(self, load_data_np=None):
         logger.info('Loading dataset from data formatter...')
         if load_data_np is None:
             load_data_np = self.cfg.np
-        dataset = self.formatter.load_dataset(load_data_np, self.cfg)
+        dataset = self.formatter.load_dataset(load_data_np)
 
         # extract processes
         logger.info('Preparing process operators...')
-        self.ops = load_ops(self.cfg.process, self.cfg.text_key_to_process)
+        self.ops = load_ops(self.cfg.process, self.cfg.text_keys)
 
         # 2. stats precompute only for filter ops
         logger.info('Computing the stats of dataset...')
         stats_collected = False
         for op_cfg, op in zip(self.cfg.process, self.ops):
             op_name = list(op_cfg.keys())[0]
             if isinstance(op, Filter):
-                if 'stats' not in dataset.features:
+                if Fields.stats not in dataset.features:
                     # TODO:
                     # this is a temp solution,
                     # only add stats when calling filter op
-                    dataset = dataset.add_column(name='stats',
+                    dataset = dataset.add_column(name=Fields.stats,
                                                  column=[{}] *
                                                  dataset.num_rows)
                 dataset = dataset.map(op.compute_stats,
@@ -94,7 +95,7 @@ def run(self, load_data_np=None):
             return dataset
 
         # 3. analysis and output result to the export path
-        # 3.1. Only consider fields in 'stats'
+        # 3.1. Only consider fields in Fields.stats
         # 3.2. For string fields, only consider its histogram
         # 3.3. For numeric fields, consider its histogram and box
         # 3.4. Otherwise, DO NOT analyse

diff --git a/data_juicer/core/data.py b/data_juicer/core/data.py
@@ -87,7 +87,7 @@ def __getitem__(self, key):
 class NestedDatasetDict(DatasetDict):
     """Enhanced HuggingFace-DatasetDict for better usability and efficiency."""
 
-    def __init(self, *args, **kargs):
+    def __init__(self, *args, **kargs):
         if len(args) == 1 and isinstance(args[0], Dataset):
             # init from another DatasetDict instance
             self.__dict__ = copy.copy(args[0].__dict__)