Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: reomve renaming column names and add constant keys for stats #13

Merged
merged 5 commits into from
Aug 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 42 additions & 38 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,21 @@
from data_juicer.config import init_configs
from data_juicer.core import Analyser, Executor
from data_juicer.ops.base_op import OPERATORS
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.logger_utils import get_log_file_path


@st.cache_data
def convert_csv(df):
def convert_to_csv(df):
# IMPORTANT: Cache the conversion to prevent computation on every rerun
return df.to_csv(encoding='utf_8_sig').encode('utf-8')
return df.to_csv().encode('utf_8_sig')


@st.cache_data
def convert_jsonl(df):
def convert_to_jsonl(df):
# IMPORTANT: Cache the conversion to prevent computation on every rerun
return df.to_json(orient='records', lines=True,
force_ascii=False).encode('utf-8')
force_ascii=False).encode('utf_8_sig')


@st.cache_data
Expand Down Expand Up @@ -111,6 +112,11 @@ def parse_cfg():
try:
parsed_cfg = init_configs(args=args_in_cmd)
st.session_state.cfg = parsed_cfg
if isinstance(parsed_cfg.text_keys, list):
text_key = parsed_cfg.text_keys[0]
else:
text_key = parsed_cfg.text_keys
st.session_state.text_key = text_key
if del_cfg_file:
os.remove(cfg_f_name)
return pretty_out(parsed_cfg), pretty_out(specified_cfg), parsed_cfg
Expand Down Expand Up @@ -160,7 +166,6 @@ def process_and_show_res():

cfg_for_processed_data.export_path = os.path.dirname(
cfg.export_path) + '_processed/data.jsonl'
cfg_for_processed_data.text_keys_to_load = [cfg.text_key_to_process]
analyzer = Analyser(cfg_for_processed_data)
analyzer.analysis_path = os.path.dirname(
cfg_for_processed_data.export_path) + '/analysis'
Expand Down Expand Up @@ -194,43 +199,41 @@ def get_min_max_step(data):


op_stats_dict = {
'alphanumeric_filter': ['alpha_token_ratio', 'alnum_ratio'],
'average_line_length_filter': ['avg_line_length'],
'character_repetition_filter': ['char_rep_ratio'],
'flagged_words_filter': ['flagged_words_ratio'],
'language_id_score_filter': ['lang', 'lang_score'],
'maximum_line_length_filter': ['max_line_length'],
'perplexity_filter': ['perplexity'],
'special_characters_filter': ['special_char_ratio'],
'stopwords_filter': ['stopwords_ratio'],
'text_length_filter': ['text_len'],
'words_num_filter': ['num_words'],
'word_repetition_filter': ['word_rep_ratio'],
'alphanumeric_filter':
[StatsKeys.alpha_token_ratio, StatsKeys.alnum_ratio],
'average_line_length_filter': [StatsKeys.avg_line_length],
'character_repetition_filter': [StatsKeys.char_rep_ratio],
'flagged_words_filter': [StatsKeys.flagged_words_ratio],
'language_id_score_filter': [StatsKeys.lang, StatsKeys.lang_score],
'maximum_line_length_filter': [StatsKeys.max_line_length],
'perplexity_filter': [StatsKeys.perplexity],
'special_characters_filter': [StatsKeys.special_char_ratio],
'stopwords_filter': [StatsKeys.stopwords_ratio],
'text_length_filter': [StatsKeys.text_len],
'words_num_filter': [StatsKeys.num_words],
'word_repetition_filter': [StatsKeys.word_rep_ratio],
}


class Visualize:

@staticmethod
def filter_dataset(dataset):
text_key = st.session_state.get('text_key', 'text')
text = dataset[text_key]
stats = pd.DataFrame(dataset[Fields.stats])
stats[text_key] = text

text = dataset['text']
if 'stats' not in dataset.features:
stats = pd.DataFrame(dataset['stats.meta'])
else:
stats = pd.DataFrame(dataset['stats'])
stats['text'] = text

non_num_list = ['lang']
non_num_list = [StatsKeys.lang]
min_cutoff_list = [
'lang_score',
'stopwords_ratio',
StatsKeys.lang_score,
StatsKeys.stopwords_ratio,
]
max_cutoff_list = [
'flagged_words_ratio',
'max_ppl',
StatsKeys.flagged_words_ratio,
StatsKeys.perplexity,
]
mask_list = ['text']
mask_list = [text_key]

cfg = st.session_state.get('cfg', None)
if cfg is None:
Expand Down Expand Up @@ -372,12 +375,12 @@ def set_sliders(total_stats, ordered):
Visualize.display_dataset(ds, all_conds, show_num, 'Retained sampels',
'docs')
st.download_button('Download Retained data as JSONL',
data=convert_jsonl(ds.loc[all_conds]),
data=convert_to_jsonl(ds.loc[all_conds]),
file_name='retained.jsonl')
Visualize.display_dataset(ds, np.invert(all_conds), show_num,
'Discarded sampels', 'docs')
st.download_button('Download Discarded data as JSONL',
data=convert_jsonl(ds.loc[np.invert(all_conds)]),
data=convert_to_jsonl(ds.loc[np.invert(all_conds)]),
file_name='discarded.jsonl')
display_discarded_details = st.checkbox(
'Display discarded documents by filter details')
Expand All @@ -389,7 +392,7 @@ def set_sliders(total_stats, ordered):
for op_key, cond in item.items():
op_name, column_name = op_key
if column_name not in mask_list:
sub_stats = show_stats[[column_name, 'text']]
sub_stats = show_stats[[column_name, text_key]]
if display_discarded_details:
Visualize.display_dataset(
sub_stats,
Expand Down Expand Up @@ -421,6 +424,7 @@ def diversity():
with st.expander('Diversity for sft dataset', expanded=False):
dataset = st.session_state.get('dataset', None)
cfg = st.session_state.get('cfg', parse_cfg()[2])
text_key = st.session_state.get('text_key', 'text')
if dataset:

col1, col2, col3, col4 = st.columns(4)
Expand All @@ -444,19 +448,19 @@ def diversity():
max_value=100,
step=1)

disversity_btn = st.button('Analyse_diversity',
use_container_width=True)
diversity_btn = st.button('Analyse_diversity',
use_container_width=True)
output_path = os.path.join(os.path.dirname(cfg.export_path),
'analysis')
raw_df = None
if disversity_btn:
if diversity_btn:
try:
diversity_analysis = DiversityAnalysis(
dataset, output_path)
with st.spinner('Wait for analyze diversity...'):
raw_df = diversity_analysis.compute(
lang_or_model=get_diversity_model(lang_select),
column_name=cfg.text_key_to_process)
column_name=text_key)

st.session_state[f'diversity{lang_select}'] = raw_df

Expand All @@ -477,7 +481,7 @@ def diversity():

st.download_button(
label='Download diversity data as CSV',
data=convert_csv(df),
data=convert_to_csv(df),
file_name='diversity.csv',
mime='text/csv',
)
Expand Down
14 changes: 6 additions & 8 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,8 @@ dataset_path: '/path/to/your/dataset' # path to your datas
export_path: '/path/to/result/dataset.jsonl' # path to processed result dataset. Supported suffixes include ['jsonl', 'json', 'parquet']
export_shard_size: 0 # Shard size of exported dataset in Byte. In default, it's 0, which means export the whole dataset into only one file. If it's set a positive number, the exported dataset will be split into several dataset shards, and the max size of each shard won't larger than the export_shard_size
np: 4 # number of subprocess to process your dataset
text_key_to_process: 'content' # the key name of field where the sample texts to be processed, e.g., `text`, `text.instruction`, `text.output`, ...'
# Note: currently, we support specify only ONE key for each op, for cases requiring multiple keys, users can specify the op multiple times
text_keys_to_load: # the key name of field where the sample texts stored in the original data
- 'text'
text_keys: 'content' # the key name of field where the sample texts to be processed, e.g., `text`, `instruction`, `output`, ...'
# Note: currently, we support specify only ONE key for each op, for cases requiring multiple keys, users can specify the op multiple times. We will only use the first key of `text_keys` when you set multiple keys.
suffixes: [] # the suffix of files that will be read. For example: '.txt', 'txt' or ['txt', '.pdf', 'docx']
use_cache: true # whether to use the cache management of hugging face datasets. It might take up lots of disk space when using cache
ds_cache_dir: '~/.cache/huggingface/datasets' # cache dir for hugging face datasets. In default it's the default cache dir "~/.cache/huggingface/datasets". If this argument is reset by users, it will override the default cache dir
Expand Down Expand Up @@ -116,10 +114,10 @@ process:
- suffix_filter: # filter to keep samples with specified suffix.
suffixes: [] # the suffix of text that will be keep. For example: '.txt', 'txt' or ['txt', '.pdf', 'docx']
- specified_field_filter: # filter text with the specified field info out of specific range
text_key: '' # the target key corresponding to multi-level field information need to be separated by '.'
field_key: '' # the target key corresponding to multi-level field information need to be separated by '.'
target_value: [] # the range of specified field information corresponding to the samples that need to be retained
- specified_numeric_field_filter: # filter text with the specified numeric field info out of specific range
text_key: '' # the target key corresponding to multi-level field information need to be separated by '.'
field_key: '' # the target key corresponding to multi-level field information need to be separated by '.'
min_value: 0 # the min filter value in SpecifiedNumericField op
max_value: 10000 # the max filter value in SpecifiedNumericField op

Expand All @@ -137,12 +135,12 @@ process:

# Selector ops
- topk_specified_field_selector: # selector to select top samples based on the sorted specified field
text_key: '' # the target keys corresponding to multi-level field information need to be separated by '.'
field_key: '' # the target keys corresponding to multi-level field information need to be separated by '.'
top_ratio: # ratio of selected top samples
topk: # number of selected top sample
reverse: True # determine the sorting rule, if reverse=True, then sort in descending order
- frequency_specified_field_selector: # selector to select samples based on the sorted frequency of specified field value
text_key: '' # the target keys corresponding to multi-level field information need to be separated by '.'
field_key: '' # the target keys corresponding to multi-level field information need to be separated by '.'
top_ratio: # ratio of selected top specified field value
topk: # number of selected top specified field value
reverse: True # determine the sorting rule, if reverse=True, then sort in descending order
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ project_name: 'Data-Juicer-recipes-the-stack'
dataset_path: '/path/to/your/dataset' # path to your dataset directory or file
export_path: '/path/to/your/dataset.jsonl'

text_key: 'content'
text_keys: 'content'

np: 50 # number of subprocess to process your dataset
open_tracer: true
Expand Down
4 changes: 3 additions & 1 deletion data_juicer/analysis/column_wise_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import matplotlib.pyplot as plt
import pandas as pd

from data_juicer.utils.constant import Fields

from .overall_analysis import OverallAnalysis


Expand Down Expand Up @@ -64,7 +66,7 @@ def __init__(self,
:param save_stats_in_one_file: whether save all analysis figures of all
stats into one image file
"""
self.stats = pd.DataFrame(dataset['stats'])
self.stats = pd.DataFrame(dataset[Fields.stats])
self.output_path = output_path
if not os.path.exists(self.output_path):
os.makedirs(self.output_path)
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/analysis/overall_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pandas as pd


from data_juicer.utils.constant import Fields
class OverallAnalysis:
"""Apply analysis on the overall stats, including mean, std, quantiles,
etc."""
Expand All @@ -14,7 +14,7 @@ def __init__(self, dataset, output_path):
:param dataset: the dataset to be analysed
:param output_path: path to store the analysis results.
"""
self.stats = pd.DataFrame(dataset['stats'])
self.stats = pd.DataFrame(dataset[Fields.stats])
self.output_path = output_path
if not os.path.exists(self.output_path):
os.makedirs(self.output_path)
Expand Down
25 changes: 11 additions & 14 deletions data_juicer/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,20 +69,17 @@ def init_configs(args=None):
type=PositiveInt,
default=4,
help='Number of processes to process dataset.')
parser.add_argument('--text_key_to_process',
type=Optional[str],
default='text',
help='Key name of field where the sample '
'texts to be processed, e.g., '
'`text`, `text.instruction`, `text.output`, ...'
'Note: currently, we support specify only ONE key for '
'each op, for cases requiring multiple keys, users can'
' specify the op multiple times')
parser.add_argument('--text_keys_to_load',
type=Union[List[str], Tuple[str]],
default=['text'],
help='Key name of field where the sample '
'texts stored in the original data')
parser.add_argument(
'--text_keys',
type=Union[str, List[str]],
default='text',
help='Key name of field where the sample '
'texts to be processed, e.g., '
'`text`, `text.instruction`, `text.output`, ...'
'Note: currently, we support specify only ONE key for '
'each op, for cases requiring multiple keys, users can'
' specify the op multiple times. We will only use the '
'first key of `text_keys` when you set multiple keys.')
parser.add_argument('--suffixes',
type=Union[str, List[str], Tuple[str]],
default=[],
Expand Down
15 changes: 8 additions & 7 deletions data_juicer/core/analyser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from data_juicer.config import init_configs
from data_juicer.format import load_formatter
from data_juicer.ops import Filter, load_ops
from data_juicer.utils.constant import Fields

from .exporter import Exporter

Expand Down Expand Up @@ -35,8 +36,8 @@ def __init__(self, cfg=None):
# setup formatter
logger.info('Setting up data formatter...')
self.formatter = load_formatter(self.cfg.dataset_path,
self.cfg.text_keys_to_load,
self.cfg.suffixes, self.cfg.add_suffix)
self.cfg.text_keys, self.cfg.suffixes,
self.cfg.add_suffix)

# prepare exporter and check export path suffix
# NOTICE: no need to export dataset texts for analyser
Expand Down Expand Up @@ -65,23 +66,23 @@ def run(self, load_data_np=None):
logger.info('Loading dataset from data formatter...')
if load_data_np is None:
load_data_np = self.cfg.np
dataset = self.formatter.load_dataset(load_data_np, self.cfg)
dataset = self.formatter.load_dataset(load_data_np)

# extract processes
logger.info('Preparing process operators...')
self.ops = load_ops(self.cfg.process, self.cfg.text_key_to_process)
self.ops = load_ops(self.cfg.process, self.cfg.text_keys)

# 2. stats precompute only for filter ops
logger.info('Computing the stats of dataset...')
stats_collected = False
for op_cfg, op in zip(self.cfg.process, self.ops):
op_name = list(op_cfg.keys())[0]
if isinstance(op, Filter):
if 'stats' not in dataset.features:
if Fields.stats not in dataset.features:
# TODO:
# this is a temp solution,
# only add stats when calling filter op
dataset = dataset.add_column(name='stats',
dataset = dataset.add_column(name=Fields.stats,
column=[{}] *
dataset.num_rows)
dataset = dataset.map(op.compute_stats,
Expand All @@ -94,7 +95,7 @@ def run(self, load_data_np=None):
return dataset

# 3. analysis and output result to the export path
# 3.1. Only consider fields in 'stats'
# 3.1. Only consider fields in Fields.stats
# 3.2. For string fields, only consider its histogram
# 3.3. For numeric fields, consider its histogram and box
# 3.4. Otherwise, DO NOT analyse
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/core/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def __getitem__(self, key):
class NestedDatasetDict(DatasetDict):
"""Enhanced HuggingFace-DatasetDict for better usability and efficiency."""

def __init(self, *args, **kargs):
def __init__(self, *args, **kargs):
if len(args) == 1 and isinstance(args[0], Dataset):
# init from another DatasetDict instance
self.__dict__ = copy.copy(args[0].__dict__)
Expand Down
Loading