-
Notifications
You must be signed in to change notification settings - Fork 101
/
xp3_jsonl_to_meg.slurm
150 lines (138 loc) · 3.05 KB
/
xp3_jsonl_to_meg.slurm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/bin/bash
#SBATCH --job-name=xp3jsonl # job name
#SBATCH --ntasks=1 # number of MP tasks
#SBATCH --nodes=1
#SBATCH --cpus-per-task=40 # number of cores per tasks
#SBATCH --hint=nomultithread # we get physical cores not logical
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
#SBATCH --output=%x-%j.out # output file name
#SBATCH --account=six@cpu
#SBATCH --partition=cpu_p1
#SBATCH --qos=qos_cpu-t3
set -x -e
source $six_ALL_CCFRWORK/start-tr13f-6B3-ml-t0
export HF_DATASETS_OFFLINE=1
export TRANSFORMERS_OFFLINE=1
MEGATRON_DEEPSPEED_REPO=$six_ALL_CCFRWORK/code/tr13f-6B3-ml-t0/Megatron-DeepSpeed
TOKENIZER_PATH="bigscience/tokenizer"
LANGS=(
ak
ar
as
bm
bn
ca
code
en
es
eu
fon
fr
gu
hi
id
ig
ki
kn
lg
ln
ml
mr
ne
nso
ny
or
pa
pt
rn
rw
sn
st
sw
ta
te
tn
ts
tum
tw
ur
vi
wo
xh
yo
zh
zu
)
DATA_PATH=/gpfswork/rech/six/commun/bigscience-training/jsonls/xp3capped/train
for val in {0..45}; do
LANG=${LANGS[$val]}
cd $DATA_PATH/$LANG
cat *.jsonl > merged_dups_$LANG.jsonl
# Drop duplicates (~1G / 37G for en)
sort -u merged_dups_$LANG.jsonl | shuf > merged_$LANG.jsonl
OUTPUT=/gpfswork/rech/six/commun/bigscience-training/xp3cappednew/train/xp3_train_$LANG
cd $MEGATRON_DEEPSPEED_REPO
python tools/preprocess_data.py \
--input $DATA_PATH/$LANG/merged_$LANG.jsonl \
--output-prefix $OUTPUT \
--dataset-impl mmap \
--json-key inputs \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path $TOKENIZER_PATH \
--workers 35
python tools/preprocess_data.py \
--input $DATA_PATH/$LANG/merged_$LANG.jsonl \
--output-prefix $OUTPUT \
--dataset-impl mmap \
--json-key targets \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path $TOKENIZER_PATH \
--append-eod \
--prepend-space \
--workers 35
done
# No val data for other langs
LANGS=(
ar
bn
code
en
es
fr
hi
id
pt
sw
te
vi
zh
)
DATA_PATH=/gpfswork/rech/six/commun/bigscience-training/jsonls/xp3capped/validation
cd $DATA_PATH
for val in {0..12}; do
LANG=${LANGS[$val]}
cd $DATA_PATH/$LANG
cat *.jsonl > merged_dups_$LANG.jsonl
# Drop duplicates (~1G / 37G for en)
sort -u merged_dups_$LANG.jsonl > merged_$LANG.jsonl
OUTPUT=/gpfswork/rech/six/commun/bigscience-training/xp3cappednew/validation/xp3_validation_$LANG
cd $MEGATRON_DEEPSPEED_REPO
python tools/preprocess_data.py \
--input $DATA_PATH/$LANG/merged_$LANG.jsonl \
--output-prefix $OUTPUT \
--dataset-impl mmap \
--json-key inputs \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path $TOKENIZER_PATH \
--workers 35
python tools/preprocess_data.py \
--input $DATA_PATH/$LANG/merged_$LANG.jsonl \
--output-prefix $OUTPUT \
--dataset-impl mmap \
--json-key targets \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path $TOKENIZER_PATH \
--append-eod \
--prepend-space \
--workers 35
done