forked from facebookresearch/XLM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get-data-glue.sh
executable file
·188 lines (164 loc) · 7.38 KB
/
get-data-glue.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# Copyright (c) 2019-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
set -e
# data paths
MAIN_PATH=$PWD
OUTPATH=$PWD/data/glue_test
PROCESSED_PATH=$PWD/data/processed/XLM15
CODES_PATH=$MAIN_PATH/codes_xnli_15
VOCAB_PATH=$MAIN_PATH/vocab_xnli_15
URLPATH=https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2F
# tools paths
TOOLS_PATH=$PWD/tools
TOKENIZE=$TOOLS_PATH/tokenize.sh
MOSES=$TOOLS_PATH/mosesdecoder
REPLACE_UNICODE_PUNCT=$MOSES/scripts/tokenizer/replace-unicode-punctuation.perl
NORM_PUNC=$MOSES/scripts/tokenizer/normalize-punctuation.perl
REM_NON_PRINT_CHAR=$MOSES/scripts/tokenizer/remove-non-printing-char.perl
LOWER_REMOVE_ACCENT=$TOOLS_PATH/lowercase_and_remove_accent.py
FASTBPE=$TOOLS_PATH/fastBPE/fast
# install tools
./install-tools.sh
# create directories
# rm -r $OUTPATH
mkdir -p $OUTPATH
# SST-2
if [ ! -d $OUTPATH/SST-2 ]; then
if [ ! -f $OUTPATH/SST-2zip ]; then
wget -c "${URLPATH}SST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8" -P $OUTPATH
fi
unzip $OUTPATH/*SST-2* -d $OUTPATH
for split in train dev
do
sed '1d' $OUTPATH/SST-2/${split}.tsv | cut -f1 | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l en | $REM_NON_PRINT_CHAR > $OUTPATH/SST-2/${split}.x
sed '1d' $OUTPATH/SST-2/${split}.tsv | cut -f2 > $OUTPATH/SST-2/${split}.y
paste $OUTPATH/SST-2/${split}.x $OUTPATH/SST-2/${split}.y > $OUTPATH/SST-2/${split}.xlm.tsv
rm $OUTPATH/SST-2/${split}.x $OUTPATH/SST-2/${split}.y
done
sed '1d' $OUTPATH/SST-2/test.tsv | cut -f2 | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l en | $REM_NON_PRINT_CHAR > $OUTPATH/SST-2/test.xlm.tsv
rm $OUTPATH/*SST-2.zip*
fi
# SST-B
if [ ! -d $OUTPATH/STS-B ]; then
if [ ! -f $OUTPATH/STS-B.zip ]; then
wget -c "${URLPATH}STS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5" -P $OUTPATH
fi
unzip $OUTPATH/*STS-B* -d $OUTPATH
for split in train dev test
do
sed '1d' $OUTPATH/STS-B/${split}.tsv | cut -f8 | $TOKENIZE en | python $LOWER_REMOVE_ACCENT > $OUTPATH/STS-B/${split}.x1
sed '1d' $OUTPATH/STS-B/${split}.tsv | cut -f9 | $TOKENIZE en | python $LOWER_REMOVE_ACCENT > $OUTPATH/STS-B/${split}.x2
if [ "$split" != "test" ]; then
sed '1d' $OUTPATH/STS-B/${split}.tsv | cut -f10 > $OUTPATH/STS-B/${split}.y
paste $OUTPATH/STS-B/${split}.x1 $OUTPATH/STS-B/${split}.x2 $OUTPATH/STS-B/${split}.y > $OUTPATH/STS-B/${split}.xlm.tsv
rm $OUTPATH/STS-B/${split}.x1 $OUTPATH/STS-B/${split}.x2 $OUTPATH/STS-B/${split}.y
else
paste $OUTPATH/STS-B/${split}.x1 $OUTPATH/STS-B/${split}.x2 > $OUTPATH/STS-B/${split}.xlm.tsv
rm $OUTPATH/STS-B/${split}.x1 $OUTPATH/STS-B/${split}.x2
fi
done
rm $OUTPATH/*STS-B.zip*
fi
# MNLI
if [ ! -d $OUTPATH/MNLI ]; then
if [ ! -f $OUTPATH/MNLI.zip ]; then
wget -c "${URLPATH}MNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce" -P $OUTPATH
fi
unzip $OUTPATH/*MNLI* -d $OUTPATH
mv $OUTPATH/MNLI/dev_matched.tsv $OUTPATH/MNLI/dev.tsv
mv $OUTPATH/MNLI/test_matched.tsv $OUTPATH/MNLI/test.tsv
for split in train dev test
do
sed '1d' $OUTPATH/MNLI/${split}.tsv | cut -f9 | $TOKENIZE en | python $LOWER_REMOVE_ACCENT > $OUTPATH/MNLI/${split}.x1
sed '1d' $OUTPATH/MNLI/${split}.tsv | cut -f10 | $TOKENIZE en | python $LOWER_REMOVE_ACCENT > $OUTPATH/MNLI/${split}.x2
sed '1d' $OUTPATH/MNLI/${split}.tsv | cut -f12 > $OUTPATH/MNLI/${split}.y
paste $OUTPATH/MNLI/${split}.x1 $OUTPATH/MNLI/${split}.x2 $OUTPATH/MNLI/${split}.y > $OUTPATH/MNLI/${split}.xlm.tsv
rm $OUTPATH/MNLI/${split}.x1 $OUTPATH/MNLI/${split}.x2 $OUTPATH/MNLI/${split}.y
done
rm $OUTPATH/*MNLI.zip*
fi
# QNLI
if [ ! -d $OUTPATH/QNLI ]; then
if [ ! -f $OUTPATH/QNLIv2.zip ]; then
wget -c "${URLPATH}QNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601" -P $OUTPATH
fi
unzip $OUTPATH/*QNLIv2* -d $OUTPATH
for split in train dev test
do
sed '1d' $OUTPATH/QNLI/${split}.tsv | cut -f2 | $TOKENIZE en | python $LOWER_REMOVE_ACCENT > $OUTPATH/QNLI/${split}.x1
sed '1d' $OUTPATH/QNLI/${split}.tsv | cut -f3 | $TOKENIZE en | python $LOWER_REMOVE_ACCENT > $OUTPATH/QNLI/${split}.x2
if [ "$split" != "test" ]; then
sed '1d' $OUTPATH/QNLI/${split}.tsv | cut -f4 > $OUTPATH/QNLI/${split}.y
paste $OUTPATH/QNLI/${split}.x1 $OUTPATH/QNLI/${split}.x2 $OUTPATH/QNLI/${split}.y > $OUTPATH/QNLI/${split}.xlm.tsv
rm $OUTPATH/QNLI/${split}.x1 $OUTPATH/QNLI/${split}.x2 $OUTPATH/QNLI/${split}.y
else
paste $OUTPATH/QNLI/${split}.x1 $OUTPATH/QNLI/${split}.x2 > $OUTPATH/QNLI/${split}.xlm.tsv
rm $OUTPATH/QNLI/${split}.x1 $OUTPATH/QNLI/${split}.x2
fi
done
rm $OUTPATH/*QNLIv2.zip*
fi
# QQP
if [ ! -d $OUTPATH/QQP ]; then
if [ ! -f $OUTPATH/QQP.zip ]; then
wget -c "${URLPATH}QQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5" -P $OUTPATH
fi
unzip $OUTPATH/*QQP* -d $OUTPATH
for split in train dev test
do
if [ "$split" != "test" ]; then
sed '1d' $OUTPATH/QQP/${split}.tsv | cut -f4 | $TOKENIZE en | python $LOWER_REMOVE_ACCENT > $OUTPATH/QQP/${split}.x1
sed '1d' $OUTPATH/QQP/${split}.tsv | cut -f5 | $TOKENIZE en | python $LOWER_REMOVE_ACCENT > $OUTPATH/QQP/${split}.x2
sed '1d' $OUTPATH/QQP/${split}.tsv | cut -f6 > $OUTPATH/QQP/${split}.y
paste $OUTPATH/QQP/${split}.x1 $OUTPATH/QQP/${split}.x2 $OUTPATH/QQP/${split}.y > $OUTPATH/QQP/${split}.xlm.tsv
rm $OUTPATH/QQP/${split}.x1 $OUTPATH/QQP/${split}.x2 $OUTPATH/QQP/${split}.y
else
sed '1d' $OUTPATH/QQP/${split}.tsv | cut -f2 | $TOKENIZE en | python $LOWER_REMOVE_ACCENT > $OUTPATH/QQP/${split}.x1
sed '1d' $OUTPATH/QQP/${split}.tsv | cut -f3 | $TOKENIZE en | python $LOWER_REMOVE_ACCENT > $OUTPATH/QQP/${split}.x2
paste $OUTPATH/QQP/${split}.x1 $OUTPATH/QQP/${split}.x2 > $OUTPATH/QQP/${split}.xlm.tsv
rm $OUTPATH/QQP/${split}.x1 $OUTPATH/QQP/${split}.x2
fi
done
rm $OUTPATH/*QQP.zip*
fi
# Get BPE codes and vocab
wget -c https://dl.fbaipublicfiles.com/XLM/codes_xnli_15 -P $MAIN_PATH
wget -c https://dl.fbaipublicfiles.com/XLM/vocab_xnli_15 -P $MAIN_PATH
# apply BPE codes and binarize the GLUE corpora
glue_tasks="MNLI QNLI QQP SST-2 STS-B" # TODO: missing MRPC
for task in $glue_tasks
do
if [ ! -d $PROCESSED_PATH/eval/$task ]; then
mkdir -p $PROCESSED_PATH/eval/$task
else
rm -r $PROCESSED_PATH/eval/$task/*
fi
for splt in train dev test
do
FPATH=$OUTPATH/${task}/${splt}.xlm.tsv
cut -f1 $FPATH > ${FPATH}.f1
$FASTBPE applybpe $PROCESSED_PATH/eval/$task/${splt}.s1 ${FPATH}.f1 $CODES_PATH
python preprocess.py $VOCAB_PATH $PROCESSED_PATH/eval/$task/${splt}.s1
rm ${FPATH}.f1
if [ "$task" != "CoLA" ] && [ "$task" != "SST-2" ]
then
cut -f2 $FPATH > ${FPATH}.f2
$FASTBPE applybpe $PROCESSED_PATH/eval/$task/${splt}.s2 ${FPATH}.f2 $CODES_PATH
python preprocess.py $VOCAB_PATH $PROCESSED_PATH/eval/$task/${splt}.s2
rm ${FPATH}.f2
if [ "$splt" != "test" ] || [ "$task" = "MRPC" ]
then
cut -f3 $FPATH > $PROCESSED_PATH/eval/$task/${splt}.label
fi
else
if [ "$splt" != "test" ]
then
cut -f2 $FPATH > $PROCESSED_PATH/eval/$task/${splt}.label
fi
fi
done
done