-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpreprocess.sh
41 lines (32 loc) · 864 Bytes
/
preprocess.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/usr/bin/env bash
source ./config.sh
set -e
set -x
rm -rf $DATA_BIN
rm -rf $DATA_RAW
# set copy params
copy_params='--copy-ext-dict'
# set common params between train/test
common_params="--source-lang src --target-lang tgt
--padding-factor 1
--srcdict ./dicts/${DICT}
--joined-dictionary
"
trainpref=$DATA/$TRAIN_PREF
validpref=$DATA/$VALID_PREF
testpref=$DATA/$TEST_PREF
# preprocess train/valid
python preprocess.py $common_params $copy_params \
--trainpref $trainpref \
--validpref $validpref \
--destdir $DATA_BIN \
--output-format binary \
--alignfile $trainpref.forward \
| tee $OUT/log/data_bin.log
# preprocess test
python preprocess.py $common_params $copy_params \
--testpref $testpref \
--destdir $DATA_RAW \
--output-format raw \
| tee $OUT/log/data_raw.log
# mv $DATA_RAW/test.src-tgt.src $DATA_RAW/test.src-tgt.src.old