-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path06.align
executable file
·40 lines (34 loc) · 1.28 KB
/
06.align
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/bin/bash
set -euo pipefail
SLANG="$1"
SRC_BATCH="$2"
REF_BATCH="$3"
SHARD_ID=$(basename $(dirname $SRC_BATCH))
SRC_BATCH_ID=$(basename $SRC_BATCH)
REF_BATCH_ID=$(basename $REF_BATCH)
PAIR_FORMAT='{"shard": "'$SHARD_ID'", "'${SLANG%~*}'":'$SRC_BATCH_ID', "'${TARGET_LANG%~*}'":'$REF_BATCH_ID'}'
TIME_FORMAT='{"user":"%U", "system":"%S", "elapsed":"%E", "cpu":"%P", "text":%X, "data":%D, "max":%M, "inputs":%I, "output":%O, "major":%F, "minor":%R, "swaps":%W}'
TMPSFX=${JOB_ID:-$$}
/usr/bin/time -f '{"task":"docalign", "pair":'"$PAIR_FORMAT"', "time":'"$TIME_FORMAT"'}' \
${DOCALIGN} -j ${DOCALIGN_THREADS:-$THREADS} --threshold 0.1 \
${SRC_BATCH}/tokenised_${TARGET_LANG%~*}.gz \
${REF_BATCH}/tokenised_${TARGET_LANG%~*}.gz \
| tee ${SRC_BATCH}/pairs-${TARGET_LANG%~*}-${REF_BATCH_ID}.txt \
| ${DOCJOIN} \
-li\
-ri\
-l ${SRC_BATCH}/sentences.gz\
-r ${REF_BATCH}/sentences.gz\
-l ${SRC_BATCH}/sentences_${TARGET_LANG%~*}.gz\
| /usr/bin/time -f '{"task":"bleualign", "pair":'"$PAIR_FORMAT"', "time":'"$TIME_FORMAT"'}' \
parallel \
--tmpdir=$TMPDIR \
-j${BLEUALIGN_THREADS:-$THREADS} \
--halt 2 \
--pipe \
--group \
-l 1 \
${BLEUALIGN} --print-sent-hash --bleu-threshold 0.2 \
| gzip -c \
> ${SRC_BATCH}/aligned-${REF_BATCH_ID}.gz.$TMPSFX
mv ${SRC_BATCH}/aligned-${REF_BATCH_ID}.gz{.$TMPSFX,}