scripts/compute_isometric_slt_stat.sh

#!/bin/bash

# SCRIPT FOR MT EVALUATION (BERTScore, SACREBleu, LC/Length compliance stat)
# For more detail on the metrics, please see ./scripts/README

set -e

SRC="en"
TGT=$1
MT_INPUT=$2
MT_OUTPUT=$3
REFERENCE=$4

PWD=$PWD
LOG=$PWD/log.stat.$TGT  #${MT_OUTPUT}.stat
LC_STAT=$PWD/isometric_slt_stat.py

# best model for bert-score considering a higher correlation with human eval (see https://github.com/Tiiiger/bert_score)
BERTSCORE_MODEL='microsoft/deberta-xlarge-mnli'
BATCH=32

# compute stat
compute_stat () {
        local INPUT=$1
        local MT=$2
        local REF=$3
        echo -e "Computing stat for:\n$(head -2 $INPUT $MT $REF)"

        # compute length compliance (length ratio and range) b/n the mt output and input
        LC_INPUT_MT=$(python $LC_STAT -s ${INPUT} -t ${MT})
        # compute detok-bleu and rescaled bert-score b/n the mt output and reference
        SACREBLEU=$(sacrebleu ${REF} -i ${MT} -m bleu)
        BERTSCORE=$(bert-score --rescale_with_baseline --m $BERTSCORE_MODEL -r $REF -c ${MT} --lang $TGT --batch_size $BATCH)

        echo -e "PAIR: $SRC->$TGT \nINPUT: $INPUT \nMT: $MT \nREF: $REF
                    \nSACRE Bleu: ${SACREBLEU}
                    \nBERT Score: ${BERTSCORE}
                    \nLength Stat: ${LC_INPUT_MT}" | tee -a $LOG
}

compute_stat "${MT_INPUT}" "${MT_OUTPUT}" "${REFERENCE}"
echo -e "===\n" | tee -a $LOG