Human evaluations for language generation tasks facilitate development of both generation models and automatic metrics. We provide THumB (Transparent Human Benchmark) scores for generation tasks. More tasks might be added in the future.
@inproceedings{kasai2022thumb,
title = {Transparent Human Evaluation for Image Captioning},
author = {Jungo Kasai and Keisuke Sakaguchi and Lavinia Dunagan and Jacob Morrison and Ronan Le Bras and Yejin Choi and Noah A. Smith},
year = {2022},
booktitle = {Proc.\ of NAACL},
url = {https://arxiv.org/abs/2111.08940},
}
@article{fabbri2021summeval,
title = {{SummEval}: Re-evaluating Summarization Evaluation},
author = {Fabbri, Alexander R and Kry{\'s}ci{\'n}ski, Wojciech and McCann, Bryan and Xiong, Caiming and Socher, Richard and Radev, Dragomir},
journal = {TACL},
year = {2021},
url = {https://arxiv.org/abs/2007.12626},
}