Code for the EMNLP 2023 paper "Towards Unsupervised Recognition of Token-level Semantic Differences in Related Documents".
- Requires Python >= 3.7 and PyTorch
pip install -r requirements.txt
from recognizers import DiffAlign, DiffDel, DiffMask
diff_align = DiffAlign("ZurichNLP/unsup-simcse-xlm-roberta-base")
a = "Chinese shares close higher Friday ."
b = "Chinese shares close lower Wednesday ."
result = diff_align.predict(a, b)
# DifferenceSample(
# tokens_a=('Chinese', 'shares', 'close', 'higher', 'Friday', '.'),
# tokens_b=('Chinese', 'shares', 'close', 'lower', 'Wednesday', '.'),
# labels_a=(0.07324671745300293, 0.06292498111724854, 0.082577645778656, 0.1421372890472412, 0.2610551714897156, 0.1118348240852356),
# labels_b=(0.07324671745300293, 0.06292498111724854, 0.082577645778656, 0.1421372890472412, 0.2709317207336426, 0.1118348240852356)
# )
python -m experiments.scripts.create_validation_table
python -m experiments.scripts.create_test_table
python -m experiments.scripts.create_dataset_statistics_table
python -m experiments.scripts.create_latency_table
python -m experiments.scripts.create_validation_table_del_ablations
python -m experiments.scripts.create_negative_ratio_figure
python -m experiments.scripts.create_document_length_figure
python -m experiments.scripts.create_permutation_figure
python -m experiments.scripts.create_languages_figure
@inproceedings{vamvas-sennrich-2023-rsd,
title={Towards Unsupervised Recognition of Token-level Semantic Differences in Related Documents},
author={Jannis Vamvas and Rico Sennrich},
month = dec,
year = "2023",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
address = "Singapore",
publisher = "Association for Computational Linguistics",
}