From d40c6bcdee6e4adfc2cf70058b669db6d0414c8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20F=C3=BCrneisen?= Date: Mon, 6 Dec 2021 09:21:48 +0000 Subject: [PATCH] Add XTransformer backend. --- annif/backend/__init__.py | 2 + annif/backend/xtransformer.py | 248 +++++++++++++++++++++++++ setup.py | 1 + tests/test_backend_xtransformer.py | 281 +++++++++++++++++++++++++++++ 4 files changed, 532 insertions(+) create mode 100644 annif/backend/xtransformer.py create mode 100644 tests/test_backend_xtransformer.py diff --git a/annif/backend/__init__.py b/annif/backend/__init__.py index cabaf5625..8a0c0c9cd 100644 --- a/annif/backend/__init__.py +++ b/annif/backend/__init__.py @@ -9,6 +9,7 @@ from . import stwfsa from . import mllm from . import svc +from . import xtransformer import annif @@ -35,6 +36,7 @@ def get_backend(backend_id): register_backend(stwfsa.StwfsaBackend) register_backend(mllm.MLLMBackend) register_backend(svc.SVCBackend) +register_backend(xtransformer.XTransformerBackend) # Optional backends try: diff --git a/annif/backend/xtransformer.py b/annif/backend/xtransformer.py new file mode 100644 index 000000000..0abc60c78 --- /dev/null +++ b/annif/backend/xtransformer.py @@ -0,0 +1,248 @@ +"""Annif backend using the transformer variant of pecos.""" + +from sys import stdout +import os.path as osp +import logging +import scipy.sparse as sp +import numpy as np + +from annif.exception import NotInitializedException, NotSupportedException +from annif.suggestion import ListSuggestionResult, SubjectSuggestion +from . import mixins +from . import backend +from annif.util import boolean, apply_param_parse_config, atomic_save + +from pecos.xmc.xtransformer.model import XTransformer +from pecos.xmc.xtransformer.module import MLProblemWithText +from pecos.utils.featurization.text.preprocess import Preprocessor +from pecos.xmc.xtransformer import matcher + + +class XTransformerBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend): + """XTransformer based backend for Annif""" + name = 'xtransformer' + needs_subject_index = True + + _model = None + + train_X_file = 'xtransformer-train-X.npz' + train_y_file = 'xtransformer-train-y.npz' + train_txt_file = 'xtransformer-train-raw.txt' + model_folder = 'xtransformer-model' + + PARAM_CONFIG = { + 'min_df': int, + 'ngram': int, + 'fix_clustering': boolean, + 'nr_splits': int, + 'min_codes': int, + 'max_leaf_size': int, + 'imbalanced_ratio': float, + 'imbalanced_depth': int, + 'max_match_clusters': int, + 'do_fine_tune': boolean, + 'model_shortcut': str, + 'beam_size': int, + 'limit': int, + 'post_processor': str, + 'negative_sampling': str, + 'ensemble_method': str, + 'threshold': float, + 'loss_function': str, + 'truncate_length': int, + 'hidden_droput_prob': float, + 'batch_size': int, + 'gradient_accumulation_steps': int, + 'learning_rate': float, + 'weight_decay': float, + 'adam_epsilon': float, + 'num_train_epochs': int, + 'max_steps': int, + 'lr_schedule': str, + 'warmup_steps': int, + 'logging_steps': int, + 'save_steps': int, + 'max_active_matching_labels': int, + 'max_num_labels_in_gpu': int, + 'use_gpu': boolean, + 'bootstrap_model': str + } + + DEFAULT_PARAMETERS = { + 'min_df': 1, + 'ngram': 1, + 'fix_clustering': False, + 'nr_splits': 16, + 'min_codes': None, + 'max_leaf_size': 100, + 'imbalanced_ratio': 0.0, + 'imbalanced_depth': 100, + 'max_match_clusters': 32768, + 'do_fine_tune': True, + # 'model_shortcut': 'distilbert-base-multilingual-cased', + 'model_shortcut': 'bert-base-multilingual-uncased', + 'beam_size': 20, + 'limit': 100, + 'post_processor': 'sigmoid', + 'negative_sampling': 'tfn', + 'ensemble_method': 'transformer-only', + 'threshold': 0.1, + 'loss_function': 'squared-hinge', + 'truncate_length': 128, + 'hidden_droput_prob': 0.1, + 'batch_size': 32, + 'gradient_accumulation_steps': 1, + 'learning_rate': 1e-4, + 'weight_decay': 0.0, + 'adam_epsilon': 1e-8, + 'num_train_epochs': 1, + 'max_steps': 0, + 'lr_schedule': 'linear', + 'warmup_steps': 0, + 'logging_steps': 100, + 'save_steps': 1000, + 'max_active_matching_labels': None, + 'max_num_labels_in_gpu': 65536, + 'use_gpu': True, + 'bootstrap_model': 'linear' + } + + def _initialize_model(self): + if self._model is None: + path = osp.join(self.datadir, self.model_folder) + self.debug('loading model from {}'.format(path)) + if osp.exists(path): + self._model = XTransformer.load(path) + else: + raise NotInitializedException( + 'model {} not found'.format(path), + backend_id=self.backend_id) + + def initialize(self, parallel=False): + self.initialize_vectorizer() + self._initialize_model() + + def default_params(self): + params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() + params.update(self.DEFAULT_PARAMETERS) + return params + + def _create_train_files(self, veccorpus, corpus): + self.info('creating train file') + Xs = [] + ys = [] + txt_pth = osp.join(self.datadir, self.train_txt_file) + with open(txt_pth, 'w', encoding='utf-8') as txt_file: + for doc, vector in zip(corpus.documents, veccorpus): + subject_ids = [ + self.project.subjects.by_uri(uri) + for uri + in doc.uris] + subject_ids = [s_id for s_id in subject_ids if s_id] + if not (subject_ids and doc.text): + continue # noqa + print(' '.join(doc.text.split()), file=txt_file) + Xs.append( + sp.csr_matrix(vector, dtype=np.float32).sorted_indices()) + ys.append( + sp.csr_matrix(( + np.ones(len(subject_ids)), + ( + np.zeros(len(subject_ids)), + subject_ids)), + shape=(1, len(self.project.subjects)), + dtype=np.float32 + ).sorted_indices()) + atomic_save( + sp.vstack(Xs, format='csr'), + self.datadir, + self.train_X_file, + method=lambda mtrx, target: sp.save_npz( + target, + mtrx, + compressed=True)) + atomic_save( + sp.vstack(ys, format='csr'), + self.datadir, + self.train_y_file, + method=lambda mtrx, target: sp.save_npz( + target, + mtrx, + compressed=True)) + + def _create_model(self, params, jobs): + train_txts = Preprocessor.load_data_from_file( + osp.join(self.datadir, self.train_txt_file), + label_text_path=None, + text_pos=0)['corpus'] + train_X = sp.load_npz(osp.join(self.datadir, self.train_X_file)) + train_y = sp.load_npz(osp.join(self.datadir, self.train_y_file)) + model_path = osp.join(self.datadir, self.model_folder) + new_params = apply_param_parse_config( + self.PARAM_CONFIG, + self.DEFAULT_PARAMETERS) + new_params['only_topk'] = new_params.pop('limit') + train_params = XTransformer.TrainParams.from_dict( + new_params, + recursive=True).to_dict() + pred_params = XTransformer.PredParams.from_dict( + new_params, + recursive=True).to_dict() + + self.info('Start training') + # enable progress + matcher.LOGGER.setLevel(logging.INFO) + matcher.LOGGER.addHandler(logging.StreamHandler(stream=stdout)) + self._model = XTransformer.train( + MLProblemWithText(train_txts, train_y, X_feat=train_X), + clustering=None, + val_prob=None, + train_params=train_params, + pred_params=pred_params, + beam_size=params['beam_size'], + steps_scale=None, + label_feat=None, + ) + atomic_save(self._model, model_path, None) + + def _train(self, corpus, params, jobs=0): + if corpus == 'cached': + self.info("Reusing cached training data from previous run.") + else: + if corpus.is_empty(): + raise NotSupportedException( + 'Cannot t project with no documents') + input = (doc.text for doc in corpus.documents) + vecparams = {'min_df': int(params['min_df']), + 'tokenizer': self.project.analyzer.tokenize_words, + 'ngram_range': (1, int(params['ngram']))} + veccorpus = self.create_vectorizer(input, vecparams) + self._create_train_files(veccorpus, corpus) + self._create_model(params, jobs) + + def _suggest(self, text, params): + text = ' '.join(text.split()) + vector = self.vectorizer.transform([text]) + if vector.nnz == 0: # All zero vector, empty result + return ListSuggestionResult([]) + new_params = apply_param_parse_config( + self.PARAM_CONFIG, + params + ) + prediction = self._model.predict( + [text], + X_feat=vector.sorted_indices(), + batch_size=params['batch_size'], + use_gpu=new_params['use_gpu'], + only_top_k=new_params['limit'], + post_processor=new_params['post_processor']) + results = [] + for idx, score in zip(prediction.indices, prediction.data): + subject = self.project.subjects[idx] + results.append(SubjectSuggestion( + uri=subject[0], + label=subject[1], + notation=subject[2], + score=score + )) + return ListSuggestionResult(results) diff --git a/setup.py b/setup.py index 8eae48f68..7e115fb92 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ def read(fname): 'omikuji': ['omikuji==0.3.*'], 'yake': ['yake==0.4.5'], 'pycld3': ['pycld3'], + 'pecos': ['libpecos==0.2.3'], 'dev': [ 'codecov', 'pytest-cov', diff --git a/tests/test_backend_xtransformer.py b/tests/test_backend_xtransformer.py new file mode 100644 index 000000000..bbd2de9cd --- /dev/null +++ b/tests/test_backend_xtransformer.py @@ -0,0 +1,281 @@ +"""Unit tests for the XTransformer backend in Annif""" + +from scipy.sparse import load_npz, csr_matrix + +from os import mknod +import os.path as osp +import pytest +from unittest.mock import MagicMock, patch +import annif.backend +import annif.corpus +from annif.exception import NotInitializedException, NotSupportedException + + +pytest.importorskip('annif.backend.xtransformer') +XTransformer = annif.backend.xtransformer.XTransformer + + +@pytest.fixture +def mocked_xtransformer(datadir, project): + model_mock = MagicMock() + model_mock.save.side_effect = lambda x: mknod(osp.join(x, 'test')) + + return patch.object( + annif.backend.xtransformer.XTransformer, + 'train', + return_value=model_mock) + + +def test_xtransformer_default_params(project): + backend_type = annif.backend.get_backend('xtransformer') + xtransformer = backend_type( + backend_id='xtransfomer', + config_params={}, + project=project + ) + expected = { + 'min_df': 1, + 'ngram': 1, + 'fix_clustering': False, + 'nr_splits': 16, + 'min_codes': None, + 'max_leaf_size': 100, + 'imbalanced_ratio': 0.0, + 'imbalanced_depth': 100, + 'max_match_clusters': 32768, + 'do_fine_tune': True, + # 'model_shortcut': 'distilbert-base-multilingual-cased', + 'model_shortcut': 'bert-base-multilingual-uncased', + 'beam_size': 20, + 'limit': 100, + 'post_processor': 'sigmoid', + 'negative_sampling': 'tfn', + 'ensemble_method': 'transformer-only', + 'threshold': 0.1, + 'loss_function': 'squared-hinge', + 'truncate_length': 128, + 'hidden_droput_prob': 0.1, + 'batch_size': 32, + 'gradient_accumulation_steps': 1, + 'learning_rate': 1e-4, + 'weight_decay': 0.0, + 'adam_epsilon': 1e-8, + 'num_train_epochs': 1, + 'max_steps': 0, + 'lr_schedule': 'linear', + 'warmup_steps': 0, + 'logging_steps': 100, + 'save_steps': 1000, + 'max_active_matching_labels': None, + 'max_num_labels_in_gpu': 65536, + 'use_gpu': True, + 'bootstrap_model': 'linear' + } + actual = xtransformer.params + assert len(actual) == len(expected) + for param, val in expected.items(): + assert param in actual and actual[param] == val + + +def test_xtransformer_suggest_no_vectorizer(project): + backend_type = annif.backend.get_backend('xtransformer') + xtransformer = backend_type( + backend_id='xtransfomer', + config_params={}, + project=project + ) + with pytest.raises(NotInitializedException): + xtransformer.suggest('example text') + + +def test_xtransformer_create_train_files(tmpdir, project, datadir): + tmpfile = tmpdir.join('document.tsv') + tmpfile.write("nonexistent\thttp://example.com/nonexistent\n" + + "arkeologia\thttp://www.yso.fi/onto/yso/p1265\n" + + "...\thttp://example.com/none") + corpus = annif.corpus.DocumentFile(str(tmpfile)) + backend_type = annif.backend.get_backend('xtransformer') + xtransformer = backend_type( + backend_id='xtransformer', + config_params={}, + project=project) + input = (doc.text for doc in corpus.documents) + veccorpus = xtransformer.create_vectorizer(input, {}) + xtransformer._create_train_files(veccorpus, corpus) + assert datadir.join('xtransformer-train-X.npz').exists() + assert datadir.join('xtransformer-train-y.npz').exists() + assert datadir.join('xtransformer-train-raw.txt').exists() + traindata = datadir.join('xtransformer-train-raw.txt').read().splitlines() + assert len(traindata) == 1 + train_features = load_npz(str(datadir.join('xtransformer-train-X.npz'))) + assert train_features.shape[0] == 1 + train_labels = load_npz(str(datadir.join('xtransformer-train-y.npz'))) + assert train_labels.shape[0] == 1 + + +def test_xtransformer_train( + datadir, + document_corpus, + project, + mocked_xtransformer): + backend_type = annif.backend.get_backend('xtransformer') + xtransformer = backend_type( + backend_id='xtransfomer', + config_params={}, + project=project + ) + + with mocked_xtransformer as train_mock: + xtransformer.train(document_corpus) + + train_mock.assert_called_once() + first_arg = train_mock.call_args.args[0] + kwargs = train_mock.call_args.kwargs + assert len(first_arg.X_text) == 6397 + assert first_arg.X_feat.shape == (6397, 12480) + assert first_arg.Y.shape == (6397, 130) + expected_pred_params = XTransformer.PredParams.from_dict( + { + 'beam_size': 20, + 'only_topk': 100, + 'post_processor': 'sigmoid', + 'truncate_length': 128, + }, + recursive=True).to_dict() + + expected_train_params = XTransformer.TrainParams.from_dict( + { + 'do_fine_tune': True, + 'only_encoder': False, + 'fix_clustering': False, + 'max_match_clusters': 32768, + 'nr_splits': 16, + 'max_leaf_size': 100, + 'imbalanced_ratio': 0.0, + 'imbalanced_depth': 100, + 'max_match_clusters': 32768, + 'do_fine_tune': True, + 'model_shortcut': 'bert-base-multilingual-uncased', + # 'model_shortcut': 'distilbert-base-multilingual-cased', + 'post_processor': 'sigmoid', + 'negative_sampling': 'tfn', + 'ensemble_method': 'transformer-only', + 'threshold': 0.1, + 'loss_function': 'squared-hinge', + 'truncate_length': 128, + 'hidden_droput_prob': 0.1, + 'batch_size': 32, + 'gradient_accumulation_steps': 1, + 'learning_rate': 1e-4, + 'weight_decay': 0.0, + 'adam_epsilon': 1e-8, + 'num_train_epochs': 1, + 'max_steps': 0, + 'lr_schedule': 'linear', + 'warmup_steps': 0, + 'logging_steps': 100, + 'save_steps': 1000, + 'max_active_matching_labels': None, + 'max_num_labels_in_gpu': 65536, + 'use_gpu': True, + 'bootstrap_model': 'linear', + }, + recursive=True).to_dict() + + assert kwargs == { + 'clustering': None, + 'val_prob': None, + 'steps_scale': None, + 'label_feat': None, + 'beam_size': 20, + 'pred_params': expected_pred_params, + 'train_params': expected_train_params + } + xtransformer._model.save.assert_called_once() + assert datadir.join('xtransformer-model').check() + + +def test_xtransformer_train_cached(mocked_xtransformer, datadir, project): + backend_type = annif.backend.get_backend('xtransformer') + xtransformer = backend_type( + backend_id='xtransfomer', + config_params={}, + project=project + ) + xtransformer._create_train_files = MagicMock() + xtransformer._create_model = MagicMock() + with mocked_xtransformer: + xtransformer.train('cached') + xtransformer._create_train_files.assert_not_called() + xtransformer._create_model.assert_called_once() + + +def test_xtransfomer_train_no_documents(datadir, project, empty_corpus): + backend_type = annif.backend.get_backend('xtransformer') + xtransformer = backend_type( + backend_id='xtransfomer', + config_params={}, + project=project + ) + with pytest.raises(NotSupportedException): + xtransformer.train(empty_corpus) + + +def test_xtransformer_suggest(project): + backend_type = annif.backend.get_backend('xtransformer') + xtransformer = backend_type( + backend_id='xtransfomer', + config_params={}, + project=project + ) + xtransformer._model = MagicMock() + xtransformer._model.predict.return_value = csr_matrix( + [0, 0.2, 0, 0, 0, 0.5, 0] + ) + result = xtransformer.suggest("""Arkeologiaa sanotaan joskus myös + muinaistutkimukseksi tai muinaistieteeksi. Se on humanistinen tiede + tai oikeammin joukko tieteitä, jotka tutkivat ihmisen menneisyyttä. + Tutkimusta tehdään analysoimalla muinaisjäännöksiä eli niitä jälkiä, + joita ihmisten toiminta on jättänyt maaperään tai vesistöjen + pohjaan.""") + xtransformer._model.predict.assert_called_once() + + expected = [ + annif.suggestion.SubjectSuggestion( + uri=project.subjects._uris[1], + label=project.subjects._labels[1], + notation=None, + score=0.2 + ), + annif.suggestion.SubjectSuggestion( + uri=project.subjects._uris[5], + label=project.subjects._labels[5], + notation=None, + score=0.5 + ) + ] + assert result.as_list(None) == expected + + +def test_xtransformer_suggest_no_input(project, datadir): + backend_type = annif.backend.get_backend('xtransformer') + xtransformer = backend_type( + backend_id='xtransfomer', + config_params={'limit': 5}, + project=project + ) + xtransformer._model = MagicMock() + results = xtransformer.suggest('j') + assert len(results.as_list(None)) == 0 + + +def test_xtransformer_suggest_no_model(datadir, project): + backend_type = annif.backend.get_backend('xtransformer') + xtransformer = backend_type( + backend_id='xtransfomer', + config_params={}, + project=project + ) + datadir.remove() + with pytest.raises(NotInitializedException): + xtransformer.suggest('example text')