Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Xtransformer to backend #798

Draft
wants to merge 21 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ FROM python:3.10-slim-bookworm
LABEL org.opencontainers.image.authors="grp-natlibfi-annif@helsinki.fi"
SHELL ["/bin/bash", "-c"]

ARG optional_dependencies="voikko fasttext nn omikuji yake spacy stwfsa"
ARG optional_dependencies="voikko fasttext nn omikuji yake spacy stwfsa pecos"
ARG POETRY_VIRTUALENVS_CREATE=false

# Install system dependencies needed at runtime:
Expand Down Expand Up @@ -37,6 +37,10 @@ RUN if [[ $optional_dependencies =~ "spacy" ]]; then \
python -m spacy download $model; \
done; \
fi
RUN if [[ $optional_dependencies =~ "pecos" ]]; then \
mkdir /.cache -m a=rwx; \
fi


# Second round of installation with the actual code:
COPY annif /Annif/annif
Expand Down
12 changes: 12 additions & 0 deletions annif/backend/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,17 @@
return tfidf.TFIDFBackend


def _xtransformer() -> Type[AnnifBackend]:
try:
from . import xtransformer

return xtransformer.XTransformerBackend

Check warning on line 96 in annif/backend/__init__.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/__init__.py#L96

Added line #L96 was not covered by tests
except ImportError:
raise ValueError(
"XTransformer not available, not enabling XTransformer backend"
)


def _yake() -> Type[AnnifBackend]:
try:
from . import yake
Expand All @@ -111,6 +122,7 @@
"stwfsa": _stwfsa,
"svc": _svc,
"tfidf": _tfidf,
"xtransformer": _xtransformer,
"yake": _yake,
}

Expand Down
6 changes: 1 addition & 5 deletions annif/backend/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,7 @@ def _create_model(self, params: dict[str, Any], jobs: int) -> None:
self.info("creating fastText model")
trainpath = os.path.join(self.datadir, self.TRAIN_FILE)
modelpath = os.path.join(self.datadir, self.MODEL_FILE)
params = {
param: self.FASTTEXT_PARAMS[param](val)
for param, val in params.items()
if param in self.FASTTEXT_PARAMS
}
params = annif.util.apply_param_parse_config(self.FASTTEXT_PARAMS, params)
if jobs != 0: # jobs set by user to non-default value
params["thread"] = jobs
self.debug("Model parameters: {}".format(params))
Expand Down
4 changes: 1 addition & 3 deletions annif/backend/omikuji.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,7 @@ def _create_model(self, params: dict[str, Any], jobs: int) -> None:
hyper_param.collapse_every_n_layers = int(params["collapse_every_n_layers"])

self._model = omikuji.Model.train_on_data(train_path, hyper_param, jobs or None)
if os.path.exists(model_path):
shutil.rmtree(model_path)
self._model.save(os.path.join(self.datadir, self.MODEL_FILE))
annif.util.atomic_save_folder(self._model, model_path)

def _train(
self,
Expand Down
8 changes: 2 additions & 6 deletions annif/backend/stwfsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from annif.exception import NotInitializedException, NotSupportedException
from annif.suggestion import SubjectSuggestion
from annif.util import atomic_save, boolean
from annif.util import apply_param_parse_config, atomic_save, boolean

from . import backend

Expand Down Expand Up @@ -106,11 +106,7 @@ def _train(
jobs: int = 0,
) -> None:
X, y = self._load_data(corpus)
new_params = {
key: self.STWFSA_PARAMETERS[key](val)
for key, val in params.items()
if key in self.STWFSA_PARAMETERS
}
new_params = apply_param_parse_config(self.STWFSA_PARAMETERS, params)
p = StwfsapyPredictor(
graph=self.project.vocab.as_graph(),
langs=frozenset([params["language"]]),
Expand Down
252 changes: 252 additions & 0 deletions annif/backend/xtransformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
"""Annif backend using the transformer variant of pecos."""

import logging
import os.path as osp
import sys
from typing import Any

import numpy as np
import scipy.sparse as sp
from pecos.utils.featurization.text.preprocess import Preprocessor
from pecos.xmc.xtransformer import matcher, model
from pecos.xmc.xtransformer.model import XTransformer
from pecos.xmc.xtransformer.module import MLProblemWithText

Check warning on line 13 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L11-L13

Added lines #L11 - L13 were not covered by tests

from annif.corpus.document import DocumentCorpus
from annif.exception import NotInitializedException, NotSupportedException
from annif.suggestion import SubjectSuggestion, SuggestionBatch
from annif.util import (

Check warning on line 18 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L15-L18

Added lines #L15 - L18 were not covered by tests
apply_param_parse_config,
atomic_save,
atomic_save_folder,
boolean,
)

from . import backend, mixins

Check warning on line 25 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L25

Added line #L25 was not covered by tests


class XTransformerBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):

Check warning on line 28 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L28

Added line #L28 was not covered by tests
"""XTransformer based backend for Annif"""

name = "xtransformer"
needs_subject_index = True

Check warning on line 32 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L31-L32

Added lines #L31 - L32 were not covered by tests

_model = None

Check warning on line 34 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L34

Added line #L34 was not covered by tests

train_X_file = "xtransformer-train-X.npz"
train_y_file = "xtransformer-train-y.npz"
train_txt_file = "xtransformer-train-raw.txt"
model_folder = "xtransformer-model"

Check warning on line 39 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L36-L39

Added lines #L36 - L39 were not covered by tests

PARAM_CONFIG = {

Check warning on line 41 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L41

Added line #L41 was not covered by tests
"min_df": int,
"ngram": int,
"fix_clustering": boolean,
"nr_splits": int,
"min_codes": int,
"max_leaf_size": int,
"imbalanced_ratio": float,
"imbalanced_depth": int,
"max_match_clusters": int,
"do_fine_tune": boolean,
"model_shortcut": str,
"beam_size": int,
"limit": int,
"post_processor": str,
"negative_sampling": str,
"ensemble_method": str,
"threshold": float,
"loss_function": str,
"truncate_length": int,
"hidden_droput_prob": float,
"batch_size": int,
"gradient_accumulation_steps": int,
"learning_rate": float,
"weight_decay": float,
"adam_epsilon": float,
"num_train_epochs": int,
"max_steps": int,
"lr_schedule": str,
"warmup_steps": int,
"logging_steps": int,
"save_steps": int,
"max_active_matching_labels": int,
"max_num_labels_in_gpu": int,
"use_gpu": boolean,
"bootstrap_model": str,
Copy link

@katjakon katjakon Nov 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding my previous comments about hyperparamters: it should be fairly easy to incorporate additonal hyperparameters:
Adding the following lines to PARAM_CONFIG would allow us to make use of the hyperparamters Cp and Cn in the project configurations:

"Cn": float,
"Cp": float,

And similarly for the dict DEFAULT_PARAMETERS:

"Cn": 1.0,
"Cp": 1.0,

Let me know if there are any questions!

}

DEFAULT_PARAMETERS = {

Check warning on line 79 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L79

Added line #L79 was not covered by tests
"min_df": 1,
"ngram": 1,
"fix_clustering": False,
"nr_splits": 16,
"min_codes": None,
"max_leaf_size": 100,
"imbalanced_ratio": 0.0,
"imbalanced_depth": 100,
"max_match_clusters": 32768,
"do_fine_tune": True,
"model_shortcut": "distilbert-base-multilingual-uncased",
"beam_size": 20,
"limit": 100,
"post_processor": "sigmoid",
"negative_sampling": "tfn",
"ensemble_method": "transformer-only",
"threshold": 0.1,
"loss_function": "squared-hinge",
"truncate_length": 128,
"hidden_droput_prob": 0.1,
"batch_size": 32,
"gradient_accumulation_steps": 1,
"learning_rate": 1e-4,
"weight_decay": 0.0,
"adam_epsilon": 1e-8,
"num_train_epochs": 1,
"max_steps": 0,
"lr_schedule": "linear",
"warmup_steps": 0,
"logging_steps": 100,
"save_steps": 1000,
"max_active_matching_labels": None,
"max_num_labels_in_gpu": 65536,
"use_gpu": True,
"bootstrap_model": "linear",
}

def _initialize_model(self):
if self._model is None:
path = osp.join(self.datadir, self.model_folder)
self.debug("loading model from {}".format(path))
if osp.exists(path):
self._model = XTransformer.load(path)

Check warning on line 122 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L117-L122

Added lines #L117 - L122 were not covered by tests
else:
raise NotInitializedException(

Check warning on line 124 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L124

Added line #L124 was not covered by tests
"model {} not found".format(path), backend_id=self.backend_id
)

def initialize(self, parallel: bool = False) -> None:
self.initialize_vectorizer()
self._initialize_model()

Check warning on line 130 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L128-L130

Added lines #L128 - L130 were not covered by tests

def default_params(self):
params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
params.update(self.DEFAULT_PARAMETERS)
return params

Check warning on line 135 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L132-L135

Added lines #L132 - L135 were not covered by tests

def _create_train_files(self, veccorpus, corpus):
self.info("creating train file")
Xs = []
ys = []
txt_pth = osp.join(self.datadir, self.train_txt_file)
with open(txt_pth, "w", encoding="utf-8") as txt_file:
for doc, vector in zip(corpus.documents, veccorpus):
subject_set = doc.subject_set
if not (subject_set and doc.text):
continue # noqa
print(" ".join(doc.text.split()), file=txt_file)
Xs.append(sp.csr_matrix(vector, dtype=np.float32).sorted_indices())
ys.append(

Check warning on line 149 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L137-L149

Added lines #L137 - L149 were not covered by tests
sp.csr_matrix(
(
np.ones(len(subject_set)),
(np.zeros(len(subject_set)), [s for s in subject_set]),
),
shape=(1, len(self.project.subjects)),
dtype=np.float32,
).sorted_indices()
)
atomic_save(

Check warning on line 159 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L159

Added line #L159 was not covered by tests
sp.vstack(Xs, format="csr"),
self.datadir,
self.train_X_file,
method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True),
)
atomic_save(

Check warning on line 165 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L165

Added line #L165 was not covered by tests
sp.vstack(ys, format="csr"),
self.datadir,
self.train_y_file,
method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True),
)

def _create_model(self, params, jobs):
train_txts = Preprocessor.load_data_from_file(

Check warning on line 173 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L172-L173

Added lines #L172 - L173 were not covered by tests
osp.join(self.datadir, self.train_txt_file),
label_text_path=None,
text_pos=0,
)["corpus"]
train_X = sp.load_npz(osp.join(self.datadir, self.train_X_file))
train_y = sp.load_npz(osp.join(self.datadir, self.train_y_file))
model_path = osp.join(self.datadir, self.model_folder)
new_params = apply_param_parse_config(self.PARAM_CONFIG, self.params)
new_params["only_topk"] = new_params.pop("limit")
train_params = XTransformer.TrainParams.from_dict(

Check warning on line 183 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L178-L183

Added lines #L178 - L183 were not covered by tests
new_params, recursive=True
).to_dict()
pred_params = XTransformer.PredParams.from_dict(

Check warning on line 186 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L186

Added line #L186 was not covered by tests
new_params, recursive=True
).to_dict()

self.info("Start training")

Check warning on line 190 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L190

Added line #L190 was not covered by tests
# enable progress
matcher.LOGGER.setLevel(logging.DEBUG)
matcher.LOGGER.addHandler(logging.StreamHandler(stream=sys.stdout))
model.LOGGER.setLevel(logging.DEBUG)
model.LOGGER.addHandler(logging.StreamHandler(stream=sys.stdout))
self._model = XTransformer.train(

Check warning on line 196 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L192-L196

Added lines #L192 - L196 were not covered by tests
MLProblemWithText(train_txts, train_y, X_feat=train_X),
clustering=None,
val_prob=None,
train_params=train_params,
pred_params=pred_params,
beam_size=int(params["beam_size"]),
steps_scale=None,
label_feat=None,
)
atomic_save_folder(self._model, model_path)

Check warning on line 206 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L206

Added line #L206 was not covered by tests

def _train(

Check warning on line 208 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L208

Added line #L208 was not covered by tests
self,
corpus: DocumentCorpus,
params: dict[str, Any],
jobs: int = 0,
) -> None:
if corpus == "cached":
self.info("Reusing cached training data from previous run.")

Check warning on line 215 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L214-L215

Added lines #L214 - L215 were not covered by tests
else:
if corpus.is_empty():
raise NotSupportedException("Cannot t project with no documents")
input = (doc.text for doc in corpus.documents)
vecparams = {

Check warning on line 220 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L217-L220

Added lines #L217 - L220 were not covered by tests
"min_df": int(params["min_df"]),
"tokenizer": self.project.analyzer.tokenize_words,
"ngram_range": (1, int(params["ngram"])),
}
veccorpus = self.create_vectorizer(input, vecparams)
self._create_train_files(veccorpus, corpus)
self._create_model(params, jobs)

Check warning on line 227 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L225-L227

Added lines #L225 - L227 were not covered by tests

def _suggest_batch(

Check warning on line 229 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L229

Added line #L229 was not covered by tests
self, texts: list[str], params: dict[str, Any]
) -> SuggestionBatch:
vector = self.vectorizer.transform(texts)
if vector.nnz == 0: # All zero vector, empty result
return list()
new_params = apply_param_parse_config(self.PARAM_CONFIG, params)
prediction = self._model.predict(

Check warning on line 236 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L232-L236

Added lines #L232 - L236 were not covered by tests
texts,
X_feat=vector.sorted_indices(),
batch_size=new_params["batch_size"],
use_gpu=True,
only_top_k=new_params["limit"],
post_processor=new_params["post_processor"],
)
current_batchsize = prediction.get_shape()[0]
batch_result = []
for i in range(current_batchsize):
results = []
row = prediction.getrow(i)
for idx, score in zip(row.indices, row.data):
results.append(SubjectSuggestion(subject_id=idx, score=score))
batch_result.append(results)
return SuggestionBatch.from_sequence(batch_result, self.project.subjects)

Check warning on line 252 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L244-L252

Added lines #L244 - L252 were not covered by tests
Loading
Loading