-
Notifications
You must be signed in to change notification settings - Fork 41
/
tfidf.py
138 lines (108 loc) · 4.67 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""Backend that returns most similar subjects based on similarity in sparse
TF-IDF normalized bag-of-words vector space"""
from __future__ import annotations
import os.path
import tempfile
from typing import TYPE_CHECKING, Any
import gensim.similarities
from gensim.matutils import Sparse2Corpus
import annif.util
from annif.exception import NotInitializedException, NotSupportedException
from annif.suggestion import vector_to_suggestions
from . import backend, mixins
if TYPE_CHECKING:
from collections.abc import Iterator
from scipy.sparse._csr import csr_matrix
from annif.corpus.document import DocumentCorpus
class SubjectBuffer:
"""A file-backed buffer to store and retrieve subject text."""
BUFFER_SIZE = 100
def __init__(self, tempdir: str, subject_id: int) -> None:
filename = "{:08d}.txt".format(subject_id)
self._path = os.path.join(tempdir, filename)
self._buffer = []
self._created = False
def flush(self) -> None:
if self._created:
mode = "a"
else:
mode = "w"
with open(self._path, mode, encoding="utf-8") as subjfile:
for text in self._buffer:
print(text, file=subjfile)
self._buffer = []
self._created = True
def write(self, text: str) -> None:
self._buffer.append(text)
if len(self._buffer) >= self.BUFFER_SIZE:
self.flush()
def read(self) -> str:
if not self._created:
# file was never created - we can simply return the buffer content
return "\n".join(self._buffer)
else:
with open(self._path, "r", encoding="utf-8") as subjfile:
return subjfile.read() + "\n" + "\n".join(self._buffer)
class TFIDFBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
"""TF-IDF vector space similarity based backend for Annif"""
name = "tfidf"
# defaults for uninitialized instances
_index = None
INDEX_FILE = "tfidf-index"
def _generate_subjects_from_documents(
self, corpus: DocumentCorpus
) -> Iterator[str]:
with tempfile.TemporaryDirectory() as tempdir:
subject_buffer = {}
for subject_id in range(len(self.project.subjects)):
subject_buffer[subject_id] = SubjectBuffer(tempdir, subject_id)
for doc in corpus.documents:
tokens = self.project.analyzer.tokenize_words(doc.text)
for subject_id in doc.subject_set:
subject_buffer[subject_id].write(" ".join(tokens))
for sid in range(len(self.project.subjects)):
yield subject_buffer[sid].read()
def _initialize_index(self) -> None:
if self._index is None:
path = os.path.join(self.datadir, self.INDEX_FILE)
self.debug("loading similarity index from {}".format(path))
if os.path.exists(path):
self._index = gensim.similarities.SparseMatrixSimilarity.load(path)
else:
raise NotInitializedException(
"similarity index {} not found".format(path),
backend_id=self.backend_id,
)
def initialize(self, parallel: bool = False) -> None:
self.initialize_vectorizer()
self._initialize_index()
def _create_index(self, veccorpus: csr_matrix) -> None:
self.info("creating similarity index")
gscorpus = Sparse2Corpus(veccorpus, documents_columns=False)
self._index = gensim.similarities.SparseMatrixSimilarity(
gscorpus, num_features=len(self.vectorizer.vocabulary_)
)
annif.util.atomic_save(self._index, self.datadir, self.INDEX_FILE)
def _train(
self,
corpus: DocumentCorpus,
params: dict[str, Any],
jobs: int = 0,
) -> None:
if corpus == "cached":
raise NotSupportedException(
"Training tfidf project from cached data not supported."
)
if corpus.is_empty():
raise NotSupportedException("Cannot train tfidf project with no documents")
self.info("transforming subject corpus")
subjects = self._generate_subjects_from_documents(corpus)
veccorpus = self.create_vectorizer(subjects)
self._create_index(veccorpus)
def _suggest(self, text: str, params: dict[str, Any]) -> Iterator:
self.debug(
'Suggesting subjects for text "{}..." (len={})'.format(text[:20], len(text))
)
tokens = self.project.analyzer.tokenize_words(text)
vectors = self.vectorizer.transform([" ".join(tokens)])
return vector_to_suggestions(self._index[vectors[0]], int(params["limit"]))