-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add gensim.models.BaseKeyedVectors.add_entity
method for fill KeyedVectors
in manual way. Fix #1942
#1957
Add gensim.models.BaseKeyedVectors.add_entity
method for fill KeyedVectors
in manual way. Fix #1942
#1957
Changes from 10 commits
99bcf44
06955c4
089d346
f428571
0aff584
f6e5e79
d4b0ffe
912d462
3611320
437a142
737cd36
070fbed
2294c07
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -154,6 +154,66 @@ def get_vector(self, entity): | |
else: | ||
raise KeyError("'%s' not in vocabulary" % entity) | ||
|
||
def add(self, entities, weights, replace=False): | ||
"""Add entities and theirs vectors in a manual way. | ||
If some entity is already in the vocabulary, old vector is keeped unless `replace` flag is True. | ||
|
||
Parameters | ||
---------- | ||
entities : list of str | ||
Entities specified by string tags. | ||
weights: {list of numpy.ndarray, numpy.ndarray} | ||
List of 1D np.array vectors or 2D np.array of vectors. | ||
replace: bool, optional | ||
Flag indicating whether to replace vectors for entities which are already in the vocabulary, | ||
if True - replace vectors, otherwise - keep old vectors. | ||
""" | ||
if isinstance(entities, string_types): | ||
entities = [entities] | ||
weights = weights.reshape(1, -1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. probably, should be |
||
elif isinstance(weights, list): | ||
weights = np.array(weights) | ||
|
||
in_vocab_mask = np.zeros(len(entities), dtype=np.bool) | ||
for idx, entity in enumerate(entities): | ||
if entity in self.vocab: | ||
in_vocab_mask[idx] = True | ||
|
||
# add new entities to the vocab | ||
for idx in np.nonzero(~in_vocab_mask)[0]: | ||
entity = entities[idx] | ||
self.vocab[entity] = Vocab(index=len(self.vocab), count=1) | ||
self.index2entity.append(entity) | ||
|
||
# add vectors for new entities | ||
if len(self.vectors) == 0: | ||
self.vectors = weights[~in_vocab_mask] | ||
else: | ||
self.vectors = vstack((self.vectors, weights[~in_vocab_mask])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might this line work even in the case where There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's not obvious how to do that, because when empty There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it possible for an empty |
||
|
||
# change vectors for in_vocab entities if `replace` flag is specified | ||
if replace: | ||
in_vocab_idxs = [self.vocab[entities[idx]].index for idx in np.nonzero(in_vocab_mask)[0]] | ||
self.vectors[in_vocab_idxs] = weights[in_vocab_mask] | ||
|
||
def __setitem__(self, entities, weights): | ||
"""Add entities and theirs vectors in a manual way. | ||
If some entity is already in the vocabulary, old vector is replaced with the new one. | ||
This method is alias for `add` with `replace=True`. | ||
|
||
Parameters | ||
---------- | ||
entities : {str, list of str} | ||
Entities specified by string tags. | ||
weights: {list of numpy.ndarray, numpy.ndarray} | ||
List of 1D np.array vectors or 2D np.array of vectors. | ||
""" | ||
if not isinstance(entities, list): | ||
entities = [entities] | ||
weights = weights.reshape(1, -1) | ||
|
||
self.add(entities, weights, replace=True) | ||
|
||
def __getitem__(self, entities): | ||
""" | ||
Accept a single entity (string tag) or list of entities as input. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nitpick: multiline docstring should ends with empty line, i.e.