diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 1a55ad9b5f..ff93dbfcb5 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -790,7 +790,7 @@ def load(cls, *args, **kwargs): except AttributeError as ae: logger.error( "Model load error. Was model saved using code from an older Gensim Version? " - "Try loading older model using gensim-3.8.1, then re-saving, to restore " + "Try loading older model using gensim-3.8.3, then re-saving, to restore " "compatibility with current code.") raise ae diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 5c07a0b540..928142580f 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -279,6 +279,7 @@ import os import numpy as np +import itertools as it from numpy import ones, vstack, float32 as REAL import six from collections.abc import Iterable @@ -822,7 +823,6 @@ def save(self, *args, **kwargs): Load :class:`~gensim.models.fasttext.FastText` model. """ - kwargs['ignore'] = kwargs.get('ignore', []) + ['buckets_word', ] super(FastText, self).save(*args, **kwargs) @classmethod @@ -845,25 +845,15 @@ def load(cls, *args, **kwargs): Save :class:`~gensim.models.fasttext.FastText` model. """ - model = super(FastText, cls).load(*args, rethrow=True, **kwargs) - - if not hasattr(model.wv, 'vectors_vocab_lockf') and hasattr(model.wv, 'vectors_vocab'): - # TODO: try trainables-location - model.wv.vectors_vocab_lockf = ones(1, dtype=REAL) - if not hasattr(model, 'vectors_ngrams_lockf') and hasattr(model.wv, 'vectors_ngrams'): - # TODO: try trainables-location - model.wv.vectors_ngrams_lockf = ones(1, dtype=REAL) - # fixup mistakenly overdimensioned gensim-3.x lockf arrays - if len(model.wv.vectors_vocab_lockf.shape) > 1: - model.wv.vectors_vocab_lockf = ones(1, dtype=REAL) - if len(model.wv.vectors_ngrams_lockf.shape) > 1: - model.wv.vectors_ngrams_lockf = ones(1, dtype=REAL) - if hasattr(model, 'bucket'): - del model.bucket # should only exist in one place: the wv subcomponent - if not hasattr(model.wv, 'buckets_word') or not model.wv.buckets_word: - model.wv.recalc_char_ngram_buckets() + return super(FastText, cls).load(*args, rethrow=True, **kwargs) - return model + def _load_specials(self, *args, **kwargs): + """Handle special requirements of `.load()` protocol, usually up-converting older versions.""" + super(FastText, self)._load_specials(*args, **kwargs) + if hasattr(self, 'bucket'): + # should only exist in one place: the wv subcomponent + self.wv.bucket = self.bucket + del self.bucket class FastTextVocab(utils.SaveLoad): @@ -1197,12 +1187,47 @@ def __init__(self, vector_size, min_n, max_n, bucket): @classmethod def load(cls, fname_or_handle, **kwargs): - model = super(FastTextKeyedVectors, cls).load(fname_or_handle, **kwargs) - if isinstance(model, FastTextKeyedVectors): - if not hasattr(model, 'compatible_hash') or model.compatible_hash is False: - raise TypeError("Pre-gensim-3.8.x Fasttext models with nonstandard hashing are no longer compatible." - "Loading into gensim-3.8.3 & re-saving may create a compatible model.") - return model + """Load a previously saved `FastTextKeyedVectors` model. + + Parameters + ---------- + fname : str + Path to the saved file. + + Returns + ------- + :class:`~gensim.models.fasttext.FastTextKeyedVectors` + Loaded model. + + See Also + -------- + :meth:`~gensim.models.fasttext.FastTextKeyedVectors.save` + Save :class:`~gensim.models.fasttext.FastTextKeyedVectors` model. + + """ + return super(FastTextKeyedVectors, cls).load(fname_or_handle, **kwargs) + + def _load_specials(self, *args, **kwargs): + """Handle special requirements of `.load()` protocol, usually up-converting older versions.""" + super(FastTextKeyedVectors, self)._load_specials(*args, **kwargs) + if not isinstance(self, FastTextKeyedVectors): + raise TypeError("Loaded object of type %s, not expected FastTextKeyedVectors" % type(self)) + if not hasattr(self, 'compatible_hash') or self.compatible_hash is False: + raise TypeError("Pre-gensim-3.8.x Fasttext models with nonstandard hashing are no longer compatible." + "Loading into gensim-3.8.3 & re-saving may create a compatible model.") + if not hasattr(self, 'vectors_vocab_lockf') and hasattr(self, 'vectors_vocab'): + self.vectors_vocab_lockf = ones(1, dtype=REAL) + if not hasattr(self, 'vectors_ngrams_lockf') and hasattr(self, 'vectors_ngrams'): + self.vectors_ngrams_lockf = ones(1, dtype=REAL) + # fixup mistakenly overdimensioned gensim-3.x lockf arrays + if len(self.vectors_vocab_lockf.shape) > 1: + self.vectors_vocab_lockf = ones(1, dtype=REAL) + if len(self.vectors_ngrams_lockf.shape) > 1: + self.vectors_ngrams_lockf = ones(1, dtype=REAL) + if not hasattr(self, 'buckets_word') or not self.buckets_word: + self.recalc_char_ngram_buckets() + if not hasattr(self, 'vectors') or self.vectors is None: + self.adjust_vectors() # recompose full-word vectors def __contains__(self, word): """Check if `word` or any character ngrams in `word` are present in the vocabulary. @@ -1250,14 +1275,15 @@ def save(self, *args, **kwargs): Load object. """ - # don't bother storing the cached normalized vectors - ignore_attrs = [ - 'buckets_word', - 'hash2index', - ] - kwargs['ignore'] = kwargs.get('ignore', ignore_attrs) super(FastTextKeyedVectors, self).save(*args, **kwargs) + def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname): + """Arrange any special handling for the gensim.utils.SaveLoad protocol""" + # don't save properties that are merely calculated from others + ignore = set(it.chain(ignore, ('buckets_word', 'vectors'))) + return super(FastTextKeyedVectors, self)._save_specials( + fname, separately, sep_limit, ignore, pickle_protocol, compress, subname) + def get_vector(self, word, use_norm=False): """Get `word` representations in vector space, as a 1D numpy array. @@ -1401,6 +1427,7 @@ def adjust_vectors(self): """ if self.bucket == 0: + self.vectors = self.vectors_vocab # no ngrams influence return self.vectors = self.vectors_vocab[:].copy() diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index a6523babdf..6306a038c4 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -128,7 +128,7 @@ from collections import defaultdict, namedtuple from types import GeneratorType import threading -import itertools +import itertools as it import copy from gensim.utils import keep_vocab_item, call_on_class_only, deprecated @@ -779,6 +779,7 @@ def reset_weights(self): self.syn1neg = np.zeros((len(self.wv), self.layer1_size), dtype=REAL) self.wv.vectors_lockf = np.ones(1, dtype=REAL) # 0.0 values suppress word-backprop-updates; 1.0 allows + logger.info("weights initialized") def update_weights(self): """Copy all the existing weights, and reset the weights for the newly added vocabulary.""" @@ -1788,20 +1789,14 @@ def save(self, *args, **kwargs): Path to the file. """ - # don't bother storing recalculable table - kwargs['ignore'] = kwargs.get('ignore', []) + ['cum_table', ] super(Word2Vec, self).save(*args, **kwargs) - def get_latest_training_loss(self): - """Get current value of the training loss. - - Returns - ------- - float - Current training loss. - - """ - return self.running_training_loss + def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname): + """Arrange any special handling for the gensim.utils.SaveLoad protocol""" + # don't save properties that are merely calculated from others + ignore = set(it.chain(ignore, ('cum_table',))) + return super(Word2Vec, self)._save_specials( + fname, separately, sep_limit, ignore, pickle_protocol, compress, subname) @classmethod def load(cls, *args, rethrow=False, **kwargs): @@ -1828,49 +1823,65 @@ def load(cls, *args, rethrow=False, **kwargs): if not isinstance(model, Word2Vec): rethrow = True raise AttributeError("Model of type %s can't be loaded by %s" % (type(model), str(cls))) - # for backward compatibility - if not hasattr(model, 'ns_exponent'): - model.ns_exponent = 0.75 - if model.negative and hasattr(model.wv, 'index2word'): - model.make_cum_table() # rebuild cum_table from vocabulary ## TODO: ??? - if not hasattr(model, 'corpus_count'): - model.corpus_count = None - if not hasattr(model, 'corpus_total_words'): - model.corpus_total_words = None - if not hasattr(model.wv, 'vectors_lockf') and hasattr(model.wv, 'vectors'): - model.wv.vectors_lockf = getattr(model, 'vectors_lockf', np.ones(1, dtype=REAL)) - if not hasattr(model, 'random'): - model.random = np.random.RandomState(model.seed) - if not hasattr(model, 'train_count'): - model.train_count = 0 - model.total_train_time = 0 - if not hasattr(model, 'epochs'): - model.epochs = model.iter - del model.iter - if not hasattr(model, 'max_final_vocab'): - model.max_final_vocab = None - if hasattr(model, 'vocabulary'): # re-integrate state that had been moved - for a in ('max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word', 'raw_vocab'): - setattr(model, a, getattr(model.vocabulary, a)) - del model.vocabulary - if hasattr(model, 'trainables'): # re-integrate state that had been moved - for a in ('hashfxn', 'layer1_size', 'seed', 'syn1neg', 'syn1'): - if hasattr(model.trainables, a): - setattr(model, a, getattr(model.trainables, a)) - if hasattr(model, 'syn1'): - model.syn1 = model.syn1 - del model.syn1 - del model.trainables return model except AttributeError as ae: if rethrow: raise ae logger.error( "Model load error. Was model saved using code from an older Gensim Version? " - "Try loading older model using gensim-3.8.1, then re-saving, to restore " + "Try loading older model using gensim-3.8.3, then re-saving, to restore " "compatibility with current code.") raise ae + def _load_specials(self, *args, **kwargs): + """Handle special requirements of `.load()` protocol, usually up-converting older versions.""" + super(Word2Vec, self)._load_specials(*args, **kwargs) + # for backward compatibility, add/rearrange properties from prior versions + if not hasattr(self, 'ns_exponent'): + self.ns_exponent = 0.75 + if self.negative and hasattr(self.wv, 'index_to_key'): + self.make_cum_table() # rebuild cum_table from vocabulary + if not hasattr(self, 'corpus_count'): + self.corpus_count = None + if not hasattr(self, 'corpus_total_words'): + self.corpus_total_words = None + if not hasattr(self.wv, 'vectors_lockf') and hasattr(self.wv, 'vectors'): + self.wv.vectors_lockf = getattr(self, 'vectors_lockf', np.ones(1, dtype=REAL)) + if not hasattr(self, 'random'): + # use new instance of numpy's recommended generator/algorithm + self.random = np.random.default_rng(seed=self.seed) + if not hasattr(self, 'train_count'): + self.train_count = 0 + self.total_train_time = 0 + if not hasattr(self, 'epochs'): + self.epochs = self.iter + del self.iter + if not hasattr(self, 'max_final_vocab'): + self.max_final_vocab = None + if hasattr(self, 'vocabulary'): # re-integrate state that had been moved + for a in ('max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word', 'raw_vocab'): + setattr(self, a, getattr(self.vocabulary, a)) + del self.vocabulary + if hasattr(self, 'trainables'): # re-integrate state that had been moved + for a in ('hashfxn', 'layer1_size', 'seed', 'syn1neg', 'syn1'): + if hasattr(self.trainables, a): + setattr(self, a, getattr(self.trainables, a)) + if hasattr(self, 'syn1'): + self.syn1 = self.syn1 + del self.syn1 + del self.trainables + + def get_latest_training_loss(self): + """Get current value of the training loss. + + Returns + ------- + float + Current training loss. + + """ + return self.running_training_loss + class BrownCorpus(object): def __init__(self, dirname): @@ -1958,7 +1969,7 @@ def __iter__(self): # Assume it is a file-like object and try treating it as such # Things that don't have seek will trigger an exception self.source.seek(0) - for line in itertools.islice(self.source, self.limit): + for line in it.islice(self.source, self.limit): line = utils.to_unicode(line).split() i = 0 while i < len(line): @@ -1967,7 +1978,7 @@ def __iter__(self): except AttributeError: # If it didn't work like a file, use it as a string filename with utils.open(self.source, 'rb') as fin: - for line in itertools.islice(fin, self.limit): + for line in it.islice(fin, self.limit): line = utils.to_unicode(line).split() i = 0 while i < len(line): @@ -2021,7 +2032,7 @@ def __iter__(self): for file_name in self.input_files: logger.info('reading file %s', file_name) with utils.open(file_name, 'rb') as fin: - for line in itertools.islice(fin, self.limit): + for line in it.islice(fin, self.limit): line = utils.to_unicode(line).split() i = 0 while i < len(line): diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index c8c9b0582c..94885be3d7 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -18,7 +18,8 @@ from gensim.models.word2vec import LineSentence from gensim.models.fasttext import FastText as FT_gensim, FastTextKeyedVectors, _unpack from gensim.models.keyedvectors import KeyedVectors -from gensim.test.utils import datapath, get_tmpfile, temporary_file, common_texts as sentences +from gensim.test.utils import datapath, get_tmpfile, temporary_file, common_texts as sentences, lee_corpus_list +from gensim.test.test_word2vec import TestWord2VecModel import gensim.models._fasttext_bin from gensim.models.fasttext_inner import compute_ngrams, compute_ngrams_bytes, ft_hash_bytes @@ -43,14 +44,7 @@ FT_CMD = os.path.join(FT_HOME, "fasttext") if FT_HOME else None -class LeeCorpus(object): - def __iter__(self): - with open(datapath('lee_background.cor')) as f: - for line in f: - yield utils.simple_preprocess(line) - - -list_corpus = list(LeeCorpus()) +list_corpus = lee_corpus_list new_sentences = [ ['computer', 'artificial', 'intelligence'], @@ -1371,6 +1365,7 @@ def _read_fb(fin): class ZeroBucketTest(unittest.TestCase): + """Tests FastText with no buckets/no-ngrams (essentially FastText-as-Word2Vec""" def test_in_vocab(self): model = train_gensim(bucket=0) self.assertIsNotNone(model.wv['anarchist']) @@ -1379,6 +1374,15 @@ def test_out_of_vocab(self): model = train_gensim(bucket=0) self.assertRaises(KeyError, model.wv.word_vec, 'streamtrain') + def test_cbow_neg(self): + """See gensim.test.test_word2vec.TestWord2VecModel.test_cbow_neg""" + model = FT_gensim( + sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15, + min_count=5, epochs=10, workers=2, sample=0, + max_n=0 # force no char-ngram buckets + ) + TestWord2VecModel.model_sanity(self, model) + class UnicodeVocabTest(unittest.TestCase): def test_ascii(self):