diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 1a55ad9b5f..ff93dbfcb5 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -790,7 +790,7 @@ def load(cls, *args, **kwargs): except AttributeError as ae: logger.error( "Model load error. Was model saved using code from an older Gensim Version? " - "Try loading older model using gensim-3.8.1, then re-saving, to restore " + "Try loading older model using gensim-3.8.3, then re-saving, to restore " "compatibility with current code.") raise ae diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 921878e128..928142580f 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -279,6 +279,7 @@ import os import numpy as np +import itertools as it from numpy import ones, vstack, float32 as REAL import six from collections.abc import Iterable @@ -822,7 +823,6 @@ def save(self, *args, **kwargs): Load :class:`~gensim.models.fasttext.FastText` model. """ - kwargs['ignore'] = kwargs.get('ignore', []) + ['buckets_word', ] super(FastText, self).save(*args, **kwargs) @classmethod @@ -845,25 +845,15 @@ def load(cls, *args, **kwargs): Save :class:`~gensim.models.fasttext.FastText` model. """ - model = super(FastText, cls).load(*args, rethrow=True, **kwargs) - - if not hasattr(model.wv, 'vectors_vocab_lockf') and hasattr(model.wv, 'vectors_vocab'): - # TODO: try trainables-location - model.wv.vectors_vocab_lockf = ones(1, dtype=REAL) - if not hasattr(model, 'vectors_ngrams_lockf') and hasattr(model.wv, 'vectors_ngrams'): - # TODO: try trainables-location - model.wv.vectors_ngrams_lockf = ones(1, dtype=REAL) - # fixup mistakenly overdimensioned gensim-3.x lockf arrays - if len(model.wv.vectors_vocab_lockf.shape) > 1: - model.wv.vectors_vocab_lockf = ones(1, dtype=REAL) - if len(model.wv.vectors_ngrams_lockf.shape) > 1: - model.wv.vectors_ngrams_lockf = ones(1, dtype=REAL) - if hasattr(model, 'bucket'): - del model.bucket # should only exist in one place: the wv subcomponent - if not hasattr(model.wv, 'buckets_word') or not model.wv.buckets_word: - model.wv.recalc_char_ngram_buckets() + return super(FastText, cls).load(*args, rethrow=True, **kwargs) - return model + def _load_specials(self, *args, **kwargs): + """Handle special requirements of `.load()` protocol, usually up-converting older versions.""" + super(FastText, self)._load_specials(*args, **kwargs) + if hasattr(self, 'bucket'): + # should only exist in one place: the wv subcomponent + self.wv.bucket = self.bucket + del self.bucket class FastTextVocab(utils.SaveLoad): @@ -1197,12 +1187,47 @@ def __init__(self, vector_size, min_n, max_n, bucket): @classmethod def load(cls, fname_or_handle, **kwargs): - model = super(FastTextKeyedVectors, cls).load(fname_or_handle, **kwargs) - if isinstance(model, FastTextKeyedVectors): - if not hasattr(model, 'compatible_hash') or model.compatible_hash is False: - raise TypeError("Pre-gensim-3.8.x Fasttext models with nonstandard hashing are no longer compatible." - "Loading into gensim-3.8.3 & re-saving may create a compatible model.") - return model + """Load a previously saved `FastTextKeyedVectors` model. + + Parameters + ---------- + fname : str + Path to the saved file. + + Returns + ------- + :class:`~gensim.models.fasttext.FastTextKeyedVectors` + Loaded model. + + See Also + -------- + :meth:`~gensim.models.fasttext.FastTextKeyedVectors.save` + Save :class:`~gensim.models.fasttext.FastTextKeyedVectors` model. + + """ + return super(FastTextKeyedVectors, cls).load(fname_or_handle, **kwargs) + + def _load_specials(self, *args, **kwargs): + """Handle special requirements of `.load()` protocol, usually up-converting older versions.""" + super(FastTextKeyedVectors, self)._load_specials(*args, **kwargs) + if not isinstance(self, FastTextKeyedVectors): + raise TypeError("Loaded object of type %s, not expected FastTextKeyedVectors" % type(self)) + if not hasattr(self, 'compatible_hash') or self.compatible_hash is False: + raise TypeError("Pre-gensim-3.8.x Fasttext models with nonstandard hashing are no longer compatible." + "Loading into gensim-3.8.3 & re-saving may create a compatible model.") + if not hasattr(self, 'vectors_vocab_lockf') and hasattr(self, 'vectors_vocab'): + self.vectors_vocab_lockf = ones(1, dtype=REAL) + if not hasattr(self, 'vectors_ngrams_lockf') and hasattr(self, 'vectors_ngrams'): + self.vectors_ngrams_lockf = ones(1, dtype=REAL) + # fixup mistakenly overdimensioned gensim-3.x lockf arrays + if len(self.vectors_vocab_lockf.shape) > 1: + self.vectors_vocab_lockf = ones(1, dtype=REAL) + if len(self.vectors_ngrams_lockf.shape) > 1: + self.vectors_ngrams_lockf = ones(1, dtype=REAL) + if not hasattr(self, 'buckets_word') or not self.buckets_word: + self.recalc_char_ngram_buckets() + if not hasattr(self, 'vectors') or self.vectors is None: + self.adjust_vectors() # recompose full-word vectors def __contains__(self, word): """Check if `word` or any character ngrams in `word` are present in the vocabulary. @@ -1250,14 +1275,15 @@ def save(self, *args, **kwargs): Load object. """ - # don't bother storing the cached normalized vectors - ignore_attrs = [ - 'buckets_word', - 'hash2index', - ] - kwargs['ignore'] = kwargs.get('ignore', ignore_attrs) super(FastTextKeyedVectors, self).save(*args, **kwargs) + def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname): + """Arrange any special handling for the gensim.utils.SaveLoad protocol""" + # don't save properties that are merely calculated from others + ignore = set(it.chain(ignore, ('buckets_word', 'vectors'))) + return super(FastTextKeyedVectors, self)._save_specials( + fname, separately, sep_limit, ignore, pickle_protocol, compress, subname) + def get_vector(self, word, use_norm=False): """Get `word` representations in vector space, as a 1D numpy array. diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index a6523babdf..307a97537f 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -128,7 +128,7 @@ from collections import defaultdict, namedtuple from types import GeneratorType import threading -import itertools +import itertools as it import copy from gensim.utils import keep_vocab_item, call_on_class_only, deprecated @@ -1788,20 +1788,14 @@ def save(self, *args, **kwargs): Path to the file. """ - # don't bother storing recalculable table - kwargs['ignore'] = kwargs.get('ignore', []) + ['cum_table', ] super(Word2Vec, self).save(*args, **kwargs) - def get_latest_training_loss(self): - """Get current value of the training loss. - - Returns - ------- - float - Current training loss. - - """ - return self.running_training_loss + def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname): + """Arrange any special handling for the gensim.utils.SaveLoad protocol""" + # don't save properties that are merely calculated from others + ignore = set(it.chain(ignore, ('cum_table',))) + return super(Word2Vec, self)._save_specials( + fname, separately, sep_limit, ignore, pickle_protocol, compress, subname) @classmethod def load(cls, *args, rethrow=False, **kwargs): @@ -1828,49 +1822,65 @@ def load(cls, *args, rethrow=False, **kwargs): if not isinstance(model, Word2Vec): rethrow = True raise AttributeError("Model of type %s can't be loaded by %s" % (type(model), str(cls))) - # for backward compatibility - if not hasattr(model, 'ns_exponent'): - model.ns_exponent = 0.75 - if model.negative and hasattr(model.wv, 'index2word'): - model.make_cum_table() # rebuild cum_table from vocabulary ## TODO: ??? - if not hasattr(model, 'corpus_count'): - model.corpus_count = None - if not hasattr(model, 'corpus_total_words'): - model.corpus_total_words = None - if not hasattr(model.wv, 'vectors_lockf') and hasattr(model.wv, 'vectors'): - model.wv.vectors_lockf = getattr(model, 'vectors_lockf', np.ones(1, dtype=REAL)) - if not hasattr(model, 'random'): - model.random = np.random.RandomState(model.seed) - if not hasattr(model, 'train_count'): - model.train_count = 0 - model.total_train_time = 0 - if not hasattr(model, 'epochs'): - model.epochs = model.iter - del model.iter - if not hasattr(model, 'max_final_vocab'): - model.max_final_vocab = None - if hasattr(model, 'vocabulary'): # re-integrate state that had been moved - for a in ('max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word', 'raw_vocab'): - setattr(model, a, getattr(model.vocabulary, a)) - del model.vocabulary - if hasattr(model, 'trainables'): # re-integrate state that had been moved - for a in ('hashfxn', 'layer1_size', 'seed', 'syn1neg', 'syn1'): - if hasattr(model.trainables, a): - setattr(model, a, getattr(model.trainables, a)) - if hasattr(model, 'syn1'): - model.syn1 = model.syn1 - del model.syn1 - del model.trainables return model except AttributeError as ae: if rethrow: raise ae logger.error( "Model load error. Was model saved using code from an older Gensim Version? " - "Try loading older model using gensim-3.8.1, then re-saving, to restore " + "Try loading older model using gensim-3.8.3, then re-saving, to restore " "compatibility with current code.") raise ae + def _load_specials(self, *args, **kwargs): + """Handle special requirements of `.load()` protocol, usually up-converting older versions.""" + super(Word2Vec, self)._load_specials(*args, **kwargs) + # for backward compatibility, add/rearrange properties from prior versions + if not hasattr(self, 'ns_exponent'): + self.ns_exponent = 0.75 + if self.negative and hasattr(self.wv, 'index_to_key'): + self.make_cum_table() # rebuild cum_table from vocabulary + if not hasattr(self, 'corpus_count'): + self.corpus_count = None + if not hasattr(self, 'corpus_total_words'): + self.corpus_total_words = None + if not hasattr(self.wv, 'vectors_lockf') and hasattr(self.wv, 'vectors'): + self.wv.vectors_lockf = getattr(self, 'vectors_lockf', np.ones(1, dtype=REAL)) + if not hasattr(self, 'random'): + # use new instance of numpy's recommended generator/algorithm + self.random = np.random.default_rng(seed=self.seed) + if not hasattr(self, 'train_count'): + self.train_count = 0 + self.total_train_time = 0 + if not hasattr(self, 'epochs'): + self.epochs = self.iter + del self.iter + if not hasattr(self, 'max_final_vocab'): + self.max_final_vocab = None + if hasattr(self, 'vocabulary'): # re-integrate state that had been moved + for a in ('max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word', 'raw_vocab'): + setattr(self, a, getattr(self.vocabulary, a)) + del self.vocabulary + if hasattr(self, 'trainables'): # re-integrate state that had been moved + for a in ('hashfxn', 'layer1_size', 'seed', 'syn1neg', 'syn1'): + if hasattr(self.trainables, a): + setattr(self, a, getattr(self.trainables, a)) + if hasattr(self, 'syn1'): + self.syn1 = self.syn1 + del self.syn1 + del self.trainables + + def get_latest_training_loss(self): + """Get current value of the training loss. + + Returns + ------- + float + Current training loss. + + """ + return self.running_training_loss + class BrownCorpus(object): def __init__(self, dirname): @@ -1958,7 +1968,7 @@ def __iter__(self): # Assume it is a file-like object and try treating it as such # Things that don't have seek will trigger an exception self.source.seek(0) - for line in itertools.islice(self.source, self.limit): + for line in it.islice(self.source, self.limit): line = utils.to_unicode(line).split() i = 0 while i < len(line): @@ -1967,7 +1977,7 @@ def __iter__(self): except AttributeError: # If it didn't work like a file, use it as a string filename with utils.open(self.source, 'rb') as fin: - for line in itertools.islice(fin, self.limit): + for line in it.islice(fin, self.limit): line = utils.to_unicode(line).split() i = 0 while i < len(line): @@ -2021,7 +2031,7 @@ def __iter__(self): for file_name in self.input_files: logger.info('reading file %s', file_name) with utils.open(file_name, 'rb') as fin: - for line in itertools.islice(fin, self.limit): + for line in it.islice(fin, self.limit): line = utils.to_unicode(line).split() i = 0 while i < len(line):