diff --git a/CHANGELOG.md b/CHANGELOG.md index 25737c095f..ff77cb3035 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ Changes - fix RuntimeError in export_phrases (change defaultdict to dict) (PR [#3041](https://github.com/RaRe-Technologies/gensim/pull/3041), [@thalishsajeed](https://github.com/thalishsajeed)) - Record lifecycle events in Gensim models (PR [#3060](https://github.com/RaRe-Technologies/gensim/pull/3060), [@piskvorky](https://github.com/piskvorky)) +- Default to pickle protocol 4 when saving models (PR [#3065](https://github.com/RaRe-Technologies/gensim/pull/3065), [@piskvorky](https://github.com/piskvorky)) ## 4.0.0beta, 2020-10-31 diff --git a/gensim/similarities/annoy.py b/gensim/similarities/annoy.py index b237c11a99..e586b2d2e3 100644 --- a/gensim/similarities/annoy.py +++ b/gensim/similarities/annoy.py @@ -84,7 +84,7 @@ def __init__(self, model=None, num_trees=None): raise ValueError("Only a Word2Vec, Doc2Vec, FastText or KeyedVectors instance can be used") self._build_from_model(kv.get_normed_vectors(), kv.index_to_key, kv.vector_size) - def save(self, fname, protocol=2): + def save(self, fname, protocol=utils.PICKLE_PROTOCOL): """Save AnnoyIndexer instance to disk. Parameters diff --git a/gensim/similarities/nmslib.py b/gensim/similarities/nmslib.py index 620f32e519..752976862b 100644 --- a/gensim/similarities/nmslib.py +++ b/gensim/similarities/nmslib.py @@ -85,6 +85,7 @@ except ImportError: raise ImportError("NMSLIB not installed. To use the NMSLIB indexer, please run `pip install nmslib`.") +from gensim import utils from gensim.models.doc2vec import Doc2Vec from gensim.models.word2vec import Word2Vec from gensim.models.fasttext import FastText @@ -141,7 +142,7 @@ def __init__(self, model, index_params=None, query_time_params=None): else: raise ValueError("model must be a Word2Vec, Doc2Vec, FastText or KeyedVectors instance") - def save(self, fname, protocol=2): + def save(self, fname, protocol=utils.PICKLE_PROTOCOL): """Save this NmslibIndexer instance to a file. Parameters diff --git a/gensim/utils.py b/gensim/utils.py index cf5d6b6499..910dab4dff 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -39,6 +39,12 @@ logger = logging.getLogger(__name__) +# When pickling objects for persistence, use this protocol by default. +# Note that users won't be able to load models saved with high protocols on older environments that do +# not support that protocol (e.g. Python 2). +# In the rare cases where this matters, users can explicitly pass `model.save(pickle_protocol=2)`. +# See also https://github.com/RaRe-Technologies/gensim/pull/3065 +PICKLE_PROTOCOL = 4 PAT_ALPHABETIC = re.compile(r'(((?![\d])\w)+)', re.UNICODE) RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE) @@ -567,7 +573,10 @@ def _adapt_by_suffix(fname): compress, suffix = (True, 'npz') if fname.endswith('.gz') or fname.endswith('.bz2') else (False, 'npy') return compress, lambda *args: '.'.join(args + (suffix,)) - def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2): + def _smart_save( + self, fname, + separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=PICKLE_PROTOCOL, + ): """Save the object to a file. Used internally by :meth:`gensim.utils.SaveLoad.save()`. Parameters @@ -595,8 +604,9 @@ def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=fro """ compress, subname = SaveLoad._adapt_by_suffix(fname) - restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol, - compress, subname) + restores = self._save_specials( + fname, separately, sep_limit, ignore, pickle_protocol, compress, subname, + ) try: pickle(self, fname, protocol=pickle_protocol) finally: @@ -711,7 +721,10 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, raise return restores + [(self, asides)] - def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2): + def save( + self, fname_or_handle, + separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=PICKLE_PROTOCOL, + ): """Save the object to a file. Parameters @@ -1410,7 +1423,7 @@ def smart_extension(fname, ext): return fname -def pickle(obj, fname, protocol=2): +def pickle(obj, fname, protocol=PICKLE_PROTOCOL): """Pickle object `obj` to file `fname`, using smart_open so that `fname` can be on S3, HDFS, compressed etc. Parameters @@ -1420,7 +1433,7 @@ def pickle(obj, fname, protocol=2): fname : str Path to pickle file. protocol : int, optional - Pickle protocol number. Default is 2 in order to support compatibility across python 2.x and 3.x. + Pickle protocol number. """ with open(fname, 'wb') as fout: # 'b' for binary, needed on Windows