Skip to content

Commit

Permalink
Restore CloudPickler-scoped globals namespace isolation for pickled f…
Browse files Browse the repository at this point in the history
…unctions (#240)
  • Loading branch information
pierreglaser authored and ogrisel committed Feb 7, 2019
1 parent 7b31ced commit 9184a67
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 210 deletions.
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@
- Add support for pickling interactively defined dataclasses.
([issue #245](https://github.com/cloudpipe/cloudpickle/pull/245))

- Global variables referenced by functions pickled by cloudpickle are now
unpickled in a new and isolated namespace scoped by the CloudPickler
instance. This restores the (previously untested) behavior of cloudpickle
prior to changes done in 0.5.4 for functions defined in the `__main__`
module, and 0.6.0/1 for other dynamic functions.

0.7.0
=====
Expand Down
70 changes: 25 additions & 45 deletions cloudpickle/cloudpickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,22 +79,6 @@
PY3 = True


# Container for the global namespace to ensure consistent unpickling of
# functions defined in dynamic modules (modules not registed in sys.modules).
_dynamic_modules_globals = weakref.WeakValueDictionary()


class _DynamicModuleFuncGlobals(dict):
"""Global variables referenced by a function defined in a dynamic module
To avoid leaking references we store such context in a WeakValueDictionary
instance. However instances of python builtin types such as dict cannot
be used directly as values in such a construct, hence the need for a
derived class.
"""
pass


def _make_cell_set_template_code():
"""Get the Python compiler to emit LOAD_FAST(arg); STORE_DEREF
Expand Down Expand Up @@ -670,17 +654,17 @@ def extract_func_data(self, func):
# save the dict
dct = func.__dict__

base_globals = self.globals_ref.get(id(func.__globals__), None)
if base_globals is None:
# For functions defined in a well behaved module use
# vars(func.__module__) for base_globals. This is necessary to
# share the global variables across multiple pickled functions from
# this module.
if func.__module__ is not None:
base_globals = func.__module__
else:
base_globals = {}
self.globals_ref[id(func.__globals__)] = base_globals
# base_globals represents the future global namespace of func at
# unpickling time. Looking it up and storing it in globals_ref allow
# functions sharing the same globals at pickling time to also
# share them once unpickled, at one condition: since globals_ref is
# an attribute of a Cloudpickler instance, and that a new CloudPickler is
# created each time pickle.dump or pickle.dumps is called, functions
# also need to be saved within the same invokation of
# cloudpickle.dump/cloudpickle.dumps (for example: cloudpickle.dumps([f1, f2])). There
# is no such limitation when using Cloudpickler.dump, as long as the
# multiple invokations are bound to the same Cloudpickler.
base_globals = self.globals_ref.setdefault(id(func.__globals__), {})

return (code, f_globals, defaults, closure, dct, base_globals)

Expand Down Expand Up @@ -1096,10 +1080,16 @@ def _fill_function(*args):
else:
raise ValueError('Unexpected _fill_value arguments: %r' % (args,))

# Only set global variables that do not exist.
for k, v in state['globals'].items():
if k not in func.__globals__:
func.__globals__[k] = v
# - At pickling time, any dynamic global variable used by func is
# serialized by value (in state['globals']).
# - At unpickling time, func's __globals__ attribute is initialized by
# first retrieving an empty isolated namespace that will be shared
# with other functions pickled from the same original module
# by the same CloudPickler instance and then updated with the
# content of state['globals'] to populate the shared isolated
# namespace with all the global variables that are specifically
# referenced for this function.
func.__globals__.update(state['globals'])

func.__defaults__ = state['defaults']
func.__dict__ = state['dict']
Expand Down Expand Up @@ -1137,21 +1127,11 @@ def _make_skel_func(code, cell_count, base_globals=None):
code and the correct number of cells in func_closure. All other
func attributes (e.g. func_globals) are empty.
"""
if base_globals is None:
# This is backward-compatibility code: for cloudpickle versions between
# 0.5.4 and 0.7, base_globals could be a string or None. base_globals
# should now always be a dictionary.
if base_globals is None or isinstance(base_globals, str):
base_globals = {}
elif isinstance(base_globals, str):
base_globals_name = base_globals
try:
# First try to reuse the globals from the module containing the
# function. If it is not possible to retrieve it, fallback to an
# empty dictionary.
base_globals = vars(importlib.import_module(base_globals))
except ImportError:
base_globals = _dynamic_modules_globals.get(
base_globals_name, None)
if base_globals is None:
base_globals = _DynamicModuleFuncGlobals()
_dynamic_modules_globals[base_globals_name] = base_globals

base_globals['__builtins__'] = __builtins__

Expand Down
228 changes: 63 additions & 165 deletions tests/cloudpickle_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,57 +451,6 @@ def method(self, x):
mod1, mod2 = pickle_depickle([mod, mod])
self.assertEqual(id(mod1), id(mod2))

def test_dynamic_modules_globals(self):
# _dynamic_modules_globals is a WeakValueDictionary, so if a value
# in this dict (containing a set of global variables from a dynamic
# module created in the parent process) has no other reference than in
# this dict in the child process, it will be garbage collected.

# We first create a module
mod = types.ModuleType('mod')
code = '''
x = 1
def func():
return
'''
exec(textwrap.dedent(code), mod.__dict__)

pickled_module_path = os.path.join(self.tmpdir, 'mod_f.pkl')
child_process_script = '''
import pickle
from cloudpickle.cloudpickle import _dynamic_modules_globals
import gc
with open("{pickled_module_path}", 'rb') as f:
func = pickle.load(f)
# A dictionnary storing the globals of the newly unpickled function
# should have been created
assert list(_dynamic_modules_globals.keys()) == ['mod']
# func.__globals__ is the only non-weak reference to
# _dynamic_modules_globals['mod']. By deleting func, we delete also
# _dynamic_modules_globals['mod']
del func
gc.collect()
# There is no reference to the globals of func since func has been
# deleted and _dynamic_modules_globals is a WeakValueDictionary,
# so _dynamic_modules_globals should now be empty
assert list(_dynamic_modules_globals.keys()) == []
'''

child_process_script = child_process_script.format(
pickled_module_path=_escape(pickled_module_path))

try:
with open(pickled_module_path, 'wb') as f:
cloudpickle.dump(mod.func, f, protocol=self.protocol)

assert_run_python_script(textwrap.dedent(child_process_script))

finally:
os.unlink(pickled_module_path)

def test_module_locals_behavior(self):
# Makes sure that a local function defined in another module is
# correctly serialized. This notably checks that the globals are
Expand Down Expand Up @@ -1083,20 +1032,42 @@ def f0():
def f1():
return VARIABLE
cloned_f0 = {clone_func}(f0, protocol={protocol})
cloned_f1 = {clone_func}(f1, protocol={protocol})
assert f0.__globals__ is f1.__globals__
# pickle f0 and f1 inside the same pickle_string
cloned_f0, cloned_f1 = {clone_func}([f0, f1], protocol={protocol})
# cloned_f0 and cloned_f1 now share a global namespace that is isolated
# from any previously existing namespace
assert cloned_f0.__globals__ is cloned_f1.__globals__
assert cloned_f0.__globals__ is not f0.__globals__
# pickle f1 another time, but in a new pickle string
pickled_f1 = dumps(f1, protocol={protocol})
# Change the value of the global variable
# Change the value of the global variable in f0's new global namespace
cloned_f0()
# Ensure that the global variable is the same for another function
result_f1 = cloned_f1()
assert result_f1 == "changed_by_f0", result_f1
# Ensure that unpickling the global variable does not change its value
result_pickled_f1 = loads(pickled_f1)()
assert result_pickled_f1 == "changed_by_f0", result_pickled_f1
# thanks to cloudpickle isolation, depickling and calling f0 and f1
# should not affect the globals of already existing modules
assert VARIABLE == "default_value", VARIABLE
# Ensure that cloned_f1 and cloned_f0 share the same globals, as f1 and
# f0 shared the same globals at pickling time, and cloned_f1 was
# depickled from the same pickle string as cloned_f0
shared_global_var = cloned_f1()
assert shared_global_var == "changed_by_f0", shared_global_var
# f1 is unpickled another time, but because it comes from another
# pickle string than pickled_f1 and pickled_f0, it will not share the
# same globals as the latter two.
new_cloned_f1 = loads(pickled_f1)
assert new_cloned_f1.__globals__ is not cloned_f1.__globals__
assert new_cloned_f1.__globals__ is not f1.__globals__
# get the value of new_cloned_f1's VARIABLE
new_global_var = new_cloned_f1()
assert new_global_var == "default_value", new_global_var
"""
for clone_func in ['local_clone', 'subprocess_pickle_echo']:
code = code_template.format(protocol=self.protocol,
Expand All @@ -1115,117 +1086,44 @@ def f0():
def f1():
return _TEST_GLOBAL_VARIABLE

cloned_f0 = cloudpickle.loads(cloudpickle.dumps(
f0, protocol=self.protocol))
cloned_f1 = cloudpickle.loads(cloudpickle.dumps(
f1, protocol=self.protocol))
# pickle f0 and f1 inside the same pickle_string
cloned_f0, cloned_f1 = pickle_depickle([f0, f1],
protocol=self.protocol)

# cloned_f0 and cloned_f1 now share a global namespace that is
# isolated from any previously existing namespace
assert cloned_f0.__globals__ is cloned_f1.__globals__
assert cloned_f0.__globals__ is not f0.__globals__

# pickle f1 another time, but in a new pickle string
pickled_f1 = cloudpickle.dumps(f1, protocol=self.protocol)

# Change the value of the global variable
# Change the global variable's value in f0's new global namespace
cloned_f0()
assert _TEST_GLOBAL_VARIABLE == "changed_by_f0"

# Ensure that the global variable is the same for another function
result_cloned_f1 = cloned_f1()
assert result_cloned_f1 == "changed_by_f0", result_cloned_f1
assert f1() == result_cloned_f1

# Ensure that unpickling the global variable does not change its
# value
result_pickled_f1 = cloudpickle.loads(pickled_f1)()
assert result_pickled_f1 == "changed_by_f0", result_pickled_f1
# depickling f0 and f1 should not affect the globals of already
# existing modules
assert _TEST_GLOBAL_VARIABLE == "default_value"

# Ensure that cloned_f1 and cloned_f0 share the same globals, as f1
# and f0 shared the same globals at pickling time, and cloned_f1
# was depickled from the same pickle string as cloned_f0
shared_global_var = cloned_f1()
assert shared_global_var == "changed_by_f0", shared_global_var

# f1 is unpickled another time, but because it comes from another
# pickle string than pickled_f1 and pickled_f0, it will not share
# the same globals as the latter two.
new_cloned_f1 = pickle.loads(pickled_f1)
assert new_cloned_f1.__globals__ is not cloned_f1.__globals__
assert new_cloned_f1.__globals__ is not f1.__globals__

# get the value of new_cloned_f1's VARIABLE
new_global_var = new_cloned_f1()
assert new_global_var == "default_value", new_global_var
finally:
_TEST_GLOBAL_VARIABLE = orig_value

def test_function_from_dynamic_module_with_globals_modifications(self):
# This test verifies that the global variable state of a function
# defined in a dynamic module in a child process are not reset by
# subsequent uplickling.

# first, we create a dynamic module in the parent process
mod = types.ModuleType('mod')
code = '''
GLOBAL_STATE = "initial value"
def func_defined_in_dynamic_module(v=None):
global GLOBAL_STATE
if v is not None:
GLOBAL_STATE = v
return GLOBAL_STATE
'''
exec(textwrap.dedent(code), mod.__dict__)

with_initial_globals_file = os.path.join(
self.tmpdir, 'function_with_initial_globals.pkl')
with_modified_globals_file = os.path.join(
self.tmpdir, 'function_with_modified_globals.pkl')

try:
# Simple sanity check on the function's output
assert mod.func_defined_in_dynamic_module() == "initial value"

# The function of mod is pickled two times, with two different
# values for the global variable GLOBAL_STATE.
# Then we launch a child process that sequentially unpickles the
# two functions. Those unpickle functions should share the same
# global variables in the child process:
# Once the first function gets unpickled, mod is created and
# tracked in the child environment. This is state is preserved
# when unpickling the second function whatever the global variable
# GLOBAL_STATE's value at the time of pickling.

with open(with_initial_globals_file, 'wb') as f:
cloudpickle.dump(mod.func_defined_in_dynamic_module, f)

# Change the mod's global variable
mod.GLOBAL_STATE = 'changed value'

# At this point, mod.func_defined_in_dynamic_module()
# returns the updated value. Let's pickle it again.
assert mod.func_defined_in_dynamic_module() == 'changed value'
with open(with_modified_globals_file, 'wb') as f:
cloudpickle.dump(mod.func_defined_in_dynamic_module, f,
protocol=self.protocol)

child_process_code = """
import pickle
with open({with_initial_globals_file!r},'rb') as f:
func_with_initial_globals = pickle.load(f)
# At this point, a module called 'mod' should exist in
# _dynamic_modules_globals. Further function loading
# will use the globals living in mod.
assert func_with_initial_globals() == 'initial value'
# Load a function with initial global variable that was
# pickled after a change in the global variable
with open({with_modified_globals_file!r},'rb') as f:
func_with_modified_globals = pickle.load(f)
# assert the this unpickling did not modify the value of
# the local
assert func_with_modified_globals() == 'initial value'
# Update the value from the child process and check that
# unpickling again does not reset our change.
assert func_with_initial_globals('new value') == 'new value'
assert func_with_modified_globals() == 'new value'
with open({with_initial_globals_file!r},'rb') as f:
func_with_initial_globals = pickle.load(f)
assert func_with_initial_globals() == 'new value'
assert func_with_modified_globals() == 'new value'
""".format(
with_initial_globals_file=_escape(with_initial_globals_file),
with_modified_globals_file=_escape(with_modified_globals_file))
assert_run_python_script(textwrap.dedent(child_process_code))

finally:
os.unlink(with_initial_globals_file)
os.unlink(with_modified_globals_file)

@pytest.mark.skipif(sys.version_info >= (3, 0),
reason="hardcoded pickle bytes for 2.7")
def test_function_pickle_compat_0_4_0(self):
Expand Down

0 comments on commit 9184a67

Please sign in to comment.